# # Unity ML-Agents Toolkit import logging from typing import Dict, List, Deque, Any, Optional, NamedTuple import os import tensorflow as tf import numpy as np from collections import deque, defaultdict from mlagents.envs import UnityException, AllBrainInfo, ActionInfoOutputs, BrainInfo from mlagents.trainers.buffer import Buffer from mlagents.trainers.tf_policy import Policy from mlagents.trainers.trainer import Trainer, UnityTrainerException from mlagents.trainers.components.reward_signals.reward_signal import RewardSignalResult from mlagents.envs import BrainParameters LOGGER = logging.getLogger("mlagents.trainers") RewardSignalResults = Dict[str, RewardSignalResult] class AllRewardsOutput(NamedTuple): """ This class stores all of the outputs of the reward signals, as well as the raw reward from the environment. """ reward_signals: RewardSignalResults environment: np.ndarray class RLTrainer(Trainer): """ This class is the base class for trainers that use Reward Signals. Contains methods for adding BrainInfos to the Buffer. """ def __init__(self, *args, **kwargs): super(RLTrainer, self).__init__(*args, **kwargs) # Make sure we have at least one reward_signal if not self.trainer_parameters["reward_signals"]: raise UnityTrainerException( "No reward signals were defined. At least one must be used with {}.".format( self.__class__.__name__ ) ) # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward # used for reporting only. We always want to report the environment reward to Tensorboard, regardless # of what reward signals are actually present. self.collected_rewards = {"environment": {}} self.training_buffer = Buffer() self.episode_steps = {} def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo: """ Constructs a BrainInfo which contains the most recent previous experiences for all agents which correspond to the agents in a provided next_info. :BrainInfo next_info: A t+1 BrainInfo. :return: curr_info: Reconstructed BrainInfo to match agents of next_info. """ visual_observations: List[List[Any]] = [ [] for _ in next_info.visual_observations ] # TODO add types to brain.py methods vector_observations = [] text_observations = [] memories = [] rewards = [] local_dones = [] max_reacheds = [] agents = [] prev_vector_actions = [] prev_text_actions = [] action_masks = [] for agent_id in next_info.agents: agent_brain_info = self.training_buffer[agent_id].last_brain_info if agent_brain_info is None: agent_brain_info = next_info agent_index = agent_brain_info.agents.index(agent_id) for i in range(len(next_info.visual_observations)): visual_observations[i].append( agent_brain_info.visual_observations[i][agent_index] ) vector_observations.append( agent_brain_info.vector_observations[agent_index] ) text_observations.append(agent_brain_info.text_observations[agent_index]) if self.policy.use_recurrent: if len(agent_brain_info.memories) > 0: memories.append(agent_brain_info.memories[agent_index]) else: memories.append(self.policy.make_empty_memory(1)) rewards.append(agent_brain_info.rewards[agent_index]) local_dones.append(agent_brain_info.local_done[agent_index]) max_reacheds.append(agent_brain_info.max_reached[agent_index]) agents.append(agent_brain_info.agents[agent_index]) prev_vector_actions.append( agent_brain_info.previous_vector_actions[agent_index] ) prev_text_actions.append( agent_brain_info.previous_text_actions[agent_index] ) action_masks.append(agent_brain_info.action_masks[agent_index]) if self.policy.use_recurrent: memories = np.vstack(memories) curr_info = BrainInfo( visual_observations, vector_observations, text_observations, memories, rewards, agents, local_dones, prev_vector_actions, prev_text_actions, max_reacheds, action_masks, ) return curr_info def add_experiences( self, curr_all_info: AllBrainInfo, next_all_info: AllBrainInfo, take_action_outputs: ActionInfoOutputs, ) -> None: """ Adds experiences to each agent's experience history. :param curr_all_info: Dictionary of all current brains and corresponding BrainInfo. :param next_all_info: Dictionary of all current brains and corresponding BrainInfo. :param take_action_outputs: The outputs of the Policy's get_action method. """ self.trainer_metrics.start_experience_collection_timer() if take_action_outputs: self.stats["Policy/Entropy"].append(take_action_outputs["entropy"].mean()) self.stats["Policy/Learning Rate"].append( take_action_outputs["learning_rate"] ) for name, signal in self.policy.reward_signals.items(): self.stats[signal.value_name].append( np.mean(take_action_outputs["value_heads"][name]) ) curr_info = curr_all_info[self.brain_name] next_info = next_all_info[self.brain_name] for agent_id in curr_info.agents: self.training_buffer[agent_id].last_brain_info = curr_info self.training_buffer[ agent_id ].last_take_action_outputs = take_action_outputs if curr_info.agents != next_info.agents: curr_to_use = self.construct_curr_info(next_info) else: curr_to_use = curr_info # Evaluate and store the reward signals tmp_reward_signal_outs = {} for name, signal in self.policy.reward_signals.items(): tmp_reward_signal_outs[name] = signal.evaluate(curr_to_use, next_info) # Store the environment reward tmp_environment = np.array(next_info.rewards) rewards_out = AllRewardsOutput( reward_signals=tmp_reward_signal_outs, environment=tmp_environment ) for agent_id in next_info.agents: stored_info = self.training_buffer[agent_id].last_brain_info stored_take_action_outputs = self.training_buffer[ agent_id ].last_take_action_outputs if stored_info is not None: idx = stored_info.agents.index(agent_id) next_idx = next_info.agents.index(agent_id) if not stored_info.local_done[idx]: for i, _ in enumerate(stored_info.visual_observations): self.training_buffer[agent_id]["visual_obs%d" % i].append( stored_info.visual_observations[i][idx] ) self.training_buffer[agent_id]["next_visual_obs%d" % i].append( next_info.visual_observations[i][next_idx] ) if self.policy.use_vec_obs: self.training_buffer[agent_id]["vector_obs"].append( stored_info.vector_observations[idx] ) self.training_buffer[agent_id]["next_vector_in"].append( next_info.vector_observations[next_idx] ) if self.policy.use_recurrent: if stored_info.memories.shape[1] == 0: stored_info.memories = np.zeros( (len(stored_info.agents), self.policy.m_size) ) self.training_buffer[agent_id]["memory"].append( stored_info.memories[idx] ) self.training_buffer[agent_id]["masks"].append(1.0) self.training_buffer[agent_id]["done"].append( next_info.local_done[next_idx] ) # Add the outputs of the last eval self.add_policy_outputs(stored_take_action_outputs, agent_id, idx) # Store action masks if neccessary if not self.policy.use_continuous_act: self.training_buffer[agent_id]["action_mask"].append( stored_info.action_masks[idx], padding_value=1 ) self.training_buffer[agent_id]["prev_action"].append( stored_info.previous_vector_actions[idx] ) values = stored_take_action_outputs["value_heads"] # Add the value outputs if needed self.add_rewards_outputs( rewards_out, values, agent_id, idx, next_idx ) for name, rewards in self.collected_rewards.items(): if agent_id not in rewards: rewards[agent_id] = 0 if name == "environment": # Report the reward from the environment rewards[agent_id] += rewards_out.environment[next_idx] else: # Report the reward signals rewards[agent_id] += rewards_out.reward_signals[ name ].scaled_reward[next_idx] if not next_info.local_done[next_idx]: if agent_id not in self.episode_steps: self.episode_steps[agent_id] = 0 self.episode_steps[agent_id] += 1 self.trainer_metrics.end_experience_collection_timer() def end_episode(self) -> None: """ A signal that the Episode has ended. The buffer must be reset. Get only called when the academy resets. """ self.training_buffer.reset_local_buffers() for agent_id in self.episode_steps: self.episode_steps[agent_id] = 0 for rewards in self.collected_rewards.values(): for agent_id in rewards: rewards[agent_id] = 0 def add_policy_outputs( self, take_action_outputs: ActionInfoOutputs, agent_id: str, agent_idx: int ) -> None: """ Takes the output of the last action and store it into the training buffer. We break this out from add_experiences since it is very highly dependent on the type of trainer. :param take_action_outputs: The outputs of the Policy's get_action method. :param agent_id: the Agent we're adding to. :param agent_idx: the index of the Agent agent_id """ raise UnityTrainerException( "The process_experiences method was not implemented." ) def add_rewards_outputs( self, rewards_out: AllRewardsOutput, values: Dict[str, np.ndarray], agent_id: str, agent_idx: int, agent_next_idx: int, ) -> None: """ Takes the value and evaluated rewards output of the last action and store it into the training buffer. We break this out from add_experiences since it is very highly dependent on the type of trainer. :param take_action_outputs: The outputs of the Policy's get_action method. :param rewards_dict: Dict of rewards after evaluation :param agent_id: the Agent we're adding to. :param agent_idx: the index of the Agent agent_id in the current brain info :param agent_next_idx: the index of the Agent agent_id in the next brain info """ raise UnityTrainerException( "The process_experiences method was not implemented." )