using System;
using System.Collections.Generic;
using UnityEngine;
using Barracuda;
using MLAgents.Sensors;
using MLAgents.Demonstrations;
using MLAgents.Policies;
namespace MLAgents
{
///
/// Struct that contains all the information for an Agent, including its
/// observations, actions and current status.
///
internal struct AgentInfo
{
///
/// Keeps track of the last vector action taken by the Brain.
///
public float[] storedVectorActions;
///
/// For discrete control, specifies the actions that the agent cannot take. Is true if
/// the action is masked.
///
public bool[] discreteActionMasks;
///
/// Current agent reward.
///
public float reward;
///
/// Whether the agent is done or not.
///
public bool done;
///
/// Whether the agent has reached its max step count for this episode.
///
public bool maxStepReached;
///
/// Episode identifier each agent receives at every reset. It is used
/// to separate between different agents in the environment.
///
public int episodeId;
}
///
/// Struct that contains the action information sent from the Brain to the
/// Agent.
///
internal struct AgentAction
{
public float[] vectorActions;
}
///
/// Agent MonoBehaviour class that is attached to a Unity GameObject, making it
/// an Agent. An agent produces observations and takes actions in the
/// environment. Observations are determined by the cameras attached
/// to the agent in addition to the vector observations implemented by the
/// user in .
/// On the other hand, actions are determined by decisions produced by a Policy.
/// Currently, this class is expected to be extended to implement the desired agent behavior.
///
///
/// Simply speaking, an agent roams through an environment and at each step
/// of the environment extracts its current observation, sends them to its
/// policy and in return receives an action. In practice,
/// however, an agent need not send its observation at every step since very
/// little may have changed between successive steps.
///
/// At any step, an agent may be considered done due to a variety of reasons:
/// - The agent reached an end state within its environment.
/// - The agent reached the maximum # of steps (i.e. timed out).
/// - The academy reached the maximum # of steps (forced agent to be done).
///
/// Here, an agent reaches an end state if it completes its task successfully
/// or somehow fails along the way. In the case where an agent is done before
/// the academy, it either resets and restarts, or just lingers until the
/// academy is done.
///
/// An important note regarding steps and episodes is due. Here, an agent step
/// corresponds to an academy step, which also corresponds to Unity
/// environment step (i.e. each FixedUpdate call). This is not the case for
/// episodes. The academy controls the global episode count and each agent
/// controls its own local episode count and can reset and start a new local
/// episode independently (based on its own experience). Thus an academy
/// (global) episode can be viewed as the upper-bound on an agents episode
/// length and that within a single global episode, an agent may have completed
/// multiple local episodes. Consequently, if an agent max step is
/// set to a value larger than the academy max steps value, then the academy
/// value takes precedence (since the agent max step will never be reached).
///
/// Lastly, note that at any step the policy to the agent is allowed to
/// change model with .
///
/// Implementation-wise, it is required that this class is extended and the
/// virtual methods overridden. For sample implementations of agent behavior,
/// see the Examples/ directory within this Unity project.
///
[HelpURL("https://github.com/Unity-Technologies/ml-agents/blob/master/" +
"docs/Learning-Environment-Design-Agents.md")]
[Serializable]
[RequireComponent(typeof(BehaviorParameters))]
public class Agent : MonoBehaviour, ISerializationCallbackReceiver
{
IPolicy m_Brain;
BehaviorParameters m_PolicyFactory;
/// This code is here to make the upgrade path for users using maxStep
/// easier. We will hook into the Serialization code and make sure that
/// agentParameters.maxStep and this.maxStep are in sync.
[Serializable]
internal struct AgentParameters
{
public int maxStep;
}
public int TeamId {
get {
LazyInitialize();
return m_PolicyFactory.TeamId;
}
}
public string BehaviorName {
get {
LazyInitialize();
return m_PolicyFactory.behaviorName;
}
}
[SerializeField][HideInInspector]
internal AgentParameters agentParameters;
[SerializeField][HideInInspector]
internal bool hasUpgradedFromAgentParameters;
///
/// The maximum number of steps the agent takes before being done.
///
///
/// If set to 0, the agent can only be set to done programmatically (or
/// when the Academy is done).
/// If set to any positive integer, the agent will be set to done after
/// that many steps. Note that setting the max step to a value greater
/// than the academy max step value renders it useless.
///
[HideInInspector] public int maxStep;
/// Current Agent information (message sent to Brain).
AgentInfo m_Info;
/// Current Agent action (message sent from Brain).
AgentAction m_Action;
/// Represents the reward the agent accumulated during the current step.
/// It is reset to 0 at the beginning of every step.
/// Should be set to a positive value when the agent performs a "good"
/// action that we wish to reinforce/reward, and set to a negative value
/// when the agent performs a "bad" action that we wish to punish/deter.
/// Additionally, the magnitude of the reward should not exceed 1.0
float m_Reward;
/// Keeps track of the cumulative reward in this episode.
float m_CumulativeReward;
/// Whether or not the agent requests an action.
bool m_RequestAction;
/// Whether or not the agent requests a decision.
bool m_RequestDecision;
/// Keeps track of the number of steps taken by the agent in this episode.
/// Note that this value is different for each agent, and may not overlap
/// with the step counter in the Academy, since agents reset based on
/// their own experience.
int m_StepCount;
/// Episode identifier each agent receives. It is used
/// to separate between different agents in the environment.
/// This Id will be changed every time the Agent resets.
int m_EpisodeId;
/// Whether or not the Agent has been initialized already
bool m_Initialized;
/// Keeps track of the actions that are masked at each step.
DiscreteActionMasker m_ActionMasker;
///
/// Set of DemonstrationWriters that the Agent will write its step information to.
/// If you use a DemonstrationRecorder component, this will automatically register its DemonstrationWriter.
/// You can also add your own DemonstrationWriter by calling
/// DemonstrationRecorder.AddDemonstrationWriterToAgent()
///
internal ISet DemonstrationWriters = new HashSet();
///
/// List of sensors used to generate observations.
/// Currently generated from attached SensorComponents, and a legacy VectorSensor
///
internal List sensors;
///
/// VectorSensor which is written to by AddVectorObs
///
internal VectorSensor collectObservationsSensor;
void OnEnable()
{
LazyInitialize();
}
///
///
///
public void OnBeforeSerialize()
{
// Manages a serialization upgrade issue from v0.13 to v0.14 where maxStep moved
// from AgentParameters (since removed) to Agent
if (maxStep == 0 && maxStep != agentParameters.maxStep && !hasUpgradedFromAgentParameters)
{
maxStep = agentParameters.maxStep;
}
hasUpgradedFromAgentParameters = true;
}
///
///
///
public void OnAfterDeserialize()
{
// Manages a serialization upgrade issue from v0.13 to v0.14 where maxStep moved
// from AgentParameters (since removed) to Agent
if (maxStep == 0 && maxStep != agentParameters.maxStep && !hasUpgradedFromAgentParameters)
{
maxStep = agentParameters.maxStep;
}
hasUpgradedFromAgentParameters = true;
}
///
/// Initializes the agent. Can be safely called multiple times.
///
public void LazyInitialize()
{
if (m_Initialized)
{
return;
}
m_Initialized = true;
// Grab the "static" properties for the Agent.
m_EpisodeId = EpisodeIdCounter.GetEpisodeId();
m_PolicyFactory = GetComponent();
m_Info = new AgentInfo();
m_Action = new AgentAction();
sensors = new List();
Academy.Instance.AgentIncrementStep += AgentIncrementStep;
Academy.Instance.AgentSendState += SendInfo;
Academy.Instance.DecideAction += DecideAction;
Academy.Instance.AgentAct += AgentStep;
Academy.Instance.AgentForceReset += _AgentReset;
m_Brain = m_PolicyFactory.GeneratePolicy(Heuristic);
ResetData();
InitializeAgent();
InitializeSensors();
}
///
/// Reason that the Agent is being considered "done"
///
enum DoneReason
{
///
/// The method was called.
///
DoneCalled,
///
/// The max steps for the Agent were reached.
///
MaxStepReached,
///
/// The Agent was disabled
///
Disabled,
}
void OnDisable()
{
DemonstrationWriters.Clear();
// If Academy.Dispose has already been called, we don't need to unregister with it.
// We don't want to even try, because this will lazily create a new Academy!
if (Academy.IsInitialized)
{
Academy.Instance.AgentIncrementStep -= AgentIncrementStep;
Academy.Instance.AgentSendState -= SendInfo;
Academy.Instance.DecideAction -= DecideAction;
Academy.Instance.AgentAct -= AgentStep;
Academy.Instance.AgentForceReset -= _AgentReset;
}
NotifyAgentDone(DoneReason.Disabled);
m_Brain?.Dispose();
m_Initialized = false;
}
void NotifyAgentDone(DoneReason doneReason)
{
m_Info.reward = m_Reward;
m_Info.done = true;
m_Info.maxStepReached = doneReason == DoneReason.MaxStepReached;
// Request the last decision with no callbacks
// We request a decision so Python knows the Agent is done immediately
m_Brain?.RequestDecision(m_Info, sensors);
// We also have to write any to any DemonstationStores so that they get the "done" flag.
foreach(var demoWriter in DemonstrationWriters)
{
demoWriter.Record(m_Info, sensors);
}
if (doneReason != DoneReason.Disabled)
{
// We don't want to udpate the reward stats when the Agent is disabled, because this will make
// the rewards look lower than they actually are during shutdown.
UpdateRewardStats();
}
// The Agent is done, so we give it a new episode Id
m_EpisodeId = EpisodeIdCounter.GetEpisodeId();
m_Reward = 0f;
m_CumulativeReward = 0f;
m_RequestAction = false;
m_RequestDecision = false;
}
///
/// Updates the Model for the agent. Any model currently assigned to the
/// agent will be replaced with the provided one. If the arguments are
/// identical to the current parameters of the agent, the model will
/// remain unchanged.
///
/// The identifier of the behavior. This
/// will categorize the agent when training.
///
/// The model to use for inference.
/// Define on what device the model
/// will be run.
public void GiveModel(
string behaviorName,
NNModel model,
InferenceDevice inferenceDevice = InferenceDevice.CPU)
{
m_PolicyFactory.GiveModel(behaviorName, model, inferenceDevice);
m_Brain?.Dispose();
m_Brain = m_PolicyFactory.GeneratePolicy(Heuristic);
}
///
/// Returns the current step counter (within the current episode).
///
///
/// Current step count.
///
public int StepCount
{
get { return m_StepCount; }
}
///
/// Overrides the current step reward of the agent and updates the episode
/// reward accordingly.
///
/// The new value of the reward.
public void SetReward(float reward)
{
#if DEBUG
Utilities.DebugCheckNanAndInfinity(reward, nameof(reward), nameof(SetReward));
#endif
m_CumulativeReward += (reward - m_Reward);
m_Reward = reward;
}
///
/// Increments the step and episode rewards by the provided value.
///
/// Incremental reward value.
public void AddReward(float increment)
{
#if DEBUG
Utilities.DebugCheckNanAndInfinity(increment, nameof(increment), nameof(AddReward));
#endif
m_Reward += increment;
m_CumulativeReward += increment;
}
///
/// Retrieves the episode reward for the Agent.
///
/// The episode reward.
public float GetCumulativeReward()
{
return m_CumulativeReward;
}
void UpdateRewardStats()
{
var gaugeName = $"{m_PolicyFactory.behaviorName}.CumulativeReward";
TimerStack.Instance.SetGauge(gaugeName, GetCumulativeReward());
}
///
/// Sets the done flag to true.
///
public void Done()
{
NotifyAgentDone(DoneReason.DoneCalled);
_AgentReset();
}
///
/// Is called when the agent must request the brain for a new decision.
///
public void RequestDecision()
{
m_RequestDecision = true;
RequestAction();
}
///
/// Is called then the agent must perform a new action.
///
public void RequestAction()
{
m_RequestAction = true;
}
/// Helper function that resets all the data structures associated with
/// the agent. Typically used when the agent is being initialized or reset
/// at the end of an episode.
void ResetData()
{
var param = m_PolicyFactory.brainParameters;
m_ActionMasker = new DiscreteActionMasker(param);
// If we haven't initialized vectorActions, initialize to 0. This should only
// happen during the creation of the Agent. In subsequent episodes, vectorAction
// should stay the previous action before the Done(), so that it is properly recorded.
if (m_Action.vectorActions == null)
{
m_Action.vectorActions = new float[param.numActions];
m_Info.storedVectorActions = new float[param.numActions];
}
}
///
/// Initializes the agent, called once when the agent is enabled. Can be
/// left empty if there is no special, unique set-up behavior for the
/// agent.
///
///
/// One sample use is to store local references to other objects in the
/// scene which would facilitate computing this agents observation.
///
public virtual void InitializeAgent()
{
}
///
/// When the Agent uses Heuristics, it will call this method every time it
/// needs an action. This can be used for debugging or controlling the agent
/// with keyboard.
///
/// A float array corresponding to the next action of the Agent
///
public virtual float[] Heuristic()
{
Debug.LogWarning("Heuristic method called but not implemented. Returning placeholder actions.");
var param = m_PolicyFactory.brainParameters;
return new float[param.numActions];
}
///
/// Set up the list of ISensors on the Agent. By default, this will select any
/// SensorBase's attached to the Agent.
///
internal void InitializeSensors()
{
// Get all attached sensor components
SensorComponent[] attachedSensorComponents;
if (m_PolicyFactory.useChildSensors)
{
attachedSensorComponents = GetComponentsInChildren();
}
else
{
attachedSensorComponents = GetComponents();
}
sensors.Capacity += attachedSensorComponents.Length;
foreach (var component in attachedSensorComponents)
{
sensors.Add(component.CreateSensor());
}
// Support legacy CollectObservations
var param = m_PolicyFactory.brainParameters;
if (param.vectorObservationSize > 0)
{
collectObservationsSensor = new VectorSensor(param.vectorObservationSize);
if (param.numStackedVectorObservations > 1)
{
var stackingSensor = new StackingSensor(
collectObservationsSensor, param.numStackedVectorObservations);
sensors.Add(stackingSensor);
}
else
{
sensors.Add(collectObservationsSensor);
}
}
// Sort the Sensors by name to ensure determinism
sensors.Sort((x, y) => x.GetName().CompareTo(y.GetName()));
#if DEBUG
// Make sure the names are actually unique
for (var i = 0; i < sensors.Count - 1; i++)
{
Debug.Assert(
!sensors[i].GetName().Equals(sensors[i + 1].GetName()),
"Sensor names must be unique.");
}
#endif
}
///
/// Sends the Agent info to the linked Brain.
///
void SendInfoToBrain()
{
if (m_Brain == null)
{
return;
}
m_Info.storedVectorActions = m_Action.vectorActions;
m_ActionMasker.ResetMask();
UpdateSensors();
using (TimerStack.Instance.Scoped("CollectObservations"))
{
CollectObservations(collectObservationsSensor);
}
using (TimerStack.Instance.Scoped("CollectDiscreteActionMasks"))
{
if (m_PolicyFactory.brainParameters.vectorActionSpaceType == SpaceType.Discrete)
{
CollectDiscreteActionMasks(m_ActionMasker);
}
}
m_Info.discreteActionMasks = m_ActionMasker.GetMask();
m_Info.reward = m_Reward;
m_Info.done = false;
m_Info.maxStepReached = false;
m_Info.episodeId = m_EpisodeId;
m_Brain.RequestDecision(m_Info, sensors);
// If we have any DemonstrationWriters, write the AgentInfo and sensors to them.
foreach(var demoWriter in DemonstrationWriters)
{
demoWriter.Record(m_Info, sensors);
}
}
void UpdateSensors()
{
foreach (var sensor in sensors)
{
sensor.Update();
}
}
///
/// Collects the vector observations of the agent.
/// The agent observation describes the current environment from the
/// perspective of the agent.
///
///
/// The vector observations for the agent.
///
///
/// An agents observation is any environment information that helps
/// the Agent achieve its goal. For example, for a fighting Agent, its
/// observation could include distances to friends or enemies, or the
/// current level of ammunition at its disposal.
/// Recall that an Agent may attach vector or visual observations.
/// Vector observations are added by calling the provided helper methods
/// on the VectorSensor input:
/// -
/// -
/// -
/// -
/// -
/// -
/// -
/// -
/// Depending on your environment, any combination of these helpers can
/// be used. They just need to be used in the exact same order each time
/// this method is called and the resulting size of the vector observation
/// needs to match the vectorObservationSize attribute of the linked Brain.
/// Visual observations are implicitly added from the cameras attached to
/// the Agent.
///
public virtual void CollectObservations(VectorSensor sensor)
{
}
///
/// Collects the masks for discrete actions.
/// When using discrete actions, the agent will not perform the masked action.
///
///
/// The action masker for the agent.
///
///
/// When using Discrete Control, you can prevent the Agent from using a certain
/// action by masking it with
///
public virtual void CollectDiscreteActionMasks(DiscreteActionMasker actionMasker)
{
}
///
/// Specifies the agent behavior at every step based on the provided
/// action.
///
///
/// Vector action. Note that for discrete actions, the provided array
/// will be of length 1.
///
public virtual void AgentAction(float[] vectorAction)
{
}
///
/// Specifies the agent behavior when being reset, which can be due to
/// the agent or Academy being done (i.e. completion of local or global
/// episode).
///
public virtual void AgentReset()
{
}
///
/// Returns the last action that was decided on by the Agent
///
///
/// The last action that was decided by the Agent (or null if no decision has been made)
///
public float[] GetAction()
{
return m_Action.vectorActions;
}
///
/// This method will forcefully reset the agent and will also reset the hasAlreadyReset flag.
/// This way, even if the agent was already in the process of reseting, it will be reset again
/// and will not send a Done flag at the next step.
///
void ForceReset()
{
_AgentReset();
}
///
/// An internal reset method that updates internal data structures in
/// addition to calling .
///
void _AgentReset()
{
ResetData();
m_StepCount = 0;
AgentReset();
}
///
/// Scales continuous action from [-1, 1] to arbitrary range.
///
///
///
///
///
protected static float ScaleAction(float rawAction, float min, float max)
{
var middle = (min + max) / 2;
var range = (max - min) / 2;
return rawAction * range + middle;
}
///
/// Signals the agent that it must sent its decision to the brain.
///
void SendInfo()
{
// If the Agent is done, it has just reset and thus requires a new decision
if (m_RequestDecision)
{
SendInfoToBrain();
m_Reward = 0f;
m_RequestDecision = false;
}
}
void AgentIncrementStep()
{
m_StepCount += 1;
}
/// Used by the brain to make the agent perform a step.
void AgentStep()
{
if ((m_RequestAction) && (m_Brain != null))
{
m_RequestAction = false;
AgentAction(m_Action.vectorActions);
}
if ((m_StepCount >= maxStep) && (maxStep > 0))
{
NotifyAgentDone(DoneReason.MaxStepReached);
_AgentReset();
}
}
void DecideAction()
{
m_Action.vectorActions = m_Brain?.DecideAction();
if (m_Action.vectorActions == null){
ResetData();
}
}
}
}