using System; using System.Collections.Generic; using System.Collections.ObjectModel; using UnityEngine; using Unity.Barracuda; using Unity.MLAgents.Actuators; using Unity.MLAgents.Sensors; using Unity.MLAgents.Sensors.Reflection; using Unity.MLAgents.Demonstrations; using Unity.MLAgents.Policies; using UnityEngine.Serialization; namespace Unity.MLAgents { ///

/// Struct that contains all the information for an Agent, including its /// observations, actions and current status. ///

internal struct AgentInfo { ///

/// Keeps track of the last actions taken by the Brain. ///

public ActionBuffers storedActions; ///

/// For discrete control, specifies the actions that the agent cannot take. /// An element of the mask array is true if the action is prohibited. ///

public bool[] discreteActionMasks; ///

/// The current agent reward. ///

public float reward; ///

/// Whether the agent is done or not. ///

public bool done; ///

/// Whether the agent has reached its max step count for this episode. ///

public bool maxStepReached; ///

/// Episode identifier each agent receives at every reset. It is used /// to separate between different agents in the environment. ///

public int episodeId; public void ClearActions() { storedActions.Clear(); } public void CopyActions(ActionBuffers actionBuffers) { var continuousActions = storedActions.ContinuousActions; for (var i = 0; i < actionBuffers.ContinuousActions.Length; i++) { continuousActions[i] = actionBuffers.ContinuousActions[i]; } var discreteActions = storedActions.DiscreteActions; for (var i = 0; i < actionBuffers.DiscreteActions.Length; i++) { discreteActions[i] = actionBuffers.DiscreteActions[i]; } } } ///

/// An agent is an actor that can observe its environment, decide on the /// best course of action using those observations, and execute those actions /// within the environment. ///

/// /// Use the Agent class as the subclass for implementing your own agents. Add /// your Agent implementation to a [GameObject] in the [Unity scene] that serves /// as the agent's environment. /// /// Agents in an environment operate in *steps*. At each step, an agent collects observations, /// passes them to its decision-making policy, and receives an action vector in response. /// /// Agents make observations using implementations. The ML-Agents /// API provides implementations for visual observations () /// raycast observations (), and arbitrary /// data observations (). You can add the /// and or /// components to an agent's [GameObject] to use /// those sensor types. You can implement the /// function in your Agent subclass to use a vector observation. The Agent class calls this /// function before it uses the observation vector to make a decision. (If you only use /// visual or raycast observations, you do not need to implement /// .) /// /// Assign a decision making policy to an agent using a /// component attached to the agent's [GameObject]. The setting /// determines how decisions are made: /// /// * : decisions are made by the external process, /// when connected. Otherwise, decisions are made using inference. If no inference model /// is specified in the BehaviorParameters component, then heuristic decision /// making is used. /// * : decisions are always made using the trained /// model specified in the component. /// * : when a decision is needed, the agent's /// function is called. Your implementation is responsible for /// providing the appropriate action. /// /// To trigger an agent decision automatically, you can attach a /// component to the Agent game object. You can also call the agent's /// function manually. You only need to call when the agent is /// in a position to act upon the decision. In many cases, this will be every [FixedUpdate] /// callback, but could be less frequent. For example, an agent that hops around its environment /// can only take an action when it touches the ground, so several frames might elapse between /// one decision and the need for the next. /// /// Use the function to implement the actions your agent can take, /// such as moving to reach a goal or interacting with its environment. /// /// When you call on an agent or the agent reaches its count, /// its current episode ends. You can reset the agent -- or remove it from the /// environment -- by implementing the function. An agent also /// becomes done when the resets the environment, which only happens when /// the receives a reset signal from an external process via the /// . /// /// The Agent class extends the Unity [MonoBehaviour] class. You can implement the /// standard [MonoBehaviour] functions as needed for your agent. Since an agent's /// observations and actions typically take place during the [FixedUpdate] phase, you should /// only use the [MonoBehaviour.Update] function for cosmetic purposes. If you override the [MonoBehaviour] /// methods, [OnEnable()] or [OnDisable()], always call the base Agent class implementations. /// /// You can implement the function to specify agent actions using /// your own heuristic algorithm. Implementing a heuristic function can be useful /// for debugging. For example, you can use keyboard input to select agent actions in /// order to manually control an agent's behavior. /// /// Note that you can change the inference model assigned to an agent at any step /// by calling . /// /// See [Agents] and [Reinforcement Learning in Unity] in the [Unity ML-Agents Toolkit manual] for /// more information on creating and training agents. /// /// For sample implementations of agent behavior, see the examples available in the /// [Unity ML-Agents Toolkit] on Github. /// /// [MonoBehaviour]: https://docs.unity3d.com/ScriptReference/MonoBehaviour.html /// [GameObject]: https://docs.unity3d.com/Manual/GameObjects.html /// [Unity scene]: https://docs.unity3d.com/Manual/CreatingScenes.html /// [FixedUpdate]: https://docs.unity3d.com/ScriptReference/MonoBehaviour.FixedUpdate.html /// [MonoBehaviour.Update]: https://docs.unity3d.com/ScriptReference/MonoBehaviour.Update.html /// [OnEnable()]: https://docs.unity3d.com/ScriptReference/MonoBehaviour.OnEnable.html /// [OnDisable()]: https://docs.unity3d.com/ScriptReference/MonoBehaviour.OnDisable.html] /// [OnBeforeSerialize()]: https://docs.unity3d.com/ScriptReference/MonoBehaviour.OnBeforeSerialize.html /// [OnAfterSerialize()]: https://docs.unity3d.com/ScriptReference/MonoBehaviour.OnAfterSerialize.html /// [Agents]: https://github.com/Unity-Technologies/ml-agents/blob/release_11_docs/docs/Learning-Environment-Design-Agents.md /// [Reinforcement Learning in Unity]: https://github.com/Unity-Technologies/ml-agents/blob/release_11_docs/docs/Learning-Environment-Design.md /// [Unity ML-Agents Toolkit]: https://github.com/Unity-Technologies/ml-agents /// [Unity ML-Agents Toolkit manual]: https://github.com/Unity-Technologies/ml-agents/blob/release_11_docs/docs/Readme.md /// /// [HelpURL("https://github.com/Unity-Technologies/ml-agents/blob/release_11_docs/" + "docs/Learning-Environment-Design-Agents.md")] [Serializable] [RequireComponent(typeof(BehaviorParameters))] public partial class Agent : MonoBehaviour, ISerializationCallbackReceiver, IActionReceiver { IPolicy m_Brain; BehaviorParameters m_PolicyFactory; /// This code is here to make the upgrade path for users using MaxStep /// easier. We will hook into the Serialization code and make sure that /// agentParameters.maxStep and this.maxStep are in sync. [Serializable] internal struct AgentParameters { public int maxStep; } [SerializeField] [HideInInspector] internal AgentParameters agentParameters; [SerializeField] [HideInInspector] internal bool hasUpgradedFromAgentParameters; ///

/// The maximum number of steps the agent takes before being done. ///

/// The maximum steps for an agent to take before it resets; or 0 for /// unlimited steps. /// /// The max step value determines the maximum length of an agent's episodes. /// Set to a positive integer to limit the episode length to that many steps. /// Set to 0 for unlimited episode length. /// /// When an episode ends and a new one begins, the Agent object's /// function is called. You can implement /// to reset the agent or remove it from the /// environment. An agent's episode can also end if you call its /// method or an external process resets the environment through the . /// /// Consider limiting the number of steps in an episode to avoid wasting time during /// training. If you set the max step value to a reasonable estimate of the time it should /// take to complete a task, then agents that haven’t succeeded in that time frame will /// reset and start a new training episode rather than continue to fail. /// /// /// To use a step limit when training while allowing agents to run without resetting /// outside of training, you can set the max step to 0 in /// if the is not connected to an external process. ///


        /// using Unity.MLAgents;
        ///
        /// public class MyAgent : Agent
        /// {
        ///     public override void Initialize()
        ///     {
        ///         if (!Academy.Instance.IsCommunicatorOn)
        ///         {
        ///             this.MaxStep = 0;
        ///         }
        ///     }
        /// }
        ///

/// **Note:** in general, you should limit the differences between the code you execute /// during training and the code you run during inference. /// [FormerlySerializedAs("maxStep")] [HideInInspector] public int MaxStep; /// Current Agent information (message sent to Brain). AgentInfo m_Info; /// Represents the reward the agent accumulated during the current step. /// It is reset to 0 at the beginning of every step. /// Should be set to a positive value when the agent performs a "good" /// action that we wish to reinforce/reward, and set to a negative value /// when the agent performs a "bad" action that we wish to punish/deter. /// Additionally, the magnitude of the reward should not exceed 1.0 float m_Reward; /// Keeps track of the cumulative reward in this episode. float m_CumulativeReward; /// Whether or not the agent requests an action. bool m_RequestAction; /// Whether or not the agent requests a decision. bool m_RequestDecision; /// Keeps track of the number of steps taken by the agent in this episode. /// Note that this value is different for each agent, and may not overlap /// with the step counter in the Academy, since agents reset based on /// their own experience. int m_StepCount; /// Number of times the Agent has completed an episode. int m_CompletedEpisodes; /// Episode identifier each agent receives. It is used /// to separate between different agents in the environment. /// This Id will be changed every time the Agent resets. int m_EpisodeId; /// Whether or not the Agent has been initialized already bool m_Initialized; /// Keeps track of the actions that are masked at each step. DiscreteActionMasker m_ActionMasker; ///

/// Set of DemonstrationWriters that the Agent will write its step information to. /// If you use a DemonstrationRecorder component, this will automatically register its DemonstrationWriter. /// You can also add your own DemonstrationWriter by calling /// DemonstrationRecorder.AddDemonstrationWriterToAgent() ///

internal ISet DemonstrationWriters = new HashSet(); ///

/// List of sensors used to generate observations. /// Currently generated from attached SensorComponents, and a legacy VectorSensor ///

internal List sensors; ///

/// VectorSensor which is written to by AddVectorObs ///

internal VectorSensor collectObservationsSensor; private RecursionChecker m_CollectObservationsChecker = new RecursionChecker("CollectObservations"); private RecursionChecker m_OnEpisodeBeginChecker = new RecursionChecker("OnEpisodeBegin"); ///

/// List of IActuators that this Agent will delegate actions to if any exist. ///

ActuatorManager m_ActuatorManager; ///

/// VectorActuator which is used by default if no other sensors exist on this Agent. This VectorSensor will /// delegate its actions to by default in order to keep backward compatibility /// with the current behavior of Agent. ///

IActuator m_VectorActuator; ///

/// This is used to avoid allocation of a float array every frame if users are still using the old /// OnActionReceived method. ///

float[] m_LegacyActionCache; ///

/// Called when the attached [GameObject] becomes enabled and active. /// [GameObject]: https://docs.unity3d.com/Manual/GameObjects.html ///

/// /// This function initializes the Agent instance, if it hasn't been initialized yet. /// Always call the base Agent class version of this function if you implement `OnEnable()` /// in your own Agent subclasses. /// /// ///


        /// protected override void OnEnable()
        /// {
        ///     base.OnEnable();
        ///     // additional OnEnable logic...
        /// }
        ///

/// protected virtual void OnEnable() { LazyInitialize(); } ///

/// Called by Unity immediately before serializing this object. ///

/// /// The Agent class uses OnBeforeSerialize() for internal housekeeping. Call the /// base class implementation if you need your own custom serialization logic. /// /// See [OnBeforeSerialize] for more information. /// /// [OnBeforeSerialize]: https://docs.unity3d.com/ScriptReference/ISerializationCallbackReceiver.OnAfterDeserialize.html /// /// ///


        /// public new void OnBeforeSerialize()
        /// {
        ///     base.OnBeforeSerialize();
        ///     // additional serialization logic...
        /// }
        ///

/// public void OnBeforeSerialize() { // Manages a serialization upgrade issue from v0.13 to v0.14 where MaxStep moved // from AgentParameters (since removed) to Agent if (MaxStep == 0 && MaxStep != agentParameters.maxStep && !hasUpgradedFromAgentParameters) { MaxStep = agentParameters.maxStep; } hasUpgradedFromAgentParameters = true; } ///

/// Called by Unity immediately after deserializing this object. ///

/// /// The Agent class uses OnAfterDeserialize() for internal housekeeping. Call the /// base class implementation if you need your own custom deserialization logic. /// /// See [OnAfterDeserialize] for more information. /// /// [OnAfterDeserialize]: https://docs.unity3d.com/ScriptReference/ISerializationCallbackReceiver.OnAfterDeserialize.html /// /// ///


        /// public new void OnAfterDeserialize()
        /// {
        ///     base.OnAfterDeserialize();
        ///     // additional deserialization logic...
        /// }
        ///

/// public void OnAfterDeserialize() { // Manages a serialization upgrade issue from v0.13 to v0.14 where MaxStep moved // from AgentParameters (since removed) to Agent if (MaxStep == 0 && MaxStep != agentParameters.maxStep && !hasUpgradedFromAgentParameters) { MaxStep = agentParameters.maxStep; } hasUpgradedFromAgentParameters = true; } ///

/// Initializes the agent. Can be safely called multiple times. ///

/// /// This function calls your implementation, if one exists. /// public void LazyInitialize() { if (m_Initialized) { return; } m_Initialized = true; // Grab the "static" properties for the Agent. m_EpisodeId = EpisodeIdCounter.GetEpisodeId(); m_PolicyFactory = GetComponent(); m_Info = new AgentInfo(); sensors = new List(); Academy.Instance.AgentIncrementStep += AgentIncrementStep; Academy.Instance.AgentSendState += SendInfo; Academy.Instance.DecideAction += DecideAction; Academy.Instance.AgentAct += AgentStep; Academy.Instance.AgentForceReset += _AgentReset; using (TimerStack.Instance.Scoped("InitializeActuators")) { InitializeActuators(); } m_Brain = m_PolicyFactory.GeneratePolicy(m_ActuatorManager.GetCombinedActionSpec(), Heuristic); ResetData(); Initialize(); using (TimerStack.Instance.Scoped("InitializeSensors")) { InitializeSensors(); } m_Info.storedActions = new ActionBuffers( new float[m_ActuatorManager.NumContinuousActions], new int[m_ActuatorManager.NumDiscreteActions] ); // The first time the Academy resets, all Agents in the scene will be // forced to reset through the event. // To avoid the Agent resetting twice, the Agents will not begin their // episode when initializing until after the Academy had its first reset. if (Academy.Instance.TotalStepCount != 0) { using (m_OnEpisodeBeginChecker.Start()) { OnEpisodeBegin(); } } } ///

/// The reason that the Agent has been set to "done". ///

enum DoneReason { ///

/// The episode was ended manually by calling . ///

DoneCalled, ///

/// The max steps for the Agent were reached. ///

MaxStepReached, ///

/// The Agent was disabled. ///

Disabled, } ///

/// Called when the attached [GameObject] becomes disabled and inactive. /// [GameObject]: https://docs.unity3d.com/Manual/GameObjects.html ///

/// /// Always call the base Agent class version of this function if you implement `OnDisable()` /// in your own Agent subclasses. /// /// ///


        /// protected override void OnDisable()
        /// {
        ///     base.OnDisable();
        ///     // additional OnDisable logic...
        /// }
        ///

/// /// protected virtual void OnDisable() { DemonstrationWriters.Clear(); // If Academy.Dispose has already been called, we don't need to unregister with it. // We don't want to even try, because this will lazily create a new Academy! if (Academy.IsInitialized) { Academy.Instance.AgentIncrementStep -= AgentIncrementStep; Academy.Instance.AgentSendState -= SendInfo; Academy.Instance.DecideAction -= DecideAction; Academy.Instance.AgentAct -= AgentStep; Academy.Instance.AgentForceReset -= _AgentReset; NotifyAgentDone(DoneReason.Disabled); } m_Brain?.Dispose(); m_Initialized = false; } void NotifyAgentDone(DoneReason doneReason) { if (m_Info.done) { // The Agent was already marked as Done and should not be notified again return; } m_Info.episodeId = m_EpisodeId; m_Info.reward = m_Reward; m_Info.done = true; m_Info.maxStepReached = doneReason == DoneReason.MaxStepReached; if (collectObservationsSensor != null) { // Make sure the latest observations are being passed to training. collectObservationsSensor.Reset(); using (m_CollectObservationsChecker.Start()) { CollectObservations(collectObservationsSensor); } } // Request the last decision with no callbacks // We request a decision so Python knows the Agent is done immediately m_Brain?.RequestDecision(m_Info, sensors); ResetSensors(); // We also have to write any to any DemonstationStores so that they get the "done" flag. foreach (var demoWriter in DemonstrationWriters) { demoWriter.Record(m_Info, sensors); } if (doneReason != DoneReason.Disabled) { // We don't want to update the reward stats when the Agent is disabled, because this will make // the rewards look lower than they actually are during shutdown. m_CompletedEpisodes++; UpdateRewardStats(); } m_Reward = 0f; m_CumulativeReward = 0f; m_RequestAction = false; m_RequestDecision = false; m_Info.storedActions.Clear(); } ///

/// Updates the Model assigned to this Agent instance. ///

/// /// If the agent already has an assigned model, that model is replaced with the /// the provided one. However, if you call this function with arguments that are /// identical to the current parameters of the agent, then no changes are made. /// /// **Note:** the parameter is ignored when not training. /// The and parameters /// are ignored when not using inference. /// /// The identifier of the behavior. This /// will categorize the agent when training. /// /// The model to use for inference. /// Define the device on which the model /// will be run. public void SetModel( string behaviorName, NNModel model, InferenceDevice inferenceDevice = InferenceDevice.CPU) { if (behaviorName == m_PolicyFactory.BehaviorName && model == m_PolicyFactory.Model && inferenceDevice == m_PolicyFactory.InferenceDevice) { // If everything is the same, don't make any changes. return; } NotifyAgentDone(DoneReason.Disabled); m_PolicyFactory.Model = model; m_PolicyFactory.InferenceDevice = inferenceDevice; m_PolicyFactory.BehaviorName = behaviorName; ReloadPolicy(); } internal void ReloadPolicy() { if (!m_Initialized) { // If we haven't initialized yet, no need to make any changes now; they'll // happen in LazyInitialize later. return; } m_Brain?.Dispose(); m_Brain = m_PolicyFactory.GeneratePolicy(m_ActuatorManager.GetCombinedActionSpec(), Heuristic); } ///

/// Returns the current step counter (within the current episode). ///

/// /// Current step count. /// public int StepCount { get { return m_StepCount; } } ///

/// Returns the number of episodes that the Agent has completed (either /// was called, or maxSteps was reached). ///

/// /// Current episode count. /// public int CompletedEpisodes { get { return m_CompletedEpisodes; } } ///

/// Overrides the current step reward of the agent and updates the episode /// reward accordingly. ///

/// /// This function replaces any rewards given to the agent during the current step. /// Use to incrementally change the reward rather than /// overriding it. /// /// Typically, you assign rewards in the Agent subclass's /// implementation after carrying out the received action and evaluating its success. /// /// Rewards are used during reinforcement learning; they are ignored during inference. /// /// See [Agents - Rewards] for general advice on implementing rewards and [Reward Signals] /// for information about mixing reward signals from curiosity and Generative Adversarial /// Imitation Learning (GAIL) with rewards supplied through this method. /// /// [Agents - Rewards]: https://github.com/Unity-Technologies/ml-agents/blob/release_11_docs/docs/Learning-Environment-Design-Agents.md#rewards /// [Reward Signals]: https://github.com/Unity-Technologies/ml-agents/blob/release_11_docs/docs/ML-Agents-Overview.md#a-quick-note-on-reward-signals /// /// The new value of the reward. public void SetReward(float reward) { #if DEBUG Utilities.DebugCheckNanAndInfinity(reward, nameof(reward), nameof(SetReward)); #endif m_CumulativeReward += (reward - m_Reward); m_Reward = reward; } ///

/// Increments the step and episode rewards by the provided value. ///

/// Use a positive reward to reinforce desired behavior. You can use a /// negative reward to penalize mistakes. Use to /// set the reward assigned to the current step with a specific value rather than /// increasing or decreasing it. /// /// Typically, you assign rewards in the Agent subclass's /// implementation after carrying out the received action and evaluating its success. /// /// Rewards are used during reinforcement learning; they are ignored during inference. /// /// See [Agents - Rewards] for general advice on implementing rewards and [Reward Signals] /// for information about mixing reward signals from curiosity and Generative Adversarial /// Imitation Learning (GAIL) with rewards supplied through this method. /// /// [Agents - Rewards]: https://github.com/Unity-Technologies/ml-agents/blob/release_11_docs/docs/Learning-Environment-Design-Agents.md#rewards /// [Reward Signals]: https://github.com/Unity-Technologies/ml-agents/blob/release_11_docs/docs/ML-Agents-Overview.md#a-quick-note-on-reward-signals /// /// Incremental reward value. public void AddReward(float increment) { #if DEBUG Utilities.DebugCheckNanAndInfinity(increment, nameof(increment), nameof(AddReward)); #endif m_Reward += increment; m_CumulativeReward += increment; } ///

/// Retrieves the episode reward for the Agent. ///

/// The episode reward. public float GetCumulativeReward() { return m_CumulativeReward; } void UpdateRewardStats() { var gaugeName = $"{m_PolicyFactory.BehaviorName}.CumulativeReward"; TimerStack.Instance.SetGauge(gaugeName, GetCumulativeReward()); } ///

/// Sets the done flag to true and resets the agent. ///

/// /// This should be used when the episode can no longer continue, such as when the Agent /// reaches the goal or fails at the task. /// /// /// public void EndEpisode() { EndEpisodeAndReset(DoneReason.DoneCalled); } ///

/// Indicate that the episode is over but not due to the "fault" of the Agent. /// This has the same end result as calling , but has a /// slightly different effect on training. ///

/// /// This should be used when the episode could continue, but has gone on for /// a sufficient number of steps. /// /// /// public void EpisodeInterrupted() { EndEpisodeAndReset(DoneReason.MaxStepReached); } ///

/// Internal method to end the episode and reset the Agent. ///

/// void EndEpisodeAndReset(DoneReason reason) { NotifyAgentDone(reason); _AgentReset(); } ///

/// Requests a new decision for this agent. ///

/// /// Call `RequestDecision()` whenever an agent needs a decision. You often /// want to request a decision every environment step. However, if an agent /// cannot use the decision every step, then you can request a decision less /// frequently. /// /// You can add a component to the agent's /// [GameObject] to drive the agent's decision making. When you use this component, /// do not call `RequestDecision()` separately. /// /// Note that this function calls ; you do not need to /// call both functions at the same time. /// /// [GameObject]: https://docs.unity3d.com/Manual/GameObjects.html /// public void RequestDecision() { m_RequestDecision = true; RequestAction(); } ///

/// Requests an action for this agent. ///

/// /// Call `RequestAction()` to repeat the previous action returned by the agent's /// most recent decision. A new decision is not requested. When you call this function, /// the Agent instance invokes with the /// existing action vector. /// /// You can use `RequestAction()` in situations where an agent must take an action /// every update, but doesn't need to make a decision as often. For example, an /// agent that moves through its environment might need to apply an action to keep /// moving, but only needs to make a decision to change course or speed occasionally. /// /// You can add a component to the agent's /// [GameObject] to drive the agent's decision making and action frequency. When you /// use this component, do not call `RequestAction()` separately. /// /// Note that calls `RequestAction()`; you do not need to /// call both functions at the same time. /// /// [GameObject]: https://docs.unity3d.com/Manual/GameObjects.html /// public void RequestAction() { m_RequestAction = true; } /// Helper function that resets all the data structures associated with /// the agent. Typically used when the agent is being initialized or reset /// at the end of an episode. void ResetData() { m_ActuatorManager?.ResetData(); } ///

/// Implement `Initialize()` to perform one-time initialization or set up of the /// Agent instance. ///

/// /// `Initialize()` is called once when the agent is first enabled. If, for example, /// the Agent object needs references to other [GameObjects] in the scene, you /// can collect and store those references here. /// /// Note that is called at the start of each of /// the agent's "episodes". You can use that function for items that need to be reset /// for each episode. /// /// [GameObject]: https://docs.unity3d.com/Manual/GameObjects.html /// public virtual void Initialize() { } ///

/// Implement `Heuristic()` to choose an action for this agent using a custom heuristic. ///

/// /// Implement this function to provide custom decision making logic or to support manual /// control of an agent using keyboard, mouse, or game controller input. /// /// Your heuristic implementation can use any decision making logic you specify. Assign decision /// values to the and /// arrays , passed to your function as a parameter. /// The same array will be reused between steps. It is up to the user to initialize /// the values on each call, for example by calling `Array.Clear(actionsOut, 0, actionsOut.Length);`. /// Add values to the array at the same indexes as they are used in your /// function, which receives this array and /// implements the corresponding agent behavior. See [Actions] for more information /// about agent actions. /// Note : Do not create a new float array of action in the `Heuristic()` method, /// as this will prevent writing floats to the original action array. /// /// An agent calls this `Heuristic()` function to make a decision when you set its behavior /// type to . The agent also calls this function if /// you set its behavior type to when the /// is not connected to an external training process and you do not /// assign a trained model to the agent. /// /// To perform imitation learning, implement manual control of the agent in the `Heuristic()` /// function so that you can record the demonstrations required for the imitation learning /// algorithms. (Attach a [Demonstration Recorder] component to the agent's [GameObject] to /// record the demonstration session to a file.) /// /// Even when you don’t plan to use heuristic decisions for an agent or imitation learning, /// implementing a simple heuristic function can aid in debugging agent actions and interactions /// with its environment. /// /// [Demonstration Recorder]: https://github.com/Unity-Technologies/ml-agents/blob/release_11_docs/docs/Learning-Environment-Design-Agents.md#recording-demonstrations /// [Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_11_docs/docs/Learning-Environment-Design-Agents.md#actions /// [GameObject]: https://docs.unity3d.com/Manual/GameObjects.html /// /// /// The following example illustrates a `Heuristic()` function that provides WASD-style /// keyboard control for an agent that can move in two dimensions as well as jump. See /// [Input Manager] for more information about the built-in Unity input functions. /// You can also use the [Input System package], which provides a more flexible and /// configurable input system. ///


        ///     public override void Heuristic(in ActionBuffers actionsOut)
        ///     {
        ///         var continuousActionsOut = actionsOut.ContinuousActions;
        ///         continuousActionsOut[0] = Input.GetAxis("Horizontal");
        ///         continuousActionsOut[1] = Input.GetKey(KeyCode.Space) ? 1.0f : 0.0f;
        ///         continuousActionsOut[2] = Input.GetAxis("Vertical");
        ///     }
        ///

/// [Input Manager]: https://docs.unity3d.com/Manual/class-InputManager.html /// [Input System package]: https://docs.unity3d.com/Packages/com.unity.inputsystem@1.0/manual/index.html /// /// The which contain the continuous and /// discrete action buffers to write to. /// public virtual void Heuristic(in ActionBuffers actionsOut) { // Disable deprecation warnings so we can call the legacy overload. #pragma warning disable CS0618 // The default implementation of Heuristic calls the // obsolete version for backward compatibility switch (m_PolicyFactory.BrainParameters.VectorActionSpaceType) { case SpaceType.Continuous: Heuristic(actionsOut.ContinuousActions.Array); actionsOut.DiscreteActions.Clear(); break; case SpaceType.Discrete: var convertedOut = Array.ConvertAll(actionsOut.DiscreteActions.Array, x => (float)x); Heuristic(convertedOut); var discreteActionSegment = actionsOut.DiscreteActions; for (var i = 0; i < actionsOut.DiscreteActions.Length; i++) { discreteActionSegment[i] = (int)convertedOut[i]; } actionsOut.ContinuousActions.Clear(); break; } #pragma warning restore CS0618 } ///

/// Set up the list of ISensors on the Agent. By default, this will select any /// SensorComponent's attached to the Agent. ///

internal void InitializeSensors() { if (m_PolicyFactory.ObservableAttributeHandling != ObservableAttributeOptions.Ignore) { var excludeInherited = m_PolicyFactory.ObservableAttributeHandling == ObservableAttributeOptions.ExcludeInherited; using (TimerStack.Instance.Scoped("CreateObservableSensors")) { var observableSensors = ObservableAttribute.CreateObservableSensors(this, excludeInherited); sensors.AddRange(observableSensors); } } // Get all attached sensor components SensorComponent[] attachedSensorComponents; if (m_PolicyFactory.UseChildSensors) { attachedSensorComponents = GetComponentsInChildren(); } else { attachedSensorComponents = GetComponents(); } sensors.Capacity += attachedSensorComponents.Length; foreach (var component in attachedSensorComponents) { sensors.Add(component.CreateSensor()); } // Support legacy CollectObservations var param = m_PolicyFactory.BrainParameters; if (param.VectorObservationSize > 0) { collectObservationsSensor = new VectorSensor(param.VectorObservationSize); if (param.NumStackedVectorObservations > 1) { var stackingSensor = new StackingSensor( collectObservationsSensor, param.NumStackedVectorObservations); sensors.Add(stackingSensor); } else { sensors.Add(collectObservationsSensor); } } // Sort the Sensors by name to ensure determinism sensors.Sort((x, y) => x.GetName().CompareTo(y.GetName())); #if DEBUG // Make sure the names are actually unique for (var i = 0; i < sensors.Count - 1; i++) { Debug.Assert( !sensors[i].GetName().Equals(sensors[i + 1].GetName()), "Sensor names must be unique."); } #endif } void InitializeActuators() { ActuatorComponent[] attachedActuators; if (m_PolicyFactory.UseChildActuators) { attachedActuators = GetComponentsInChildren(); } else { attachedActuators = GetComponents(); } // Support legacy OnActionReceived // TODO don't set this up if the sizes are 0? var param = m_PolicyFactory.BrainParameters; m_VectorActuator = new VectorActuator(this, param.ActionSpec); m_ActuatorManager = new ActuatorManager(attachedActuators.Length + 1); m_LegacyActionCache = new float[m_VectorActuator.TotalNumberOfActions()]; m_ActuatorManager.Add(m_VectorActuator); foreach (var actuatorComponent in attachedActuators) { m_ActuatorManager.Add(actuatorComponent.CreateActuator()); } } ///

/// Sends the Agent info to the linked Brain. ///

void SendInfoToBrain() { if (!m_Initialized) { throw new UnityAgentsException("Call to SendInfoToBrain when Agent hasn't been initialized." + "Please ensure that you are calling 'base.OnEnable()' if you have overridden OnEnable."); } if (m_Brain == null) { return; } if (m_Info.done) { m_Info.ClearActions(); } else { m_Info.CopyActions(m_ActuatorManager.StoredActions); } UpdateSensors(); using (TimerStack.Instance.Scoped("CollectObservations")) { using (m_CollectObservationsChecker.Start()) { CollectObservations(collectObservationsSensor); } } using (TimerStack.Instance.Scoped("WriteActionMask")) { m_ActuatorManager.WriteActionMask(); } m_Info.discreteActionMasks = m_ActuatorManager.DiscreteActionMask?.GetMask(); m_Info.reward = m_Reward; m_Info.done = false; m_Info.maxStepReached = false; m_Info.episodeId = m_EpisodeId; using (TimerStack.Instance.Scoped("RequestDecision")) { m_Brain.RequestDecision(m_Info, sensors); } // If we have any DemonstrationWriters, write the AgentInfo and sensors to them. foreach (var demoWriter in DemonstrationWriters) { demoWriter.Record(m_Info, sensors); } } void UpdateSensors() { foreach (var sensor in sensors) { sensor.Update(); } } void ResetSensors() { foreach (var sensor in sensors) { sensor.Reset(); } } ///

/// Implement `CollectObservations()` to collect the vector observations of /// the agent for the step. The agent observation describes the current /// environment from the perspective of the agent. ///

/// /// The vector observations for the agent. /// /// /// An agent's observation is any environment information that helps /// the agent achieve its goal. For example, for a fighting agent, its /// observation could include distances to friends or enemies, or the /// current level of ammunition at its disposal. /// /// You can use a combination of vector, visual, and raycast observations for an /// agent. If you only use visual or raycast observations, you do not need to /// implement a `CollectObservations()` function. /// /// Add vector observations to the parameter passed to /// this method by calling the helper methods: /// - /// - /// - /// - /// - /// - /// - /// - /// /// You can use any combination of these helper functions to build the agent's /// vector of observations. You must build the vector in the same order /// each time `CollectObservations()` is called and the length of the vector /// must always be the same. In addition, the length of the observation must /// match the /// attribute of the linked Brain, which is set in the Editor on the /// **Behavior Parameters** component attached to the agent's [GameObject]. /// /// For more information about observations, see [Observations and Sensors]. /// /// [GameObject]: https://docs.unity3d.com/Manual/GameObjects.html /// [Observations and Sensors]: https://github.com/Unity-Technologies/ml-agents/blob/release_11_docs/docs/Learning-Environment-Design-Agents.md#observations-and-sensors /// public virtual void CollectObservations(VectorSensor sensor) { } ///

/// Returns a read-only view of the observations that were generated in /// . This is mainly useful inside of a /// method to avoid recomputing the observations. ///

/// A read-only view of the observations list. public ReadOnlyCollection GetObservations() { return collectObservationsSensor.GetObservations(); } ///

/// Implement `WriteDiscreteActionMask()` to collects the masks for discrete /// actions. When using discrete actions, the agent will not perform the masked /// action. ///

/// /// The action mask for the agent. /// /// /// When using Discrete Control, you can prevent the Agent from using a certain /// action by masking it with . /// /// See [Agents - Actions] for more information on masking actions. /// /// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_11_docs/docs/Learning-Environment-Design-Agents.md#actions /// /// public virtual void WriteDiscreteActionMask(IDiscreteActionMask actionMask) { if (m_ActionMasker == null) { m_ActionMasker = new DiscreteActionMasker(actionMask); } // Disable deprecation warnings so we can call the legacy overload. #pragma warning disable CS0618 CollectDiscreteActionMasks(m_ActionMasker); #pragma warning restore CS0618 } ///

/// Implement `OnActionReceived()` to specify agent behavior at every step, based /// on the provided action. ///

/// /// An action is passed to this function in the form of an . /// Your implementation must use the array to direct the agent's behavior for the /// current step. /// /// You decide how many elements you need in the ActionBuffers to control your /// agent and what each element means. For example, if you want to apply a /// force to move an agent around the environment, you can arbitrarily pick /// three values in ActionBuffers.ContinuousActions array to use as the force components. /// During training, the agent's policy learns to set those particular elements of /// the array to maximize the training rewards the agent receives. (Of course, /// if you implement a function, it must use the same /// elements of the action array for the same purpose since there is no learning /// involved.) /// /// An Agent can use continuous and/or discrete actions. Configure this along with the size /// of the action array, in the of the agent's associated /// component. /// /// When an agent uses continuous actions, the values in the ActionBuffers.ContinuousActions /// array are floating point numbers. You should clamp the values to the range, /// -1..1, to increase numerical stability during training. /// /// When an agent uses discrete actions, the values in the ActionBuffers.DiscreteActions array /// are integers that each represent a specific, discrete action. For example, /// you could define a set of discrete actions such as: /// ///


        /// 0 = Do nothing
        /// 1 = Move one space left
        /// 2 = Move one space right
        /// 3 = Move one space up
        /// 4 = Move one space down
        ///

/// /// When making a decision, the agent picks one of the five actions and puts the /// corresponding integer value in the ActionBuffers.DiscreteActions array. For example, if the agent /// decided to move left, the ActionBuffers.DiscreteActions parameter would be an array with /// a single element with the value 1. /// /// You can define multiple sets, or branches, of discrete actions to allow an /// agent to perform simultaneous, independent actions. For example, you could /// use one branch for movement and another branch for throwing a ball left, right, /// up, or down, to allow the agent to do both in the same step. /// /// The ActionBuffers.DiscreteActions array of an agent with discrete actions contains one /// element for each branch. The value of each element is the integer representing the /// chosen action for that branch. The agent always chooses one action for each branch. /// /// When you use the discrete actions, you can prevent the training process /// or the neural network model from choosing specific actions in a step by /// implementing the /// method. For example, if your agent is next to a wall, you could mask out any /// actions that would result in the agent trying to move into the wall. /// /// For more information about implementing agent actions see [Agents - Actions]. /// /// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_11_docs/docs/Learning-Environment-Design-Agents.md#actions /// /// /// Struct containing the buffers of actions to be executed at this step. /// public virtual void OnActionReceived(ActionBuffers actions) { var actionSpec = m_PolicyFactory.BrainParameters.ActionSpec; // For continuous and discrete actions together, we don't need to fall back to the legacy method if (actionSpec.NumContinuousActions > 0 && actionSpec.NumDiscreteActions > 0) { // Nothing implemented. return; } if (!actions.ContinuousActions.IsEmpty()) { m_LegacyActionCache = actions.ContinuousActions.Array; } else { m_LegacyActionCache = Array.ConvertAll(actions.DiscreteActions.Array, x => (float)x); } // Disable deprecation warnings so we can call the legacy overload. #pragma warning disable CS0618 OnActionReceived(m_LegacyActionCache); #pragma warning restore CS0618 } ///

/// Implement `OnEpisodeBegin()` to set up an Agent instance at the beginning /// of an episode. ///

/// /// public virtual void OnEpisodeBegin() { } ///

/// Gets the most recent ActionBuffer for this agent. ///

/// The most recent ActionBuffer for this agent public ActionBuffers GetStoredActionBuffers() { return m_ActuatorManager.StoredActions; } ///

/// An internal reset method that updates internal data structures in /// addition to calling . ///

void _AgentReset() { ResetData(); m_StepCount = 0; using (m_OnEpisodeBeginChecker.Start()) { OnEpisodeBegin(); } } ///

/// Scales continuous action from [-1, 1] to arbitrary range. ///

/// The input action value. /// The minimum output value. /// The maximum output value. /// The scaled from [-1,1] to /// [, ]. protected static float ScaleAction(float rawAction, float min, float max) { var middle = (min + max) / 2; var range = (max - min) / 2; return rawAction * range + middle; } ///

/// Signals the agent that it must send its decision to the brain. ///

void SendInfo() { // If the Agent is done, it has just reset and thus requires a new decision if (m_RequestDecision) { SendInfoToBrain(); m_Reward = 0f; m_RequestDecision = false; } } void AgentIncrementStep() { m_StepCount += 1; } /// Used by the brain to make the agent perform a step. void AgentStep() { if ((m_RequestAction) && (m_Brain != null)) { m_RequestAction = false; m_ActuatorManager.ExecuteActions(); } if ((m_StepCount >= MaxStep) && (MaxStep > 0)) { NotifyAgentDone(DoneReason.MaxStepReached); _AgentReset(); } } void DecideAction() { if (m_ActuatorManager.StoredActions.ContinuousActions.Array == null) { ResetData(); } var actions = m_Brain?.DecideAction() ?? new ActionBuffers(); m_Info.CopyActions(actions); m_ActuatorManager.UpdateActions(actions); } } }