Merge remote-tracking branch 'origin/master' into develop-BehaviorParams-public

4 年前 · fa5e7e6d
--- a/Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs
    Rigidbody m_BallRb;
    FloatPropertiesChannel m_ResetParams;

-    public override void InitializeAgent()
+    public override void Initialize()
    {
        m_BallRb = ball.GetComponent<Rigidbody>();
        m_ResetParams = Academy.Instance.FloatProperties;
        sensor.AddObservation(m_BallRb.velocity);
    }

-    public override void AgentAction(float[] vectorAction)
+    public override void OnActionReceived(float[] vectorAction)
    {
        var actionZ = 2f * Mathf.Clamp(vectorAction[0], -1f, 1f);
        var actionX = 2f * Mathf.Clamp(vectorAction[1], -1f, 1f);
            Mathf.Abs(ball.transform.position.z - gameObject.transform.position.z) > 3f)
        {
            SetReward(-1f);
-            Done();
+            EndEpisode();
        }
        else
        {

-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
        gameObject.transform.rotation = new Quaternion(0f, 0f, 0f, 0f);
        gameObject.transform.Rotate(new Vector3(1, 0, 0), Random.Range(-10f, 10f));
--- a/Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs
    Rigidbody m_BallRb;
    FloatPropertiesChannel m_ResetParams;

-    public override void InitializeAgent()
+    public override void Initialize()
    {
        m_BallRb = ball.GetComponent<Rigidbody>();
        m_ResetParams = Academy.Instance.FloatProperties;
        sensor.AddObservation((ball.transform.position - gameObject.transform.position));
    }

-    public override void AgentAction(float[] vectorAction)
+    public override void OnActionReceived(float[] vectorAction)
    {
        var actionZ = 2f * Mathf.Clamp(vectorAction[0], -1f, 1f);
        var actionX = 2f * Mathf.Clamp(vectorAction[1], -1f, 1f);
            Mathf.Abs(ball.transform.position.z - gameObject.transform.position.z) > 3f)
        {
            SetReward(-1f);
-            Done();
+            EndEpisode();
        }
        else
        {

-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
        gameObject.transform.rotation = new Quaternion(0f, 0f, 0f, 0f);
        gameObject.transform.Rotate(new Vector3(1, 0, 0), Random.Range(-10f, 10f));
--- a/Project/Assets/ML-Agents/Examples/Basic/Scripts/BasicController.cs
+++ b/Project/Assets/ML-Agents/Examples/Basic/Scripts/BasicController.cs
        if (m_Position == k_SmallGoalPosition)
        {
            m_Agent.AddReward(0.1f);
-            m_Agent.Done();
+            m_Agent.EndEpisode();
            ResetAgent();
        }

-            m_Agent.Done();
+            m_Agent.EndEpisode();
            ResetAgent();
        }
    }
--- a/Project/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs

    FloatPropertiesChannel m_ResetParams;

-    public override void InitializeAgent()
+    public override void Initialize()
    {
        m_Rb = gameObject.GetComponent<Rigidbody>();
        m_LookDir = Vector3.zero;
        sensor.AddObservation(target.transform.localPosition);
    }

-    public override void AgentAction(float[] vectorAction)
+    public override void OnActionReceived(float[] vectorAction)
    {
        for (var i = 0; i < vectorAction.Length; i++)
        {
        m_LookDir = new Vector3(x, y, z);
    }

-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
        gameObject.transform.localPosition = new Vector3(
            (1 - 2 * Random.value) * 5, 2, (1 - 2 * Random.value) * 5);
        if (gameObject.transform.position.y < -1)
        {
            AddReward(-1);
-            Done();
+            EndEpisode();
            return;
        }

            AddReward(-1);
-            Done();
+            EndEpisode();
-            Done();
+            EndEpisode();
        }
    }

--- a/Project/Assets/ML-Agents/Examples/Crawler/Scripts/CrawlerAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/Crawler/Scripts/CrawlerAgent.cs
    Quaternion m_LookRotation;
    Matrix4x4 m_TargetDirMatrix;

-    public override void InitializeAgent()
+    public override void Initialize()
    {
        m_JdController = GetComponent<JointDriveController>();
        m_DirToTarget = target.position - body.position;
        target.position = newTargetPos + ground.position;
    }

-    public override void AgentAction(float[] vectorAction)
+    public override void OnActionReceived(float[] vectorAction)
    {
        // The dictionary with all the body parts in it are in the jdController
        var bpDict = m_JdController.bodyPartsDict;
    /// <summary>
    /// Loop over body parts and reset them to initial conditions.
    /// </summary>
-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
        if (m_DirToTarget != Vector3.zero)
        {
--- a/Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorAgent.cs
    public bool useVectorObs;


-    public override void InitializeAgent()
+    public override void Initialize()
-        base.InitializeAgent();
        m_AgentRb = GetComponent<Rigidbody>();
        m_MyArea = area.GetComponent<FoodCollectorArea>();
        m_FoodCollecterSettings = FindObjectOfType<FoodCollectorSettings>();
        gameObject.GetComponentInChildren<Renderer>().material = normalMaterial;
    }

-    public override void AgentAction(float[] vectorAction)
+    public override void OnActionReceived(float[] vectorAction)
    {
        MoveAgent(vectorAction);
    }
        return action;
    }

-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
        Unfreeze();
        Unpoison();
--- a/Project/Assets/ML-Agents/Examples/GridWorld/Demos/ExpertGrid.demo
+++ b/Project/Assets/ML-Agents/Examples/GridWorld/Demos/ExpertGrid.demo
--- a/Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAgent.cs
    const int k_Left = 3;
    const int k_Right = 4;

-    public override void InitializeAgent()
-    {
-    }
-
    public override void CollectDiscreteActionMasks(DiscreteActionMasker actionMasker)
    {
        // Mask the necessary actions if selected by the user.
    }

    // to be implemented by the developer
-    public override void AgentAction(float[] vectorAction)
+    public override void OnActionReceived(float[] vectorAction)
    {
        AddReward(-0.01f);
        var action = Mathf.FloorToInt(vectorAction[0]);
            if (hit.Where(col => col.gameObject.CompareTag("goal")).ToArray().Length == 1)
            {
                SetReward(1f);
-                Done();
+                EndEpisode();
-                Done();
+                EndEpisode();
            }
        }
    }
    }

    // to be implemented by the developer
-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
        area.AreaReset();
    }
--- a/Project/Assets/ML-Agents/Examples/Hallway/Scripts/HallwayAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/Hallway/Scripts/HallwayAgent.cs
    HallwaySettings m_HallwaySettings;
    int m_Selection;

-    public override void InitializeAgent()
+    public override void Initialize()
-        base.InitializeAgent();
        m_HallwaySettings = FindObjectOfType<HallwaySettings>();
        m_AgentRb = GetComponent<Rigidbody>();
        m_GroundRenderer = ground.GetComponent<Renderer>();
        m_AgentRb.AddForce(dirToGo * m_HallwaySettings.agentRunSpeed, ForceMode.VelocityChange);
    }

-    public override void AgentAction(float[] vectorAction)
+    public override void OnActionReceived(float[] vectorAction)
    {
        AddReward(-1f / maxStep);
        MoveAgent(vectorAction);
                SetReward(-0.1f);
                StartCoroutine(GoalScoredSwapGroundMaterial(m_HallwaySettings.failMaterial, 0.5f));
            }
-            Done();
+            EndEpisode();
        }
    }

        return new float[] { 0 };
    }

-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
        var agentOffset = -15f;
        var blockOffset = 0f;
--- a/Project/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs
+++ b/Project/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs
        m_PushBlockSettings = FindObjectOfType<PushBlockSettings>();
    }

-    public override void InitializeAgent()
+    public override void Initialize()
-        base.InitializeAgent();
        goalDetect = block.GetComponent<GoalDetect>();
        goalDetect.agent = this;

        AddReward(5f);

        // By marking an agent as done AgentReset() will be called automatically.
-        Done();
+        EndEpisode();

        // Swap ground material for a bit to indicate we scored.
        StartCoroutine(GoalScoredSwapGroundMaterial(m_PushBlockSettings.goalScoredMaterial, 0.5f));
    /// <summary>
    /// Called every step of the engine. Here the agent takes an action.
    /// </summary>
-    public override void AgentAction(float[] vectorAction)
+    public override void OnActionReceived(float[] vectorAction)
    {
        // Move the agent using the action.
        MoveAgent(vectorAction);
    /// In the editor, if "Reset On Done" is checked then AgentReset() will be
    /// called automatically anytime we mark done = true in an agent script.
    /// </summary>
-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
        var rotation = Random.Range(0, 4);
        var rotationAngle = rotation * 90f;
--- a/Project/Assets/ML-Agents/Examples/Pyramids/Scripts/PyramidAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/Pyramids/Scripts/PyramidAgent.cs
    public GameObject areaSwitch;
    public bool useVectorObs;

-    public override void InitializeAgent()
+    public override void Initialize()
-        base.InitializeAgent();
        m_AgentRb = GetComponent<Rigidbody>();
        m_MyArea = area.GetComponent<PyramidArea>();
        m_SwitchLogic = areaSwitch.GetComponent<PyramidSwitch>();
        m_AgentRb.AddForce(dirToGo * 2f, ForceMode.VelocityChange);
    }

-    public override void AgentAction(float[] vectorAction)
+    public override void OnActionReceived(float[] vectorAction)
    {
        AddReward(-1f / maxStep);
        MoveAgent(vectorAction);
        return new float[] { 0 };
    }

-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
        var enumerable = Enumerable.Range(0, 9).OrderBy(x => Guid.NewGuid()).Take(9);
        var items = enumerable.ToArray();
        if (collision.gameObject.CompareTag("goal"))
        {
            SetReward(2f);
-            Done();
+            EndEpisode();
        }
    }
 }
--- a/Project/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs
    /// Collect the rigidbodies of the reacher in order to resue them for
    /// observations and actions.
    /// </summary>
-    public override void InitializeAgent()
+    public override void Initialize()
    {
        m_RbA = pendulumA.GetComponent<Rigidbody>();
        m_RbB = pendulumB.GetComponent<Rigidbody>();
    /// <summary>
    /// The agent's four actions correspond to torques on each of the two joints.
    /// </summary>
-    public override void AgentAction(float[] vectorAction)
+    public override void OnActionReceived(float[] vectorAction)
    {
        m_GoalDegree += m_GoalSpeed;
        UpdateGoalPosition();
    /// <summary>
    /// Resets the position and velocity of the agent and the goal.
    /// </summary>
-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
        pendulumA.transform.position = new Vector3(0f, -4f, 0f) + transform.position;
        pendulumA.transform.rotation = Quaternion.Euler(180f, 0f, 0f);
--- a/Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/GroundContact.cs
+++ b/Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/GroundContact.cs

                if (agentDoneOnGroundContact)
                {
-                    agent.Done();
+                    agent.EndEpisode();
                }
            }
        }
--- a/Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ModelOverrider.cs
+++ b/Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ModelOverrider.cs
            var nnModel = GetModelForBehaviorName(name);
            Debug.Log($"Overriding behavior {name} for agent with model {nnModel?.name}");
            // This might give a null model; that's better because we'll fall back to the Heuristic
-            m_Agent.GiveModel($"Override_{name}", nnModel);
+            m_Agent.SetModel($"Override_{name}", nnModel);

        }
    }
--- a/Project/Assets/ML-Agents/Examples/Soccer/Scripts/AgentSoccer.cs
+++ b/Project/Assets/ML-Agents/Examples/Soccer/Scripts/AgentSoccer.cs
    BehaviorParameters m_BehaviorParameters;
    Vector3 m_Transform;

-    public override void InitializeAgent()
+    public override void Initialize()
-        base.InitializeAgent();
        m_BehaviorParameters = gameObject.GetComponent<BehaviorParameters>();
        if (m_BehaviorParameters.TeamId == (int)Team.Blue)
        {
            ForceMode.VelocityChange);
    }

-    public override void AgentAction(float[] vectorAction)
+    public override void OnActionReceived(float[] vectorAction)
    {
        // Existential penalty for strikers.
        AddReward(-1f / 3000f);
        }
    }

-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
        if (team == Team.Purple)
        {
--- a/Project/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerFieldArea.cs
+++ b/Project/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerFieldArea.cs
            {
                ps.agentScript.AddReward(-1);
            }
-            ps.agentScript.Done();  //all agents need to be reset
+            ps.agentScript.EndEpisode();  //all agents need to be reset

            if (goalTextUI)
            {
--- a/Project/Assets/ML-Agents/Examples/Template/Scripts/TemplateAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/Template/Scripts/TemplateAgent.cs
    {
    }

-    public override void AgentAction(float[] vectorAction)
+    public override void OnActionReceived(float[] vectorAction)
-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
    }
 }
--- a/Project/Assets/ML-Agents/Examples/Tennis/Scripts/HitWall.cs
+++ b/Project/Assets/ML-Agents/Examples/Tennis/Scripts/HitWall.cs

    void Reset()
    {
-        m_AgentA.Done();
-        m_AgentB.Done();
+        m_AgentA.EndEpisode();
+        m_AgentB.EndEpisode();
        m_Area.MatchReset();
        lastFloorHit = FloorHit.Service;
        net = false;
--- a/Project/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs
    const string k_ScoreBoardAName = "ScoreA";
    const string k_ScoreBoardBName = "ScoreB";

-    public override void InitializeAgent()
+    public override void Initialize()
    {
        m_AgentRb = GetComponent<Rigidbody>();
        m_BallRb = ball.GetComponent<Rigidbody>();
        sensor.AddObservation(m_InvertMult * gameObject.transform.rotation.z);
    }

-    public override void AgentAction(float[] vectorAction)
+    public override void OnActionReceived(float[] vectorAction)
    {
        var moveX = Mathf.Clamp(vectorAction[0], -1f, 1f) * m_InvertMult;
        var moveY = Mathf.Clamp(vectorAction[1], -1f, 1f);
        return action;
    }

-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
        m_InvertMult = invertX ? -1f : 1f;

--- a/Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs

    FloatPropertiesChannel m_ResetParams;

-    public override void InitializeAgent()
+    public override void Initialize()
    {
        m_JdController = GetComponent<JointDriveController>();
        m_JdController.SetupBodyPart(hips);
        }
    }

-    public override void AgentAction(float[] vectorAction)
+    public override void OnActionReceived(float[] vectorAction)
    {
        var bpDict = m_JdController.bodyPartsDict;
        var i = -1;
    /// <summary>
    /// Loop over body parts and reset them to initial conditions.
    /// </summary>
-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
        if (m_DirToTarget != Vector3.zero)
        {
--- a/Project/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs
    Vector3 m_JumpTargetPos;
    Vector3 m_JumpStartingPos;

-    public override void InitializeAgent()
+    public override void Initialize()
    {
        m_WallJumpSettings = FindObjectOfType<WallJumpSettings>();
        m_Configuration = Random.Range(0, 5);
        jumpingTime -= Time.fixedDeltaTime;
    }

-    public override void AgentAction(float[] vectorAction)
+    public override void OnActionReceived(float[] vectorAction)
    {
        MoveAgent(vectorAction);
        if ((!Physics.Raycast(m_AgentRb.position, Vector3.down, 20))
-            Done();
+            EndEpisode();
            ResetBlock(m_ShortBlockRb);
            StartCoroutine(
                GoalScoredSwapGroundMaterial(m_WallJumpSettings.failMaterial, .5f));
        if (col.gameObject.CompareTag("goal") && DoGroundCheck(true))
        {
            SetReward(1f);
-            Done();
+            EndEpisode();
            StartCoroutine(
                GoalScoredSwapGroundMaterial(m_WallJumpSettings.goalScoredMaterial, 2));
        }
        blockRb.angularVelocity = Vector3.zero;
    }

-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
        ResetBlock(m_ShortBlockRb);
        transform.localPosition = new Vector3(
                Academy.Instance.FloatProperties.GetPropertyWithDefault("no_wall_height", 0),
                localScale.z);
            wall.transform.localScale = localScale;
-            GiveModel("SmallWallJump", noWallBrain);
+            SetModel("SmallWallJump", noWallBrain);
        }
        else if (config == 1)
        {
                localScale.z);
            wall.transform.localScale = localScale;
-            GiveModel("SmallWallJump", smallWallBrain);
+            SetModel("SmallWallJump", smallWallBrain);
        }
        else
        {
                height,
                localScale.z);
            wall.transform.localScale = localScale;
-            GiveModel("BigWallJump", bigWallBrain);
+            SetModel("BigWallJump", bigWallBrain);
        }
    }
 }
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
 - Multi-GPU training and the `--multi-gpu` option has been removed temporarily. (#3345)
 - All Sensor related code has been moved to the namespace `MLAgents.Sensors`.
 - All SideChannel related code has been moved to the namespace `MLAgents.SideChannels`.
+ - `BrainParameters` and `SpaceType` have been removed from the public API
+ - `BehaviorParameters` have been removed from the public API.
+ - The following methods in the `Agent` class have been deprecated and will be removed in a later release:
+   - `InitializeAgent()` was renamed to `Initialize()`
+   - `AgentAction()` was renamed to `OnActionReceived()`
+   - `AgentReset()` was renamed to `OnEpisodeBegin()`
+   - `Done()` was renamed to `EndEpisode()`
+   - `GiveModel()` was renamed to `SetModel()`

 ### Minor Changes
 - Monitor.cs was moved to Examples. (#3372)
 - `DecisionRequester` has been made internal (you can still use the DecisionRequesterComponent from the inspector). `RepeatAction` was renamed `TakeActionsBetweenDecisions` for clarity. (#3555)
 - The `IFloatProperties` interface has been removed.
 - Fix #3579.
+ - Fixed an issue when using GAIL with less than `batch_size` number of demonstrations. (#3591)

 ## [0.14.1-preview] - 2020-02-25

--- a/com.unity.ml-agents/Runtime/Agent.cs
+++ b/com.unity.ml-agents/Runtime/Agent.cs
            Academy.Instance.AgentForceReset += _AgentReset;
            m_Brain = m_PolicyFactory.GeneratePolicy(Heuristic);
            ResetData();
-            InitializeAgent();
+            Initialize();
            InitializeSensors();
        }

            m_RequestDecision = false;
        }

+        [Obsolete("GiveModel() has been deprecated, use SetModel() instead.")]
+        public void GiveModel(
+            string behaviorName,
+            NNModel model,
+            InferenceDevice inferenceDevice = InferenceDevice.CPU)
+        {
+            SetModel(behaviorName, model, inferenceDevice);
+        }
+
        /// <summary>
        /// Updates the Model for the agent. Any model currently assigned to the
        /// agent will be replaced with the provided one. If the arguments are
        /// <param name="model"> The model to use for inference.</param>
        /// <param name = "inferenceDevice"> Define on what device the model
        /// will be run.</param>
-        public void GiveModel(
+        public void SetModel(
            string behaviorName,
            NNModel model,
            InferenceDevice inferenceDevice = InferenceDevice.CPU)
            TimerStack.Instance.SetGauge(gaugeName, GetCumulativeReward());
        }

+        [Obsolete("Done() has been deprecated, use EndEpisode() instead.")]
+        public void Done()
+        {
+            EndEpisode();
+        }
+
-        public void Done()
+        public void EndEpisode()
        {
            NotifyAgentDone(DoneReason.DoneCalled);
            _AgentReset();
            }
        }

+        [Obsolete("InitializeAgent() has been deprecated, use Initialize() instead.")]
+        public virtual void InitializeAgent()
+        {
+        }
+
        /// <summary>
        /// Initializes the agent, called once when the agent is enabled. Can be
        /// left empty if there is no special, unique set-up behavior for the
        /// One sample use is to store local references to other objects in the
        /// scene which would facilitate computing this agents observation.
        /// </remarks>
-        public virtual void InitializeAgent()
+        public virtual void Initialize()
+#pragma warning disable 0618
+            InitializeAgent();
+#pragma warning restore 0618
        }

        /// <summary>
        {
        }

+        [Obsolete("AgentAction() has been deprecated, use OnActionReceived() instead.")]
+        public virtual void AgentAction(float[] vectorAction)
+        {
+        }
+
        /// <summary>
        /// Specifies the agent behavior at every step based on the provided
        /// action.
        /// will be of length 1.
        /// </param>
-        public virtual void AgentAction(float[] vectorAction)
+        public virtual void OnActionReceived(float[] vectorAction)
+        {
+#pragma warning disable 0618
+            AgentAction(m_Action.vectorActions);
+#pragma warning restore 0618
+        }
+
+        [Obsolete("AgentReset() has been deprecated, use OnEpisodeBegin() instead.")]
+        public virtual void AgentReset()
        {
        }

        /// episode).
        /// </summary>
-        public virtual void AgentReset()
+        public virtual void OnEpisodeBegin()
+#pragma warning disable 0618
+            AgentReset();
+#pragma warning restore 0618
        }

        /// <summary>
        {
            ResetData();
            m_StepCount = 0;
-            AgentReset();
+            OnEpisodeBegin();
        }

        /// <summary>
            if ((m_RequestAction) && (m_Brain != null))
            {
                m_RequestAction = false;
-                AgentAction(m_Action.vectorActions);
+                OnActionReceived(m_Action.vectorActions);
+
            }

            if ((m_StepCount >= maxStep) && (maxStep > 0))
--- a/com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs
+++ b/com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs
        public TestSensor sensor1;
        public TestSensor sensor2;

-        public override void InitializeAgent()
+        public override void Initialize()
        {
            initializeAgentCalls += 1;

            sensor.AddObservation(0f);
        }

-        public override void AgentAction(float[] vectorAction)
+        public override void OnActionReceived(float[] vectorAction)
        {
            agentActionCalls += 1;
            agentActionCallsSinceLastReset += 1;
-        public override void AgentReset()
+        public override void OnEpisodeBegin()
        {
            agentResetCalls += 1;
            collectObservationsCallsSinceLastReset = 0;
                // Set agent 1 to done every 11 steps to test behavior
                if (i % 11 == 5)
                {
-                    agent1.Done();
+                    agent1.EndEpisode();
-                    agent2.Done();
+                    agent2.EndEpisode();
                    numberAgent2Reset += 1;
                    agent2StepSinceReset = 0;
                }
--- a/docs/Getting-Started-with-Balance-Ball.md
+++ b/docs/Getting-Started-with-Balance-Ball.md
 * **Behavior Parameters** — Every Agent must have a Behavior. The Behavior
  determines how an Agent makes decisions. More on Behavior Parameters in
  the next section.
-* **Max Step** — Defines how many simulation steps can occur before the Agent
-  decides it is done. In 3D Balance Ball, an Agent restarts after 5000 steps.
+* **Max Step** — Defines how many simulation steps can occur before the Agent's
+  episode ends. In 3D Balance Ball, an Agent restarts after 5000 steps.
-Perhaps the more interesting aspect of an agents is the Agent subclass
-implementation. When you create an Agent, you must extend the base Agent class.
+When you create an Agent, you must extend the base Agent class.
-* agent.AgentReset() — Called when the Agent resets, including at the beginning
-  of a session. The Ball3DAgent class uses the reset function to reset the
+* `Agent.OnEpisodeBegin()` — Called when the Agent resets, including at the beginning
+  of the simulation. The Ball3DAgent class uses the reset function to reset the
-* agent.CollectObservations(VectorSensor sensor) — Called every simulation step. Responsible for
+* `Agent.CollectObservations(VectorSensor sensor)` — Called every simulation step. Responsible for
-* agent.AgentAction() — Called every simulation step. Receives the action chosen
+* `Agent.OnActionReceived()` — Called every time the Agent receives an action to take. Receives the action chosen
-  small change in the agent cube's rotation at each step. The `AgentAction()` function
+  small change in the agent cube's rotation at each step. The `OnActionReceived()` method
-  negative reward for dropping the ball. An Agent is also marked as done when it
+  negative reward for dropping the ball. An Agent's episode is also ended when it
-* agent.Heuristic() - When the `Behavior Type` is set to `Heuristic Only` in the Behavior
+* `Agent.Heuristic()` - When the `Behavior Type` is set to `Heuristic Only` in the Behavior
  Parameters of the Agent, the Agent will use the `Heuristic()` method to generate
  the actions of the Agent. As such, the `Heuristic()` method returns an array of
  floats. In the case of the Ball 3D Agent, the `Heuristic()` method converts the
--- a/docs/Learning-Environment-Create-New.md
+++ b/docs/Learning-Environment-Create-New.md

 ### Initialization and Resetting the Agent

-When the Agent reaches its target, it marks itself done and its Agent reset
-function moves the target to a random location. In addition, if the Agent rolls
-off the platform, the reset function puts it back onto the floor.
+When the Agent reaches its target, its episode ends and the `OnEpisodeBegin()`
+method moves the target to a random location. In addition, if the Agent rolls
+off the platform, the `OnEpisodeBegin()` method puts it back onto the floor.

 To move the target GameObject, we need a reference to its Transform (which
 stores a GameObject's position, orientation and scale in the 3D world). To get
    }

    public Transform Target;
-    public override void AgentReset()
+    public override void OnEpisodeBegin()
    {
        if (this.transform.position.y < 0)
        {
 }
 ```

-The final part of the Agent code is the `Agent.AgentAction()` method, which
-receives the decision from the Brain and assigns the reward.
+The final part of the Agent code is the `Agent.OnActionReceived()` method, which
+receives the actions from the Brain and assigns the reward.
-`AgentAction()` function. The number of elements in this array is determined by
+`OnActionReceived()` function. The number of elements in this array is determined by
 the `Vector Action` `Space Type` and `Space Size` settings of the
 agent's Brain. The RollerAgent uses the continuous vector action space and needs
 two continuous control signals from the Brain. Thus, we will set the Brain

 ### Rewards

-Reinforcement learning requires rewards. Assign rewards in the `AgentAction()`
+Reinforcement learning requires rewards. Assign rewards in the `OnActionReceived()`
 function. The learning algorithm uses the rewards assigned to the Agent during
 the simulation and learning process to determine whether it is giving
 the Agent the optimal actions. You want to reward an Agent for completing the
 The RollerAgent calculates the distance to detect when it reaches the target.
 When it does, the code calls the `Agent.SetReward()` method to assign a
-reward of 1.0 and marks the agent as finished by calling the `Done()` method
+reward of 1.0 and marks the agent as finished by calling the `EndEpisode()` method
 on the Agent.

 ```csharp
 if (distanceToTarget < 1.42f)
 {
    SetReward(1.0f);
-    Done();
+    EndEpisode();
-Finally, if the Agent falls off the platform, set the Agent to done so that it can reset itself:
+Finally, if the Agent falls off the platform, end the episode so that it can reset itself:
-    Done();
+    EndEpisode();
-### AgentAction()
+### OnActionReceived()
-`AgentAction()` function looks like:
+`OnActionReceived()` function looks like:
-public override void AgentAction(float[] vectorAction)
+public override void OnActionReceived(float[] vectorAction)
 {
    // Actions, size = 2
    Vector3 controlSignal = Vector3.zero;
    if (distanceToTarget < 1.42f)
    {
        SetReward(1.0f);
-        Done();
+        EndEpisode();
-        Done();
+        EndEpisode();
    }

 }
--- a/docs/Learning-Environment-Design-Agents.md
+++ b/docs/Learning-Environment-Design-Agents.md

 An action is an instruction from the Policy that the agent carries out. The
 action is passed to the Agent as a parameter when the Academy invokes the
-agent's `AgentAction()` function. When you specify that the vector action space
+agent's `OnActionReceived()` function. When you specify that the vector action space
 is **Continuous**, the action parameter passed to the Agent is an array of
 control signals with length equal to the `Vector Action Space Size` property.
 When you specify a **Discrete** vector action space type, the action parameter
 values themselves mean. The training algorithm simply tries different values for
 the action list and observes the affect on the accumulated rewards over time and
 many training episodes. Thus, the only place actions are defined for an Agent is
-in the `AgentAction()` function. You simply specify the type of vector action
-space, and, for the continuous vector action space, the number of values, and
-then apply the received values appropriately (and consistently) in
-`ActionAct()`.
+in the `OnActionReceived()` function.

 For example, if you designed an agent to move in two dimensions, you could use
 either continuous or the discrete vector actions. In the continuous case, you
 ### Continuous Action Space

 When an Agent uses a Policy set to the **Continuous** vector action space, the
-action parameter passed to the Agent's `AgentAction()` function is an array with
+action parameter passed to the Agent's `OnActionReceived()` function is an array with
 length equal to the `Vector Action Space Size` property value.
 The individual values in the array have whatever meanings that you ascribe to
 them. If you assign an element in the array as the speed of an Agent, for
 These control values are applied as torques to the bodies making up the arm:

 ```csharp
-public override void AgentAction(float[] act)
+public override void OnActionReceived(float[] act)
 {
    float torque_x = Mathf.Clamp(act[0], -1, 1) * 100f;
    float torque_z = Mathf.Clamp(act[1], -1, 1) * 100f;
 ### Discrete Action Space

 When an Agent uses a  **Discrete** vector action space, the
-action parameter passed to the Agent's `AgentAction()` function is an array
+action parameter passed to the Agent's `OnActionReceived()` function is an array
 containing indices. With the discrete vector action space, `Branches` is an
 array of integers, each value corresponds to the number of possibilities for
 each branch.
 agent be able to move __and__ jump concurrently. We define the first branch to
 have 5 possible actions (don't move, go left, go right, go backward, go forward)
 and the second one to have 2 possible actions (don't jump, jump). The
-AgentAction method would look something like:
+`OnActionReceived()` method would look something like:

 ```csharp
 // Get the action index for movement
 Agent's Heuristic to control the Agent while watching how it accumulates rewards.

 Allocate rewards to an Agent by calling the `AddReward()` method in the
-`AgentAction()` function. The reward assigned between each decision
+`OnActionReceived()` function. The reward assigned between each decision
 should be in the range [-1,1]. Values outside this range can lead to
 unstable training. The `reward` value is reset to zero when the agent receives a
 new decision. If there are multiple calls to `AddReward()` for a single agent

 ### Examples

-You can examine the `AgentAction()` functions defined in the [example
+You can examine the `OnActionReceived()` functions defined in the [example
 environments](Learning-Environment-Examples.md) to see how those projects
 allocate rewards.

 if (hitObjects.Where(col => col.gameObject.tag == "goal").ToArray().Length == 1)
 {
    AddReward(1.0f);
-    Done();
+    EndEpisode();
-    Done();
+    EndEpisode();
 }
 ```

    Mathf.Abs(gameObject.transform.position.x - area.transform.position.x) > 8f ||
    Mathf.Abs(gameObject.transform.position.z + 5 - area.transform.position.z) > 8)
 {
-    Done();
+    EndEpisode();
    AddReward(-1f);
 }
 ```
 platform:

 ```csharp
-if (IsDone() == false)
-{
-    SetReward(0.1f);
-}
+
+SetReward(0.1f);
-// When ball falls mark Agent as done and give a negative penalty
+// When ball falls mark Agent as finished and give a negative penalty
-    Done();
+    EndEpisode();
+
 }
 ```

-Note that all of these environments make use of the `Done()` method, which manually
+Note that all of these environments make use of the `EndEpisode()` method, which manually
 terminates an episode when a termination condition is reached. This can be
 called independently of the `Max Step` property.

--- a/docs/Learning-Environment-Design.md
+++ b/docs/Learning-Environment-Design.md

 Training and simulation proceed in steps orchestrated by the ML-Agents Academy
 class. The Academy works with Agent objects in the scene to step
-through the simulation. When all Agents in the scene are _done_,
-one training episode is finished.
+through the simulation.

 During training, the external Python training process communicates with the
 Academy to run a series of episodes while it collects data and optimizes its
 The ML-Agents Academy class orchestrates the agent simulation loop as follows:

 1. Calls your Academy's `OnEnvironmentReset` delegate.
-2. Calls the `AgentReset()` function for each Agent in the scene.
+2. Calls the `OnEpisodeBegin()` function for each Agent in the scene.
-5. Calls the `AgentAction()` function for each Agent in the scene, passing in
-   the action chosen by the Agent's Policy. (This function is not called if the
-   Agent is done.)
-6. Calls the Agent's `AgentReset()` function if the Agent has reached its `Max
-   Step` count or has otherwise marked itself as `done`.
+5. Calls the `OnActionReceived()` function for each Agent in the scene, passing in
+   the action chosen by the Agent's Policy.
+6. Calls the Agent's `OnEpisodeBegin()` function if the Agent has reached its `Max
+   Step` count or has otherwise marked itself as `EndEpisode()`.
-implement the above methods. The `Agent.CollectObservations(VectorSensor sensor)` and
-`Agent.AgentAction()` functions are required; the other methods are optional —
-whether you need to implement them or not depends on your specific scenario.
+implement the above methods whether you need to implement them or not depends on
+your specific scenario.

 **Note:** The API used by the Python training process to communicate with
 and control the Academy during training can be used for other purposes as well.
 have appropriate `Behavior Parameters`.

 To create an Agent, extend the Agent class and implement the essential
-`CollectObservations(VectorSensor sensor)` and `AgentAction()` methods:
+`CollectObservations(VectorSensor sensor)` and `OnActionReceived()` methods:
-* `AgentAction()` — Carries out the action chosen by the Agent's Policy and
+* `OnActionReceived()` — Carries out the action chosen by the Agent's Policy and
  assigns a reward to the current state.

 Your implementations of these functions determine how the Behavior Parameters
-manually set an Agent to done in your `AgentAction()` function when the Agent
-has finished (or irrevocably failed) its task by calling the `Done()` function.
+manually terminate an Agent episode in your `OnActionReceived()` function when the Agent
+has finished (or irrevocably failed) its task by calling the `EndEpisode()` function.
-Agent will consider itself done after it has taken that many steps. You can
-use the `Agent.AgentReset()` function to prepare the Agent to start again.
+Agent will consider the episode over after it has taken that many steps. You can
+use the `Agent.OnEpisodeBegin()` function to prepare the Agent to start again.

 See [Agents](Learning-Environment-Design-Agents.md) for detailed information
 about programming your own Agents.
 * The Academy must reset the scene to a valid starting point for each episode of
  training.
 * A training episode must have a definite end — either using `Max Steps` or by
-  each Agent setting itself to `done`.
+  each Agent ending its episode manually with `EndEpisode()`.
--- a/docs/Migrating.md
+++ b/docs/Migrating.md
 * `BrainParameters` and `SpaceType` have been removed from the public API
 * `BehaviorParameters` have been removed from the public API.
 * `DecisionRequester` has been made internal (you can still use the DecisionRequesterComponent from the inspector). `RepeatAction` was renamed `TakeActionsBetweenDecisions` for clarity.
+* The following methods in the `Agent` class have been renamed. The original method names will be removed in a later release:
+  * `InitializeAgent()` was renamed to `Initialize()`
+  * `AgentAction()` was renamed to `OnActionReceived()`
+  * `AgentReset()` was renamed to `OnEpsiodeBegin()`
+  * `Done()` was renamed to `EndEpisode()`
+  * `GiveModel()` was renamed to `SetModel()`
 * The `IFloatProperties` interface has been removed.

 ### Steps to Migrate
 * If you call `RayPerceptionSensor.PerceiveStatic()` manually, add your inputs to a `RayPerceptionInput`. To get the previous float array output,
 iterate through `RayPerceptionOutput.rayOutputs` and call `RayPerceptionOutput.RayOutput.ToFloatArray()`.
-* Replace all calls to `Agent.GetStepCount()` with `Agent.StepCount`.
+* Replace all calls to `Agent.GetStepCount()` with `Agent.StepCount`
+* We strongly recommend replacing the following methods with their new equivalent as they will be removed in a later release:
+  * `InitializeAgent()` to `Initialize()`
+  * `AgentAction()` to `OnActionReceived()`
+  * `AgentReset()` to `OnEpsiodeBegin()`
+  * `Done()` to `EndEpisode()`
+  * `GiveModel()` to `SetModel()`
 * Replace `IFloatProperties` variables with `FloatPropertiesChannel` variables.

 ## Migrating from 0.13 to 0.14
--- a/docs/Python-API.md
+++ b/docs/Python-API.md
 allows you to interact directly with a Unity Environment (`mlagents_envs`) and
 an entry point to train (`mlagents-learn`) which allows you to train agents in
 Unity Environments using our implementations of reinforcement learning or
-imitation learning.
+imitation learning. This document describes how to use the `mlagents_envs` API.
+For information on using `mlagents-learn`, see [here](Training-ML-Agents.md).
-You can use the Python Low Level API to interact directly with your learning
-environment, and use it to develop new learning algorithms.
+The Python Low Level API can be used to interact directly with your Unity learning environment.
+As such, it can serve as the basis for developing and evaluating new learning algorithms.

 ## mlagents_envs

 Python-side communication happens through `UnityEnvironment` which is located in
 [`environment.py`](../ml-agents-envs/mlagents_envs/environment.py). To load
 a Unity environment from a built binary file, put the file in the same directory
-as `envs`. For example, if the filename of your Unity environment is 3DBall.app, in python, run:
+as `envs`. For example, if the filename of your Unity environment is `3DBall`, in python, run:

 ```python
 from mlagents_envs.environment import UnityEnvironment
 `discrete_action_branches = (3,2,)`)


-### Modifying the environment from Python
-The Environment can be modified by using side channels to send data to the
-environment. When creating the environment, pass a list of side channels as
-`side_channels` argument to the constructor.
+### Communicating additional information with the Environment
+In addition to the means of communicating between Unity and python described above,
+we also provide methods for sharing agent-agnostic information. These
+additional methods are referred to as side channels. ML-Agents includes two ready-made
+side channels, described below. It is also possible to create custom side channels to
+communicate any additional data between a Unity environment and Python. Instructions for
+creating custom side channels can be found [here](Custom-SideChannels.md).
+
+Side channels exist as separate classes which are instantiated, and then passed as list to the `side_channels` argument of the constructor of the `UnityEnvironment` class.
+
+```python
+channel = MyChannel()
+
+env = UnityEnvironment(side_channels = [channel])
+```
-__Note__ : A side channel will only send/receive messages when `env.step` is
+__Note__ : A side channel will only send/receive messages when `env.step` or `env.reset()` is
-An `EngineConfiguration` will allow you to modify the time scale and graphics quality of the Unity engine.
+The `EngineConfiguration` side channel allows you to modify the time-scale, resolution, and graphics quality of the environment. This can be useful for adjusting the environment to perform better during training, or be more interpretable during inference.
+
- * `set_configuration_parameters` with arguments
-   * width: Defines the width of the display. Default 80.
-   * height: Defines the height of the display. Default 80.
-   * quality_level: Defines the quality level of the simulation. Default 1.
-   * time_scale: Defines the multiplier for the deltatime in the simulation. If set to a higher value, time will pass faster in the simulation but the physics might break. Default 20.
-   *  target_frame_rate: Instructs simulation to try to render at a specified frame rate. Default -1.
+ * `set_configuration_parameters` which takes the following arguments:
+   * `width`: Defines the width of the display. Default 80.
+   * `height`: Defines the height of the display. Default 80.
+   * `quality_level`: Defines the quality level of the simulation. Default 1.
+   * `time_scale`: Defines the multiplier for the deltatime in the simulation. If set to a higher value, time will pass faster in the simulation but the physics may perform unpredictably. Default 20.
+   *  `target_frame_rate`: Instructs simulation to try to render at a specified frame rate. Default -1.
-For example :
+For example, the following code would adjust the time-scale of the simulation to be 2x realtime.
+
 ```python
 from mlagents_envs.environment import UnityEnvironment
 from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
 ```

 #### FloatPropertiesChannel
-A `FloatPropertiesChannel` will allow you to get and set float properties
-in the environment. You can call get_property and set_property on the
-side channel to read and write properties.
+The `FloatPropertiesChannel` will allow you to get and set pre-defined numerical values in the environment. This can be useful for adjusting environment-specific settings, or for reading non-agent related information from the environment. You can call `get_property` and `set_property` on the side channel to read and write properties.
+
 `FloatPropertiesChannel` has three methods:

 * `set_property` Sets a property in the Unity Environment.
 channel.set_property("parameter_1", 2.0)

 i = env.reset()
+
+readout_value = channel.get_property("parameter_2")
 ...
 ```

 float property1 = sharedProperties.GetPropertyWithDefault("parameter_1", 0.0f);
 ```

-#### [Advanced] Create your own SideChannel
-
-You can create your own `SideChannel` in C# and Python and use it to communicate data between the two.
-
-##### Unity side
-The side channel will have to implement the `SideChannel` abstract class and the following method.
-
- * `OnMessageReceived(byte[] data)` : You must implement this method to specify what the side channel will be doing
- with the data received from Python. The data is a `byte[]` argument.
-
-The side channel must also assign a `ChannelId` property in the constructor. The `ChannelId` is a Guid
-(or UUID in Python) used to uniquely identify a side channel. This Guid must be the same on C# and Python.
-There can only be one side channel of a certain id during communication.
-
-To send a byte array from C# to Python, call the `base.QueueMessageToSend(data)` method inside the side channel.
-The `data` argument must be a `byte[]`.
-
-To register a side channel on the Unity side, call `Academy.Instance.RegisterSideChannel` with the side channel
-as only argument.
-
-##### Python side
-The side channel will have to implement the `SideChannel` abstract class. You must implement :
-
- * `on_message_received(self, data: bytes) -> None` : You must implement this method to specify what the
- side channel will be doing with the data received from Unity. The data is a `byte[]` argument.
-
-The side channel must also assign a `channel_id` property in the constructor. The `channel_id` is a UUID
-(referred in C# as Guid) used to uniquely identify a side channel. This number must be the same on C# and
-Python. There can only be one side channel of a certain id during communication.
-
-To assign the `channel_id` call the abstract class constructor with the appropriate `channel_id` as follows:
-
-```python
-super().__init__(my_channel_id)
-```
-
-To send a byte array from Python to C#, call the `super().queue_message_to_send(bytes_data)` method inside the
-side channel. The `bytes_data` argument must be a `bytes` object.
-
-To register a side channel on the Python side, pass the side channel as argument when creating the
-`UnityEnvironment` object. One of the arguments of the constructor (`side_channels`) is a list of side channels.
-
-##### Example implementation
-
-Here is a simple implementation of a Side Channel that will exchange strings between C# and Python
-(encoded as ascii).
-
-One the C# side :
-Here is an implementation of a `StringLogSideChannel` that will listed to the `UnityEngine.Debug.LogError` calls in
-the game :
-
-```csharp
-using UnityEngine;
-using MLAgents;
-using System.Text;
-using System;
-
-public class StringLogSideChannel : SideChannel
-{
-    public StringLogSideChannel()
-    {
-        ChannelId = new Guid("621f0a70-4f87-11ea-a6bf-784f4387d1f7");
-    }
-
-    public override void OnMessageReceived(byte[] data)
-    {
-        var receivedString = Encoding.ASCII.GetString(data);
-        Debug.Log("From Python : " + receivedString);
-    }
-
-    public void SendDebugStatementToPython(string logString, string stackTrace, LogType type)
-    {
-        if (type == LogType.Error)
-        {
-            var stringToSend = type.ToString() + ": " + logString + "\n" + stackTrace;
-            var encodedString = Encoding.ASCII.GetBytes(stringToSend);
-            base.QueueMessageToSend(encodedString);
-        }
-    }
-}
-```
-
-We also need to register this side channel to the Academy and to the `Application.logMessageReceived` events,
-so we write a simple MonoBehavior for this. (Do not forget to attach it to a GameObject in the scene).
-
-```csharp
-using UnityEngine;
-using MLAgents;
-
-
-public class RegisterStringLogSideChannel : MonoBehaviour
-{
-
-    StringLogSideChannel stringChannel;
-    public void Awake()
-    {
-        // We create the Side Channel
-        stringChannel = new StringLogSideChannel();
-
-        // When a Debug.Log message is created, we send it to the stringChannel
-        Application.logMessageReceived += stringChannel.SendDebugStatementToPython;
-
-        // Just in case the Academy has not yet initialized
-        Academy.Instance.RegisterSideChannel(stringChannel);
-    }
-
-    public void OnDestroy()
-    {
-        // De-register the Debug.Log callback
-        Application.logMessageReceived -= stringChannel.SendDebugStatementToPython;
-        if (Academy.IsInitialized){
-            Academy.Instance.UnregisterSideChannel(stringChannel);
-        }
-    }
-
-    public void Update()
-    {
-        // Optional : If the space bar is pressed, raise an error !
-        if (Input.GetKeyDown(KeyCode.Space))
-        {
-            Debug.LogError("This is a fake error. Space bar was pressed in Unity.");
-        }
-    }
-}
-```
-
-And here is the script on the Python side. This script creates a new Side channel type (`StringLogChannel`) and
-launches a `UnityEnvironment` with that side channel.
-
-```python
-
-from mlagents_envs.environment import UnityEnvironment
-from mlagents_envs.side_channel.side_channel import SideChannel
-import numpy as np
-
-
-# Create the StringLogChannel class
-class StringLogChannel(SideChannel):
-
-    def __init__(self) -> None:
-        super().__init__(uuid.UUID("621f0a70-4f87-11ea-a6bf-784f4387d1f7"))
+#### Custom side channels
-    def on_message_received(self, data: bytes) -> None:
-        """
-        Note :We must implement this method of the SideChannel interface to
-        receive messages from Unity
-        """
-        # We simply print the data received interpreted as ascii
-        print(data.decode("ascii"))
-
-    def send_string(self, data: str) -> None:
-        # Convert the string to ascii
-        bytes_data = data.encode("ascii")
-        # We call this method to queue the data we want to send
-        super().queue_message_to_send(bytes_data)
-
-# Create the channel
-string_log = StringLogChannel()
-
-# We start the communication with the Unity Editor and pass the string_log side channel as input
-env = UnityEnvironment(base_port=UnityEnvironment.DEFAULT_EDITOR_PORT, side_channels=[string_log])
-env.reset()
-string_log.send_string("The environment was reset")
-
-group_name = env.get_agent_groups()[0]  # Get the first group_name
-for i in range(1000):
-    step_data = env.get_step_result(group_name)
-    n_agents = step_data.n_agents()  # Get the number of agents
-    # We send data to Unity : A string with the number of Agent at each
-    string_log.send_string(
-        "Step " + str(i) + " occurred with " + str(n_agents) + " agents."
-    )
-    env.step()  # Move the simulation forward
-
-env.close()
-```
-
-Now, if you run this script and press `Play` the Unity Editor when prompted, The console in the Unity Editor will
-display a message at every Python step. Additionally, if you press the Space Bar in the Unity Engine, a message will
-appear in the terminal.
+For information on how to make custom side channels for sending additional data types, see the documentation [here](Custom-SideChannels.md).
--- a/docs/Readme.md
+++ b/docs/Readme.md
  * [Using the Monitor](Feature-Monitor.md)
  * [Using the Video Recorder](https://github.com/Unity-Technologies/video-recorder)
  * [Using an Executable Environment](Learning-Environment-Executable.md)
+  * [Creating Custom Side Channels](Custom-SideChannels.md)

 ## Training

--- a/docs/Training-Curriculum-Learning.md
+++ b/docs/Training-Curriculum-Learning.md
  greater than number of thresholds.

 Once our curriculum is defined, we have to use the reset parameters we defined
-and modify the environment from the Agent's `AgentReset()` function. See
+and modify the environment from the Agent's `OnEpisodeBegin()` function. See
 [WallJumpAgent.cs](https://github.com/Unity-Technologies/ml-agents/blob/master/Project/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs)
 for an example.

--- a/ml-agents/mlagents/trainers/components/reward_signals/init.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/init.py

 from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.policy.tf_policy import TFPolicy
+from mlagents.trainers.buffer import AgentBuffer

 logger = logging.getLogger("mlagents.trainers")

        self.strength = strength
        self.stats_name_to_update_name: Dict[str, str] = {}

-    def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
+    def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
        """
        Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward
        function drawn straight from a Buffer.
        )

    def prepare_update(
-        self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
+        self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
    ) -> Dict[tf.Tensor, Any]:
        """
        If the reward signal has an internal model (e.g. GAIL or Curiosity), get the feed_dict
--- a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
 from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
 from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel
 from mlagents.trainers.policy.tf_policy import TFPolicy
+from mlagents.trainers.buffer import AgentBuffer


 class CuriosityRewardSignal(RewardSignal):
        }
        self.has_updated = False

-    def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
+    def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
        feed_dict: Dict[tf.Tensor, Any] = {
            self.policy.batch_size_ph: len(mini_batch["actions"]),
            self.policy.sequence_length_ph: self.policy.sequence_length,
        super().check_config(config_dict, param_keys)

    def prepare_update(
-        self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
+        self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
    ) -> Dict[tf.Tensor, Any]:
        """
        Prepare for update and get feed_dict.
--- a/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
 import numpy as np

 from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
+from mlagents.trainers.buffer import AgentBuffer


 class ExtrinsicRewardSignal(RewardSignal):
        param_keys = ["strength", "gamma"]
        super().check_config(config_dict, param_keys)

-    def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
+    def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
        env_rews = np.array(mini_batch["environment_rewards"], dtype=np.float32)
        return RewardSignalResult(self.strength * env_rews, env_rews)
--- a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from .model import GAILModel
 from mlagents.trainers.demo_loader import demo_to_buffer
+from mlagents.trainers.buffer import AgentBuffer


 class GAILRewardSignal(RewardSignal):
            "Policy/GAIL Expert Estimate": "gail_expert_estimate",
        }

-    def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
+    def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
        feed_dict: Dict[tf.Tensor, Any] = {
            self.policy.batch_size_ph: len(mini_batch["actions"]),
            self.policy.sequence_length_ph: self.policy.sequence_length,
        super().check_config(config_dict, param_keys)

    def prepare_update(
-        self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
+        self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
    ) -> Dict[tf.Tensor, Any]:
        """
        Prepare inputs for update. .
        """
-        max_num_experiences = min(
-            len(mini_batch["actions"]), self.demonstration_buffer.num_experiences
-        )
-        # If num_sequences is less, we need to shorten the input batch.
-        for key, element in mini_batch.items():
-            mini_batch[key] = element[:max_num_experiences]
-
-        # Get batch from demo buffer
+        # Get batch from demo buffer. Even if demo buffer is smaller, we sample with replacement
-            len(mini_batch["actions"]), 1
+            mini_batch.num_experiences, 1
        )

        feed_dict: Dict[tf.Tensor, Any] = {
--- a/ml-agents/mlagents/trainers/sac/optimizer.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer.py
        return update_stats

    def update_reward_signals(
-        self, reward_signal_minibatches: Mapping[str, Dict], num_sequences: int
+        self, reward_signal_minibatches: Mapping[str, AgentBuffer], num_sequences: int
    ) -> Dict[str, float]:
        """
        Only update the reward signals.
        feed_dict: Dict[tf.Tensor, Any],
        update_dict: Dict[str, tf.Tensor],
        stats_needed: Dict[str, str],
-        reward_signal_minibatches: Mapping[str, Dict],
+        reward_signal_minibatches: Mapping[str, AgentBuffer],
        num_sequences: int,
    ) -> None:
        """
--- a/ml-agents/mlagents/trainers/tests/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/test_ppo.py
 from mlagents.trainers.tests import mock_brain as mb
 from mlagents.trainers.tests.mock_brain import make_brain_parameters
 from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
+from mlagents.trainers.tests.test_reward_signals import (  # noqa: F401; pylint: disable=unused-variable
+    curiosity_dummy_config,
+    gail_dummy_config,
+)


@pytest.fixture
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // dummy_config["sequence_length"],
+    )
+
+
+@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
+@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
+@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
+# We need to test this separately from test_reward_signals.py to ensure no interactions
+def test_ppo_optimizer_update_curiosity(
+    curiosity_dummy_config, dummy_config, rnn, visual, discrete  # noqa: F811
+):
+    # Test evaluate
+    tf.reset_default_graph()
+    dummy_config["reward_signals"].update(curiosity_dummy_config)
+    optimizer = _create_ppo_optimizer_ops_mock(
+        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
+    )
+    # Test update
+    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
+    # Mock out reward signal eval
+    update_buffer["advantages"] = update_buffer["environment_rewards"]
+    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
+    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
+    update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
+    update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"]
+    optimizer.update(
+        update_buffer,
+        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
+    )
+
+
+# We need to test this separately from test_reward_signals.py to ensure no interactions
+def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config):  # noqa: F811
+    # Test evaluate
+    tf.reset_default_graph()
+    dummy_config["reward_signals"].update(gail_dummy_config)
+    optimizer = _create_ppo_optimizer_ops_mock(
+        dummy_config, use_rnn=False, use_discrete=False, use_visual=False
+    )
+    # Test update
+    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
+    # Mock out reward signal eval
+    update_buffer["advantages"] = update_buffer["environment_rewards"]
+    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
+    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
+    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
+    update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
+    optimizer.update(
+        update_buffer,
+        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
+    )
+
+    # Check if buffer size is too big
+    update_buffer = mb.simulate_rollout(3000, optimizer.policy.brain)
+    # Mock out reward signal eval
+    update_buffer["advantages"] = update_buffer["environment_rewards"]
+    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
+    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
+    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
+    update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
+    optimizer.update(
+        update_buffer,
+        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    )


--- a/docs/Custom-SideChannels.md
+++ b/docs/Custom-SideChannels.md
+# Custom Side Channels
+
+You can create your own side channel in C# and Python and use it to communicate
+custom data structures between the two. This can be useful for situations in
+which the data to be sent is too complex or structured for the built-in
+`FloatPropertiesChannel`, or is not related to any specific agent, and therefore
+inappropriate as an agent observation.
+
+## Overview
+
+In order to use a side channel, it must be implemented as both Unity and Python classes.
+
+### Unity side
+The side channel will have to implement the `SideChannel` abstract class and the following method.
+
+ * `OnMessageReceived(byte[] data)` : You must implement this method to specify what the side channel will be doing
+ with the data received from Python. The data is a `byte[]` argument.
+
+The side channel must also assign a `ChannelId` property in the constructor. The `ChannelId` is a Guid
+(or UUID in Python) used to uniquely identify a side channel. This Guid must be the same on C# and Python.
+There can only be one side channel of a certain id during communication.
+
+To send a byte array from C# to Python, call the `base.QueueMessageToSend(data)` method inside the side channel.
+The `data` argument must be a `byte[]`.
+
+To register a side channel on the Unity side, call `Academy.Instance.RegisterSideChannel` with the side channel
+as only argument.
+
+### Python side
+The side channel will have to implement the `SideChannel` abstract class. You must implement :
+
+ * `on_message_received(self, data: bytes) -> None` : You must implement this method to specify what the
+ side channel will be doing with the data received from Unity. The data is a `byte[]` argument.
+
+The side channel must also assign a `channel_id` property in the constructor. The `channel_id` is a UUID
+(referred in C# as Guid) used to uniquely identify a side channel. This number must be the same on C# and
+Python. There can only be one side channel of a certain id during communication.
+
+To assign the `channel_id` call the abstract class constructor with the appropriate `channel_id` as follows:
+
+```python
+super().__init__(my_channel_id)
+```
+
+To send a byte array from Python to C#, call the `super().queue_message_to_send(bytes_data)` method inside the
+side channel. The `bytes_data` argument must be a `bytes` object.
+
+To register a side channel on the Python side, pass the side channel as argument when creating the
+`UnityEnvironment` object. One of the arguments of the constructor (`side_channels`) is a list of side channels.
+
+## Example implementation
+
+Below is a simple implementation of a side channel that will exchange ascii encoded
+strings between a Unity environment and Python.
+
+### Example Unity C# code
+
+The first step is to create the `StringLogSideChannel` class within the Unity project.
+Here is an implementation of a `StringLogSideChannel` that will listen for messages
+from python and print them to the Unity debug log, as well as send error messages
+from Unity to python.
+
+```csharp
+using UnityEngine;
+using MLAgents;
+using MLAgents.SideChannels;
+using System.Text;
+using System;
+
+public class StringLogSideChannel : SideChannel
+{
+    public StringLogSideChannel()
+    {
+        ChannelId = new Guid("621f0a70-4f87-11ea-a6bf-784f4387d1f7");
+    }
+
+    public override void OnMessageReceived(byte[] data)
+    {
+        var receivedString = Encoding.ASCII.GetString(data);
+        Debug.Log("From Python : " + receivedString);
+    }
+
+    public void SendDebugStatementToPython(string logString, string stackTrace, LogType type)
+    {
+        if (type == LogType.Error)
+        {
+            var stringToSend = type.ToString() + ": " + logString + "\n" + stackTrace;
+            var encodedString = Encoding.ASCII.GetBytes(stringToSend);
+            base.QueueMessageToSend(encodedString);
+        }
+    }
+}
+```
+
+Once we have defined our custom side channel class, we need to ensure that it is
+instantiated and registered. This can typically be done wherever the logic of
+the side channel makes sense to be associated, for example on a MonoBehaviour
+object that might need to access data from the side channel. Here we show a
+simple MonoBehaviour object which instantiates and registeres the new side
+channel. If you have not done it already, make sure that the MonoBehaviour
+which registers the side channel is attached to a gameobject which will
+be live in your Unity scene.
+
+```csharp
+using UnityEngine;
+using MLAgents;
+
+
+public class RegisterStringLogSideChannel : MonoBehaviour
+{
+
+    StringLogSideChannel stringChannel;
+    public void Awake()
+    {
+        // We create the Side Channel
+        stringChannel = new StringLogSideChannel();
+
+        // When a Debug.Log message is created, we send it to the stringChannel
+        Application.logMessageReceived += stringChannel.SendDebugStatementToPython;
+
+        // The channel must be registered with the Academy
+        Academy.Instance.RegisterSideChannel(stringChannel);
+    }
+
+    public void OnDestroy()
+    {
+        // De-register the Debug.Log callback
+        Application.logMessageReceived -= stringChannel.SendDebugStatementToPython;
+        if (Academy.IsInitialized){
+            Academy.Instance.UnregisterSideChannel(stringChannel);
+        }
+    }
+
+    public void Update()
+    {
+        // Optional : If the space bar is pressed, raise an error !
+        if (Input.GetKeyDown(KeyCode.Space))
+        {
+            Debug.LogError("This is a fake error. Space bar was pressed in Unity.");
+        }
+    }
+}
+```
+
+### Example Python code
+
+Now that we have created the necessary Unity C# classes, we can create their Python counterparts.
+
+```python
+from mlagents_envs.environment import UnityEnvironment
+from mlagents_envs.side_channel.side_channel import SideChannel
+import numpy as np
+import uuid
+
+
+# Create the StringLogChannel class
+class StringLogChannel(SideChannel):
+
+    def __init__(self) -> None:
+        super().__init__(uuid.UUID("621f0a70-4f87-11ea-a6bf-784f4387d1f7"))
+
+    def on_message_received(self, data: bytes) -> None:
+        """
+        Note :We must implement this method of the SideChannel interface to
+        receive messages from Unity
+        """
+        # We simply print the data received interpreted as ascii
+        print(data.decode("ascii"))
+
+    def send_string(self, data: str) -> None:
+        # Convert the string to ascii
+        bytes_data = data.encode("ascii")
+        # We call this method to queue the data we want to send
+        super().queue_message_to_send(bytes_data)
+```
+
+
+We can then instantiate the new side channel,
+launch a `UnityEnvironment` with that side channel active, and send a series of
+messages to the Unity environment from Python using it.
+
+```python
+# Create the channel
+string_log = StringLogChannel()
+
+# We start the communication with the Unity Editor and pass the string_log side channel as input
+env = UnityEnvironment(base_port=UnityEnvironment.DEFAULT_EDITOR_PORT, side_channels=[string_log])
+env.reset()
+string_log.send_string("The environment was reset")
+
+group_name = env.get_agent_groups()[0]  # Get the first group_name
+for i in range(1000):
+    step_data = env.get_step_result(group_name)
+    n_agents = step_data.n_agents()  # Get the number of agents
+    # We send data to Unity : A string with the number of Agent at each
+    string_log.send_string(
+        "Step " + str(i) + " occurred with " + str(n_agents) + " agents."
+    )
+    env.step()  # Move the simulation forward
+
+env.close()
+```
+
+Now, if you run this script and press `Play` the Unity Editor when prompted,
+the console in the Unity Editor will display a message at every Python step.
+Additionally, if you press the Space Bar in the Unity Engine, a message will
+appear in the terminal.