Merge branch 'master' into develop-sac-apex

5 年前 · 06fa3d39
--- a/.pylintrc
+++ b/.pylintrc
    # Appears to be https://github.com/PyCQA/pylint/issues/2981
    W0201,

+    # Using the global statement
+    W0603,
--- a/.yamato/standalone-build-test.yml
+++ b/.yamato/standalone-build-test.yml
        - "*.md"
        - "com.unity.ml-agents/*.md"
        - "com.unity.ml-agents/**/*.md"
+  artifacts:
+    standalonebuild:
+      paths:
+        - "Project/testPlayer*/**"
 {% endfor %}
--- a/.yamato/training-int-tests.yml
+++ b/.yamato/training-int-tests.yml
  commands:
    - pip install pyyaml
    - python -u -m ml-agents.tests.yamato.training_int_tests
+    # Backwards-compatibility tests.
+    # If we make a breaking change to the communication protocol, these will need
+    # to be disabled until the next release.
+    - python -u -m ml-agents.tests.yamato.training_int_tests --python=0.15.0
+    - python -u -m ml-agents.tests.yamato.training_int_tests --csharp=0.15.0
+  dependencies:
+    - .yamato/standalone-build-test.yml#test_mac_standalone_{{ editor.version }}
  triggers:
    cancel_old_ci: true
    changes:
--- a/8
+++ b/8
 WORKDIR /ml-agents
 RUN pip install -e .

-# port 5005 is the port used in in Editor training.
-EXPOSE 5005
+# Port 5004 is the port used in in Editor training.
+# Environments will start from port 5005, 
+# so allow enough ports for several environments.
+EXPOSE 5004-5050
-ENTRYPOINT ["mlagents-learn"]
+ENTRYPOINT ["xvfb-run", "--auto-servernum", "--server-args='-screen 0 640x480x24'", "mlagents-learn"]
--- a/Project/Assets/ML-Agents/Editor/Tests/StandaloneBuildTest.cs
+++ b/Project/Assets/ML-Agents/Editor/Tests/StandaloneBuildTest.cs
 {
    public class StandaloneBuildTest
    {
+        const string k_outputCommandLineFlag = "--mlagents-build-output-path";
+        const string k_sceneCommandLineFlag = "--mlagents-build-scene-path";
+
-            string[] scenes = { "Assets/ML-Agents/Examples/3DBall/Scenes/3DBall.unity" };
-            var buildResult = BuildPipeline.BuildPlayer(scenes, "testPlayer", BuildTarget.StandaloneOSX, BuildOptions.None);
+            // Read commandline arguments for options
+            var outputPath = "testPlayer";
+            var scenePath = "Assets/ML-Agents/Examples/3DBall/Scenes/3DBall.unity";
+
+            var args = Environment.GetCommandLineArgs();
+            for (var i = 0; i < args.Length - 1; i++)
+            {
+                if (args[i] == k_outputCommandLineFlag)
+                {
+                    outputPath = args[i + 1];
+                    Debug.Log($"Overriding output path to {outputPath}");
+                }
+                else if (args[i] == k_sceneCommandLineFlag)
+                {
+                    scenePath = args[i + 1];
+                }
+            }
+
+            string[] scenes = { scenePath };
+            var buildResult = BuildPipeline.BuildPlayer(
+                scenes,
+                outputPath,
+                BuildTarget.StandaloneOSX,
+                BuildOptions.None
+            );
            var isOk = buildResult.summary.result == BuildResult.Succeeded;
            var error = "";
            foreach (var stepInfo in buildResult.steps)
--- a/Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs
    public override void Initialize()
    {
        m_BallRb = ball.GetComponent<Rigidbody>();
-        m_ResetParams = Academy.Instance.FloatProperties;
+        m_ResetParams = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
        SetResetParameters();
    }

--- a/Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs
    public override void Initialize()
    {
        m_BallRb = ball.GetComponent<Rigidbody>();
-        m_ResetParams = Academy.Instance.FloatProperties;
+        m_ResetParams = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
        SetResetParameters();
    }

--- a/Project/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs
        m_Rb = gameObject.GetComponent<Rigidbody>();
        m_LookDir = Vector3.zero;

-        m_ResetParams = Academy.Instance.FloatProperties;
+        m_ResetParams = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();

        SetResetParameters();
    }
--- a/Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorAgent.cs
 using UnityEngine;
 using MLAgents;
 using MLAgents.Sensors;
+using MLAgents.SideChannels;

 public class FoodCollectorAgent : Agent
 {

    public void SetLaserLengths()
    {
-        m_LaserLength = Academy.Instance.FloatProperties.GetPropertyWithDefault("laser_length", 1.0f);
+        m_LaserLength = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>().GetPropertyWithDefault("laser_length", 1.0f);
-        float agentScale = Academy.Instance.FloatProperties.GetPropertyWithDefault("agent_scale", 1.0f);
+        float agentScale = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>().GetPropertyWithDefault("agent_scale", 1.0f);
        gameObject.transform.localScale = new Vector3(agentScale, agentScale, agentScale);
    }

--- a/Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorSettings.cs
+++ b/Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorSettings.cs
 using UnityEngine;
 using UnityEngine.UI;
 using MLAgents;
+using MLAgents.SideChannels;

 public class FoodCollectorSettings : MonoBehaviour
 {
    public int totalScore;
    public Text scoreText;

+    StatsSideChannel m_statsSideChannel;
+
+        m_statsSideChannel = SideChannelUtils.GetSideChannel<StatsSideChannel>();
    }

    public void EnvironmentReset()
    public void Update()
    {
        scoreText.text = $"Score: {totalScore}";
+
+        // Send stats via SideChannel so that they'll appear in TensorBoard.
+        // These values get averaged every summary_frequency steps, so we don't
+        // need to send every Update() call.
+        if ((Time.frameCount % 100)== 0)
+        {
+            m_statsSideChannel?.AddStat("TotalScore", totalScore);
+        }
    }
 }
--- a/Project/Assets/ML-Agents/Examples/GridWorld/Scenes/GridWorld.unity
+++ b/Project/Assets/ML-Agents/Examples/GridWorld/Scenes/GridWorld.unity
  m_ReflectionIntensity: 1
  m_CustomReflection: {fileID: 0}
  m_Sun: {fileID: 0}
-  m_IndirectSpecularColor: {r: 0.44971162, g: 0.49977726, b: 0.5756362, a: 1}
+  m_IndirectSpecularColor: {r: 0.44971168, g: 0.4997775, b: 0.57563686, a: 1}
  m_UseRadianceAmbientProbe: 0
 --- !u!157 &3
 LightmapSettings:
  m_Father: {fileID: 363761400}
  m_RootOrder: 1
  m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
-  m_AnchorMin: {x: 0.5, y: 0.5}
-  m_AnchorMax: {x: 0.5, y: 0.5}
-  m_AnchoredPosition: {x: -369.5, y: -62.2}
-  m_SizeDelta: {x: 160, y: 55.6}
+  m_AnchorMin: {x: 0, y: 1}
+  m_AnchorMax: {x: 0, y: 1}
+  m_AnchoredPosition: {x: 150, y: -230}
+  m_SizeDelta: {x: 160, y: 55.599976}
  m_Pivot: {x: 0.5, y: 0.5}
 --- !u!114 &918893360
 MonoBehaviour:
      m_Calls: []
  m_FontData:
    m_Font: {fileID: 10102, guid: 0000000000000000e000000000000000, type: 0}
-    m_FontSize: 20
+    m_FontSize: 22
    m_FontStyle: 0
    m_BestFit: 0
    m_MinSize: 2
  m_Father: {fileID: 363761400}
  m_RootOrder: 2
  m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
-  m_AnchorMin: {x: 0.5, y: 0.5}
-  m_AnchorMax: {x: 0.5, y: 0.5}
-  m_AnchoredPosition: {x: -369.5, y: -197}
+  m_AnchorMin: {x: 0, y: 1}
+  m_AnchorMax: {x: 0, y: 1}
+  m_AnchoredPosition: {x: 150, y: -128}
  m_SizeDelta: {x: 200, y: 152}
  m_Pivot: {x: 0.5, y: 0.5}
 --- !u!114 &1305247361
--- a/Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAgent.cs
 using MLAgents;
 using MLAgents.Sensors;
 using UnityEngine.Serialization;
+using MLAgents.SideChannels;

 public class GridAgent : Agent
 {
           // Prevents the agent from picking an action that would make it collide with a wall
            var positionX = (int)transform.position.x;
            var positionZ = (int)transform.position.z;
-            var maxPosition = (int)Academy.Instance.FloatProperties.GetPropertyWithDefault("gridSize", 5f) - 1;
+            var maxPosition = (int)SideChannelUtils.GetSideChannel<FloatPropertiesChannel>().GetPropertyWithDefault("gridSize", 5f) - 1;

            if (positionX == 0)
            {
--- a/Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridArea.cs
+++ b/Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridArea.cs

    public void Start()
    {
-        m_ResetParameters = Academy.Instance.FloatProperties;
+        m_ResetParameters = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();

        m_Objects = new[] { goalPref, pitPref };

--- a/Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridSettings.cs
+++ b/Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridSettings.cs
 using UnityEngine;
 using MLAgents;
+using MLAgents.SideChannels;

 public class GridSettings : MonoBehaviour
 {
    {
-        Academy.Instance.FloatProperties.RegisterCallback("gridSize", f =>
+        SideChannelUtils.GetSideChannel<FloatPropertiesChannel>().RegisterCallback("gridSize", f =>
        {
            MainCamera.transform.position = new Vector3(-(f - 1) / 2f, f * 1.25f, -(f - 1) / 2f);
            MainCamera.orthographicSize = (f + 5f) / 2f;
--- a/Project/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs
+++ b/Project/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs
 using System.Collections;
 using UnityEngine;
 using MLAgents;
+using MLAgents.SideChannels;

 public class PushAgentBasic : Agent
 {

    public void SetGroundMaterialFriction()
    {
-        var resetParams = Academy.Instance.FloatProperties;
+        var resetParams = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();

        var groundCollider = ground.GetComponent<Collider>();


    public void SetBlockProperties()
    {
-        var resetParams = Academy.Instance.FloatProperties;
+        var resetParams = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();

        var scale = resetParams.GetPropertyWithDefault("block_scale", 2);
        //Set the scale of the block
--- a/Project/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs
 using UnityEngine;
 using MLAgents;
 using MLAgents.Sensors;
+using MLAgents.SideChannels;

 public class ReacherAgent : Agent
 {

    public void SetResetParameters()
    {
-        var fp = Academy.Instance.FloatProperties;
+        var fp = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
        m_GoalSize = fp.GetPropertyWithDefault("goal_size", 5);
        m_GoalSpeed = Random.Range(-1f, 1f) * fp.GetPropertyWithDefault("goal_speed", 1);
        m_Deviation = fp.GetPropertyWithDefault("deviation", 0);
--- a/Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ProjectSettingsOverrides.cs
+++ b/Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ProjectSettingsOverrides.cs
 using UnityEngine;
 using MLAgents;
+using MLAgents.SideChannels;

 namespace MLAgentsExamples
 {
            Physics.defaultSolverIterations = solverIterations;
            Physics.defaultSolverVelocityIterations = solverVelocityIterations;

-            Academy.Instance.FloatProperties.RegisterCallback("gravity", f => { Physics.gravity = new Vector3(0, -f, 0); });
+            SideChannelUtils.GetSideChannel<FloatPropertiesChannel>().RegisterCallback("gravity", f => { Physics.gravity = new Vector3(0, -f, 0); });
        }

        public void OnDestroy()
--- a/Project/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerFieldArea.cs
+++ b/Project/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerFieldArea.cs
 using System.Collections;
 using System.Collections.Generic;
 using MLAgents;
+using MLAgents.SideChannels;
 using UnityEngine;
 using UnityEngine.Serialization;

        ballRb.velocity = Vector3.zero;
        ballRb.angularVelocity = Vector3.zero;

-        var ballScale = Academy.Instance.FloatProperties.GetPropertyWithDefault("ball_scale", 0.015f);
+        var ballScale = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>().GetPropertyWithDefault("ball_scale", 0.015f);
        ballRb.transform.localScale = new Vector3(ballScale, ballScale, ballScale);
    }
 }
--- a/Project/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs
        m_BallRb = ball.GetComponent<Rigidbody>();
        var canvas = GameObject.Find(k_CanvasName);
        GameObject scoreBoard;
-        m_ResetParams = Academy.Instance.FloatProperties;
+        m_ResetParams = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
        if (invertX)
        {
            scoreBoard = canvas.transform.Find(k_ScoreBoardBName).gameObject;
--- a/Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs
        m_ChestRb = chest.GetComponent<Rigidbody>();
        m_SpineRb = spine.GetComponent<Rigidbody>();

-        m_ResetParams = Academy.Instance.FloatProperties;
+        m_ResetParams = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();

        SetResetParameters();
    }
--- a/Project/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs
 using MLAgents;
 using Barracuda;
 using MLAgents.Sensors;
+using MLAgents.SideChannels;

 public class WallJumpAgent : Agent
 {
    Vector3 m_JumpTargetPos;
    Vector3 m_JumpStartingPos;

+    FloatPropertiesChannel m_FloatProperties;
+
    public override void Initialize()
    {
        m_WallJumpSettings = FindObjectOfType<WallJumpSettings>();
        m_GroundMaterial = m_GroundRenderer.material;

        spawnArea.SetActive(false);
+
+        m_FloatProperties = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
    }

    // Begin the jump sequence
        {
            localScale = new Vector3(
                localScale.x,
-                Academy.Instance.FloatProperties.GetPropertyWithDefault("no_wall_height", 0),
+                m_FloatProperties.GetPropertyWithDefault("no_wall_height", 0),
                localScale.z);
            wall.transform.localScale = localScale;
            SetModel("SmallWallJump", noWallBrain);
            localScale = new Vector3(
                localScale.x,
-                Academy.Instance.FloatProperties.GetPropertyWithDefault("small_wall_height", 4),
+                m_FloatProperties.GetPropertyWithDefault("small_wall_height", 4),
                localScale.z);
            wall.transform.localScale = localScale;
            SetModel("SmallWallJump", smallWallBrain);
-            var min = Academy.Instance.FloatProperties.GetPropertyWithDefault("big_wall_min_height", 8);
-            var max = Academy.Instance.FloatProperties.GetPropertyWithDefault("big_wall_max_height", 8);
+            var min = m_FloatProperties.GetPropertyWithDefault("big_wall_min_height", 8);
+            var max = m_FloatProperties.GetPropertyWithDefault("big_wall_max_height", 8);
            var height = min + Random.value * (max - min);
            localScale = new Vector3(
                localScale.x,
--- a/README.md
+++ b/README.md
 * Train using concurrent Unity environment instances

 ## Releases & Documentation
-**Our latest, stable release is 0.15.0. Click
+**Our latest, stable release is 0.15.1. Click
-
 get started with the latest release of ML-Agents.**

 The table below lists all our releases, including our `master` branch which is under active
 | **Version** | **Release Date** | **Source** | **Documentation** | **Download** |
 |:-------:|:------:|:-------------:|:-------:|:------------:|
 | **master (unstable)** | -- | [source](https://github.com/Unity-Technologies/ml-agents/tree/master) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/master/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/master.zip) |
+| **0.15.1** | **March 30, 2020** | **[source](https://github.com/Unity-Technologies/ml-agents/tree/0.15.1)** | **[docs](https://github.com/Unity-Technologies/ml-agents/tree/0.15.1/docs/Readme.md)** | **[download](https://github.com/Unity-Technologies/ml-agents/archive/0.15.1.zip)** |
 | **0.15.0** | **March 18, 2020** | **[source](https://github.com/Unity-Technologies/ml-agents/tree/0.15.0)** | **[docs](https://github.com/Unity-Technologies/ml-agents/tree/0.15.0/docs/Readme.md)** | **[download](https://github.com/Unity-Technologies/ml-agents/archive/0.15.0.zip)** |
 | **0.14.1** | February 26, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/0.14.1) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/0.14.1/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/0.14.1.zip) |
 | **0.14.0** | February 13, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/0.14.0) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/0.14.0/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/0.14.0.zip) |
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md

 ## [Unreleased]
 ### Major Changes
+ - The `--load` and `--train` command-line flags have been deprecated. Training now happens by default, and
+ use `--resume` to resume training instead. (#3705)
+ - The Jupyter notebooks have been removed from the repository.
+ - Introduced the `SideChannelUtils` to register, unregister and access side channels.
+ - `Academy.FloatProperties` was removed, please use `SideChannelUtils.GetSideChannel<FloatPropertiesChannel>()` instead.
- - Raise the wall in CrawlerStatic scene to prevent Agent from falling off. (#3650)
+ - Added a feature to allow sending stats from C# environments to TensorBoard (and other python StatsWriters). To do this from your code, use `SideChannelUtils.GetSideChannel<StatsSideChannel>().AddStat(key, value)` (#3660)
+ - The way that UnityEnvironment decides the port was changed. If no port is specified, the behavior will depend on the `file_name` parameter. If it is `None`, 5004 (the editor port) will be used; otherwise 5005 (the base environment port) will be used.
+ - Fixed an issue where exceptions from environments provided a returncode of 0. (#3680)
+ - Running `mlagents-learn` with the same `--run-id` twice will no longer overwrite the existing files. (#3705)
+ - `StackingSensor` was changed from `internal` visibility to `public`
+
+## [0.15.1-preview] - 2020-03-30
+### Bug Fixes
+ - Raise the wall in CrawlerStatic scene to prevent Agent from falling off. (#3650)
+ - Fixed an issue where specifying `vis_encode_type` was required only for SAC. (#3677)
+ - Fixed the reported entropy values for continuous actions (#3684)
+ - Fixed an issue where switching models using `SetModel()` during training would use an excessive amount of memory. (#3664)
+ - Environment subprocesses now close immediately on timeout or wrong API version. (#3679)
+ - Fixed an issue in the gym wrapper that would raise an exception if an Agent called EndEpisode multiple times in the same step. (#3700)
+ - Fixed an issue where logging output was not visible; logging levels are now set consistently. (#3703)
+

 ## [0.15.0-preview] - 2020-03-18
 ### Major Changes
--- a/com.unity.ml-agents/Runtime/Academy.cs
+++ b/com.unity.ml-agents/Runtime/Academy.cs
        /// </summary>
        public static Academy Instance { get { return s_Lazy.Value; } }

-        /// <summary>
-        /// Collection of float properties (indexed by a string).
-        /// </summary>
-        public FloatPropertiesChannel FloatProperties;
-
-
        // Fields not provided in the Inspector.

        /// <summary>
        }

        /// <summary>
-        /// Registers SideChannel to the Academy to send and receive data with Python.
-        /// If IsCommunicatorOn is false, the SideChannel will not be registered.
-        /// </summary>
-        /// <param name="channel"> The side channel to be registered.</param>
-        public void RegisterSideChannel(SideChannel channel)
-        {
-            LazyInitialize();
-            Communicator?.RegisterSideChannel(channel);
-        }
-
-        /// <summary>
-        /// Unregisters SideChannel to the Academy. If the side channel was not registered,
-        /// nothing will happen.
-        /// </summary>
-        /// <param name="channel"> The side channel to be unregistered.</param>
-        public void UnregisterSideChannel(SideChannel channel)
-        {
-            Communicator?.UnregisterSideChannel(channel);
-        }
-
-        /// <summary>
        /// Disable stepping of the Academy during the FixedUpdate phase. If this is called, the Academy must be
        /// stepped manually by the user by calling Academy.EnvironmentStep().
        /// </summary>
        {
            EnableAutomaticStepping();

-            var floatProperties = new FloatPropertiesChannel();
-            FloatProperties = floatProperties;
+            SideChannelUtils.RegisterSideChannel(new EngineConfigurationChannel());
+            SideChannelUtils.RegisterSideChannel(new FloatPropertiesChannel());
+            SideChannelUtils.RegisterSideChannel(new StatsSideChannel());

            // Try to launch the communicator by using the arguments passed at launch
            var port = ReadPortFromArgs();

            if (Communicator != null)
            {
-                Communicator.RegisterSideChannel(new EngineConfigurationChannel());
-                Communicator.RegisterSideChannel(floatProperties);
                // We try to exchange the first message with Python. If this fails, it means
                // no Python Process is ready to train the environment. In this case, the
                //environment must use Inference.
                DecideAction?.Invoke();
            }

+            // If the communicator is not on, we need to clear the SideChannel sending queue
+            if (!IsCommunicatorOn)
+            {
+                SideChannelUtils.GetSideChannelMessage();
+            }
+
            using (TimerStack.Instance.Scoped("AgentAct"))
            {
                AgentAct?.Invoke();

            Communicator?.Dispose();
            Communicator = null;
+            SideChannelUtils.UnregisterAllSideChannels();

            if (m_ModelRunners != null)
            {
            // TODO - Pass worker ID or some other identifier,
            // so that multiple envs won't overwrite each others stats.
            TimerStack.Instance.SaveJsonTimers();
-
-            FloatProperties = null;
            m_Initialized = false;

            // Reset the Lazy instance
--- a/com.unity.ml-agents/Runtime/Agent.cs
+++ b/com.unity.ml-agents/Runtime/Agent.cs

        void NotifyAgentDone(DoneReason doneReason)
        {
+            m_Info.episodeId = m_EpisodeId;
            m_Info.reward = m_Reward;
            m_Info.done = true;
            m_Info.maxStepReached = doneReason == DoneReason.MaxStepReached;
                // If everything is the same, don't make any changes.
                return;
            }
-
+            NotifyAgentDone(DoneReason.Disabled);
            m_PolicyFactory.model = model;
            m_PolicyFactory.inferenceDevice = inferenceDevice;
            m_PolicyFactory.behaviorName = behaviorName;
--- a/com.unity.ml-agents/Runtime/Communicator/ICommunicator.cs
+++ b/com.unity.ml-agents/Runtime/Communicator/ICommunicator.cs
        /// <param name="agentId">A key to identify which Agent actions to get.</param>
        /// <returns></returns>
        float[] GetActions(string key, int agentId);
-
-        /// <summary>
-        /// Registers a side channel to the communicator. The side channel will exchange
-        /// messages with its Python equivalent.
-        /// </summary>
-        /// <param name="sideChannel"> The side channel to be registered.</param>
-        void RegisterSideChannel(SideChannel sideChannel);
-
-        /// <summary>
-        /// Unregisters a side channel from the communicator.
-        /// </summary>
-        /// <param name="sideChannel"> The side channel to be unregistered.</param>
-        void UnregisterSideChannel(SideChannel sideChannel);
    }
 }
--- a/com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs
+++ b/com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs
 using MLAgents.Sensors;
 using MLAgents.Policies;
 using MLAgents.SideChannels;
-using System.IO;
 using Google.Protobuf;

 namespace MLAgents
 #endif
        /// The communicator parameters sent at construction
        CommunicatorInitParameters m_CommunicatorInitParameters;
-
-        Dictionary<Guid, SideChannel> m_SideChannels = new Dictionary<Guid, SideChannel>();

        /// <summary>
        /// Initializes a new instance of the RPCCommunicator class.

        void UpdateEnvironmentWithInput(UnityRLInputProto rlInput)
        {
-            ProcessSideChannelData(m_SideChannels, rlInput.SideChannel.ToArray());
+            SideChannelUtils.ProcessSideChannelData(rlInput.SideChannel.ToArray());
            SendCommandEvent(rlInput.Command);
        }

                message.RlInitializationOutput = tempUnityRlInitializationOutput;
            }

-            byte[] messageAggregated = GetSideChannelMessage(m_SideChannels);
+            byte[] messageAggregated = SideChannelUtils.GetSideChannelMessage();
            message.RlOutput.SideChannel = ByteString.CopyFrom(messageAggregated);

            var input = Exchange(message);
            {
                if (m_CurrentUnityRlOutput.AgentInfos.ContainsKey(behaviorName))
                {
-                    if (output == null)
+                    if (m_CurrentUnityRlOutput.AgentInfos[behaviorName].CalculateSize() > 0)
-                        output = new UnityRLInitializationOutputProto();
-                    }
+                        // Only send the BrainParameters if there is a non empty list of
+                        // AgentInfos ready to be sent.
+                        // This is to ensure that The Python side will always have a first
+                        // observation when receiving the BrainParameters
+                        if (output == null)
+                        {
+                            output = new UnityRLInitializationOutputProto();
+                        }
-                    var brainParameters = m_UnsentBrainKeys[behaviorName];
-                    output.BrainParameters.Add(brainParameters.ToProto(behaviorName, true));
+                        var brainParameters = m_UnsentBrainKeys[behaviorName];
+                        output.BrainParameters.Add(brainParameters.ToProto(behaviorName, true));
+                    }
                }
            }

            {
                m_SentBrainKeys.Add(brainProto.BrainName);
                m_UnsentBrainKeys.Remove(brainProto.BrainName);
-            }
-        }
-
-        #endregion
-
-
-        #region Handling side channels
-
-        /// <summary>
-        /// Registers a side channel to the communicator. The side channel will exchange
-        /// messages with its Python equivalent.
-        /// </summary>
-        /// <param name="sideChannel"> The side channel to be registered.</param>
-        public void RegisterSideChannel(SideChannel sideChannel)
-        {
-            var channelId = sideChannel.ChannelId;
-            if (m_SideChannels.ContainsKey(channelId))
-            {
-                throw new UnityAgentsException(string.Format(
-                    "A side channel with type index {0} is already registered. You cannot register multiple " +
-                    "side channels of the same id.", channelId));
-            }
-
-            // Process any messages that we've already received for this channel ID.
-            var numMessages = m_CachedMessages.Count;
-            for (int i = 0; i < numMessages; i++)
-            {
-                var cachedMessage = m_CachedMessages.Dequeue();
-                if (channelId == cachedMessage.ChannelId)
-                {
-                    using (var incomingMsg = new IncomingMessage(cachedMessage.Message))
-                    {
-                        sideChannel.OnMessageReceived(incomingMsg);
-                    }
-                }
-                else
-                {
-                    m_CachedMessages.Enqueue(cachedMessage);
-                }
-            }
-            m_SideChannels.Add(channelId, sideChannel);
-        }
-
-        /// <summary>
-        /// Unregisters a side channel from the communicator.
-        /// </summary>
-        /// <param name="sideChannel"> The side channel to be unregistered.</param>
-        public void UnregisterSideChannel(SideChannel sideChannel)
-        {
-            if (m_SideChannels.ContainsKey(sideChannel.ChannelId))
-            {
-                m_SideChannels.Remove(sideChannel.ChannelId);
-            }
-        }
-
-        /// <summary>
-        /// Grabs the messages that the registered side channels will send to Python at the current step
-        /// into a singe byte array.
-        /// </summary>
-        /// <param name="sideChannels"> A dictionary of channel type to channel.</param>
-        /// <returns></returns>
-        public static byte[] GetSideChannelMessage(Dictionary<Guid, SideChannel> sideChannels)
-        {
-            using (var memStream = new MemoryStream())
-            {
-                using (var binaryWriter = new BinaryWriter(memStream))
-                {
-                    foreach (var sideChannel in sideChannels.Values)
-                    {
-                        var messageList = sideChannel.MessageQueue;
-                        foreach (var message in messageList)
-                        {
-                            binaryWriter.Write(sideChannel.ChannelId.ToByteArray());
-                            binaryWriter.Write(message.Count());
-                            binaryWriter.Write(message);
-                        }
-                        sideChannel.MessageQueue.Clear();
-                    }
-                    return memStream.ToArray();
-                }
-            }
-        }
-
-        private struct CachedSideChannelMessage
-        {
-            public Guid ChannelId;
-            public byte[] Message;
-        }
-
-        private static Queue<CachedSideChannelMessage> m_CachedMessages = new Queue<CachedSideChannelMessage>();
-
-        /// <summary>
-        /// Separates the data received from Python into individual messages for each registered side channel.
-        /// </summary>
-        /// <param name="sideChannels">A dictionary of channel type to channel.</param>
-        /// <param name="dataReceived">The byte array of data received from Python.</param>
-        public static void ProcessSideChannelData(Dictionary<Guid, SideChannel> sideChannels, byte[] dataReceived)
-        {
-            while (m_CachedMessages.Count != 0)
-            {
-                var cachedMessage = m_CachedMessages.Dequeue();
-                if (sideChannels.ContainsKey(cachedMessage.ChannelId))
-                {
-                    using (var incomingMsg = new IncomingMessage(cachedMessage.Message))
-                    {
-                        sideChannels[cachedMessage.ChannelId].OnMessageReceived(incomingMsg);
-                    }
-                }
-                else
-                {
-                    Debug.Log(string.Format(
-                        "Unknown side channel data received. Channel Id is "
-                        + ": {0}", cachedMessage.ChannelId));
-                }
-            }
-
-            if (dataReceived.Length == 0)
-            {
-                return;
-            }
-            using (var memStream = new MemoryStream(dataReceived))
-            {
-                using (var binaryReader = new BinaryReader(memStream))
-                {
-                    while (memStream.Position < memStream.Length)
-                    {
-                        Guid channelId = Guid.Empty;
-                        byte[] message = null;
-                        try
-                        {
-                            channelId = new Guid(binaryReader.ReadBytes(16));
-                            var messageLength = binaryReader.ReadInt32();
-                            message = binaryReader.ReadBytes(messageLength);
-                        }
-                        catch (Exception ex)
-                        {
-                            throw new UnityAgentsException(
-                                "There was a problem reading a message in a SideChannel. Please make sure the " +
-                                "version of MLAgents in Unity is compatible with the Python version. Original error : "
-                                + ex.Message);
-                        }
-                        if (sideChannels.ContainsKey(channelId))
-                        {
-                            using (var incomingMsg = new IncomingMessage(message))
-                            {
-                                sideChannels[channelId].OnMessageReceived(incomingMsg);
-                            }
-                        }
-                        else
-                        {
-                            // Don't recognize this ID, but cache it in case the SideChannel that can handle
-                            // it is registered before the next call to ProcessSideChannelData.
-                            m_CachedMessages.Enqueue(new CachedSideChannelMessage
-                            {
-                                ChannelId = channelId,
-                                Message = message
-                            });
-                        }
-                    }
-                }
            }
        }

--- a/com.unity.ml-agents/Runtime/Policies/HeuristicPolicy.cs
+++ b/com.unity.ml-agents/Runtime/Policies/HeuristicPolicy.cs
        public void RequestDecision(AgentInfo info, List<ISensor> sensors)
        {
            StepSensors(sensors);
-            m_LastDecision = m_Heuristic.Invoke();
+            if (!info.done)
+            {
+                m_LastDecision = m_Heuristic.Invoke();
+            }
        }

        /// <inheritdoc />
--- a/com.unity.ml-agents/Runtime/Sensors/StackingSensor.cs
+++ b/com.unity.ml-agents/Runtime/Sensors/StackingSensor.cs
    /// For example, 4 stacked sets of observations would be output like
    ///   |  t = now - 3  |  t = now -3  |  t = now - 2  |  t = now  |
    /// Internally, a circular buffer of arrays is used. The m_CurrentIndex represents the most recent observation.
+    ///
+    /// Currently, compressed and multidimensional observations are not supported.
-    internal class StackingSensor : ISensor
+    public class StackingSensor : ISensor
    {
        /// <summary>
        /// The wrapped sensor.
        WriteAdapter m_LocalAdapter = new WriteAdapter();

        /// <summary>
-        ///
+        /// Initializes the sensor.
        /// </summary>
        /// <param name="wrapped">The wrapped sensor.</param>
        /// <param name="numStackedObservations">Number of stacked observations to keep.</param>

            m_Name = $"StackingSensor_size{numStackedObservations}_{wrapped.GetName()}";

+            if (wrapped.GetCompressionType() != SensorCompressionType.None)
+            {
+                throw new UnityAgentsException("StackingSensor doesn't support compressed observations.'");
+            }
+
+            if (shape.Length != 1)
+            {
+                throw new UnityAgentsException("Only 1-D observations are supported by StackingSensor");
+            }
            m_Shape = new int[shape.Length];

            m_UnstackedObservationSize = wrapped.ObservationSize();
            }
        }

+        /// <inheritdoc/>
        public int Write(WriteAdapter adapter)
        {
            // First, call the wrapped sensor's write method. Make sure to use our own adapter, not the passed one.
            m_CurrentIndex = (m_CurrentIndex + 1) % m_NumStackedObservations;
        }

+        /// <inheritdoc/>
+        /// <inheritdoc/>
+        /// <inheritdoc/>
+        /// <inheritdoc/>
        public virtual SensorCompressionType GetCompressionType()
        {
            return SensorCompressionType.None;
--- a/com.unity.ml-agents/Runtime/SideChannels/EngineConfigurationChannel.cs
+++ b/com.unity.ml-agents/Runtime/SideChannels/EngineConfigurationChannel.cs
    /// </summary>
    public class EngineConfigurationChannel : SideChannel
    {
-        private const string k_EngineConfigId = "e951342c-4f7e-11ea-b238-784f4387d1f7";
+        const string k_EngineConfigId = "e951342c-4f7e-11ea-b238-784f4387d1f7";
-        /// Initializes the side channel.
+        /// Initializes the side channel. The constructor is internal because only one instance is
+        /// supported at a time, and is created by the Academy.
-        public EngineConfigurationChannel()
+        internal EngineConfigurationChannel()
        {
            ChannelId = new Guid(k_EngineConfigId);
        }
--- a/com.unity.ml-agents/Runtime/SideChannels/SideChannel.cs
+++ b/com.unity.ml-agents/Runtime/SideChannels/SideChannel.cs
 using System.Collections.Generic;
 using System;
-using System.IO;
-using System.Text;

 namespace MLAgents.SideChannels
 {
--- a/com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs
+++ b/com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs
 using System.Collections.Generic;
 using MLAgents.Sensors;
 using MLAgents.Policies;
+using MLAgents.SideChannels;

 namespace MLAgents.Tests
 {
            Assert.AreEqual(0, aca.EpisodeCount);
            Assert.AreEqual(0, aca.StepCount);
            Assert.AreEqual(0, aca.TotalStepCount);
-            Assert.AreNotEqual(null, aca.FloatProperties);
+            Assert.AreNotEqual(null, SideChannelUtils.GetSideChannel<FloatPropertiesChannel>());

            // Check that Dispose is idempotent
            aca.Dispose();
        [Test]
        public void TestAcademyDispose()
        {
-            var floatProperties1 = Academy.Instance.FloatProperties;
+            var floatProperties1 = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
-            var floatProperties2 = Academy.Instance.FloatProperties;
+            Academy.Instance.LazyInitialize();
+            var floatProperties2 = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
            Academy.Instance.Dispose();

            Assert.AreNotEqual(floatProperties1, floatProperties2);
--- a/com.unity.ml-agents/Tests/Editor/PublicAPI/PublicApiValidation.cs
+++ b/com.unity.ml-agents/Tests/Editor/PublicAPI/PublicApiValidation.cs
            }
        }

+        // Simple SensorComponent that sets up a StackingSensor
+        class StackingComponent : SensorComponent
+        {
+            public SensorComponent wrappedComponent;
+            public int numStacks;
+
+            public override ISensor CreateSensor()
+            {
+                var wrappedSensor = wrappedComponent.CreateSensor();
+                return new StackingSensor(wrappedSensor, numStacks);
+            }
+
+            public override int[] GetObservationShape()
+            {
+                int[] shape = (int[]) wrappedComponent.GetObservationShape().Clone();
+                for (var i = 0; i < shape.Length; i++)
+                {
+                    shape[i] *= numStacks;
+                }
+
+                return shape;
+            }
+        }
+

        [Test]
        public void CheckSetupAgent()
            sensorComponent.sensorName = "ray3d";
            sensorComponent.detectableTags = new List<string> { "Player", "Respawn" };
            sensorComponent.raysPerDirection = 3;
+
+            // Make a StackingSensor that wraps the RayPerceptionSensorComponent3D
+            // This isn't necessarily practical, just to ensure that it can be done
+            var wrappingSensorComponent = gameObject.AddComponent<StackingComponent>();
+            wrappingSensorComponent.wrappedComponent = sensorComponent;
+            wrappingSensorComponent.numStacks = 3;

            // ISensor isn't set up yet.
            Assert.IsNull(sensorComponent.raySensor);
--- a/com.unity.ml-agents/Tests/Editor/SideChannelTests.cs
+++ b/com.unity.ml-agents/Tests/Editor/SideChannelTests.cs
            intSender.SendInt(5);
            intSender.SendInt(6);

-            byte[] fakeData = RpcCommunicator.GetSideChannelMessage(dictSender);
-            RpcCommunicator.ProcessSideChannelData(dictReceiver, fakeData);
+            byte[] fakeData = SideChannelUtils.GetSideChannelMessage(dictSender);
+            SideChannelUtils.ProcessSideChannelData(dictReceiver, fakeData);

            Assert.AreEqual(intReceiver.messagesReceived[0], 4);
            Assert.AreEqual(intReceiver.messagesReceived[1], 5);
            strSender.SendRawBytes(Encoding.ASCII.GetBytes(str1));
            strSender.SendRawBytes(Encoding.ASCII.GetBytes(str2));

-            byte[] fakeData = RpcCommunicator.GetSideChannelMessage(dictSender);
-            RpcCommunicator.ProcessSideChannelData(dictReceiver, fakeData);
+            byte[] fakeData = SideChannelUtils.GetSideChannelMessage(dictSender);
+            SideChannelUtils.ProcessSideChannelData(dictReceiver, fakeData);

            var messages = strReceiver.GetAndClearReceivedMessages();

            tmp = propB.GetPropertyWithDefault(k2, 3.0f);
            Assert.AreEqual(tmp, 1.0f);

-            byte[] fakeData = RpcCommunicator.GetSideChannelMessage(dictSender);
-            RpcCommunicator.ProcessSideChannelData(dictReceiver, fakeData);
+            byte[] fakeData = SideChannelUtils.GetSideChannelMessage(dictSender);
+            SideChannelUtils.ProcessSideChannelData(dictReceiver, fakeData);

            tmp = propA.GetPropertyWithDefault(k2, 3.0f);
            Assert.AreEqual(tmp, 1.0f);
            Assert.AreEqual(wasCalled, 0);
-            fakeData = RpcCommunicator.GetSideChannelMessage(dictSender);
-            RpcCommunicator.ProcessSideChannelData(dictReceiver, fakeData);
+            fakeData = SideChannelUtils.GetSideChannelMessage(dictSender);
+            SideChannelUtils.ProcessSideChannelData(dictReceiver, fakeData);
            Assert.AreEqual(wasCalled, 1);

            var keysA = propA.ListProperties();
--- a/config/trainer_config.yaml
+++ b/config/trainer_config.yaml
    time_horizon: 1000
    self_play:
        window: 10
-        play_against_current_self_ratio: 0.5
+        play_against_latest_model_ratio: 0.5
+        team_change: 100000

 Soccer:
    normalize: false
    num_layers: 2
    self_play:
        window: 10
-        play_against_current_self_ratio: 0.5
+        play_against_latest_model_ratio: 0.5
+        team_change: 100000

 CrawlerStatic:
    normalize: true
--- a/docs/Custom-SideChannels.md
+++ b/docs/Custom-SideChannels.md
 `base.QueueMessageToSend(msg)` method inside the side channel, and call the
 `OutgoingMessage.Dispose()` method.

-To register a side channel on the Unity side, call `Academy.Instance.RegisterSideChannel` with the side channel
+To register a side channel on the Unity side, call `SideChannelUtils.RegisterSideChannel` with the side channel
 as only argument.

 ### Python side
        // When a Debug.Log message is created, we send it to the stringChannel
        Application.logMessageReceived += stringChannel.SendDebugStatementToPython;

-        // The channel must be registered with the Academy
-        Academy.Instance.RegisterSideChannel(stringChannel);
+        // The channel must be registered with the SideChannelUtils class
+        SideChannelUtils.RegisterSideChannel(stringChannel);
    }

    public void OnDestroy()
        if (Academy.IsInitialized){
-            Academy.Instance.UnregisterSideChannel(stringChannel);
+            SideChannelUtils.UnregisterSideChannel(stringChannel);
        }
    }

 string_log = StringLogChannel()

 # We start the communication with the Unity Editor and pass the string_log side channel as input
-env = UnityEnvironment(base_port=UnityEnvironment.DEFAULT_EDITOR_PORT, side_channels=[string_log])
+env = UnityEnvironment(side_channels=[string_log])
 env.reset()
 string_log.send_string("The environment was reset")

--- a/docs/Getting-Started.md
+++ b/docs/Getting-Started.md

 Depending on your version of Unity, it may be necessary to change the **Scripting Runtime Version** of your project. This can be done as follows:

-1. Launch Unity
-2. On the Projects dialog, choose the **Open** option at the top of the window.
+1. Launch Unity Hub
+2. On the Projects dialog, choose the **Add** option at the top of the window.
 3. Using the file dialog that opens, locate the `Project` folder
   within the ML-Agents toolkit project and click **Open**.
 4. Go to **Edit** > **Project Settings** > **Player**
 2. Navigate to the folder where you cloned the ML-Agents toolkit repository.
   **Note**: If you followed the default [installation](Installation.md), then
   you should be able to run `mlagents-learn` from any directory.
-3. Run `mlagents-learn <trainer-config-path> --run-id=<run-identifier> --train`
+3. Run `mlagents-learn <trainer-config-path> --run-id=<run-identifier>`
-      training runs
-    - `--train` tells `mlagents-learn` to run a training session (rather
-      than inference)
+      training runs. Make sure to use one that hasn't been used already!
-      mlagents-learn config/trainer_config.yaml --run-id=firstRun --train
+      mlagents-learn config/trainer_config.yaml --run-id=firstRun
      ```

 5. When the message _"Start training by pressing the Play button in the Unity
 **Note**: If you're using Anaconda, don't forget to activate the ml-agents
 environment first.

-The `--train` flag tells the ML-Agents toolkit to run in training mode.
 The `--time-scale=100` sets the `Time.TimeScale` value in Unity.

 **Note**: You can train using an executable rather than the Editor. To do so,
 command-line prompt. If you close the window manually, the `.nn` file
 containing the trained model is not exported into the ml-agents folder.

-You can press Ctrl+C to stop the training, and your trained model will be at
-`models/<run-identifier>/<behavior_name>.nn` where
+If you've quit the training early using Ctrl+C and want to resume training, run the
+same command again, appending the `--resume` flag:
+
+```sh
+mlagents-learn config/trainer_config.yaml --run-id=firstRun --resume
+```
+
+Your trained model will be at `models/<run-identifier>/<behavior_name>.nn` where
 `<behavior_name>` is the name of the `Behavior Name` of the agents corresponding to the model.
 (**Note:** There is a known bug on Windows that causes the saving of the model to
 fail when you early terminate the training, it's recommended to wait until Step
--- a/docs/Installation.md
+++ b/docs/Installation.md

 By installing the `mlagents` package, the dependencies listed in the
 [setup.py file](../ml-agents/setup.py) are also installed. These include
-[TensorFlow](Background-TensorFlow.md) (Requires a CPU w/ AVX support) and
-[Jupyter](Background-Jupyter.md).
+[TensorFlow](Background-TensorFlow.md) (Requires a CPU w/ AVX support).

 #### Advanced: Installing for Development

--- a/docs/Learning-Environment-Create-New.md
+++ b/docs/Learning-Environment-Create-New.md
 includes a convenient Monitor class that you can use to easily display Agent
 status information in the Game window.

-One additional test you can perform is to first ensure that your environment and
-the Python API work as expected using the `notebooks/getting-started.ipynb`
-[Jupyter notebook](Background-Jupyter.md). Within the notebook, be sure to set
-`env_name` to the name of the environment file you specify when building this
-environment.

 ## Training the Environment

 To train in the editor, run the following Python command from a Terminal or Console
 window before pressing play:

-    mlagents-learn config/config.yaml --run-id=RollerBall-1 --train
+    mlagents-learn config/config.yaml --run-id=RollerBall-1

 (where `config.yaml` is a copy of `trainer_config.yaml` that you have edited
 to change the `batch_size` and `buffer_size` hyperparameters for your trainer.)
--- a/docs/Learning-Environment-Design-Agents.md
+++ b/docs/Learning-Environment-Design-Agents.md
 ```csharp
 normalizedValue = (currentValue - minValue)/(maxValue - minValue)
 ```
+:warning: For vectors, you should apply the above formula to each component (x, y, and z). Note that this is *not* the same as using the `Vector3.normalized` property or `Vector3.Normalize()` method in Unity (and similar for `Vector2`).

 Rotations and angles should also be normalized. For angles between 0 and 360
 degrees, you can use the following formulas:
--- a/docs/Learning-Environment-Examples.md
+++ b/docs/Learning-Environment-Examples.md
 * Goal:
  * Get the ball into the opponent's goal while preventing
  the ball from entering own goal.
-  * Goalie:
 * Agents: The environment contains four agents, with the same
  Behavior Parameters : Soccer.
 * Agent Reward Function (dependent):
--- a/docs/Learning-Environment-Executable.md
+++ b/docs/Learning-Environment-Executable.md
   followed the default [installation](Installation.md), then navigate to the
   `ml-agents/` folder.
 3. Run
-   `mlagents-learn <trainer-config-file> --env=<env_name> --run-id=<run-identifier> --train`
+   `mlagents-learn <trainer-config-file> --env=<env_name> --run-id=<run-identifier>`
   Where:
   * `<trainer-config-file>` is the file path of the trainer configuration yaml
   * `<env_name>` is the name and path to the executable you exported from Unity
-   * And the `--train` tells `mlagents-learn` to run a training session (rather
-     than inference)
-mlagents-learn ../config/trainer_config.yaml --env=3DBall --run-id=firstRun --train
+mlagents-learn ../config/trainer_config.yaml --env=3DBall --run-id=firstRun
-ml-agents$ mlagents-learn config/trainer_config.yaml --env=3DBall --run-id=first-run --train
+ml-agents$ mlagents-learn config/trainer_config.yaml --env=3DBall --run-id=first-run


                        ▄▄▄▓▓▓▓
--- a/docs/Migrating.md
+++ b/docs/Migrating.md
 ## Migrating from 0.15 to latest

 ### Important changes
+* The `--load` and `--train` command-line flags have been deprecated and replaced with `--resume` and `--inference`.
+* Running with the same `--run-id` twice will now throw an error.
+* The `play_against_current_self_ratio` self-play trainer hyperparameter has been renamed to `play_against_latest_model_ratio`
+* Replace the `--load` flag with `--resume` when calling `mlagents-learn`, and don't use the `--train` flag as training
+ will happen by default. To run without training, use `--inference`.
+* To force-overwrite files from a pre-existing run, add the `--force` command-line flag.
+* The Jupyter notebooks have been removed from the repository.
+* `Academy.FloatProperties` was removed.
+* `Academy.RegisterSideChannel` and `Academy.UnregisterSideChannel` were removed.
+### Steps to Migrate
+* Replace `Academy.FloatProperties` with `SideChannelUtils.GetSideChannel<FloatPropertiesChannel>()`.
+* Replace `Academy.RegisterSideChannel` with `SideChannelUtils.RegisterSideChannel()`.
+* Replace `Academy.UnregisterSideChannel` with `SideChannelUtils.UnregisterSideChannel`.

 ## Migrating from 0.14 to 0.15

 * The interface for SideChannels was changed:
  * In C#, `OnMessageReceived` now takes a `IncomingMessage` argument, and `QueueMessageToSend` takes an `OutgoingMessage` argument.
  * In python, `on_message_received` now takes a `IncomingMessage` argument, and `queue_message_to_send` takes an `OutgoingMessage` argument.
+  * Automatic stepping for Academy is now controlled from the AutomaticSteppingEnabled property.

 ### Steps to Migrate
 * Add the `using MLAgents.Sensors;` in addition to `using MLAgents;` on top of your Agent's script.
 * We strongly recommend replacing the following methods with their new equivalent as they will be removed in a later release:
  * `InitializeAgent()` to `Initialize()`
  * `AgentAction()` to `OnActionReceived()`
-  * `AgentReset()` to `OnEpsiodeBegin()`
+  * `AgentReset()` to `OnEpisodeBegin()`
+* Replace calls to Academy.EnableAutomaticStepping()/DisableAutomaticStepping() with Academy.AutomaticSteppingEnabled = true/false.

 ## Migrating from 0.13 to 0.14

--- a/docs/Python-API.md
+++ b/docs/Python-API.md
 The ML-Agents Toolkit Low Level API is a Python API for controlling the simulation
 loop of an environment or game built with Unity. This API is used by the
 training algorithms inside the ML-Agent Toolkit, but you can also write your own
-Python programs using this API. Go [here](../notebooks/getting-started.ipynb)
-for a Jupyter Notebook walking through the functionality of the API.
+Python programs using this API.

 The key objects in the Python API include:


 ```python
 from mlagents_envs.environment import UnityEnvironment
-env = UnityEnvironment(file_name="3DBall", base_port=5005, seed=1, side_channels=[])
+env = UnityEnvironment(file_name="3DBall", seed=1, side_channels=[])
 ```

 - `file_name` is the name of the environment binary (located in the root

 channel = EngineConfigurationChannel()

-env = UnityEnvironment(base_port = UnityEnvironment.DEFAULT_EDITOR_PORT, side_channels = [channel])
+env = UnityEnvironment(side_channels=[channel])

 channel.set_configuration_parameters(time_scale = 2.0)


 channel = FloatPropertiesChannel()

-env = UnityEnvironment(base_port = UnityEnvironment.DEFAULT_EDITOR_PORT, side_channels = [channel])
+env = UnityEnvironment(side_channels=[channel])

 channel.set_property("parameter_1", 2.0)

 Once a property has been modified in Python, you can access it in C# after the next call to `step` as follows:

 ```csharp
-var sharedProperties = Academy.Instance.FloatProperties;
+var sharedProperties = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
 float property1 = sharedProperties.GetPropertyWithDefault("parameter_1", 0.0f);
 ```

--- a/docs/Readme.md
+++ b/docs/Readme.md
 ## Installation & Set-up

 * [Installation](Installation.md)
-  * [Background: Jupyter Notebooks](Background-Jupyter.md)
  * [Using Virtual Environment](Using-Virtual-Environment.md)

 ## Getting Started
--- a/docs/Training-Curriculum-Learning.md
+++ b/docs/Training-Curriculum-Learning.md
 In order to define the curricula, the first step is to decide which parameters of
 the environment will vary. In the case of the Wall Jump environment,
 the height of the wall is what varies. We define this as a `Shared Float Property`
-that can be accessed in `Academy.Instance.FloatProperties`, and by doing
+that can be accessed in `SideChannelUtils.GetSideChannel<FloatPropertiesChannel>()`, and by doing
 so it becomes adjustable via the Python API.
 Rather than adjusting it by hand, we will create a YAML file which
 describes the structure of the curricula. Within it, we can specify which
 to train agents in the Wall Jump environment with curriculum learning, we can run:

 ```sh
-mlagents-learn config/trainer_config.yaml --curriculum=config/curricula/wall_jump.yaml --run-id=wall-jump-curriculum --train
+mlagents-learn config/trainer_config.yaml --curriculum=config/curricula/wall_jump.yaml --run-id=wall-jump-curriculum
 ```

 We can then keep track of the current lessons and progresses via TensorBoard.
--- a/docs/Training-Environment-Parameter-Randomization.md
+++ b/docs/Training-Environment-Parameter-Randomization.md


 To enable variations in the environments, we implemented `Environment Parameters`.
-`Environment Parameters` are `Academy.Instance.FloatProperties` that can be read when setting
+`Environment Parameters` are values in the `FloatPropertiesChannel` that can be read when setting
 up the environment. We
 also included different sampling methods and the ability to create new kinds of
 sampling methods for each `Environment Parameter`. In the 3D ball environment example displayed
 environment with a new sample of `Environment Parameters`.

 * `Environment Parameter` - Name of the `Environment Parameter` like `mass`, `gravity` and `scale`. This should match the name
-specified in the `FloatProperties` of the environment being trained. If a parameter specified in the file doesn't exist in the
+specified in the `FloatPropertiesChannel` of the environment being trained. If a parameter specified in the file doesn't exist in the
 environment, then this parameter will be ignored.  Within each `Environment Parameter`

    * `sampler-type` - Specify the sampler type to use for the `Environment Parameter`.

 ```sh
 mlagents-learn config/trainer_config.yaml --sampler=config/3dball_randomize.yaml
--run-id=3D-Ball-randomize --train
+--run-id=3D-Ball-randomize
 ```

 We can observe progress and metrics via Tensorboard.
--- a/docs/Training-ML-Agents.md
+++ b/docs/Training-ML-Agents.md
 The basic command for training is:

 ```sh
-mlagents-learn <trainer-config-file> --env=<env_name> --run-id=<run-identifier> --train
+mlagents-learn <trainer-config-file> --env=<env_name> --run-id=<run-identifier>
 ```

 where
   environment you built in step 1:

 ```sh
-mlagents-learn config/trainer_config.yaml --env=../../projects/Cats/CatsOnBicycles.app --run-id=cob_1 --train
+mlagents-learn config/trainer_config.yaml --env=../../projects/Cats/CatsOnBicycles.app --run-id=cob_1
 ```

 During a training session, the training program prints out and saves updates at
 `models/cob_1/CatsOnBicycles_cob_1.nn`.

 While this example used the default training hyperparameters, you can edit the
-[training_config.yaml file](#training-config-file) with a text editor to set
+[trainer_config.yaml file](#training-config-file) with a text editor to set
+To interrupt training and save the current progress, hit Ctrl+C once and wait for the
+model to be saved out.
+
+### Loading an Existing Model
+
+If you've quit training early using Ctrl+C, you can resume the training run by running
+`mlagents-learn` again, specifying the same `<run-identifier>` and appending the `--resume` flag
+to the command.
+
+You can also use this mode to run inference of an already-trained model in Python.
+Append both the `--resume` and `--inference` to do this. Note that if you want to run
+inference in Unity, you should use the
+[Unity Inference Engine](Getting-started#Running-a-pre-trained-model).
+
+If you've already trained a model using the specified `<run-identifier>` and `--resume` is not
+specified, you will not be able to continue with training. Use `--force` to force ML-Agents to
+overwrite the existing data.
+
 ### Command Line Training Options

 In addition to passing the path of the Unity executable containing your training
  training. Defaults to 0.
 * `--num-envs=<n>`: Specifies the number of concurrent Unity environment instances to
  collect experiences from when training. Defaults to 1.
-* `--run-id=<path>`: Specifies an identifier for each training run. This
+* `--run-id=<run-identifier>`: Specifies an identifier for each training run. This
  identifier is used to name the subdirectories in which the trained model and
  summary statistics are saved as well as the saved model itself. The default id
  is "ppo". If you use TensorBoard to view the training statistics, always set a
  will use the port `(base_port + worker_id)`, where the `worker_id` is sequential IDs
  given to each instance from 0 to `num_envs - 1`. Default is 5005. __Note:__ When
  training using the Editor rather than an executable, the base port will be ignored.
-* `--train`: Specifies whether to train model or only run in inference mode.
-  When training, **always** use the `--train` option.
-* `--load`: If set, the training code loads an already trained model to
+* `--inference`: Specifies whether to only run in inference mode. Omit to train the model.
+  To load an existing model, specify a run-id and combine with `--resume`.
+* `--resume`: If set, the training code loads an already trained model to
-  training). When not set (the default), the neural network weights are randomly
-  initialized and an existing model is not loaded.
+  training). This option only works when the models exist, and have the same behavior names
+  as the current agents in your scene.
+* `--force`: Attempting to train a model with a run-id that has been used before will
+  throw an error. Use `--force` to force-overwrite this run-id's summary and model data.
 * `--no-graphics`: Specify this option to run the Unity executable in
  `-batchmode` and doesn't initialize the graphics driver. Use this only if your
  training doesn't involve visual observations (reading from Pixels). See
--- a/docs/Training-Self-Play.md
+++ b/docs/Training-Self-Play.md
 # Training with Self-Play

-ML-Agents provides the functionality to train symmetric, adversarial games with [Self-Play](https://openai.com/blog/competitive-self-play/).
-A symmetric game is one in which opposing agents are *equal* in form and function. In reinforcement learning,
-this means both agents have the same observation and action spaces.
-With self-play, an agent learns in adversarial games by competing against fixed, past versions of itself
-to provide a more stable, stationary learning environment. This is compared
-to competing against its current self in every episode, which is a constantly changing opponent.
+ML-Agents provides the functionality to train both symmetric and asymmetric adversarial games with
+[Self-Play](https://openai.com/blog/competitive-self-play/).
+A symmetric game is one in which opposing agents are equal in form, function and objective. Examples of symmetric games
+are our Tennis and Soccer example environments. In reinforcement learning, this means both agents have the same observation and
+action spaces and learn from the same reward function and so *they can share the same policy*. In asymmetric games,
+this is not the case. An example of an asymmetric games are Hide and Seek. Agents in these
+types of games do not always have the same observation or action spaces and so sharing policy networks is not
+necessarily ideal.
+
+With self-play, an agent learns in adversarial games by competing against fixed, past versions of its opponent
+(which could be itself as in symmetric games) to provide a more stable, stationary learning environment. This is compared
+to competing against the current, best opponent in every episode, which is constantly changing (because it's learning).
+However, from the perspective of an individual agent, these scenarios appear to have non-stationary dynamics because the opponent is often changing.
+This can cause significant issues in the experience replay mechanism used by SAC. Thus, we recommend that users use PPO. For further reading on
+this issue in particular, see the paper [Stabilising Experience Replay for Deep Multi-Agent Reinforcement Learning](https://arxiv.org/pdf/1702.08887.pdf).
 For more general information on training with ML-Agents, see [Training ML-Agents](Training-ML-Agents.md).
 For more algorithm specific instruction, please see the documentation for [PPO](Training-PPO.md) or [SAC](Training-SAC.md).


-See the trainer configuration and agent prefabs for our Tennis environment for an example.
+***Team ID must be 0 or an integer greater than 0.***
+
+In symmetric games, since all agents (even on opposing teams) will share the same policy, they should have the same 'Behavior Name' in their
+Behavior Parameters Script.  In asymmetric games, they should have a different Behavior Name in their Behavior Parameters script.
+Note, in asymmetric games, the agents must have both different Behavior Names *and* different team IDs! Then, specify the trainer configuration
+for each Behavior Name in your scene as you would normally, and remember to include the self-play hyperparameter hierarchy!
+
+For examples of how to use this feature, you can see the trainer configurations and agent prefabs for our Tennis and Soccer environments.
+Tennis and Soccer provide examples of symmetric games. To train an asymmetric game, specify trainer configurations for each of your behavior names
+and include the self-play hyperparameter hierarchy in both.
+

 ## Best Practices Training with Self-Play

 Training against a set of slowly or unchanging adversaries with low diversity
 results in a more stable learning process than training against a set of quickly
-changing adversaries with high diversity. With this context, this guide discusses the exposed self-play hyperparameters and intuitions for tuning them.
+changing adversaries with high diversity. With this context, this guide discusses
+the exposed self-play hyperparameters and intuitions for tuning them.


 ## Hyperparameters

 ### Save Steps

-The `save_steps` parameter corresponds to the number of *trainer steps* between snapshots.  For example, if `save_steps`=10000 then a snapshot of the current policy will be saved every 10000 trainer steps. Note, trainer steps are counted per agent. For more information, please see the [migration doc](Migrating.md) after v0.13.
+The `save_steps` parameter corresponds to the number of *trainer steps* between snapshots.  For example, if `save_steps=10000` then a snapshot of the current policy will be saved every `10000` trainer steps. Note, trainer steps are counted per agent. For more information, please see the [migration doc](Migrating.md) after v0.13.
+### Team Change
+
+The `team_change` parameter corresponds to the number of *trainer_steps* between switching the learning team.
+This is the number of trainer steps the teams associated with a specific ghost trainer will train before a different team
+becomes the new learning team. It is possible that, in asymmetric games, opposing teams require fewer trainer steps to make similar
+performance gains. This enables users to train a more complicated team of agents for more trainer steps than a simpler team of agents
+per team switch.
+
+A larger value of `team-change` will allow the agent to train longer against it's opponents.  The longer an agent trains against the same set of opponents
+the more able it will be to defeat them. However, training against them for too long may result in overfitting to the particular opponent strategies
+and so the agent may fail against the next batch of opponents.
+
+The value of `team-change` will determine how many snapshots of the agent's policy are saved to be used as opponents for the other team.  So, we
+recommend setting this value as a function of the `save_steps` parameter discussed previously.
+
+Recommended Range : 4x-10x where x=`save_steps`
+
+
-The `swap_steps` parameter corresponds to the number of *trainer steps* between swapping the opponents policy with a different snapshot. As in the `save_steps` discussion, note that trainer steps are counted per agent. For more information, please see the [migration doc](Migrating.md) after v0.13.
+The `swap_steps` parameter corresponds to the number of *ghost steps* (not trainer steps) between swapping the opponents policy with a different snapshot.
+A 'ghost step' refers to a step taken by an agent *that is following a fixed policy and not learning*. The reason for this distinction is that in asymmetric games,
+we may have teams with an unequal number of agents e.g. a 2v1 scenario. The team with two agents collects
+twice as many agent steps per environment step as the team with one agent.  Thus, these two values will need to be distinct to ensure that the same number
+of trainer steps corresponds to the same number of opponent swaps for each team. The formula for `swap_steps` if
+a user desires `x` swaps of a team with `num_agents` agents against an opponent team with `num_opponent_agents`
+agents during `team-change` total steps is:
+```
+swap_steps = (num_agents / num_opponent_agents) * (team_change / x)
+```
+
+As an example, in a 2v1 scenario, if we want the swap to occur `x=4` times during `team-change=200000` steps,
+the `swap_steps` for the team of one agent is:
+
+```
+swap_steps = (1 / 2) * (200000 / 4) = 25000
+```
+The `swap_steps` for the team of two agents is:
+```
+swap_steps = (2 / 1) * (200000 / 4) = 100000
+```
+Note, with equal team sizes, the first term is equal to 1 and `swap_steps` can be calculated by just dividing the total steps by the desired number of swaps.
-### Play against current self ratio
+### Play against latest model ratio
-The `play_against_current_self_ratio` parameter corresponds to the probability
-an agent will play against its ***current*** self. With probability
-1 - `play_against_current_self_ratio`, the agent will play against a snapshot of itself
-from a past iteration.
+The `play_against_latest_model_ratio` parameter corresponds to the probability
+an agent will play against the latest opponent policy. With probability
+1 - `play_against_latest_model_ratio`, the agent will play against a snapshot of its
+opponent from a past iteration.
-A larger value of `play_against_current_self_ratio` indicates that an agent will be playing against itself more often. Since the agent is updating it's policy, the opponent will be different from iteration to iteration.  This can lead to an unstable learning environment, but poses the agent with an [auto-curricula](https://openai.com/blog/emergent-tool-use/) of more increasingly challenging situations which may lead to a stronger final policy.
+A larger value of `play_against_latest_model_ratio` indicates that an agent will be playing against the current opponent more often. Since the agent is updating it's policy, the opponent will be different from iteration to iteration.  This can lead to an unstable learning environment, but poses the agent with an [auto-curricula](https://openai.com/blog/emergent-tool-use/) of more increasingly challenging situations which may lead to a stronger final policy.
-Recommended Range : 0.0 - 1.0
+Range : 0.0 - 1.0

 ### Window

 In adversarial games, the cumulative environment reward may not be a meaningful metric by which to track learning progress.  This is because cumulative reward is entirely dependent on the skill of the opponent. An agent at a particular skill level will get more or less reward against a worse or better agent, respectively.

 We provide an implementation of the ELO rating system, a method for calculating the relative skill level between two players from a given population in a zero-sum game. For more information on ELO, please see [the ELO wiki](https://en.wikipedia.org/wiki/Elo_rating_system).
+In a proper training run, the ELO of the agent should steadily increase. The absolute value of the ELO is less important than the change in ELO over training iterations.
-In a proper training run, the ELO of the agent should steadily increase. The absolute value of the ELO is less important than the change in ELO over training iterations.
+Note, this implementation will support any number of teams but ELO is only applicable to games with two teams.  It is ongoing work to implement
+a reliable metric for measuring progress in scenarios with three or more teams. These scenarios can still train, though as of now, reward and qualitative observations
+are the only metric by which we can judge performance.
--- a/docs/Using-Docker.md
+++ b/docs/Using-Docker.md
           -p 5005:5005 \
           -p 6006:6006 \
           <image-name>:latest \
-           --docker-target-name=unity-volume \
           <trainer-config-file> \
           --env=<environment-name> \
           --train \
 - `source`: Reference to the path in your host OS where you will store the Unity
  executable.
 - `target`: Tells Docker to mount the `source` path as a disk with this name.
- `docker-target-name`: Tells the ML-Agents Python package what the name of the
-  disk where it can read the Unity executable and store the graph. **This should
-  therefore be identical to `target`.**
 - `trainer-config-file`, `train`, `run-id`: ML-Agents arguments passed to
  `mlagents-learn`. `trainer-config-file` is the filename of the trainer config
  file, `train` trains the algorithm, and `run-id` is used to tag each
           -p 5005:5005 \
           -p 6006:6006 \
           balance.ball.v0.1:latest 3DBall \
-           --docker-target-name=unity-volume \
-           trainer_config.yaml \
-           --env=3DBall \
+           /unity-volume/trainer_config.yaml \
+           --env=/unity-volume/3DBall \
           --train \
           --run-id=3dball_first_trial
 ```
--- a/docs/Using-Tensorboard.md
+++ b/docs/Using-Tensorboard.md
  taken between two observations.

 * `Losses/Cloning Loss` (BC) - The mean magnitude of the behavioral cloning loss. Corresponds to how well the model imitates the demonstration data.
+
+## Custom Metrics from C#
+To get custom metrics from a C# environment into Tensorboard, you can use the StatsSideChannel:
+```csharp
+var statsSideChannel = SideChannelUtils.GetSideChannel<StatsSideChannel>();
+statsSideChannel.AddStat("MyMetric", 1.0);
+```
--- a/gym-unity/README.md
+++ b/gym-unity/README.md

 The returned environment `env` will function as a gym.

-For more on using the gym interface, see our
-[Jupyter Notebook tutorial](../notebooks/getting-started-gym.ipynb).

 ## Limitations

--- a/gym-unity/gym_unity/envs/init.py
+++ b/gym-unity/gym_unity/envs/init.py
-import logging
 import itertools
 import numpy as np
 from typing import Any, Dict, List, Optional, Tuple, Union

 from mlagents_envs.environment import UnityEnvironment
 from mlagents_envs.base_env import BatchedStepResult
+from mlagents_envs import logging_util


 class UnityGymException(error.Error):
    pass


-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger("gym_unity")
-
+logger = logging_util.get_logger(__name__)
+logging_util.set_log_level(logging_util.INFO)

 GymSingleStepResult = Tuple[np.ndarray, float, bool, Dict]
 GymMultiStepResult = Tuple[List[np.ndarray], List[float], List[bool], Dict]
        :param no_graphics: Whether to run the Unity simulator in no-graphics mode
        :param allow_multiple_visual_obs: If True, return a list of visual observations instead of only one.
        """
-        base_port = 5005
+        base_port = UnityEnvironment.BASE_ENVIRONMENT_PORT
        if environment_filename is None:
            base_port = UnityEnvironment.DEFAULT_EDITOR_PORT


    def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult:
        n_extra_agents = step_result.n_agents() - self._n_agents
-        if n_extra_agents < 0 or n_extra_agents > self._n_agents:
+        if n_extra_agents < 0:
-            # or too many requested a decision
            raise UnityGymException(
                "The number of agents in the scene does not match the expected number."
            )
        # only cares about the ordering.
        for index, agent_id in enumerate(step_result.agent_id):
            if not self._previous_step_result.contains_agent(agent_id):
+                if step_result.done[index]:
+                    # If the Agent is already done (e.g. it ended its epsiode twice in one step)
+                    # Don't try to register it here.
+                    continue
                # Register this agent, and get the reward of the previous agent that
                # was in its index, so that we can return it to the gym.
                last_reward = self.agent_mapper.register_new_agent_id(agent_id)
        """
        Declare the agent done with the corresponding final reward.
        """
-        gym_index = self._agent_id_to_gym_index.pop(agent_id)
-        self._done_agents_index_to_last_reward[gym_index] = reward
+        if agent_id in self._agent_id_to_gym_index:
+            gym_index = self._agent_id_to_gym_index.pop(agent_id)
+            self._done_agents_index_to_last_reward[gym_index] = reward
+        else:
+            # Agent was never registered in the first place (e.g. EndEpisode called multiple times)
+            pass

    def register_new_agent_id(self, agent_id: int) -> float:
        """
        self._gym_id_order = list(agent_ids)

    def mark_agent_done(self, agent_id: int, reward: float) -> None:
-        gym_index = self._gym_id_order.index(agent_id)
-        self._done_agents_index_to_last_reward[gym_index] = reward
-        self._gym_id_order[gym_index] = -1
+        try:
+            gym_index = self._gym_id_order.index(agent_id)
+            self._done_agents_index_to_last_reward[gym_index] = reward
+            self._gym_id_order[gym_index] = -1
+        except ValueError:
+            # Agent was never registered in the first place (e.g. EndEpisode called multiple times)
+            pass

    def register_new_agent_id(self, agent_id: int) -> float:
        original_index = self._gym_id_order.index(-1)
--- a/gym-unity/gym_unity/tests/test_gym.py
+++ b/gym-unity/gym_unity/tests/test_gym.py
        assert expected_agent_id == agent_id


+@mock.patch("gym_unity.envs.UnityEnvironment")
+def test_sanitize_action_new_agent_done(mock_env):
+    mock_spec = create_mock_group_spec(
+        vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3]
+    )
+    mock_step = create_mock_vector_step_result(num_agents=3)
+    mock_step.agent_id = np.array(range(5))
+    setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
+    env = UnityEnv(" ", use_visual=False, multiagent=True)
+
+    received_step_result = create_mock_vector_step_result(num_agents=7)
+    received_step_result.agent_id = np.array(range(7))
+    # agent #3 (id = 2) is Done
+    # so is the "new" agent (id = 5)
+    done = [False] * 7
+    done[2] = True
+    done[5] = True
+    received_step_result.done = np.array(done)
+    sanitized_result = env._sanitize_info(received_step_result)
+    for expected_agent_id, agent_id in zip([0, 1, 6, 3, 4], sanitized_result.agent_id):
+        assert expected_agent_id == agent_id
+
+
+@mock.patch("gym_unity.envs.UnityEnvironment")
+def test_sanitize_action_single_agent_multiple_done(mock_env):
+    mock_spec = create_mock_group_spec(
+        vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3]
+    )
+    mock_step = create_mock_vector_step_result(num_agents=1)
+    mock_step.agent_id = np.array(range(1))
+    setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
+    env = UnityEnv(" ", use_visual=False, multiagent=False)
+
+    received_step_result = create_mock_vector_step_result(num_agents=3)
+    received_step_result.agent_id = np.array(range(3))
+    # original agent (id = 0) is Done
+    # so is the "new" agent (id = 1)
+    done = [True, True, False]
+    received_step_result.done = np.array(done)
+    sanitized_result = env._sanitize_info(received_step_result)
+    for expected_agent_id, agent_id in zip([2], sanitized_result.agent_id):
+        assert expected_agent_id == agent_id
+
+
 # Helper methods


    # Mark some agents as done with their last rewards.
    mapper.mark_agent_done(1001, 42.0)
    mapper.mark_agent_done(1004, 1337.0)
+
+    # Make sure we can handle an unknown agent id being marked done.
+    # This can happen when an agent ends an episode on the same step it starts.
+    mapper.mark_agent_done(9999, -1.0)

    # Now add new agents, and get the rewards of the agent they replaced.
    old_reward1 = mapper.register_new_agent_id(2001)
--- a/ml-agents-envs/mlagents_envs/communicator.py
+++ b/ml-agents-envs/mlagents_envs/communicator.py
        """
        Python side of the communication. Must be used in pair with the right Unity Communicator equivalent.

+        :int worker_id: Offset from base_port. Used for training multiple environments simultaneously.
-        :int worker_id: Number to add to communication port (5005) [0]. Used for asynchronous agent scenarios.
        """

    def initialize(self, inputs: UnityInputProto) -> UnityOutputProto:
--- a/ml-agents-envs/mlagents_envs/environment.py
+++ b/ml-agents-envs/mlagents_envs/environment.py
 import atexit
 import glob
 import uuid
-import logging
 import numpy as np
 import os
 import subprocess
+
+from mlagents_envs.logging_util import get_logger
 from mlagents_envs.side_channel.side_channel import SideChannel, IncomingMessage

 from mlagents_envs.base_env import (
 import struct


-logger = logging.getLogger("mlagents_envs")
+logger = get_logger(__name__)


 class UnityEnvironment(BaseEnv):
    # isn't specified, this port will be used.
    DEFAULT_EDITOR_PORT = 5004

+    # Default base port for environments. Each environment will be offset from this
+    # by it's worker_id.
+    BASE_ENVIRONMENT_PORT = 5005
+
    # Command line argument used to pass the port to the executable environment.
    PORT_COMMAND_LINE_ARG = "--mlagents-port"

        worker_id: int = 0,
-        base_port: int = 5005,
+        base_port: Optional[int] = None,
-        docker_training: bool = False,
        no_graphics: bool = False,
        timeout_wait: int = 60,
        args: Optional[List[str]] = None,

        :string file_name: Name of Unity environment binary.
        :int base_port: Baseline port number to connect to Unity environment over. worker_id increments over this.
-        :int worker_id: Number to add to communication port (5005) [0]. Used for asynchronous agent scenarios.
-        :bool docker_training: Informs this class whether the process is being run within a container.
+        If no environment is specified (i.e. file_name is None), the DEFAULT_EDITOR_PORT will be used.
+        :int worker_id: Offset from base_port. Used for training multiple environments simultaneously.
        :bool no_graphics: Whether to run the Unity simulator in no-graphics mode
        :int timeout_wait: Time (in seconds) to wait for connection from environment.
        :list args: Addition Unity command line arguments
        atexit.register(self._close)
+        # If base port is not specified, use BASE_ENVIRONMENT_PORT if we have
+        # an environment, otherwise DEFAULT_EDITOR_PORT
+        if base_port is None:
+            base_port = (
+                self.BASE_ENVIRONMENT_PORT if file_name else self.DEFAULT_EDITOR_PORT
+            )
        self.port = base_port + worker_id
        self._buffer_size = 12000
        # If true, this means the environment was successfully loaded
                "the worker-id must be 0 in order to connect with the Editor."
            )
        if file_name is not None:
-            self.executable_launcher(file_name, docker_training, no_graphics, args)
+            self.executable_launcher(file_name, no_graphics, args)
        else:
            logger.info(
                f"Listening on port {self.port}. "
            aca_output = self.send_academy_parameters(rl_init_parameters_in)
            aca_params = aca_output.rl_initialization_output
        except UnityTimeOutException:
-            self._close()
+            self._close(0)
-            self._close()
+            self._close(0)
            raise UnityEnvironmentException(
                f"The communication API version is not compatible between Unity and python. "
                f"Python API: {UnityEnvironment.API_VERSION}, Unity API: {unity_communicator_version}.\n "
                launch_string = candidates[0]
        return launch_string

-    def executable_launcher(self, file_name, docker_training, no_graphics, args):
+    def executable_launcher(self, file_name, no_graphics, args):
-            self._close()
+            self._close(0)
            raise UnityEnvironmentException(
                f"Couldn't launch the {file_name} environment. Provided filename does not match any environments."
            )
-            if not docker_training:
-                subprocess_args = [launch_string]
-                if no_graphics:
-                    subprocess_args += ["-nographics", "-batchmode"]
-                subprocess_args += [
-                    UnityEnvironment.PORT_COMMAND_LINE_ARG,
-                    str(self.port),
-                ]
-                subprocess_args += args
-                try:
-                    self.proc1 = subprocess.Popen(
-                        subprocess_args,
-                        # start_new_session=True means that signals to the parent python process
-                        # (e.g. SIGINT from keyboard interrupt) will not be sent to the new process on POSIX platforms.
-                        # This is generally good since we want the environment to have a chance to shutdown,
-                        # but may be undesirable in come cases; if so, we'll add a command-line toggle.
-                        # Note that on Windows, the CTRL_C signal will still be sent.
-                        start_new_session=True,
-                    )
-                except PermissionError as perm:
-                    # This is likely due to missing read or execute permissions on file.
-                    raise UnityEnvironmentException(
-                        f"Error when trying to launch environment - make sure "
-                        f"permissions are set correctly. For example "
-                        f'"chmod -R 755 {launch_string}"'
-                    ) from perm
-
-            else:
-                # Comments for future maintenance:
-                #     xvfb-run is a wrapper around Xvfb, a virtual xserver where all
-                #     rendering is done to virtual memory. It automatically creates a
-                #     new virtual server automatically picking a server number `auto-servernum`.
-                #     The server is passed the arguments using `server-args`, we are telling
-                #     Xvfb to create Screen number 0 with width 640, height 480 and depth 24 bits.
-                #     Note that 640 X 480 are the default width and height. The main reason for
-                #     us to add this is because we'd like to change the depth from the default
-                #     of 8 bits to 24.
-                #     Unfortunately, this means that we will need to pass the arguments through
-                #     a shell which is why we set `shell=True`. Now, this adds its own
-                #     complications. E.g SIGINT can bounce off the shell and not get propagated
-                #     to the child processes. This is why we add `exec`, so that the shell gets
-                #     launched, the arguments are passed to `xvfb-run`. `exec` replaces the shell
-                #     we created with `xvfb`.
-                #
-                docker_ls = (
-                    f"exec xvfb-run --auto-servernum --server-args='-screen 0 640x480x24'"
-                    f" {launch_string} {UnityEnvironment.PORT_COMMAND_LINE_ARG} {self.port}"
-                )
-
+            subprocess_args = [launch_string]
+            if no_graphics:
+                subprocess_args += ["-nographics", "-batchmode"]
+            subprocess_args += [UnityEnvironment.PORT_COMMAND_LINE_ARG, str(self.port)]
+            subprocess_args += args
+            try:
-                    docker_ls,
-                    stdout=subprocess.PIPE,
-                    stderr=subprocess.PIPE,
-                    shell=True,
+                    subprocess_args,
+                    # start_new_session=True means that signals to the parent python process
+                    # (e.g. SIGINT from keyboard interrupt) will not be sent to the new process on POSIX platforms.
+                    # This is generally good since we want the environment to have a chance to shutdown,
+                    # but may be undesirable in come cases; if so, we'll add a command-line toggle.
+                    # Note that on Windows, the CTRL_C signal will still be sent.
+                    start_new_session=True,
+            except PermissionError as perm:
+                # This is likely due to missing read or execute permissions on file.
+                raise UnityEnvironmentException(
+                    f"Error when trying to launch environment - make sure "
+                    f"permissions are set correctly. For example "
+                    f'"chmod -R 755 {launch_string}"'
+                ) from perm

    def _update_group_specs(self, output: UnityOutputProto) -> None:
        init_output = output.rl_initialization_output
        else:
            raise UnityEnvironmentException("No Unity environment is loaded.")

-    def _close(self):
+    def _close(self, timeout: Optional[int] = None) -> None:
+        """
+        Close the communicator and environment subprocess (if necessary).
+
+        :int timeout: [Optional] Number of seconds to wait for the environment to shut down before
+            force-killing it.  Defaults to `self.timeout_wait`.
+        """
+        if timeout is None:
+            timeout = self.timeout_wait
-                self.proc1.wait(timeout=self.timeout_wait)
+                self.proc1.wait(timeout=timeout)
                signal_name = self.returncode_to_signal_name(self.proc1.returncode)
                signal_name = f" ({signal_name})" if signal_name else ""
                return_info = f"Environment shut down with return code {self.proc1.returncode}{signal_name}."
--- a/ml-agents-envs/mlagents_envs/mock_communicator.py
+++ b/ml-agents-envs/mlagents_envs/mock_communicator.py
    ):
        """
        Python side of the grpc communication. Python is the client and Unity the server
-
-        :int base_port: Baseline port number to connect to Unity environment over. worker_id increments over this.
-        :int worker_id: Number to add to communication port (5005) [0]. Used for asynchronous agent scenarios.
        """
        super().__init__()
        self.is_discrete = discrete_action
--- a/ml-agents-envs/mlagents_envs/rpc_communicator.py
+++ b/ml-agents-envs/mlagents_envs/rpc_communicator.py


        :int base_port: Baseline port number to connect to Unity environment over. worker_id increments over this.
-        :int worker_id: Number to add to communication port (5005) [0]. Used for asynchronous agent scenarios.
+        :int worker_id: Offset from base_port. Used for training multiple environments simultaneously.
+        :int timeout_wait: Timeout (in seconds) to wait for a response before exiting.
        """
        super().__init__(worker_id, base_port)
        self.port = base_port + worker_id
--- a/ml-agents-envs/mlagents_envs/side_channel/outgoing_message.py
+++ b/ml-agents-envs/mlagents_envs/side_channel/outgoing_message.py
 from typing import List
 import struct

-import logging
+from mlagents_envs.logging_util import get_logger
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)


 class OutgoingMessage:
--- a/ml-agents-envs/mlagents_envs/side_channel/side_channel.py
+++ b/ml-agents-envs/mlagents_envs/side_channel/side_channel.py
 from abc import ABC, abstractmethod
 from typing import List
 import uuid
-import logging
+from mlagents_envs.logging_util import get_logger
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)


 class SideChannel(ABC):
--- a/ml-agents-envs/mlagents_envs/tests/test_envs.py
+++ b/ml-agents-envs/mlagents_envs/tests/test_envs.py
    env.close()


+@pytest.mark.parametrize(
+    "base_port,file_name,expected",
+    [
+        # Non-None base port value will always be used
+        (6001, "foo.exe", 6001),
+        # No port specified and environment specified, so use BASE_ENVIRONMENT_PORT
+        (None, "foo.exe", UnityEnvironment.BASE_ENVIRONMENT_PORT),
+        # No port specified and no environment, so use DEFAULT_EDITOR_PORT
+        (None, None, UnityEnvironment.DEFAULT_EDITOR_PORT),
+    ],
+)
+@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
+@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
+def test_port_defaults(
+    mock_communicator, mock_launcher, base_port, file_name, expected
+):
+    mock_communicator.return_value = MockCommunicator(
+        discrete_action=False, visual_inputs=0
+    )
+    env = UnityEnvironment(file_name=file_name, worker_id=0, base_port=base_port)
+    assert expected == env.port
+
+
@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
 def test_reset(mock_communicator, mock_launcher):
--- a/ml-agents/mlagents/model_serialization.py
+++ b/ml-agents/mlagents/model_serialization.py
 from distutils.util import strtobool
 import os
-import logging
 from typing import Any, List, Set, NamedTuple
 from distutils.version import LooseVersion


 from tensorflow.python.platform import gfile
 from tensorflow.python.framework import graph_util
+
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers import tensorflow_to_barracuda as tf2bc

 if LooseVersion(tf.__version__) < LooseVersion("1.12.0"):
+logger = get_logger(__name__)
-logger = logging.getLogger("mlagents.trainers")

 POSSIBLE_INPUT_NODES = frozenset(
    [
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
 import sys
-from typing import List, Dict, TypeVar, Generic, Tuple, Set
+from typing import List, Dict, TypeVar, Generic, Tuple, Any
+from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
 from mlagents.trainers.trajectory import Trajectory, AgentExperience
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.policy import Policy
            for _entropy in take_action_outputs["entropy"]:
                self.stats_reporter.add_stat("Policy/Entropy", _entropy)

-        terminated_agents: Set[str] = set()
        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
            get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids
            stored_take_action_outputs = self.last_take_action_outputs.get(
                global_id, None
            )
+
            if stored_agent_step is not None and stored_take_action_outputs is not None:
                # We know the step is from the same worker, so use the local agent id.
                obs = stored_agent_step.obs
                        traj_queue.put(trajectory)
                    self.experience_buffers[global_id] = []
                    if curr_agent_step.done:
+                        # Record episode length for agents which have had at least
+                        # 1 step. Done after reset ignored.
-                        terminated_agents.add(global_id)
                elif not curr_agent_step.done:
                    self.episode_steps[global_id] += 1

                batched_step_result.agent_id_to_index[_id],
            )
-
-        for terminated_id in terminated_agents:
-            self._clean_agent_data(terminated_id)
+            # Delete all done agents, regardless of if they had a 0-length episode.
+            if curr_agent_step.done:
+                self._clean_agent_data(global_id)

        for _gid in action_global_agent_ids:
            # If the ID doesn't have a last step result, the agent just reset,
        """
        Removes the data for an Agent.
        """
-        del self.experience_buffers[global_id]
-        del self.last_take_action_outputs[global_id]
-        del self.last_step_result[global_id]
-        del self.episode_steps[global_id]
-        del self.episode_rewards[global_id]
+        self._safe_delete(self.experience_buffers, global_id)
+        self._safe_delete(self.last_take_action_outputs, global_id)
+        self._safe_delete(self.last_step_result, global_id)
+        self._safe_delete(self.episode_steps, global_id)
+        self._safe_delete(self.episode_rewards, global_id)
+    def _safe_delete(self, my_dictionary: Dict[Any, Any], key: Any) -> None:
+        """
+        Safe removes data from a dictionary. If not found,
+        don't delete.
+        """
+        if key in my_dictionary:
+            del my_dictionary[key]
+
    def publish_trajectory_queue(
        self, trajectory_queue: "AgentManagerQueue[Trajectory]"
    ) -> None:
            self.behavior_id
        )
        self.publish_trajectory_queue(self.trajectory_queue)
+
+    def record_environment_stats(
+        self, env_stats: Dict[str, Tuple[float, StatsAggregationMethod]], worker_id: int
+    ) -> None:
+        """
+        Pass stats from the environment to the StatsReporter.
+        Depending on the StatsAggregationMethod, either StatsReporter.add_stat or StatsReporter.set_stat is used.
+        The worker_id is used to determin whether StatsReporter.set_stat should be used.
+        :param env_stats:
+        :param worker_id:
+        :return:
+        """
+        for stat_name, (val, agg_type) in env_stats.items():
+            if agg_type == StatsAggregationMethod.AVERAGE:
+                self.stats_reporter.add_stat(stat_name, val)
+            elif agg_type == StatsAggregationMethod.MOST_RECENT:
+                # In order to prevent conflicts between multiple environments,
+                # only stats from the first environment are recorded.
+                if worker_id == 0:
+                    self.stats_reporter.set_stat(stat_name, val)
--- a/ml-agents/mlagents/trainers/behavior_id_utils.py
+++ b/ml-agents/mlagents/trainers/behavior_id_utils.py
-from typing import Dict, NamedTuple
+from typing import NamedTuple
+from urllib.parse import urlparse, parse_qs
-    name_behavior_id: str
+    """
+    BehaviorIdentifiers is a named tuple of the identifiers that uniquely distinguish
+    an agent encountered in the trainer_controller. The named tuple consists of the
+    fully qualified behavior name, the name of the brain name (corresponds to trainer
+    in the trainer controller) and the team id.  In the future, this can be extended
+    to support further identifiers.
+    """
+
+    behavior_id: str
-    behavior_ids: Dict[str, int]
+    team_id: int
-        Parses a name_behavior_id of the form name?team=0&param1=i&...
+        Parses a name_behavior_id of the form name?team=0
-        This allows you to access the brain name and distinguishing identifiers
-        without parsing more than once.
+        This allows you to access the brain name and team id of an agent
-        ids: Dict[str, int] = {}
-        if "?" in name_behavior_id:
-            name, identifiers = name_behavior_id.rsplit("?", 1)
-            if "&" in identifiers:
-                list_of_identifiers = identifiers.split("&")
-            else:
-                list_of_identifiers = [identifiers]
-
-            for identifier in list_of_identifiers:
-                key, value = identifier.split("=")
-                ids[key] = int(value)
-        else:
-            name = name_behavior_id
-
+        parsed = urlparse(name_behavior_id)
+        name = parsed.path
+        ids = parse_qs(parsed.query)
+        team_id: int = 0
+        if "team" in ids:
+            team_id = int(ids["team"][0])
-            name_behavior_id=name_behavior_id, brain_name=name, behavior_ids=ids
+            behavior_id=name_behavior_id, brain_name=name, team_id=team_id
+
+
+def create_name_behavior_id(name: str, team_id: int) -> str:
+    """
+   Reconstructs fully qualified behavior name from name and team_id
+   :param name: brain name
+   :param team_id: team ID
+   :return: name_behavior_id
+   """
+    return name + "?team=" + str(team_id)
--- a/ml-agents/mlagents/trainers/components/reward_signals/init.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/init.py
-import logging
 from typing import Any, Dict, List
 from collections import namedtuple
 import numpy as np

+from mlagents_envs.logging_util import get_logger
-logger = logging.getLogger("mlagents.trainers")
+
+logger = get_logger(__name__)

 RewardSignalResult = namedtuple(
    "RewardSignalResult", ["scaled_reward", "unscaled_reward"]
--- a/ml-agents/mlagents/trainers/curriculum.py
+++ b/ml-agents/mlagents/trainers/curriculum.py

 from .exception import CurriculumConfigError, CurriculumLoadingError

-import logging
+from mlagents_envs.logging_util import get_logger
-logger = logging.getLogger("mlagents.trainers")
+logger = get_logger(__name__)


 class Curriculum:
--- a/ml-agents/mlagents/trainers/distributions.py
+++ b/ml-agents/mlagents/trainers/distributions.py
        act_size: List[int],
        reparameterize: bool = False,
        tanh_squash: bool = False,
+        condition_sigma: bool = True,
        log_sigma_min: float = -20,
        log_sigma_max: float = 2,
    ):
        :param log_sigma_max: Maximum log standard deviation to clip by.
        """
        encoded = self._create_mu_log_sigma(
-            logits, act_size, log_sigma_min, log_sigma_max
+            logits,
+            act_size,
+            log_sigma_min,
+            log_sigma_max,
+            condition_sigma=condition_sigma,
        )
        self._sampled_policy = self._create_sampled_policy(encoded)
        if not reparameterize:
        act_size: List[int],
        log_sigma_min: float,
        log_sigma_max: float,
+        condition_sigma: bool,
    ) -> "GaussianDistribution.MuSigmaTensors":

        mu = tf.layers.dense(
            reuse=tf.AUTO_REUSE,
        )

-        # Policy-dependent log_sigma_sq
-        log_sigma = tf.layers.dense(
-            logits,
-            act_size[0],
-            activation=None,
-            name="log_std",
-            kernel_initializer=ModelUtils.scaled_init(0.01),
-        )
+        if condition_sigma:
+            # Policy-dependent log_sigma_sq
+            log_sigma = tf.layers.dense(
+                logits,
+                act_size[0],
+                activation=None,
+                name="log_std",
+                kernel_initializer=ModelUtils.scaled_init(0.01),
+            )
+        else:
+            log_sigma = tf.get_variable(
+                "log_std",
+                [act_size[0]],
+                dtype=tf.float32,
+                initializer=tf.zeros_initializer(),
+            )
        log_sigma = tf.clip_by_value(log_sigma, log_sigma_min, log_sigma_max)
        sigma = tf.exp(log_sigma)
        return self.MuSigmaTensors(mu, log_sigma, sigma)
        self, encoded: "GaussianDistribution.MuSigmaTensors"
    ) -> tf.Tensor:
        single_dim_entropy = 0.5 * tf.reduce_mean(
-            tf.log(2 * np.pi * np.e) + tf.square(encoded.log_sigma)
+            tf.log(2 * np.pi * np.e) + 2 * encoded.log_sigma
        )
        # Make entropy the right shape
        return tf.ones_like(tf.reshape(encoded.mu[:, 0], [-1])) * single_dim_entropy
        Adjust probabilities for squashed sample before output
        """
-        probs -= tf.log(1 - squashed_policy ** 2 + EPSILON)
-        return probs
+        adjusted_probs = probs - tf.log(1 - squashed_policy ** 2 + EPSILON)
+        return adjusted_probs

    @property
    def total_log_probs(self) -> tf.Tensor:
--- a/ml-agents/mlagents/trainers/env_manager.py
+++ b/ml-agents/mlagents/trainers/env_manager.py
 from abc import ABC, abstractmethod
-import logging
-from typing import List, Dict, NamedTuple, Iterable
+from typing import List, Dict, NamedTuple, Iterable, Tuple
+from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
+from mlagents_envs.logging_util import get_logger
-logger = logging.getLogger("mlagents.trainers")
+
+logger = get_logger(__name__)


 class EnvironmentStep(NamedTuple):
+    environment_stats: Dict[str, Tuple[float, StatsAggregationMethod]]

    @property
    def name_behavior_ids(self) -> Iterable[AgentGroup]:
    def empty(worker_id: int) -> "EnvironmentStep":
-        return EnvironmentStep({}, worker_id, {})
+        return EnvironmentStep({}, worker_id, {}, {})


 class EnvManager(ABC):
                    step_info.brain_name_to_action_info.get(
                        name_behavior_id, ActionInfo.empty()
                    ),
+                )
+
+                self.agent_managers[name_behavior_id].record_environment_stats(
+                    step_info.environment_stats, step_info.worker_id
                )
        return len(step_infos)
--- a/ml-agents/mlagents/trainers/ghost/trainer.py
+++ b/ml-agents/mlagents/trainers/ghost/trainer.py
 # # Unity ML-Agents Toolkit
 # ## ML-Agent Learning (Ghost Trainer)

-from typing import Deque, Dict, List, Any, cast
+from typing import Deque, Dict, List, cast
-import logging
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.brain import BrainParameters
 from mlagents.trainers.policy import Policy
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.agent_processor import AgentManagerQueue
 from mlagents.trainers.stats import StatsPropertyType
-from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
+from mlagents.trainers.behavior_id_utils import (
+    BehaviorIdentifiers,
+    create_name_behavior_id,
+)
-logger = logging.getLogger("mlagents.trainers")
+
+logger = get_logger(__name__)
+    """
+    The GhostTrainer trains agents in adversarial games (there are teams in opposition) using a self-play mechanism.
+    In adversarial settings with self-play, at any time, there is only a single learning team. The other team(s) is
+    "ghosted" which means that its agents are executing fixed policies and not learning. The GhostTrainer wraps
+    a standard RL trainer which trains the learning team and ensures that only the trajectories collected
+    by the learning team are used for training.  The GhostTrainer also maintains past policy snapshots to be used
+    as the fixed policies when the team is not learning. The GhostTrainer is 1:1 with brain_names as the other
+    trainers, and is responsible for one or more teams. Note, a GhostTrainer can have only one team in
+    asymmetric games where there is only one team with a particular behavior i.e. Hide and Seek.
+    The GhostController manages high level coordination between multiple ghost trainers. The learning team id
+    is cycled throughout a training run.
+    """
+
-        self, trainer, brain_name, reward_buff_cap, trainer_parameters, training, run_id
+        self,
+        trainer,
+        brain_name,
+        controller,
+        reward_buff_cap,
+        trainer_parameters,
+        training,
+        run_id,
-        Responsible for collecting experiences and training trainer model via self_play.
+        Creates a GhostTrainer.
+        :param controller: GhostController that coordinates all ghost trainers and calculates ELO
        :param reward_buff_cap: Max reward history to track in the reward buffer
        :param trainer_parameters: The parameters for the trainer (dictionary).
        :param training: Whether the trainer is set for training.
        )

        self.trainer = trainer
+        self.controller = controller
-        self.internal_policy_queues: List[AgentManagerQueue[Policy]] = []
-        self.internal_trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
-        self.ignored_trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
-        self.learning_policy_queues: Dict[str, AgentManagerQueue[Policy]] = {}
+        self._internal_trajectory_queues: Dict[str, AgentManagerQueue[Trajectory]] = {}
+        self._internal_policy_queues: Dict[str, AgentManagerQueue[Policy]] = {}
+
+        self._team_to_name_to_policy_queue: Dict[
+            int, Dict[str, AgentManagerQueue[Policy]]
+        ] = {}
+
+        self._name_to_parsed_behavior_id: Dict[str, BehaviorIdentifiers] = {}

        # assign ghost's stats collection to wrapped trainer's
        self._stats_reporter = self.trainer.stats_reporter
        self_play_parameters = trainer_parameters["self_play"]
        self.window = self_play_parameters.get("window", 10)
-        self.play_against_current_self_ratio = self_play_parameters.get(
-            "play_against_current_self_ratio", 0.5
+        self.play_against_latest_model_ratio = self_play_parameters.get(
+            "play_against_latest_model_ratio", 0.5
+        if (
+            self.play_against_latest_model_ratio > 1.0
+            or self.play_against_latest_model_ratio < 0.0
+        ):
+            logger.warning(
+                "The play_against_latest_model_ratio is not between 0 and 1."
+            )
+
+        self.steps_to_train_team = self_play_parameters.get("team_change", 100000)
+        if self.steps_to_train_team > self.get_max_steps:
+            logger.warning(
+                "The max steps of the GhostTrainer for behavior name {} is less than team change. This team will not face \
+                opposition that has been trained if the opposition is managed by a different GhostTrainer as in an \
+                asymmetric game.".format(
+                    self.brain_name
+                )
+            )
+
+        # Counts the The number of steps of the ghost policies. Snapshot swapping
+        # depends on this counter whereas snapshot saving and team switching depends
+        # on the wrapped. This ensures that all teams train for the same number of trainer
+        # steps.
+        self.ghost_step: int = 0
+
+        # A list of dicts from brain name to a single snapshot for this trainer's policies
+        self.policy_snapshots: List[Dict[str, List[float]]] = []
+
+        # A dict from brain name to the current snapshot of this trainer's policies
+        self.current_policy_snapshot: Dict[str, List[float]] = {}
+        self.snapshot_counter: int = 0
-        self.policy_snapshots: List[Any] = []
-        self.snapshot_counter: int = 0
-        self.learning_behavior_name: str = None
-        self.current_policy_snapshot = None
-        self.last_save = 0
-        self.last_swap = 0
+
+        # wrapped_training_team and learning team need to be separate
+        # in the situation where new agents are created destroyed
+        # after learning team switches. These agents need to be added
+        # to trainers properly.
+        self._learning_team: int = None
+        self.wrapped_trainer_team: int = None
+        self.last_save: int = 0
+        self.last_swap: int = 0
+        self.last_team_change: int = 0
-        self.current_elo: float = self.initial_elo
        self.policy_elos: List[float] = [self.initial_elo] * (
            self.window + 1
        )  # for learning policy
    def get_step(self) -> int:
        """
-        Returns the number of steps the trainer has performed
-        :return: the step count of the trainer
+        Returns the number of steps the wrapped trainer has performed
+        :return: the step count of the wrapped trainer
        """
        return self.trainer.get_step

         """
        return self.trainer.reward_buffer

+    @property
+    def current_elo(self) -> float:
+        """
+        Gets ELO of current policy which is always last in the list
+        :return: ELO of current policy
+        """
+        return self.policy_elos[-1]
+
+    def change_current_elo(self, change: float) -> None:
+        """
+        Changes elo of current policy which is always last in the list
+        :param change: Amount to change current elo by
+        """
+        self.policy_elos[-1] += change
+
+    def get_opponent_elo(self) -> float:
+        """
+        Get elo of current opponent policy
+        :return: ELO of current opponent policy
+        """
+        return self.policy_elos[self.current_opponent]
+
+    def change_opponent_elo(self, change: float) -> None:
+        """
+        Changes elo of current opponent policy
+        :param change: Amount to change current opponent elo by
+        """
+        self.policy_elos[self.current_opponent] -= change
+
-        if trajectory.done_reached and not trajectory.max_step_reached:
-            # Assumption is that final reward is 1/.5/0 for win/draw/loss
+        """
+        Determines the final result of an episode and asks the GhostController
+        to calculate the ELO change. The GhostController changes the ELO
+        of the opponent policy since this may be in a different GhostTrainer
+        i.e. in asymmetric games. We assume the last reward determines the winner.
+        :param trajectory: Trajectory.
+        """
+        if trajectory.done_reached:
+            # Assumption is that final reward is >0/0/<0 for win/draw/loss
            final_reward = trajectory.steps[-1].reward
            result = 0.5
            if final_reward > 0:

-            change = compute_elo_rating_changes(
-                self.current_elo, self.policy_elos[self.current_opponent], result
+            change = self.controller.compute_elo_rating_changes(
+                self.current_elo, result
-            self.current_elo += change
-            self.policy_elos[self.current_opponent] -= change
-            opponents = np.array(self.policy_elos, dtype=np.float32)
+            self.change_current_elo(change)
-            self._stats_reporter.add_stat(
-                "Self-play/Mean Opponent ELO", opponents.mean()
-            )
-            self._stats_reporter.add_stat("Self-play/Std Opponent ELO", opponents.std())
-        for traj_queue, internal_traj_queue in zip(
-            self.trajectory_queues, self.internal_trajectory_queues
-        ):
-            try:
-                # We grab at most the maximum length of the queue.
-                # This ensures that even if the queue is being filled faster than it is
-                # being emptied, the trajectories in the queue are on-policy.
-                for _ in range(traj_queue.maxlen):
-                    t = traj_queue.get_nowait()
-                    # adds to wrapped trainers queue
-                    internal_traj_queue.put(t)
-                    self._process_trajectory(t)
-            except AgentManagerQueue.Empty:
-                pass
+        for trajectory_queue in self.trajectory_queues:
+            parsed_behavior_id = self._name_to_parsed_behavior_id[
+                trajectory_queue.behavior_id
+            ]
+            if parsed_behavior_id.team_id == self._learning_team:
+                # With a future multiagent trainer, this will be indexed by 'role'
+                internal_trajectory_queue = self._internal_trajectory_queues[
+                    parsed_behavior_id.brain_name
+                ]
+                try:
+                    # We grab at most the maximum length of the queue.
+                    # This ensures that even if the queue is being filled faster than it is
+                    # being emptied, the trajectories in the queue are on-policy.
+                    for _ in range(trajectory_queue.maxlen):
+                        t = trajectory_queue.get_nowait()
+                        # adds to wrapped trainers queue
+                        internal_trajectory_queue.put(t)
+                        self._process_trajectory(t)
+                except AgentManagerQueue.Empty:
+                    pass
+            else:
+                # Dump trajectories from non-learning policy
+                try:
+                    for _ in range(trajectory_queue.maxlen):
+                        t = trajectory_queue.get_nowait()
+                        # count ghost steps
+                        self.ghost_step += len(t.steps)
+                except AgentManagerQueue.Empty:
+                    pass
+        self.trainer.advance()
+        if self.get_step - self.last_team_change > self.steps_to_train_team:
+            self.controller.change_training_team(self.get_step)
+            self.last_team_change = self.get_step
-        self.trainer.advance()
+        next_learning_team = self.controller.get_learning_team
-        for internal_q in self.internal_policy_queues:
-            # Get policies that correspond to the policy queue in question
+        # CASE 1: Current learning team is managed by this GhostTrainer.
+        # If the learning team changes, the following loop over queues will push the
+        # new policy into the policy queue for the new learning agent if
+        # that policy is managed by this GhostTrainer. Otherwise, it will save the current snapshot.
+        # CASE 2: Current learning team is managed by a different GhostTrainer.
+        # If the learning team changes to a team managed by this GhostTrainer, this loop
+        # will push the current_snapshot into the correct queue.  Otherwise,
+        # it will continue skipping and swap_snapshot will continue to handle
+        # pushing fixed snapshots
+        # Case 3: No team change. The if statement just continues to push the policy
+        # into the correct queue (or not if not learning team).
+        for brain_name in self._internal_policy_queues:
+            internal_policy_queue = self._internal_policy_queues[brain_name]
-                policy = cast(TFPolicy, internal_q.get_nowait())
-                self.current_policy_snapshot = policy.get_weights()
-                self.learning_policy_queues[internal_q.behavior_id].put(policy)
+                policy = cast(TFPolicy, internal_policy_queue.get_nowait())
+                self.current_policy_snapshot[brain_name] = policy.get_weights()
+            if next_learning_team in self._team_to_name_to_policy_queue:
+                name_to_policy_queue = self._team_to_name_to_policy_queue[
+                    next_learning_team
+                ]
+                if brain_name in name_to_policy_queue:
+                    behavior_id = create_name_behavior_id(
+                        brain_name, next_learning_team
+                    )
+                    policy = self.get_policy(behavior_id)
+                    policy.load_weights(self.current_policy_snapshot[brain_name])
+                    name_to_policy_queue[brain_name].put(policy)
+        # Note save and swap should be on different step counters.
+        # We don't want to save unless the policy is learning.
-            self._save_snapshot(self.trainer.policy)
+            self._save_snapshot()
-        if self.get_step - self.last_swap > self.steps_between_swap:
+        if (
+            self._learning_team != next_learning_team
+            or self.ghost_step - self.last_swap > self.steps_between_swap
+        ):
+            self._learning_team = next_learning_team
-            self.last_swap = self.get_step
-
-        # Dump trajectories from non-learning policy
-        for traj_queue in self.ignored_trajectory_queues:
-            try:
-                for _ in range(traj_queue.maxlen):
-                    traj_queue.get_nowait()
-            except AgentManagerQueue.Empty:
-                pass
+            self.last_swap = self.ghost_step
+        """
+        Forwarding call to wrapped trainers end_episode
+        """
+        """
+        Forwarding call to wrapped trainers save_model
+        """
-        self.trainer.export_model(name_behavior_id)
+        """
+        Forwarding call to wrapped trainers export_model.
+        First loads the current snapshot.
+        """
+        parsed_behavior_id = self._name_to_parsed_behavior_id[name_behavior_id]
+        brain_name = parsed_behavior_id.brain_name
+        policy = self.trainer.get_policy(brain_name)
+        policy.load_weights(self.current_policy_snapshot[brain_name])
+        self.trainer.export_model(brain_name)
+        """
+        Creates policy with the wrapped trainer's create_policy function
+        """
-    def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
+    def add_policy(
+        self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
+    ) -> None:
-        Adds policy to trainer. For the first policy added, add a trainer
-        to the policy and set the learning behavior name to name_behavior_id.
+        Adds policy to trainer. The first policy encountered sets the wrapped
+        trainer team.  This is to ensure that all agents from the same multi-agent
+        team are grouped. All policies associated with this team are added to the
+        wrapped trainer to be trained.
+        name_behavior_id = parsed_behavior_id.behavior_id
+        team_id = parsed_behavior_id.team_id
+        self.controller.subscribe_team_id(team_id, self)
-        # First policy encountered
-        if not self.learning_behavior_name:
-            weights = policy.get_weights()
-            self.current_policy_snapshot = weights
-            self.trainer.add_policy(name_behavior_id, policy)
-            self._save_snapshot(policy)  # Need to save after trainer initializes policy
-            self.learning_behavior_name = name_behavior_id
-            behavior_id_parsed = BehaviorIdentifiers.from_name_behavior_id(
-                self.learning_behavior_name
-            )
-            team_id = behavior_id_parsed.behavior_ids["team"]
-            self._stats_reporter.add_property(StatsPropertyType.SELF_PLAY_TEAM, team_id)
-        else:
-            # for saving/swapping snapshots
-            policy.init_load_weights()
+        self._name_to_parsed_behavior_id[name_behavior_id] = parsed_behavior_id
+        # for saving/swapping snapshots
+        policy.init_load_weights()
+
+        # First policy or a new agent on the same team encountered
+        if self.wrapped_trainer_team is None or team_id == self.wrapped_trainer_team:
+            self.current_policy_snapshot[
+                parsed_behavior_id.brain_name
+            ] = policy.get_weights()
+
+            self._save_snapshot()  # Need to save after trainer initializes policy
+            self.trainer.add_policy(parsed_behavior_id, policy)
+            self._learning_team = self.controller.get_learning_team
+            self.wrapped_trainer_team = team_id
+        """
+        Gets policy associated with name_behavior_id
+        :param name_behavior_id: Fully qualified behavior name
+        :return: Policy associated with name_behavior_id
+        """
-    def _save_snapshot(self, policy: TFPolicy) -> None:
-        weights = policy.get_weights()
-        try:
-            self.policy_snapshots[self.snapshot_counter] = weights
-        except IndexError:
-            self.policy_snapshots.append(weights)
+    def _save_snapshot(self) -> None:
+        """
+        Saves a snapshot of the current weights of the policy and maintains the policy_snapshots
+        according to the window size
+        """
+        for brain_name in self.current_policy_snapshot:
+            current_snapshot_for_brain_name = self.current_policy_snapshot[brain_name]
+
+            try:
+                self.policy_snapshots[self.snapshot_counter][
+                    brain_name
+                ] = current_snapshot_for_brain_name
+            except IndexError:
+                self.policy_snapshots.append(
+                    {brain_name: current_snapshot_for_brain_name}
+                )
-        for q in self.policy_queues:
-            name_behavior_id = q.behavior_id
-            # here is the place for a sampling protocol
-            if name_behavior_id == self.learning_behavior_name:
+        """
+        Swaps the appropriate weight to the policy and pushes it to respective policy queues
+        """
+
+        for team_id in self._team_to_name_to_policy_queue:
+            if team_id == self._learning_team:
-            elif np.random.uniform() < (1 - self.play_against_current_self_ratio):
+            elif np.random.uniform() < (1 - self.play_against_latest_model_ratio):
-                self.policy_elos[-1] = self.current_elo
+
-            logger.debug(
-                "Step {}: Swapping snapshot {} to id {} with {} learning".format(
-                    self.get_step, x, name_behavior_id, self.learning_behavior_name
+            name_to_policy_queue = self._team_to_name_to_policy_queue[team_id]
+            for brain_name in self._team_to_name_to_policy_queue[team_id]:
+                behavior_id = create_name_behavior_id(brain_name, team_id)
+                policy = self.get_policy(behavior_id)
+                policy.load_weights(snapshot[brain_name])
+                name_to_policy_queue[brain_name].put(policy)
+                logger.debug(
+                    "Step {}: Swapping snapshot {} to id {} with team {} learning".format(
+                        self.ghost_step, x, behavior_id, self._learning_team
+                    )
-            )
-            policy = self.get_policy(name_behavior_id)
-            policy.load_weights(snapshot)
-            q.put(policy)
-        Adds a policy queue to the list of queues to publish to when this Trainer
-        makes a policy update
+        Adds a policy queue for every member of the team to the list of queues to publish to when this Trainer
+        makes a policy update.  Creates an internal policy queue for the wrapped
+        trainer to push to.  The GhostTrainer pushes all policies to the env.
-        if policy_queue.behavior_id == self.learning_behavior_name:
-
+        parsed_behavior_id = self._name_to_parsed_behavior_id[policy_queue.behavior_id]
+        try:
+            self._team_to_name_to_policy_queue[parsed_behavior_id.team_id][
+                parsed_behavior_id.brain_name
+            ] = policy_queue
+        except KeyError:
+            self._team_to_name_to_policy_queue[parsed_behavior_id.team_id] = {
+                parsed_behavior_id.brain_name: policy_queue
+            }
+        if parsed_behavior_id.team_id == self.wrapped_trainer_team:
+            # With a future multiagent trainer, this will be indexed by 'role'
-                policy_queue.behavior_id
+                parsed_behavior_id.brain_name
-            self.internal_policy_queues.append(internal_policy_queue)
-            self.learning_policy_queues[policy_queue.behavior_id] = policy_queue
+            self._internal_policy_queues[
+                parsed_behavior_id.brain_name
+            ] = internal_policy_queue
            self.trainer.publish_policy_queue(internal_policy_queue)

    def subscribe_trajectory_queue(
-        Adds a trajectory queue to the list of queues for the trainer to ingest Trajectories from.
+        Adds a trajectory queue for every member of the team to the list of queues for the trainer
+        to ingest Trajectories from. Creates an internal trajectory queue to push trajectories from
+        the learning team.  The wrapped trainer subscribes to this queue.
-
-        if trajectory_queue.behavior_id == self.learning_behavior_name:
-            super().subscribe_trajectory_queue(trajectory_queue)
-
+        super().subscribe_trajectory_queue(trajectory_queue)
+        parsed_behavior_id = self._name_to_parsed_behavior_id[
+            trajectory_queue.behavior_id
+        ]
+        if parsed_behavior_id.team_id == self.wrapped_trainer_team:
+            # With a future multiagent trainer, this will be indexed by 'role'
-            ] = AgentManagerQueue(trajectory_queue.behavior_id)
+            ] = AgentManagerQueue(parsed_behavior_id.brain_name)
-            self.internal_trajectory_queues.append(internal_trajectory_queue)
+            self._internal_trajectory_queues[
+                parsed_behavior_id.brain_name
+            ] = internal_trajectory_queue
-        else:
-            self.ignored_trajectory_queues.append(trajectory_queue)
-
-
-# Taken from https://github.com/Unity-Technologies/ml-agents/pull/1975 and
-# https://metinmediamath.wordpress.com/2013/11/27/how-to-calculate-the-elo-rating-including-example/
-# ELO calculation
-
-
-def compute_elo_rating_changes(rating1: float, rating2: float, result: float) -> float:
-    r1 = pow(10, rating1 / 400)
-    r2 = pow(10, rating2 / 400)
-
-    summed = r1 + r2
-    e1 = r1 / summed
-
-    change = result - e1
-    return change
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
 # # Unity ML-Agents Toolkit
-import logging
-import glob
-import shutil
 import numpy as np
 import json

 from mlagents import tf_utils
 from mlagents.trainers.trainer_controller import TrainerController
 from mlagents.trainers.meta_curriculum import MetaCurriculum
-from mlagents.trainers.trainer_util import load_config, TrainerFactory
+from mlagents.trainers.trainer_util import (
+    load_config,
+    TrainerFactory,
+    handle_existing_directories,
+)
 from mlagents.trainers.stats import (
    TensorboardWriter,
    CSVWriter,
 from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig
 from mlagents_envs.exception import UnityEnvironmentException
 from mlagents_envs.timers import hierarchical_timer, get_timer_tree
-from mlagents.logging_util import create_logger
+from mlagents_envs import logging_util
+
+logger = logging_util.get_logger(__name__)


 def _create_parser():
        default=False,
        dest="load_model",
        action="store_true",
-        help="Whether to load the model or randomly initialize",
+        help=argparse.SUPPRESS,  # Deprecated but still usable for now.
+    )
+    argparser.add_argument(
+        "--resume",
+        default=False,
+        dest="resume",
+        action="store_true",
+        help="Resumes training from a checkpoint. Specify a --run-id to use this option.",
+    )
+    argparser.add_argument(
+        "--force",
+        default=False,
+        dest="force",
+        action="store_true",
+        help="Force-overwrite existing models and summaries for a run-id that has been used "
+        "before.",
    )
    argparser.add_argument(
        "--run-id",
        default=False,
        dest="train_model",
        action="store_true",
-        help="Whether to train model, or only run inference",
+        help=argparse.SUPPRESS,
+    )
+    argparser.add_argument(
+        "--inference",
+        default=False,
+        dest="inference",
+        action="store_true",
+        help="Run in Python inference mode (don't train). Use with --resume to load a model trained with an "
+        "existing run-id.",
-        default=5005,
+        default=UnityEnvironment.BASE_ENVIRONMENT_PORT,
        type=int,
        help="Base port for environment communication",
    )
        type=int,
        help="Number of parallel environments to use for training",
-    )
-    argparser.add_argument(
-        "--docker-target-name",
-        default=None,
-        dest="docker_target_name",
-        help="Docker volume to store training-specific files",
    )
    argparser.add_argument(
        "--no-graphics",
    env_path: Optional[str] = parser.get_default("env_path")
    run_id: str = parser.get_default("run_id")
    load_model: bool = parser.get_default("load_model")
+    resume: bool = parser.get_default("resume")
+    force: bool = parser.get_default("force")
+    inference: bool = parser.get_default("inference")
    save_freq: int = parser.get_default("save_freq")
    keep_checkpoints: int = parser.get_default("keep_checkpoints")
    base_port: int = parser.get_default("base_port")
    no_graphics: bool = parser.get_default("no_graphics")
    multi_gpu: bool = parser.get_default("multi_gpu")
    sampler_config: Optional[Dict] = None
-    docker_target_name: Optional[str] = parser.get_default("docker_target_name")
    env_args: Optional[List[str]] = parser.get_default("env_args")
    cpu: bool = parser.get_default("cpu")
    width: int = parser.get_default("width")
          configs loaded from files.
        """
        argparse_args = vars(args)
-        docker_target_name = argparse_args["docker_target_name"]
-        if docker_target_name is not None:
-            trainer_config_path = f"/{docker_target_name}/{trainer_config_path}"
-            if curriculum_config_path is not None:
-                curriculum_config_path = (
-                    f"/{docker_target_name}/{curriculum_config_path}"
-                )
        argparse_args["trainer_config"] = load_config(trainer_config_path)
        if curriculum_config_path is not None:
            argparse_args["curriculum_config"] = load_config(curriculum_config_path)
            )
-
+        # Keep deprecated --load working, TODO: remove
+        argparse_args["resume"] = argparse_args["resume"] or argparse_args["load_model"]
        # Since argparse accepts file paths in the config options which don't exist in CommandLineOptions,
        # these keys will need to be deleted to use the **/splat operator below.
        argparse_args.pop("sampler_file_path")
    :param run_options: Command line arguments for training.
    """
    with hierarchical_timer("run_training.setup"):
-        # Recognize and use docker volume if one is passed as an argument
-        if not options.docker_target_name:
-            model_path = f"./models/{options.run_id}"
-            summaries_dir = "./summaries"
-        else:
-            model_path = f"/{options.docker_target_name}/models/{options.run_id}"
-            summaries_dir = f"/{options.docker_target_name}/summaries"
+        model_path = f"./models/{options.run_id}"
+        summaries_dir = "./summaries"
        port = options.base_port

        # Configure CSV, Tensorboard Writers and StatsReporter
                "Environment/Episode Length",
            ],
        )
-        tb_writer = TensorboardWriter(summaries_dir)
+        handle_existing_directories(
+            model_path, summaries_dir, options.resume, options.force
+        )
+        tb_writer = TensorboardWriter(summaries_dir, clear_past_data=not options.resume)
        gauge_write = GaugeWriter()
        console_writer = ConsoleWriter()
        StatsReporter.add_writer(tb_writer)
        if options.env_path is None:
            port = UnityEnvironment.DEFAULT_EDITOR_PORT
        env_factory = create_environment_factory(
-            options.env_path,
-            options.docker_target_name,
-            options.no_graphics,
-            run_seed,
-            port,
-            options.env_args,
+            options.env_path, options.no_graphics, run_seed, port, options.env_args
        )
        engine_config = EngineConfig(
            options.width,
            options.run_id,
            model_path,
            options.keep_checkpoints,
-            options.train_model,
-            options.load_model,
+            not options.inference,
+            options.resume,
            run_seed,
            maybe_meta_curriculum,
            options.multi_gpu,
            options.run_id,
            options.save_freq,
            maybe_meta_curriculum,
-            options.train_model,
+            not options.inference,
            run_seed,
            sampler_manager,
            resampling_interval,
        with open(timing_path, "w") as f:
            json.dump(get_timer_tree(), f, indent=4)
    except FileNotFoundError:
-        logging.warning(
+        logger.warning(
            f"Unable to save to {timing_path}. Make sure the directory exists"
        )

        return meta_curriculum


-def prepare_for_docker_run(docker_target_name, env_path):
-    for f in glob.glob(
-        "/{docker_target_name}/*".format(docker_target_name=docker_target_name)
-    ):
-        if env_path in f:
-            try:
-                b = os.path.basename(f)
-                if os.path.isdir(f):
-                    shutil.copytree(f, "/ml-agents/{b}".format(b=b))
-                else:
-                    src_f = "/{docker_target_name}/{b}".format(
-                        docker_target_name=docker_target_name, b=b
-                    )
-                    dst_f = "/ml-agents/{b}".format(b=b)
-                    shutil.copyfile(src_f, dst_f)
-                    os.chmod(dst_f, 0o775)  # Make executable
-            except Exception as e:
-                logging.getLogger("mlagents.trainers").info(e)
-    env_path = "/ml-agents/{env_path}".format(env_path=env_path)
-    return env_path
-
-
-    docker_target_name: Optional[str],
    no_graphics: bool,
    seed: int,
    start_port: int,
            raise UnityEnvironmentException(
                f"Couldn't launch the {env_path} environment. Provided filename does not match any environments."
            )
-    docker_training = docker_target_name is not None
-    if docker_training and env_path is not None:
-        #     Comments for future maintenance:
-        #         Some OS/VM instances (e.g. COS GCP Image) mount filesystems
-        #         with COS flag which prevents execution of the Unity scene,
-        #         to get around this, we will copy the executable into the
-        #         container.
-        # Navigate in docker path and find env_path and copy it.
-        env_path = prepare_for_docker_run(docker_target_name, env_path)

    def create_unity_environment(
        worker_id: int, side_channels: List[SideChannel]
            file_name=env_path,
            worker_id=worker_id,
            seed=env_seed,
-            docker_training=docker_training,
            no_graphics=no_graphics,
            base_port=start_port,
            args=env_args,
    print(get_version_string())

    if options.debug:
-        log_level = logging.DEBUG
+        log_level = logging_util.DEBUG
-        log_level = logging.INFO
+        log_level = logging_util.INFO
-    trainer_logger = create_logger("mlagents.trainers", log_level)
+    logging_util.set_log_level(log_level)
+
+    logger.debug("Configuration for this run:")
+    logger.debug(json.dumps(options._asdict(), indent=4))
-    trainer_logger.debug("Configuration for this run:")
-    trainer_logger.debug(json.dumps(options._asdict(), indent=4))
+    # Options deprecation warnings
+    if options.load_model:
+        logger.warning(
+            "The --load option has been deprecated. Please use the --resume option instead."
+        )
+    if options.train_model:
+        logger.warning(
+            "The --train option has been deprecated. Train mode is now the default. Use "
+            "--inference to run in inference mode."
+        )

    run_seed = options.seed
    if options.cpu:
--- a/ml-agents/mlagents/trainers/meta_curriculum.py
+++ b/ml-agents/mlagents/trainers/meta_curriculum.py
 from typing import Dict, Set
 from mlagents.trainers.curriculum import Curriculum

-import logging
+from mlagents_envs.logging_util import get_logger
-logger = logging.getLogger("mlagents.trainers")
+logger = get_logger(__name__)


 class MetaCurriculum:
--- a/ml-agents/mlagents/trainers/policy/nn_policy.py
+++ b/ml-agents/mlagents/trainers/policy/nn_policy.py
                self.act_size,
                reparameterize=reparameterize,
                tanh_squash=tanh_squash,
+                condition_sigma=condition_sigma_on_obs,
            )

        if tanh_squash:
--- a/ml-agents/mlagents/trainers/policy/tf_policy.py
+++ b/ml-agents/mlagents/trainers/policy/tf_policy.py
-import logging
 from typing import Any, Dict, List, Optional
 import abc
 import numpy as np
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.policy import Policy
 from mlagents.trainers.action_info import ActionInfo
 from mlagents.trainers.trajectory import SplitObservations


-logger = logging.getLogger("mlagents.trainers")
+logger = get_logger(__name__)


 class UnityPolicyException(UnityException):
            logger.info("Loading Model for brain {}".format(self.brain.brain_name))
            ckpt = tf.train.get_checkpoint_state(self.model_path)
            if ckpt is None:
-                logger.info(
-                    "The model {0} could not be found. Make "
+                raise UnityPolicyException(
+                    "The model {0} could not be loaded. Make "
-                    "--run-id".format(self.model_path)
+                    "--run-id. and that the previous run you are resuming from had the same "
+                    "behavior names.".format(self.model_path)
                )
            self.saver.restore(self.sess, ckpt.model_checkpoint_path)

                self.assign_ops.append(tf.assign(var, assign_ph))

    def load_weights(self, values):
+        if len(self.assign_ops) == 0:
+            logger.warning(
+                "Calling load_weights in tf_policy but assign_ops is empty. Did you forget to call init_load_weights?"
+            )
        with self.graph.as_default():
            feed_dict = {}
            for assign_ph, value in zip(self.assign_phs, values):
        """
        if batched_step_result.n_agents() == 0:
            return ActionInfo.empty()
-
-        agents_done = [
-            agent
-            for agent, done in zip(
-                batched_step_result.agent_id, batched_step_result.done
-            )
-            if done
-        ]
-
-        self.remove_memories(agents_done)
-        self.remove_previous_action(agents_done)

        global_agent_ids = [
            get_global_agent_id(worker_id, int(agent_id))

    def create_input_placeholders(self):
        with self.graph.as_default():
-            self.global_step, self.increment_step_op, self.steps_to_increment = (
-                ModelUtils.create_global_steps()
-            )
+            (
+                self.global_step,
+                self.increment_step_op,
+                self.steps_to_increment,
+            ) = ModelUtils.create_global_steps()
            self.visual_in = ModelUtils.create_visual_input_placeholders(
                self.brain.camera_resolutions
            )
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
 # ## ML-Agent Learning (PPO)
 # Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347

-import logging
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.policy.nn_policy import NNPolicy
 from mlagents.trainers.trainer.rl_trainer import RLTrainer
 from mlagents.trainers.brain import BrainParameters
 from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
-logger = logging.getLogger("mlagents.trainers")
+logger = get_logger(__name__)


 class PPOTrainer(RLTrainer):

        return policy

-    def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
+    def add_policy(
+        self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
+    ) -> None:
-        :param name_behavior_id: Behavior ID that the policy should belong to.
+        :param parsed_behavior_id: Behavior identifiers that the policy should belong to.
        :param policy: Policy to associate with name_behavior_id.
        """
        if self.policy:
--- a/ml-agents/mlagents/trainers/sac/optimizer.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer.py
-import logging
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
 from mlagents.trainers.models import LearningRateSchedule, EncoderType, ModelUtils
 from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer

 EPSILON = 1e-6  # Small value to avoid divide by zero

-logger = logging.getLogger("mlagents.trainers")
+logger = get_logger(__name__)

 POLICY_SCOPE = ""
 TARGET_SCOPE = "target_network"
            "q1_loss": self.q1_loss,
            "q2_loss": self.q2_loss,
            "entropy_coef": self.ent_coef,
-            "entropy": self.policy.entropy,
            "update_batch": self.update_batch_policy,
            "update_value": self.update_batch_value,
            "update_entropy": self.update_batch_entropy,
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
 # Contains an implementation of SAC as described in https://arxiv.org/abs/1801.01290
 # and implemented in https://github.com/hill-a/stable-baselines

-import logging
 from collections import defaultdict
 from typing import Dict
 import os

+from mlagents_envs.logging_util import get_logger
 from mlagents_envs.timers import timed
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.policy.nn_policy import NNPolicy
 from mlagents.trainers.brain import BrainParameters
 from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
-logger = logging.getLogger("mlagents.trainers")
+logger = get_logger(__name__)
+
 BUFFER_TRUNCATE_PERCENT = 0.8


            "memory_size",
            "model_path",
            "reward_signals",
-            "vis_encode_type",
        ]

        self._check_param_keys()
        for stat, stat_list in batch_update_stats.items():
            self._stats_reporter.add_stat(stat, np.mean(stat_list))

-    def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
+    def add_policy(
+        self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
+    ) -> None:
        """
        Adds policy to trainer.
        :param brain_parameters: specifications for policy construction
--- a/ml-agents/mlagents/trainers/simple_env_manager.py
+++ b/ml-agents/mlagents/trainers/simple_env_manager.py
        self.env.step()
        all_step_result = self._generate_all_results()

-        step_info = EnvironmentStep(all_step_result, 0, self.previous_all_action_info)
+        step_info = EnvironmentStep(
+            all_step_result, 0, self.previous_all_action_info, {}
+        )
        self.previous_step = step_info
        return [step_info]

                self.shared_float_properties.set_property(k, v)
        self.env.reset()
        all_step_result = self._generate_all_results()
-        self.previous_step = EnvironmentStep(all_step_result, 0, {})
+        self.previous_step = EnvironmentStep(all_step_result, 0, {}, {})
        return [self.previous_step]

    @property
--- a/ml-agents/mlagents/trainers/stats.py
+++ b/ml-agents/mlagents/trainers/stats.py
 import csv
 import os
 import time
-import logging
+from mlagents_envs.logging_util import get_logger
+from mlagents_envs.timers import set_gauge
-from mlagents_envs.timers import set_gauge
+
-logger = logging.getLogger("mlagents.trainers")
+logger = get_logger(__name__)


 class StatsSummary(NamedTuple):
 class StatsPropertyType(Enum):
    HYPERPARAMETERS = "hyperparameters"
    SELF_PLAY = "selfplay"
-    SELF_PLAY_TEAM = "selfplayteam"


 class StatsWriter(abc.ABC):
            )
            if self.self_play and "Self-play/ELO" in values:
                elo_stats = values["Self-play/ELO"]
-                mean_opponent_elo = values["Self-play/Mean Opponent ELO"]
-                std_opponent_elo = values["Self-play/Std Opponent ELO"]
-                logger.info(
-                    "{} Team {}: ELO: {:0.3f}. "
-                    "Mean Opponent ELO: {:0.3f}. "
-                    "Std Opponent ELO: {:0.3f}. ".format(
-                        category,
-                        self.self_play_team,
-                        elo_stats.mean,
-                        mean_opponent_elo.mean,
-                        std_opponent_elo.mean,
-                    )
-                )
+                logger.info("{} ELO: {:0.3f}. ".format(category, elo_stats.mean))
        else:
            logger.info(
                "{}: Step: {}. No episode was completed since last summary. {}".format(
        elif property_type == StatsPropertyType.SELF_PLAY:
            assert isinstance(value, bool)
            self.self_play = value
-        elif property_type == StatsPropertyType.SELF_PLAY_TEAM:
-            assert isinstance(value, int)
-            self.self_play_team = value

    def _dict_to_str(self, param_dict: Dict[str, Any], num_tabs: int) -> str:
        """


 class TensorboardWriter(StatsWriter):
-    def __init__(self, base_dir: str):
+    def __init__(self, base_dir: str, clear_past_data: bool = False):
+        :param clear_past_data: Whether or not to clean up existing Tensorboard files associated with the base_dir and
+            category.
+        self._clear_past_data = clear_past_data

    def write_stats(
        self, category: str, values: Dict[str, StatsSummary], step: int
                basedir=self.base_dir, category=category
            )
            os.makedirs(filewriter_dir, exist_ok=True)
+            if self._clear_past_data:
+                self._delete_all_events_files(filewriter_dir)
+
+    def _delete_all_events_files(self, directory_name: str) -> None:
+        for file_name in os.listdir(directory_name):
+            if file_name.startswith("events.out"):
+                logger.warning(
+                    "{} was left over from a previous run. Deleting.".format(file_name)
+                )
+                full_fname = os.path.join(directory_name, file_name)
+                try:
+                    os.remove(full_fname)
+                except OSError:
+                    logger.warning(
+                        "{} was left over from a previous run and "
+                        "not deleted.".format(full_fname)
+                    )

    def add_property(
        self, category: str, property_type: StatsPropertyType, value: Any
--- a/ml-agents/mlagents/trainers/subprocess_env_manager.py
+++ b/ml-agents/mlagents/trainers/subprocess_env_manager.py
-import logging
-from typing import Dict, NamedTuple, List, Any, Optional, Callable, Set
+from typing import Dict, NamedTuple, List, Any, Optional, Callable, Set, Tuple
+import enum
-from mlagents_envs.exception import UnityCommunicationException, UnityTimeOutException
+from mlagents_envs.exception import (
+    UnityCommunicationException,
+    UnityTimeOutException,
+    UnityEnvironmentException,
+)
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.env_manager import EnvManager, EnvironmentStep, AllStepResult
 from mlagents_envs.timers import (
    TimerNode,
    EngineConfigurationChannel,
    EngineConfig,
 )
+from mlagents_envs.side_channel.stats_side_channel import (
+    StatsSideChannel,
+    StatsAggregationMethod,
+)
-logger = logging.getLogger("mlagents.trainers")
+
+logger = get_logger(__name__)
+
+
+class EnvironmentCommand(enum.Enum):
+    STEP = 1
+    EXTERNAL_BRAINS = 2
+    GET_PROPERTIES = 3
+    RESET = 4
+    CLOSE = 5
+    ENV_EXITED = 6
-class EnvironmentCommand(NamedTuple):
-    name: str
+class EnvironmentRequest(NamedTuple):
+    cmd: EnvironmentCommand
-    name: str
+    cmd: EnvironmentCommand
    worker_id: int
    payload: Any

    timer_root: Optional[TimerNode]
+    environment_stats: Dict[str, Tuple[float, StatsAggregationMethod]]


 class UnityEnvWorker:
        self.previous_all_action_info: Dict[str, ActionInfo] = {}
        self.waiting = False

-    def send(self, name: str, payload: Any = None) -> None:
+    def send(self, cmd: EnvironmentCommand, payload: Any = None) -> None:
-            cmd = EnvironmentCommand(name, payload)
-            self.conn.send(cmd)
+            req = EnvironmentRequest(cmd, payload)
+            self.conn.send(req)
        except (BrokenPipeError, EOFError):
            raise UnityCommunicationException("UnityEnvironment worker: send failed.")

+            if response.cmd == EnvironmentCommand.ENV_EXITED:
+                env_exception: Exception = response.payload
+                raise env_exception
            return response
        except (BrokenPipeError, EOFError):
            raise UnityCommunicationException("UnityEnvironment worker: recv failed.")
-            self.conn.send(EnvironmentCommand("close"))
+            self.conn.send(EnvironmentRequest(EnvironmentCommand.CLOSE))
        except (BrokenPipeError, EOFError):
            logger.debug(
                f"UnityEnvWorker {self.worker_id} got exception trying to close."
    shared_float_properties = FloatPropertiesChannel()
    engine_configuration_channel = EngineConfigurationChannel()
    engine_configuration_channel.set_configuration(engine_configuration)
-    env: BaseEnv = env_factory(
-        worker_id, [shared_float_properties, engine_configuration_channel]
-    )
+    stats_channel = StatsSideChannel()
+    env: BaseEnv = None
-    def _send_response(cmd_name, payload):
+    def _send_response(cmd_name: EnvironmentCommand, payload: Any) -> None:
        parent_conn.send(EnvironmentResponse(cmd_name, worker_id, payload))

    def _generate_all_results() -> AllStepResult:
        return result

    try:
+        env = env_factory(
+            worker_id,
+            [shared_float_properties, engine_configuration_channel, stats_channel],
+        )
-            cmd: EnvironmentCommand = parent_conn.recv()
-            if cmd.name == "step":
-                all_action_info = cmd.payload
+            req: EnvironmentRequest = parent_conn.recv()
+            if req.cmd == EnvironmentCommand.STEP:
+                all_action_info = req.payload
                for brain_name, action_info in all_action_info.items():
                    if len(action_info.action) != 0:
                        env.set_actions(brain_name, action_info.action)
                # Note that we could randomly return timers a fraction of the time if we wanted to reduce
                # the data transferred.
                # TODO get gauges from the workers and merge them in the main process too.
-                step_response = StepResponse(all_step_result, get_timer_root())
-                step_queue.put(EnvironmentResponse("step", worker_id, step_response))
+                env_stats = stats_channel.get_and_reset_stats()
+                step_response = StepResponse(
+                    all_step_result, get_timer_root(), env_stats
+                )
+                step_queue.put(
+                    EnvironmentResponse(
+                        EnvironmentCommand.STEP, worker_id, step_response
+                    )
+                )
-            elif cmd.name == "external_brains":
-                _send_response("external_brains", external_brains())
-            elif cmd.name == "get_properties":
+            elif req.cmd == EnvironmentCommand.EXTERNAL_BRAINS:
+                _send_response(EnvironmentCommand.EXTERNAL_BRAINS, external_brains())
+            elif req.cmd == EnvironmentCommand.GET_PROPERTIES:
-                _send_response("get_properties", reset_params)
-            elif cmd.name == "reset":
-                for k, v in cmd.payload.items():
+                _send_response(EnvironmentCommand.GET_PROPERTIES, reset_params)
+            elif req.cmd == EnvironmentCommand.RESET:
+                for k, v in req.payload.items():
-                _send_response("reset", all_step_result)
-            elif cmd.name == "close":
+                _send_response(EnvironmentCommand.RESET, all_step_result)
+            elif req.cmd == EnvironmentCommand.CLOSE:
-    except (KeyboardInterrupt, UnityCommunicationException, UnityTimeOutException):
+    except (
+        KeyboardInterrupt,
+        UnityCommunicationException,
+        UnityTimeOutException,
+        UnityEnvironmentException,
+    ) as ex:
-        step_queue.put(EnvironmentResponse("env_close", worker_id, None))
+        step_queue.put(
+            EnvironmentResponse(EnvironmentCommand.ENV_EXITED, worker_id, ex)
+        )
+        _send_response(EnvironmentCommand.ENV_EXITED, ex)
    finally:
        # If this worker has put an item in the step queue that hasn't been processed by the EnvManager, the process
        # will hang until the item is processed. We avoid this behavior by using Queue.cancel_join_thread()
        step_queue.cancel_join_thread()
        step_queue.close()
-        env.close()
+        if env is not None:
+            env.close()
        logger.debug(f"UnityEnvironment worker {worker_id} done.")


            if not env_worker.waiting:
                env_action_info = self._take_step(env_worker.previous_step)
                env_worker.previous_all_action_info = env_action_info
-                env_worker.send("step", env_action_info)
+                env_worker.send(EnvironmentCommand.STEP, env_action_info)
                env_worker.waiting = True

    def _step(self) -> List[EnvironmentStep]:
        while len(worker_steps) < 1:
            try:
                while True:
-                    step = self.step_queue.get_nowait()
-                    if step.name == "env_close":
-                        raise UnityCommunicationException(
-                            "At least one of the environments has closed."
-                        )
+                    step: EnvironmentResponse = self.step_queue.get_nowait()
+                    if step.cmd == EnvironmentCommand.ENV_EXITED:
+                        env_exception: Exception = step.payload
+                        raise env_exception
                    self.env_workers[step.worker_id].waiting = False
                    if step.worker_id not in step_workers:
                        worker_steps.append(step)
                self.env_workers[step.worker_id].waiting = False
        # First enqueue reset commands for all workers so that they reset in parallel
        for ew in self.env_workers:
-            ew.send("reset", config)
+            ew.send(EnvironmentCommand.RESET, config)
-            ew.previous_step = EnvironmentStep(ew.recv().payload, ew.worker_id, {})
+            ew.previous_step = EnvironmentStep(ew.recv().payload, ew.worker_id, {}, {})
-        self.env_workers[0].send("external_brains")
+        self.env_workers[0].send(EnvironmentCommand.EXTERNAL_BRAINS)
-        self.env_workers[0].send("get_properties")
+        self.env_workers[0].send(EnvironmentCommand.GET_PROPERTIES)
        return self.env_workers[0].recv().payload

    def close(self) -> None:
                payload.all_step_result,
                step.worker_id,
                env_worker.previous_all_action_info,
+                payload.environment_stats,
            )
            step_infos.append(new_step)
            env_worker.previous_step = new_step
--- a/ml-agents/mlagents/trainers/tests/simple_test_envs.py
+++ b/ml-agents/mlagents/trainers/tests/simple_test_envs.py
 VIS_OBS_SIZE = (20, 20, 3)
 STEP_SIZE = 0.1

-TIME_PENALTY = 0.001
+TIME_PENALTY = 0.01
 MIN_STEPS = int(1.0 / STEP_SIZE) + 1
 SUCCESS_REWARD = 1.0 + MIN_STEPS * TIME_PENALTY



-class Simple1DEnvironment(BaseEnv):
+class SimpleEnvironment(BaseEnv):
    """
    Very simple "game" - the agent has a position on [-1, 1], gets a reward of 1 if it reaches 1, and a reward of -1 if
    it reaches -1. The position is incremented by the action amount (clamped to [-step_size, step_size]).
        num_vector=1,
        vis_obs_size=VIS_OBS_SIZE,
        vec_obs_size=OBS_SIZE,
+        action_size=1,
    ):
        super().__init__()
        self.discrete = use_discrete
        self.vec_obs_size = vec_obs_size
        action_type = ActionType.DISCRETE if use_discrete else ActionType.CONTINUOUS
        self.group_spec = AgentGroupSpec(
-            self._make_obs_spec(), action_type, (2,) if use_discrete else 1
+            self._make_obs_spec(),
+            action_type,
+            tuple(2 for _ in range(action_size)) if use_discrete else action_size,
+        self.action_size = action_size
-        self.position: Dict[str, float] = {}
+        self.positions: Dict[str, List[float]] = {}
        self.step_count: Dict[str, float] = {}
        self.random = random.Random(str(self.group_spec))
        self.goal: Dict[str, int] = {}
        return self.step_result[name]

    def _take_action(self, name: str) -> bool:
+        deltas = []
+        for _act in self.action[name][0]:
+            if self.discrete:
+                deltas.append(1 if _act else -1)
+            else:
+                deltas.append(_act)
+        for i, _delta in enumerate(deltas):
+            _delta = clamp(_delta, -self.step_size, self.step_size)
+            self.positions[name][i] += _delta
+            self.positions[name][i] = clamp(self.positions[name][i], -1, 1)
+            self.step_count[name] += 1
+            # Both must be in 1.0 to be done
+        done = all(pos >= 1.0 or pos <= -1.0 for pos in self.positions[name])
+        return done
+
+    def _generate_mask(self):
-            act = self.action[name][0][0]
-            delta = 1 if act else -1
+            # LL-Python API will return an empty dim if there is only 1 agent.
+            ndmask = np.array(2 * self.action_size * [False], dtype=np.bool)
+            ndmask = np.expand_dims(ndmask, axis=0)
+            action_mask = [ndmask]
-            delta = self.action[name][0][0]
-
-        delta = clamp(delta, -self.step_size, self.step_size)
-        self.position[name] += delta
-        self.position[name] = clamp(self.position[name], -1, 1)
-        self.step_count[name] += 1
-        done = self.position[name] >= 1.0 or self.position[name] <= -1.0
-        return done
+            action_mask = None
+        return action_mask
-            reward = SUCCESS_REWARD * self.position[name] * self.goal[name]
+            reward = 0.0
+            for _pos in self.positions[name]:
+                reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
+                    self.positions[name]
+                )
+
+    def _reset_agent(self, name):
+        self.goal[name] = self.random.choice([-1, 1])
+        self.positions[name] = [0.0 for _ in range(self.action_size)]
+        self.step_count[name] = 0
+        self.final_rewards[name].append(self.rewards[name])
+        self.rewards[name] = 0
+        self.agent_id[name] = self.agent_id[name] + 1

    def _make_batched_step(
        self, name: str, done: bool, reward: float
            self.rewards[name] += reward
            self.step_result[name] = self._make_batched_step(name, done, reward)

-    def _generate_mask(self):
-        if self.discrete:
-            # LL-Python API will return an empty dim if there is only 1 agent.
-            ndmask = np.array(2 * [False], dtype=np.bool)
-            ndmask = np.expand_dims(ndmask, axis=0)
-            action_mask = [ndmask]
-        else:
-            action_mask = None
-        return action_mask
-
-    def _reset_agent(self, name):
-        self.goal[name] = self.random.choice([-1, 1])
-        self.position[name] = 0.0
-        self.step_count[name] = 0
-        self.final_rewards[name].append(self.rewards[name])
-        self.rewards[name] = 0
-        self.agent_id[name] = self.agent_id[name] + 1
-
    def reset(self) -> None:  # type: ignore
        for name in self.names:
            self._reset_agent(name)
        pass


-class Memory1DEnvironment(Simple1DEnvironment):
+class MemoryEnvironment(SimpleEnvironment):
    def __init__(self, brain_names, use_discrete, step_size=0.2):
        super().__init__(brain_names, use_discrete, step_size=step_size)
        # Number of steps to reveal the goal for. Lower is harder. Should be
        )


-class Record1DEnvironment(Simple1DEnvironment):
+class RecordEnvironment(SimpleEnvironment):
    def __init__(
        self,
        brain_names,
--- a/ml-agents/mlagents/trainers/tests/test_agent_processor.py
+++ b/ml-agents/mlagents/trainers/tests/test_agent_processor.py
 )
 from mlagents.trainers.action_info import ActionInfo
 from mlagents.trainers.trajectory import Trajectory
-from mlagents.trainers.stats import StatsReporter
+from mlagents.trainers.stats import StatsReporter, StatsSummary
+from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod


 def create_mock_brain():
    assert len(processor.last_take_action_outputs.keys()) == 0
    assert len(processor.episode_steps.keys()) == 0
    assert len(processor.episode_rewards.keys()) == 0
+    assert len(processor.last_step_result.keys()) == 0
+
+    # check that steps with immediate dones don't add to dicts
+    processor.add_experiences(mock_done_step, 0, ActionInfo.empty())
+    assert len(processor.experience_buffers.keys()) == 0
+    assert len(processor.last_take_action_outputs.keys()) == 0
+    assert len(processor.episode_steps.keys()) == 0
+    assert len(processor.episode_rewards.keys()) == 0
+    assert len(processor.last_step_result.keys()) == 0


 def test_end_episode():
    queue_traj = queue.get_nowait()
    assert isinstance(queue_traj, Trajectory)
    assert queue.empty()
+
+
+def test_agent_manager_stats():
+    policy = mock.Mock()
+    stats_reporter = StatsReporter("FakeCategory")
+    writer = mock.Mock()
+    stats_reporter.add_writer(writer)
+    manager = AgentManager(policy, "MyBehavior", stats_reporter)
+
+    all_env_stats = [
+        {
+            "averaged": (1.0, StatsAggregationMethod.AVERAGE),
+            "most_recent": (2.0, StatsAggregationMethod.MOST_RECENT),
+        },
+        {
+            "averaged": (3.0, StatsAggregationMethod.AVERAGE),
+            "most_recent": (4.0, StatsAggregationMethod.MOST_RECENT),
+        },
+    ]
+    for env_stats in all_env_stats:
+        manager.record_environment_stats(env_stats, worker_id=0)
+
+    expected_stats = {
+        "averaged": StatsSummary(mean=2.0, std=mock.ANY, num=2),
+        "most_recent": StatsSummary(mean=4.0, std=0.0, num=1),
+    }
+    stats_reporter.write_stats(123)
+    writer.write_stats.assert_any_call("FakeCategory", expected_stats, 123)
+
+    # clean up our Mock from the global list
+    StatsReporter.writers.remove(writer)
--- a/ml-agents/mlagents/trainers/tests/test_distributions.py
+++ b/ml-agents/mlagents/trainers/tests/test_distributions.py

 def test_gaussian_distribution():
    with tf.Graph().as_default():
-        logits = tf.Variable(initial_value=[[0, 0]], trainable=True, dtype=tf.float32)
+        logits = tf.Variable(initial_value=[[1, 1]], trainable=True, dtype=tf.float32)
        distribution = GaussianDistribution(
            logits,
            act_size=VECTOR_ACTION_SPACE,
                    assert out.shape[1] == VECTOR_ACTION_SPACE[0]
                output = sess.run([distribution.total_log_probs])
                assert output[0].shape[0] == 1
+            # Test entropy is correct
+            log_std_tensor = tf.get_default_graph().get_tensor_by_name(
+                "log_std/BiasAdd:0"
+            )
+            feed_dict = {log_std_tensor: [[1.0, 1.0]]}
+            entropy = sess.run([distribution.entropy], feed_dict=feed_dict)
+            # Entropy with log_std of 1.0 should be 2.42
+            assert pytest.approx(entropy[0], 0.01) == 2.42


 def test_tanh_distribution():
--- a/ml-agents/mlagents/trainers/tests/test_ghost.py
+++ b/ml-agents/mlagents/trainers/tests/test_ghost.py
 import yaml

 from mlagents.trainers.ghost.trainer import GhostTrainer
+from mlagents.trainers.ghost.controller import GhostController
+from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
-from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
 from mlagents.trainers.tests import mock_brain as mb
 from mlagents.trainers.tests.test_trajectory import make_fake_trajectory

    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
-    trainer = GhostTrainer(ppo_trainer, brain_name, 0, dummy_config, True, "0")
+    controller = GhostController(100)
+    trainer = GhostTrainer(
+        ppo_trainer, brain_name, controller, 0, dummy_config, True, "0"
+    )
-    trainer.add_policy(brain_params_team0.brain_name, policy)
+    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
+        brain_params_team0.brain_name
+    )
+    trainer.add_policy(parsed_behavior_id0, policy)
-    trainer.add_policy(brain_params_team1.brain_name, policy)
+    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
+        brain_params_team1.brain_name
+    )
+    trainer.add_policy(parsed_behavior_id1, policy)
    trajectory_queue1 = AgentManagerQueue(brain_params_team1.brain_name)
    trainer.subscribe_trajectory_queue(trajectory_queue1)

        vector_action_space_type=0,
    )

-    brain_name = BehaviorIdentifiers.from_name_behavior_id(
+    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
-    ).brain_name
+    )
+
+    brain_name = parsed_behavior_id0.brain_name

    brain_params_team1 = BrainParameters(
        brain_name="test_brain?team=1",
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
-    trainer = GhostTrainer(ppo_trainer, brain_name, 0, dummy_config, True, "0")
+    controller = GhostController(100)
+    trainer = GhostTrainer(
+        ppo_trainer, brain_name, controller, 0, dummy_config, True, "0"
+    )
-    trainer.add_policy(brain_params_team0.brain_name, policy)
+    trainer.add_policy(parsed_behavior_id0, policy)
-    trainer.add_policy(brain_params_team1.brain_name, policy)
+    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
+        brain_params_team1.brain_name
+    )
+    trainer.add_policy(parsed_behavior_id1, policy)
    policy_queue1 = AgentManagerQueue(brain_params_team1.brain_name)
    trainer.publish_policy_queue(policy_queue1)

--- a/ml-agents/mlagents/trainers/tests/test_learn.py
+++ b/ml-agents/mlagents/trainers/tests/test_learn.py
    return parse_command_line(args)


+@patch("mlagents.trainers.learn.handle_existing_directories")
@patch("mlagents.trainers.learn.TrainerFactory")
@patch("mlagents.trainers.learn.SamplerManager")
@patch("mlagents.trainers.learn.SubprocessEnvManager")
    subproc_env_mock,
    sampler_manager_mock,
    trainer_factory_mock,
+    handle_dir_mock,
 ):
    mock_env = MagicMock()
    mock_env.external_brain_names = []
                "ppo",
                50000,
                None,
-                False,
+                True,
-    StatsReporter.writers.clear()  # make sure there aren't any writers as added by learn.py
-
-
-@patch("mlagents.trainers.learn.SamplerManager")
-@patch("mlagents.trainers.learn.SubprocessEnvManager")
-@patch("mlagents.trainers.learn.create_environment_factory")
-@patch("mlagents.trainers.learn.load_config")
-def test_docker_target_path(
-    load_config, create_environment_factory, subproc_env_mock, sampler_manager_mock
-):
-    mock_env = MagicMock()
-    mock_env.external_brain_names = []
-    mock_env.academy_name = "TestAcademyName"
-    create_environment_factory.return_value = mock_env
-    trainer_config_mock = MagicMock()
-    load_config.return_value = trainer_config_mock
-
-    options_with_docker_target = basic_options({"--docker-target-name": "dockertarget"})
-
-    mock_init = MagicMock(return_value=None)
-    with patch.object(TrainerController, "__init__", mock_init):
-        with patch.object(TrainerController, "start_learning", MagicMock()):
-            learn.run_training(0, options_with_docker_target)
-            mock_init.assert_called_once()
-            assert mock_init.call_args[0][1] == "/dockertarget/models/ppo"
-            assert mock_init.call_args[0][2] == "/dockertarget/summaries"
+            handle_dir_mock.assert_called_once_with(
+                "./models/ppo", "./summaries", False, False
+            )
    StatsReporter.writers.clear()  # make sure there aren't any writers as added by learn.py


            env_path="/foo/bar",
-            docker_target_name=None,
            no_graphics=True,
            seed=None,
            start_port=8000,
    assert opt.sampler_config is None
    assert opt.keep_checkpoints == 5
    assert opt.lesson == 0
-    assert opt.load_model is False
+    assert opt.resume is False
+    assert opt.inference is False
-    assert opt.train_model is False
-    assert opt.docker_target_name is None
    assert opt.no_graphics is False
    assert opt.debug is False
    assert opt.env_args is None
        "--sampler=./mysample",
        "--keep-checkpoints=42",
        "--lesson=3",
-        "--load",
+        "--resume",
+        "--inference",
        "--run-id=myawesomerun",
        "--save-freq=123456",
        "--seed=7890",
-        "--docker-target-name=mydockertarget",
        "--no-graphics",
        "--debug",
    ]
    assert opt.sampler_config == {}
    assert opt.keep_checkpoints == 42
    assert opt.lesson == 3
-    assert opt.load_model is True
-    assert opt.train_model is True
-    assert opt.docker_target_name == "mydockertarget"
+    assert opt.inference is True
+    assert opt.resume is True


@patch("builtins.open", new_callable=mock_open, read_data="{}")
--- a/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
+++ b/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
 import json
 import yaml

-from mlagents.trainers.tests.simple_test_envs import Simple1DEnvironment
+from mlagents.trainers.tests.simple_test_envs import SimpleEnvironment
 from mlagents.trainers.tests.test_simple_rl import _check_environment_trains, BRAIN_NAME
 from mlagents.trainers.tests.test_curriculum import dummy_curriculum_json_str


@pytest.mark.parametrize("curriculum_brain_name", [BRAIN_NAME, "WrongBrainName"])
 def test_simple_metacurriculum(curriculum_brain_name):
-    env = Simple1DEnvironment([BRAIN_NAME], use_discrete=False)
+    env = SimpleEnvironment([BRAIN_NAME], use_discrete=False)
    curriculum_config = json.loads(dummy_curriculum_json_str)
    mc = MetaCurriculum({curriculum_brain_name: curriculum_config})
    trainer_config = yaml.safe_load(TRAINER_CONFIG)
--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
 from typing import Dict, Any

 from mlagents.trainers.tests.simple_test_envs import (
-    Simple1DEnvironment,
-    Memory1DEnvironment,
-    Record1DEnvironment,
+    SimpleEnvironment,
+    MemoryEnvironment,
+    RecordEnvironment,
 )
 from mlagents.trainers.trainer_controller import TrainerController
 from mlagents.trainers.trainer_util import TrainerFactory
        lambd: 0.95
        learning_rate: 5.0e-3
        learning_rate_schedule: constant
-        max_steps: 2000
+        max_steps: 3000
        memory_size: 16
        normalize: false
        num_epoch: 3
 # Custom reward processors shuld be built within the test function and passed to _check_environment_trains
 # Default is average over the last 5 final rewards
 def default_reward_processor(rewards, last_n_rewards=5):
+    rewards_to_use = rewards[-last_n_rewards:]
+    # For debugging tests
+    print("Last {} rewards:".format(last_n_rewards), rewards_to_use)
    return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean()


    trainer_config,
    reward_processor=default_reward_processor,
    meta_curriculum=None,
-    success_threshold=0.99,
+    success_threshold=0.9,
    env_manager=None,
 ):
    # Create controller and begin training.
        if (
            success_threshold is not None
        ):  # For tests where we are just checking setup and not reward
-
            processed_rewards = [
                reward_processor(rewards) for rewards in env.final_rewards.values()
            ]

@pytest.mark.parametrize("use_discrete", [True, False])
 def test_simple_ppo(use_discrete):
-    env = Simple1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
+    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
+    config = generate_config(PPO_CONFIG)
+    _check_environment_trains(env, config)
+
+
+@pytest.mark.parametrize("use_discrete", [True, False])
+def test_2d_ppo(use_discrete):
+    env = SimpleEnvironment(
+        [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.5
+    )
    config = generate_config(PPO_CONFIG)
    _check_environment_trains(env, config)

 def test_visual_ppo(num_visual, use_discrete):
-    env = Simple1DEnvironment(
+    env = SimpleEnvironment(
        [BRAIN_NAME],
        use_discrete=use_discrete,
        num_visual=num_visual,
@pytest.mark.parametrize("num_visual", [1, 2])
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn"])
 def test_visual_advanced_ppo(vis_encode_type, num_visual):
-    env = Simple1DEnvironment(
+    env = SimpleEnvironment(
        [BRAIN_NAME],
        use_discrete=True,
        num_visual=num_visual,

@pytest.mark.parametrize("use_discrete", [True, False])
 def test_recurrent_ppo(use_discrete):
-    env = Memory1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
+    env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
-        "max_steps": 3000,
+        "max_steps": 5000,
+        "learning_rate": 1e-3,
-    _check_environment_trains(env, config)
+    _check_environment_trains(env, config, success_threshold=0.9)
-    env = Simple1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
+    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
+def test_2d_sac(use_discrete):
+    env = SimpleEnvironment(
+        [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
+    )
+    override_vals = {"buffer_init_steps": 2000, "max_steps": 4000}
+    config = generate_config(SAC_CONFIG, override_vals)
+    _check_environment_trains(env, config, success_threshold=0.8)
+
+
+@pytest.mark.parametrize("use_discrete", [True, False])
-    env = Simple1DEnvironment(
+    env = SimpleEnvironment(
        [BRAIN_NAME],
        use_discrete=use_discrete,
        num_visual=num_visual,
@pytest.mark.parametrize("num_visual", [1, 2])
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn"])
 def test_visual_advanced_sac(vis_encode_type, num_visual):
-    env = Simple1DEnvironment(
+    env = SimpleEnvironment(
        [BRAIN_NAME],
        use_discrete=True,
        num_visual=num_visual,

@pytest.mark.parametrize("use_discrete", [True, False])
 def test_recurrent_sac(use_discrete):
-    env = Memory1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
-    override_vals = {"batch_size": 32, "use_recurrent": True, "max_steps": 2000}
+    env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
+    override_vals = {
+        "batch_size": 64,
+        "use_recurrent": True,
+        "max_steps": 3000,
+        "learning_rate": 1e-3,
+        "buffer_init_steps": 500,
+    }
    config = generate_config(SAC_CONFIG, override_vals)
    _check_environment_trains(env, config)

-    env = Simple1DEnvironment(
+    env = SimpleEnvironment(
-            "play_against_current_self_ratio": 1.0,
+            "play_against_latest_model_ratio": 1.0,
            "save_steps": 2000,
            "swap_steps": 2000,
        },

@pytest.mark.parametrize("use_discrete", [True, False])
 def test_simple_ghost_fails(use_discrete):
-    env = Simple1DEnvironment(
+    env = SimpleEnvironment(
        [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
    )
    # This config should fail because the ghosted policy is never swapped with a competent policy.
        "self_play": {
-            "play_against_current_self_ratio": 1.0,
+            "play_against_latest_model_ratio": 1.0,
            "save_steps": 2000,
            "swap_steps": 4000,
        },
    processed_rewards = [
        default_reward_processor(rewards) for rewards in env.final_rewards.values()
    ]
+    success_threshold = 0.9
+    assert any(reward > success_threshold for reward in processed_rewards) and any(
+        reward < success_threshold for reward in processed_rewards
+    )
+
+
+@pytest.mark.parametrize("use_discrete", [True, False])
+def test_simple_asymm_ghost(use_discrete):
+    # Make opponent for asymmetric case
+    brain_name_opp = BRAIN_NAME + "Opp"
+    env = SimpleEnvironment(
+        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
+    )
+    override_vals = {
+        "max_steps": 2000,
+        "self_play": {
+            "play_against_latest_model_ratio": 1.0,
+            "save_steps": 5000,
+            "swap_steps": 5000,
+            "team_change": 2000,
+        },
+    }
+    config = generate_config(PPO_CONFIG, override_vals)
+    config[brain_name_opp] = config[BRAIN_NAME]
+    _check_environment_trains(env, config)
+
+
+@pytest.mark.parametrize("use_discrete", [True, False])
+def test_simple_asymm_ghost_fails(use_discrete):
+    # Make opponent for asymmetric case
+    brain_name_opp = BRAIN_NAME + "Opp"
+    env = SimpleEnvironment(
+        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
+    )
+    # This config should fail because the team that us not learning when both have reached
+    # max step should be executing the initial, untrained poliy.
+    override_vals = {
+        "max_steps": 2000,
+        "self_play": {
+            "play_against_latest_model_ratio": 0.0,
+            "save_steps": 5000,
+            "swap_steps": 5000,
+            "team_change": 2000,
+        },
+    }
+    config = generate_config(PPO_CONFIG, override_vals)
+    config[brain_name_opp] = config[BRAIN_NAME]
+    _check_environment_trains(env, config, success_threshold=None)
+    processed_rewards = [
+        default_reward_processor(rewards) for rewards in env.final_rewards.values()
+    ]
    success_threshold = 0.99
    assert any(reward > success_threshold for reward in processed_rewards) and any(
        reward < success_threshold for reward in processed_rewards
@pytest.fixture(scope="session")
 def simple_record(tmpdir_factory):
    def record_demo(use_discrete, num_visual=0, num_vector=1):
-        env = Record1DEnvironment(
+        env = RecordEnvironment(
            [BRAIN_NAME],
            use_discrete=use_discrete,
            num_visual=num_visual,
@pytest.mark.parametrize("trainer_config", [PPO_CONFIG, SAC_CONFIG])
 def test_gail(simple_record, use_discrete, trainer_config):
    demo_path = simple_record(use_discrete)
-    env = Simple1DEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
+    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
    override_vals = {
        "max_steps": 500,
        "behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},
@pytest.mark.parametrize("use_discrete", [True, False])
 def test_gail_visual_ppo(simple_record, use_discrete):
    demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
-    env = Simple1DEnvironment(
+    env = SimpleEnvironment(
        [BRAIN_NAME],
        num_visual=1,
        num_vector=0,
@pytest.mark.parametrize("use_discrete", [True, False])
 def test_gail_visual_sac(simple_record, use_discrete):
    demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
-    env = Simple1DEnvironment(
+    env = SimpleEnvironment(
        [BRAIN_NAME],
        num_visual=1,
        num_vector=0,
--- a/ml-agents/mlagents/trainers/tests/test_stats.py
+++ b/ml-agents/mlagents/trainers/tests/test_stats.py
 import tempfile
 import unittest
 import csv
+import time

 from mlagents.trainers.stats import (
    StatsReporter,
    # Test write_stats
    category = "category1"
    with tempfile.TemporaryDirectory(prefix="unittest-") as base_dir:
-        tb_writer = TensorboardWriter(base_dir)
+        tb_writer = TensorboardWriter(base_dir, clear_past_data=False)
        statssummary1 = StatsSummary(mean=1.0, std=1.0, num=1)
        tb_writer.write_stats("category1", {"key1": statssummary1}, 10)

        assert mock_filewriter.return_value.add_summary.call_count > 1


+def test_tensorboard_writer_clear(tmp_path):
+    tb_writer = TensorboardWriter(tmp_path, clear_past_data=False)
+    statssummary1 = StatsSummary(mean=1.0, std=1.0, num=1)
+    tb_writer.write_stats("category1", {"key1": statssummary1}, 10)
+    # TB has some sort of timeout before making a new file
+    time.sleep(1.0)
+    assert len(os.listdir(os.path.join(tmp_path, "category1"))) > 0
+
+    # See if creating a new one doesn't delete it
+    tb_writer = TensorboardWriter(tmp_path, clear_past_data=False)
+    tb_writer.write_stats("category1", {"key1": statssummary1}, 10)
+    assert len(os.listdir(os.path.join(tmp_path, "category1"))) > 1
+    time.sleep(1.0)
+
+    # See if creating a new one deletes old ones
+    tb_writer = TensorboardWriter(tmp_path, clear_past_data=True)
+    tb_writer.write_stats("category1", {"key1": statssummary1}, 10)
+    assert len(os.listdir(os.path.join(tmp_path, "category1"))) == 1
+
+
 def test_csv_writer():
    # Test write_stats
    category = "category1"
            category = "category1"
            console_writer = ConsoleWriter()
            console_writer.add_property(category, StatsPropertyType.SELF_PLAY, True)
-            console_writer.add_property(category, StatsPropertyType.SELF_PLAY_TEAM, 1)
            statssummary1 = StatsSummary(mean=1.0, std=1.0, num=1)
            console_writer.write_stats(
                category,
                    "Self-play/ELO": statssummary1,
-                    "Self-play/Mean Opponent ELO": statssummary1,
-                    "Self-play/Std Opponent ELO": statssummary1,
                },
                10,
            )
        )
-        self.assertIn(
-            "category1 Team 1: ELO: 1.000. Mean Opponent ELO: 1.000. Std Opponent ELO: 1.000.",
-            cm.output[1],
-        )
--- a/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
+++ b/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
    SubprocessEnvManager,
    EnvironmentResponse,
    StepResponse,
+    EnvironmentCommand,
-from mlagents.trainers.tests.simple_test_envs import Simple1DEnvironment
+from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
+from mlagents_envs.exception import UnityEnvironmentException
+from mlagents.trainers.tests.simple_test_envs import SimpleEnvironment
 from mlagents.trainers.stats import StatsReporter
 from mlagents.trainers.tests.test_simple_rl import (
    _check_environment_trains,


 def create_worker_mock(worker_id, step_queue, env_factor, engine_c):
-    return MockEnvWorker(worker_id, EnvironmentResponse("reset", worker_id, worker_id))
+    return MockEnvWorker(
+        worker_id, EnvironmentResponse(EnvironmentCommand.RESET, worker_id, worker_id)
+    )


 class SubprocessEnvManagerTest(unittest.TestCase):
        )
        params = {"test": "params"}
        manager._reset_env(params)
-        manager.env_workers[0].send.assert_called_with("reset", (params))
+        manager.env_workers[0].send.assert_called_with(
+            EnvironmentCommand.RESET, (params)
+        )

    @mock.patch(
        "mlagents.trainers.subprocess_env_manager.SubprocessEnvManager.create_worker"
        params = {"test": "params"}
        res = manager._reset_env(params)
        for i, env in enumerate(manager.env_workers):
-            env.send.assert_called_with("reset", (params))
+            env.send.assert_called_with(EnvironmentCommand.RESET, (params))
            env.recv.assert_called()
            # Check that the "last steps" are set to the value returned for each step
            self.assertEqual(
        )
        manager.step_queue = Mock()
        manager.step_queue.get_nowait.side_effect = [
-            EnvironmentResponse("step", 0, StepResponse(0, None)),
-            EnvironmentResponse("step", 1, StepResponse(1, None)),
+            EnvironmentResponse(EnvironmentCommand.STEP, 0, StepResponse(0, None, {})),
+            EnvironmentResponse(EnvironmentCommand.STEP, 1, StepResponse(1, None, {})),
            EmptyQueue(),
        ]
        step_mock = Mock()
        res = manager._step()
        for i, env in enumerate(manager.env_workers):
            if i < 2:
-                env.send.assert_called_with("step", step_mock)
+                env.send.assert_called_with(EnvironmentCommand.STEP, step_mock)
                manager.step_queue.get_nowait.assert_called()
                # Check that the "last steps" are set to the value returned for each step
                self.assertEqual(
        env_manager.set_agent_manager(brain_name, agent_manager_mock)

        step_info_dict = {brain_name: Mock()}
-        step_info = EnvironmentStep(step_info_dict, 0, action_info_dict)
+        env_stats = {
+            "averaged": (1.0, StatsAggregationMethod.AVERAGE),
+            "most_recent": (2.0, StatsAggregationMethod.MOST_RECENT),
+        }
+        step_info = EnvironmentStep(step_info_dict, 0, action_info_dict, env_stats)
        step_mock.return_value = [step_info]
        env_manager.advance()

        assert agent_manager_mock.policy == mock_policy


-def simple_env_factory(worker_id, config):
-    env = Simple1DEnvironment(["1D"], use_discrete=True)
-    return env
-
-
+    def simple_env_factory(worker_id, config):
+        env = SimpleEnvironment(["1D"], use_discrete=True)
+        return env
+
    env_manager = SubprocessEnvManager(
        simple_env_factory, EngineConfig.default_config(), num_envs
    )
        val > 0.7 for val in StatsReporter.writers[0].get_last_rewards().values()
    )
    env_manager.close()
+
+
+@pytest.mark.parametrize("num_envs", [1, 4])
+def test_subprocess_env_raises_errors(num_envs):
+    def failing_env_factory(worker_id, config):
+        import time
+
+        # Sleep momentarily to allow time for the EnvManager to be waiting for the
+        # subprocess response.  We won't be able to capture failures from the subprocess
+        # that cause it to close the pipe before we can send the first message.
+        time.sleep(0.1)
+        raise UnityEnvironmentException()
+
+    env_manager = SubprocessEnvManager(
+        failing_env_factory, EngineConfig.default_config(), num_envs
+    )
+    with pytest.raises(UnityEnvironmentException):
+        env_manager.reset()
+    env_manager.close()
--- a/ml-agents/mlagents/trainers/tests/test_trainer_util.py
+++ b/ml-agents/mlagents/trainers/tests/test_trainer_util.py
 import pytest
 import yaml
 import io
+import os
 from unittest.mock import patch

 from mlagents.trainers import trainer_util
    with pytest.raises(TrainerConfigError):
        fp = io.StringIO(file_contents)
        _load_config(fp)
+
+
+def test_existing_directories(tmp_path):
+    model_path = os.path.join(tmp_path, "runid")
+    # Unused summary path
+    summary_path = os.path.join(tmp_path, "runid")
+    # Test fresh new unused path - should do nothing.
+    trainer_util.handle_existing_directories(model_path, summary_path, False, False)
+    # Test resume with fresh path - should throw an exception.
+    with pytest.raises(UnityTrainerException):
+        trainer_util.handle_existing_directories(model_path, summary_path, True, False)
+
+    # make a directory
+    os.mkdir(model_path)
+    # Test try to train w.o. force, should complain
+    with pytest.raises(UnityTrainerException):
+        trainer_util.handle_existing_directories(model_path, summary_path, False, False)
+    # Test try to train w/ resume - should work
+    trainer_util.handle_existing_directories(model_path, summary_path, True, False)
+    # Test try to train w/ force - should work
+    trainer_util.handle_existing_directories(model_path, summary_path, False, True)
--- a/ml-agents/mlagents/trainers/trainer/trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/trainer.py
 # # Unity ML-Agents Toolkit
-import logging
+from mlagents_envs.logging_util import get_logger
 from mlagents.model_serialization import export_policy_model, SerializationSettings
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.stats import StatsReporter
 from mlagents.trainers.policy import Policy
 from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
-logger = logging.getLogger("mlagents.trainers")
+
+logger = get_logger(__name__)


 class Trainer(abc.ABC):
        pass

    @abc.abstractmethod
-    def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
+    def add_policy(
+        self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
+    ) -> None:
        """
        Adds policy to trainer.
        """
--- a/ml-agents/mlagents/trainers/trainer_controller.py
+++ b/ml-agents/mlagents/trainers/trainer_controller.py

 import os
 import sys
-import logging
 import threading
 from typing import Dict, Optional, Set, List
 from collections import defaultdict

+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.env_manager import EnvManager
 from mlagents_envs.exception import (
    UnityEnvironmentException,
        self.trainer_factory = trainer_factory
        self.model_path = model_path
        self.summaries_dir = summaries_dir
-        self.logger = logging.getLogger("mlagents.trainers")
+        self.logger = get_logger(__name__)
        self.run_id = run_id
        self.save_freq = save_freq
        self.train_model = train
        self, env_manager: EnvManager, name_behavior_id: str
    ) -> None:

-        brain_name = BehaviorIdentifiers.from_name_behavior_id(
-            name_behavior_id
-        ).brain_name
+        parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(name_behavior_id)
+        brain_name = parsed_behavior_id.brain_name
        try:
            trainer = self.trainers[brain_name]
        except KeyError:
        policy = trainer.create_policy(env_manager.external_brains[name_behavior_id])
-        trainer.add_policy(name_behavior_id, policy)
+        trainer.add_policy(parsed_behavior_id, policy)

        agent_manager = AgentManager(
            policy,
            # Final save Tensorflow model
            if global_step != 0 and self.train_model:
                self._save_model()
-        except (KeyboardInterrupt, UnityCommunicationException):
+        except (
+            KeyboardInterrupt,
+            UnityCommunicationException,
+            UnityEnvironmentException,
+        ) as ex:
-            pass
+
+            if isinstance(ex, KeyboardInterrupt):
+                pass
+            else:
+                # If the environment failed, we want to make sure to raise
+                # the exception so we exit the process with an return code of 1.
+                raise ex
        if self.train_model:
            self._export_graph()

--- a/ml-agents/mlagents/trainers/trainer_util.py
+++ b/ml-agents/mlagents/trainers/trainer_util.py
 import os
 import yaml
 from typing import Any, Dict, TextIO
-import logging
+from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.meta_curriculum import MetaCurriculum
 from mlagents.trainers.exception import TrainerConfigError
 from mlagents.trainers.trainer import Trainer
 from mlagents.trainers.ghost.trainer import GhostTrainer
+from mlagents.trainers.ghost.controller import GhostController
-logger = logging.getLogger("mlagents.trainers")
+
+logger = get_logger(__name__)


 class TrainerFactory:
        self.seed = seed
        self.meta_curriculum = meta_curriculum
        self.multi_gpu = multi_gpu
+        self.ghost_controller = GhostController()

    def generate(self, brain_name: str) -> Trainer:
        return initialize_trainer(
            self.keep_checkpoints,
            self.train_model,
            self.load_model,
+            self.ghost_controller,
            self.seed,
            self.meta_curriculum,
            self.multi_gpu,
    keep_checkpoints: int,
    train_model: bool,
    load_model: bool,
+    ghost_controller: GhostController,
    seed: int,
    meta_curriculum: MetaCurriculum = None,
    multi_gpu: bool = False,
    :param keep_checkpoints: How many model checkpoints to keep
    :param train_model: Whether to train the model (vs. run inference)
    :param load_model: Whether to load the model or randomly initialize
+    :param ghost_controller: The object that coordinates ghost trainers
    :param seed: The random seed to use
    :param meta_curriculum: Optional meta_curriculum, used to determine a reward buffer length for PPOTrainer
    :return:
        trainer = GhostTrainer(
            trainer,
            brain_name,
+            ghost_controller,
            min_lesson_length,
            trainer_parameters,
            train_model,
            "Error parsing yaml file. Please check for formatting errors. "
            "A tool such as http://www.yamllint.com/ can be helpful with this."
        ) from e
+
+
+def handle_existing_directories(
+    model_path: str, summary_path: str, resume: bool, force: bool
+) -> None:
+    """
+    Validates that if the run_id model exists, we do not overwrite it unless --force is specified.
+    Throws an exception if resume isn't specified and run_id exists. Throws an exception
+    if --resume is specified and run-id was not found.
+    :param model_path: The model path specified.
+    :param summary_path: The summary path to be used.
+    :param resume: Whether or not the --resume flag was passed.
+    :param force: Whether or not the --force flag was passed.
+    """
+
+    model_path_exists = os.path.isdir(model_path)
+
+    if model_path_exists:
+        if not resume and not force:
+            raise UnityTrainerException(
+                "Previous data from this run-id was found. "
+                "Either specify a new run-id, use --resume to resume this run, "
+                "or use the --force parameter to overwrite existing data."
+            )
+    else:
+        if resume:
+            raise UnityTrainerException(
+                "Previous data from this run-id was not found. "
+                "Train a new run by removing the --resume flag."
+            )
--- a/ml-agents/setup.py
+++ b/ml-agents/setup.py
        # Test-only dependencies should go in test_requirements.txt, not here.
        "grpcio>=1.11.0",
        "h5py>=2.9.0",
-        "jupyter",
-        "matplotlib",
        "mlagents_envs=={}".format(VERSION),
        "numpy>=1.13.3,<2.0",
        "Pillow>=4.2.1",
--- a/ml-agents/tests/yamato/training_int_tests.py
+++ b/ml-agents/tests/yamato/training_int_tests.py
+import argparse
+import time

 from .yamato_utils import (
    get_base_path,
+    checkout_csharp_version,
+    undo_git_checkout,
-def main():
-    nn_file_expected = "./models/ppo/3DBall.nn"
+def run_training(python_version, csharp_version):
+    latest = "latest"
+    run_id = int(time.time() * 1000.0)
+    print(
+        f"Running training with python={python_version or latest} and c#={csharp_version or latest}"
+    )
+    nn_file_expected = f"./models/{run_id}/3DBall.nn"
    if os.path.exists(nn_file_expected):
        # Should never happen - make sure nothing leftover from an old test.
        print("Artifacts from previous build found!")
    print(f"Running in base path {base_path}")

-    build_returncode = run_standalone_build(base_path)
-    if build_returncode != 0:
-        print("Standalone build FAILED!")
-        sys.exit(build_returncode)
+    # Only build the standalone player if we're overriding the C# version
+    # Otherwise we'll use the one built earlier in the pipeline.
+    if csharp_version is not None:
+        # We can't rely on the old C# code recognizing the commandline argument to set the output
+        # So rename testPlayer (containing the most recent build) to something else temporarily
+        full_player_path = os.path.join("Project", "testPlayer.app")
+        temp_player_path = os.path.join("Project", "temp_testPlayer.app")
+        final_player_path = os.path.join("Project", f"testPlayer_{csharp_version}.app")
+
+        os.rename(full_player_path, temp_player_path)
+
+        checkout_csharp_version(csharp_version)
+        build_returncode = run_standalone_build(base_path)
+
+        if build_returncode != 0:
+            print("Standalone build FAILED!")
+            sys.exit(build_returncode)
+
+        # Now rename the newly-built executable, and restore the old one
+        os.rename(full_player_path, final_player_path)
+        os.rename(temp_player_path, full_player_path)
+        standalone_player_path = f"testPlayer_{csharp_version}"
+    else:
+        standalone_player_path = "testPlayer"
-    init_venv()
+    venv_path = init_venv(python_version)

    # Copy the default training config but override the max_steps parameter,
    # and reduce the batch_size and buffer_size enough to ensure an update step happens.
        buffer_size=10,
    )

-    # TODO pass scene name and exe destination to build
-    # TODO make sure we fail if the exe isn't found - see MLA-559
-    mla_learn_cmd = "mlagents-learn override.yaml --train --env=Project/testPlayer --no-graphics --env-args -logFile -"  # noqa
-    res = subprocess.run(f"source venv/bin/activate; {mla_learn_cmd}", shell=True)
+    mla_learn_cmd = (
+        f"mlagents-learn override.yaml --train --env=Project/{standalone_player_path} "
+        f"--run-id={run_id} --no-graphics --env-args -logFile -"
+    )  # noqa
+    res = subprocess.run(
+        f"source {venv_path}/bin/activate; {mla_learn_cmd}", shell=True
+    )

    if res.returncode != 0 or not os.path.exists(nn_file_expected):
        print("mlagents-learn run FAILED!")
    sys.exit(0)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--python", default=None)
+    parser.add_argument("--csharp", default=None)
+    args = parser.parse_args()
+
+    try:
+        run_training(args.python, args.csharp)
+    finally:
+        # Cleanup - this gets executed even if we hit sys.exit()
+        undo_git_checkout()


 if __name__ == "__main__":
--- a/ml-agents/tests/yamato/yamato_utils.py
+++ b/ml-agents/tests/yamato/yamato_utils.py
 import os
 import subprocess
 import yaml
+from typing import List, Optional


 def get_unity_executable_path():
    return os.getcwd()


-def run_standalone_build(base_path: str, verbose: bool = False) -> int:
+def run_standalone_build(
+    base_path: str, verbose: bool = False, output_path: str = None
+) -> int:
-    Run BuildStandalonePlayerOSX test to produce a player at Project/testPlayer
-    :param base_path:
-    :return:
+    Run BuildStandalonePlayerOSX test to produce a player. The location defaults to Project/testPlayer.
    """
    unity_exe = get_unity_executable_path()
    print(f"Running BuildStandalonePlayerOSX via {unity_exe}")
    ]
    if verbose:
        test_args += ["-logfile", "-"]
+    if output_path is not None:
+        test_args += ["--mlagents-build-output-path", output_path]
    print(f"{' '.join(test_args)} ...")

    timeout = 30 * 60  # 30 minutes, just in case

-def init_venv():
+def init_venv(
+    mlagents_python_version: str = None, extra_packages: Optional[List[str]] = None
+) -> str:
+    """
+    Set up the virtual environment, and return the venv path.
+    :param mlagents_python_version: The version of mlagents python packcage to install.
+        If None, will do a local install, otherwise will install from pypi
+    :return:
+    """
+    # Use a different venv path for different versions
+    venv_path = "venv"
+    if mlagents_python_version:
+        venv_path += "_" + mlagents_python_version
+
-    subprocess.check_call("python -m venv venv", shell=True)
+    subprocess.check_call(f"python -m venv {venv_path}", shell=True)
-        "-e ./ml-agents-envs",
-        "-e ./ml-agents",
+    if mlagents_python_version:
+        # install from pypi
+        pip_commands += [
+            f"mlagents=={mlagents_python_version}",
+            f"gym-unity=={mlagents_python_version}",
+        ]
+    else:
+        # Local install
+        pip_commands += ["-e ./ml-agents-envs", "-e ./ml-agents", "-e ./gym-unity"]
+    if extra_packages:
+        pip_commands += extra_packages
-            f"source venv/bin/activate; python -m pip install -q {cmd}", shell=True
+            f"source {venv_path}/bin/activate; python -m pip install -q {cmd}",
+            shell=True,
+    return venv_path
+
+
+def checkout_csharp_version(csharp_version):
+    """
+    Checks out the specific git revision (usually a tag) for the C# package and Project.
+    If csharp_version is None, no changes are made.
+    :param csharp_version:
+    :return:
+    """
+    if csharp_version is None:
+        return
+    csharp_dirs = ["com.unity.ml-agents", "Project"]
+    for csharp_dir in csharp_dirs:
+        subprocess.check_call(
+            f"git checkout {csharp_version} -- {csharp_dir}", shell=True
+        )
+
+
+def undo_git_checkout():
+    """
+    Clean up the git working directory.
+    """
+    subprocess.check_call("git reset HEAD .", shell=True)
+    subprocess.check_call("git checkout -- .", shell=True)


 def override_config_file(src_path, dest_path, **kwargs):
--- a/setup.cfg
+++ b/setup.cfg
    I200,

 banned-modules = tensorflow = use mlagents.tf_utils instead (it handles tf2 compat).
+                 logging = use mlagents_envs.logging_util instead
--- a/.yamato/gym-interface-test.yml
+++ b/.yamato/gym-interface-test.yml
+test_editors:
+  - version: 2019.3
+---
+{% for editor in test_editors %}
+test_gym_interface_{{ editor.version }}:
+  name: Test Mac Gym Interface {{ editor.version }}
+  agent:
+    type: Unity::VM::osx
+    image: ml-agents/ml-agents-bokken-mac:0.1.4-492264
+    flavor: b1.small
+  variables:
+    UNITY_VERSION: {{ editor.version }}
+  commands:
+    - pip install pyyaml
+    - python -u -m ml-agents.tests.yamato.setup_venv
+    - ./venv/bin/python ml-agents/tests/yamato/scripts/run_gym.py
+  dependencies:
+    - .yamato/standalone-build-test.yml#test_mac_standalone_{{ editor.version }}
+  triggers:
+    cancel_old_ci: true
+    changes:
+      only:
+        - "com.unity.ml-agents/**"
+        - "Project/**"
+        - "ml-agents/**"
+        - "ml-agents-envs/**"
+        - ".yamato/gym-interface-test.yml"
+      except:
+        - "*.md"
+        - "com.unity.ml-agents/*.md"
+        - "com.unity.ml-agents/**/*.md"
+{% endfor %}
--- a/.yamato/python-ll-api-test.yml
+++ b/.yamato/python-ll-api-test.yml
+test_editors:
+  - version: 2019.3
+---
+{% for editor in test_editors %}
+test_mac_ll_api_{{ editor.version }}:
+  name: Test Mac LL-API {{ editor.version }}
+  agent:
+    type: Unity::VM::osx
+    image: ml-agents/ml-agents-bokken-mac:0.1.4-492264
+    flavor: b1.small
+  variables:
+    UNITY_VERSION: {{ editor.version }}
+  commands:
+    - pip install pyyaml
+    - python -u -m ml-agents.tests.yamato.setup_venv
+    - ./venv/bin/python ml-agents/tests/yamato/scripts/run_llapi.py
+  dependencies:
+    - .yamato/standalone-build-test.yml#test_mac_standalone_{{ editor.version }}
+  triggers:
+    cancel_old_ci: true
+    changes:
+      only:
+        - "com.unity.ml-agents/**"
+        - "Project/**"
+        - "ml-agents/**"
+        - "ml-agents-envs/**"
+        - ".yamato/python-ll-api-test.yml"
+      except:
+        - "*.md"
+        - "com.unity.ml-agents/*.md"
+        - "com.unity.ml-agents/**/*.md"
+{% endfor %}
--- a/com.unity.ml-agents/Runtime/SideChannels/SideChannelUtils.cs
+++ b/com.unity.ml-agents/Runtime/SideChannels/SideChannelUtils.cs
+using System;
+using System.Collections.Generic;
+using UnityEngine;
+using System.IO;
+
+namespace MLAgents.SideChannels
+{
+    public static class SideChannelUtils
+    {
+
+        private static Dictionary<Guid, SideChannel> RegisteredChannels = new Dictionary<Guid, SideChannel>();
+
+        private struct CachedSideChannelMessage
+        {
+            public Guid ChannelId;
+            public byte[] Message;
+        }
+
+        private static Queue<CachedSideChannelMessage> m_CachedMessages = new Queue<CachedSideChannelMessage>();
+
+        /// <summary>
+        /// Registers a side channel to the communicator. The side channel will exchange
+        /// messages with its Python equivalent.
+        /// </summary>
+        /// <param name="sideChannel"> The side channel to be registered.</param>
+        public static void RegisterSideChannel(SideChannel sideChannel)
+        {
+            var channelId = sideChannel.ChannelId;
+            if (RegisteredChannels.ContainsKey(channelId))
+            {
+                throw new UnityAgentsException(string.Format(
+                    "A side channel with type index {0} is already registered. You cannot register multiple " +
+                    "side channels of the same id.", channelId));
+            }
+
+            // Process any messages that we've already received for this channel ID.
+            var numMessages = m_CachedMessages.Count;
+            for (int i = 0; i < numMessages; i++)
+            {
+                var cachedMessage = m_CachedMessages.Dequeue();
+                if (channelId == cachedMessage.ChannelId)
+                {
+                    using (var incomingMsg = new IncomingMessage(cachedMessage.Message))
+                    {
+                        sideChannel.OnMessageReceived(incomingMsg);
+                    }
+                }
+                else
+                {
+                    m_CachedMessages.Enqueue(cachedMessage);
+                }
+            }
+            RegisteredChannels.Add(channelId, sideChannel);
+        }
+
+        /// <summary>
+        /// Unregisters a side channel from the communicator.
+        /// </summary>
+        /// <param name="sideChannel"> The side channel to be unregistered.</param>
+        public static void UnregisterSideChannel(SideChannel sideChannel)
+        {
+            if (RegisteredChannels.ContainsKey(sideChannel.ChannelId))
+            {
+                RegisteredChannels.Remove(sideChannel.ChannelId);
+            }
+        }
+
+        /// <summary>
+        /// Unregisters all the side channels from the communicator.
+        /// </summary>
+        public static void UnregisterAllSideChannels()
+        {
+            RegisteredChannels = new Dictionary<Guid, SideChannel>();
+        }
+
+        /// <summary>
+        /// Returns the SideChannel of Type T if there is one registered, or null if it doesn't.
+        /// If there are multiple SideChannels of the same type registered, the returned instance is arbitrary.
+        /// </summary>
+        /// <typeparam name="T"></typeparam>
+        /// <returns></returns>
+        public static T GetSideChannel<T>() where T: SideChannel
+        {
+            foreach (var sc in RegisteredChannels.Values)
+            {
+                if (sc.GetType() == typeof(T))
+                {
+                    return (T) sc;
+                }
+            }
+            return null;
+        }
+
+        /// <summary>
+        /// Returns all SideChannels of Type T that are registered. Use <see cref="GetSideChannel{T}()"/> if possible,
+        /// as that does not make any memory allocations.
+        /// </summary>
+        /// <typeparam name="T"></typeparam>
+        /// <returns></returns>
+        public static List<T> GetSideChannels<T>() where T: SideChannel
+        {
+            var output = new List<T>();
+
+            foreach (var sc in RegisteredChannels.Values)
+            {
+                if (sc.GetType() == typeof(T))
+                {
+                    output.Add((T) sc);
+                }
+            }
+            return output;
+        }
+
+        /// <summary>
+        /// Grabs the messages that the registered side channels will send to Python at the current step
+        /// into a singe byte array.
+        /// </summary>
+        /// <returns></returns>
+        internal static byte[] GetSideChannelMessage()
+        {
+            return GetSideChannelMessage(RegisteredChannels);
+        }
+
+        /// <summary>
+        /// Grabs the messages that the registered side channels will send to Python at the current step
+        /// into a singe byte array.
+        /// </summary>
+        /// <param name="sideChannels"> A dictionary of channel type to channel.</param>
+        /// <returns></returns>
+        internal static byte[] GetSideChannelMessage(Dictionary<Guid, SideChannel> sideChannels)
+        {
+            using (var memStream = new MemoryStream())
+            {
+                using (var binaryWriter = new BinaryWriter(memStream))
+                {
+                    foreach (var sideChannel in sideChannels.Values)
+                    {
+                        var messageList = sideChannel.MessageQueue;
+                        foreach (var message in messageList)
+                        {
+                            binaryWriter.Write(sideChannel.ChannelId.ToByteArray());
+                            binaryWriter.Write(message.Length);
+                            binaryWriter.Write(message);
+                        }
+                        sideChannel.MessageQueue.Clear();
+                    }
+                    return memStream.ToArray();
+                }
+            }
+        }
+
+        /// <summary>
+        /// Separates the data received from Python into individual messages for each registered side channel.
+        /// </summary>
+        /// <param name="dataReceived">The byte array of data received from Python.</param>
+        internal static void ProcessSideChannelData(byte[] dataReceived)
+        {
+            ProcessSideChannelData(RegisteredChannels, dataReceived);
+        }
+
+        /// <summary>
+        /// Separates the data received from Python into individual messages for each registered side channel.
+        /// </summary>
+        /// <param name="sideChannels">A dictionary of channel type to channel.</param>
+        /// <param name="dataReceived">The byte array of data received from Python.</param>
+        internal static void ProcessSideChannelData(Dictionary<Guid, SideChannel> sideChannels, byte[] dataReceived)
+        {
+            while (m_CachedMessages.Count != 0)
+            {
+                var cachedMessage = m_CachedMessages.Dequeue();
+                if (sideChannels.ContainsKey(cachedMessage.ChannelId))
+                {
+                    using (var incomingMsg = new IncomingMessage(cachedMessage.Message))
+                    {
+                        sideChannels[cachedMessage.ChannelId].OnMessageReceived(incomingMsg);
+                    }
+                }
+                else
+                {
+                    Debug.Log(string.Format(
+                        "Unknown side channel data received. Channel Id is "
+                        + ": {0}", cachedMessage.ChannelId));
+                }
+            }
+
+            if (dataReceived.Length == 0)
+            {
+                return;
+            }
+            using (var memStream = new MemoryStream(dataReceived))
+            {
+                using (var binaryReader = new BinaryReader(memStream))
+                {
+                    while (memStream.Position < memStream.Length)
+                    {
+                        Guid channelId = Guid.Empty;
+                        byte[] message = null;
+                        try
+                        {
+                            channelId = new Guid(binaryReader.ReadBytes(16));
+                            var messageLength = binaryReader.ReadInt32();
+                            message = binaryReader.ReadBytes(messageLength);
+                        }
+                        catch (Exception ex)
+                        {
+                            throw new UnityAgentsException(
+                                "There was a problem reading a message in a SideChannel. Please make sure the " +
+                                "version of MLAgents in Unity is compatible with the Python version. Original error : "
+                                + ex.Message);
+                        }
+                        if (sideChannels.ContainsKey(channelId))
+                        {
+                            using (var incomingMsg = new IncomingMessage(message))
+                            {
+                                sideChannels[channelId].OnMessageReceived(incomingMsg);
+                            }
+                        }
+                        else
+                        {
+                            // Don't recognize this ID, but cache it in case the SideChannel that can handle
+                            // it is registered before the next call to ProcessSideChannelData.
+                            m_CachedMessages.Enqueue(new CachedSideChannelMessage
+                            {
+                                ChannelId = channelId,
+                                Message = message
+                            });
+                        }
+                    }
+                }
+            }
+        }
+
+    }
+}
--- a/com.unity.ml-agents/Runtime/SideChannels/SideChannelUtils.cs.meta
+++ b/com.unity.ml-agents/Runtime/SideChannels/SideChannelUtils.cs.meta
+fileFormatVersion: 2
+guid: 2506dff31271f49298fbff21e13fa8b6
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: