Merge branch 'master' into merge-release-0.13.0

5 年前 · d985dded
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
            . venv/bin/activate
            mkdir test-reports
            pip freeze > test-reports/pip_versions.txt
-            pytest --cov=ml-agents --cov=ml-agents-envs --cov=gym-unity --cov-report html --junitxml=test-reports/junit.xml -p no:warnings
+            pytest -n 2 --cov=ml-agents --cov=ml-agents-envs --cov=gym-unity --cov-report html --junitxml=test-reports/junit.xml -p no:warnings

      - run:
          name: Verify there are no hidden/missing metafiles.
--- a/.gitignore
+++ b/.gitignore
 /UnitySDK/Assets/AssetStoreTools*
 /UnitySDK/Assets/Plugins*
 /UnitySDK/Assets/Demonstrations*
+/UnitySDK/csharp_timers.json

 # Tensorflow Model Info
 /models
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
                .*_pb2_grpc.py
            )$
        # flake8-tidy-imports is used for banned-modules, not actually tidying
-        additional_dependencies: [flake8-comprehensions, flake8-tidy-imports, flake8-bugbear]
+        additional_dependencies: [flake8-comprehensions==3.1.4, flake8-tidy-imports==4.0.0, flake8-bugbear==20.1.2]
    -   id: trailing-whitespace
        name: trailing-whitespace-markdown
        types: [markdown]
--- a/UnitySDK/Assets/ML-Agents/Editor/DemonstrationImporter.cs
+++ b/UnitySDK/Assets/ML-Agents/Editor/DemonstrationImporter.cs
                var texture = (Texture2D)
                    AssetDatabase.LoadAssetAtPath(k_IconPath, typeof(Texture2D));

-#if UNITY_2017_3_OR_NEWER
-#else
-                ctx.SetMainAsset(ctx.assetPath, demonstration);
-#endif
            }
            catch
            {
--- a/UnitySDK/Assets/ML-Agents/Editor/Tests/MLAgentsEditModeTest.cs
+++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/MLAgentsEditModeTest.cs
            sensorName = n;
        }

-        public int[] GetFloatObservationShape()
+        public int[] GetObservationShape()
        {
            return new[] { 0 };
        }
--- a/UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/StackingSensorTests.cs
+++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/StackingSensorTests.cs
            ISensor wrapped = new VectorSensor(4);
            ISensor sensor = new StackingSensor(wrapped, 4);
            Assert.AreEqual("StackingSensor_size4_VectorSensor_size4", sensor.GetName());
-            Assert.AreEqual(sensor.GetFloatObservationShape(), new [] {16});
+            Assert.AreEqual(sensor.GetObservationShape(), new [] {16});
        }

        [Test]
--- a/UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/VectorSensorTests.cs
+++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/VectorSensorTests.cs
            Assert.AreEqual(fill, output[0]);

            WriteAdapter writer = new WriteAdapter();
-            writer.SetTarget(output, 0);
+            writer.SetTarget(output, sensor.GetObservationShape(), 0);

            // Make sure WriteAdapter didn't touch anything
            Assert.AreEqual(fill, output[0]);
--- a/UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/WriterAdapterTests.cs
+++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/WriterAdapterTests.cs
        {
            WriteAdapter writer = new WriteAdapter();
            var buffer = new[] { 0f, 0f, 0f };
+            var shape = new[] { 3 };
-            writer.SetTarget(buffer, 0);
+            writer.SetTarget(buffer, shape, 0);
            // Elementwise writes
            writer[0] = 1f;
            writer[2] = 2f;
-            writer.SetTarget(buffer, 1);
+            writer.SetTarget(buffer, shape, 1);
-            writer.SetTarget(buffer, 0);
+            writer.SetTarget(buffer, shape, 0);
-            writer.SetTarget(buffer, 1);
+            writer.SetTarget(buffer, shape, 1);
            writer.AddRange(new [] {6f, 7f});
            Assert.AreEqual(new[] { 4f, 6f, 7f }, buffer);
        }
                valueType = TensorProxy.TensorType.FloatingPoint,
                data = new Tensor(2, 3)
            };
+
            writer.SetTarget(t, 0, 0);
            Assert.AreEqual(0f, t.data[0, 0]);
            writer[0] = 1f;
                valueType = TensorProxy.TensorType.FloatingPoint,
                data = new Tensor(2, 2, 2, 3)
            };
+
+            var shape = new[] { 2, 2, 3 };

            writer.SetTarget(t, 0, 0);
            writer[1, 0, 1] = 1f;
--- a/UnitySDK/Assets/ML-Agents/Editor/Tests/StandaloneBuildTest.cs
+++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/StandaloneBuildTest.cs
 using System;
 using UnityEditor;
 using UnityEngine;
-#if UNITY_2018_1_OR_NEWER
-#endif

 namespace MLAgents
 {
        {
            string[] scenes = { "Assets/ML-Agents/Examples/3DBall/Scenes/3DBall.unity" };
            var buildResult = BuildPipeline.BuildPlayer(scenes, "testPlayer", BuildTarget.StandaloneOSX, BuildOptions.None);
-#if UNITY_2018_1_OR_NEWER
            var isOk = buildResult.summary.result == BuildResult.Succeeded;
            var error = "";
            foreach (var stepInfo in buildResult.steps)
                    }
                }
            }
-#else
-            var error = buildResult;
-            var isOk = string.IsNullOrEmpty(error);
-#endif
            if (isOk)
            {
                EditorApplication.Exit(0);
--- a/UnitySDK/Assets/ML-Agents/Examples/GridWorld/Scenes/GridWorld.unity
+++ b/UnitySDK/Assets/ML-Agents/Examples/GridWorld/Scenes/GridWorld.unity
--- a/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
                Debug.Assert(!sensors[i].GetName().Equals(sensors[i + 1].GetName()), "Sensor names must be unique.");
            }
 #endif
-            // Create a buffer for writing vector sensor data too
+            // Create a buffer for writing uncompressed (i.e. float) sensor data to
            int numFloatObservations = 0;
            for (var i = 0; i < sensors.Count; i++)
            {
                var sensor = sensors[i];
                if (sensor.GetCompressionType() == SensorCompressionType.None)
                {
-                    // only handles 1D
-                    m_WriteAdapter.SetTarget(m_VectorSensorBuffer, floatsWritten);
+                    m_WriteAdapter.SetTarget(m_VectorSensorBuffer, sensor.GetObservationShape(), floatsWritten);
-                        Shape = sensor.GetFloatObservationShape(),
+                        Shape = sensor.GetObservationShape(),
                        CompressionType = sensor.GetCompressionType()
                    };
                    m_Info.observations.Add(floatObs);
                    var compressedObs = new Observation
                    {
                        CompressedData = sensor.GetCompressedObservation(),
-                        Shape = sensor.GetFloatObservationShape(),
+                        Shape = sensor.GetObservationShape(),
                        CompressionType = sensor.GetCompressionType()
                    };
                    m_Info.observations.Add(compressedObs);
--- a/UnitySDK/Assets/ML-Agents/Scripts/Grpc/RpcCommunicator.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Grpc/RpcCommunicator.cs
            var result = m_Client.Exchange(WrapMessage(unityOutput, 200));
            unityInput = m_Client.Exchange(WrapMessage(null, 200)).UnityInput;
 #if UNITY_EDITOR
-#if UNITY_2017_2_OR_NEWER
-#else
-            EditorApplication.playmodeStateChanged += HandleOnPlayModeChanged;
-#endif
 #endif
            return result.UnityInput;
 #else
        #endregion

 #if UNITY_EDITOR
-#if UNITY_2017_2_OR_NEWER
        /// <summary>
        /// When the editor exits, the communicator must be closed
        /// </summary>
            }
        }

-#else
-        /// <summary>
-        /// When the editor exits, the communicator must be closed
-        /// </summary>
-        private void HandleOnPlayModeChanged()
-        {
-            // This method is run whenever the playmode state is changed.
-            if (!EditorApplication.isPlayingOrWillChangePlaymode)
-            {
-                Close();
-            }
-        }
-
-#endif
 #endif
    }
 }
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/GeneratorImpl.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/GeneratorImpl.cs
                // Write each sensor consecutively to the tensor
                foreach (var sensorIndex in m_SensorIndices)
                {
-                    m_WriteAdapter.SetTarget(tensorProxy, agentIndex, tensorOffset);
+                    m_WriteAdapter.SetTarget(tensorProxy, agentIndex, tensorOffset);
                    var numWritten = sensor.Write(m_WriteAdapter);
                    tensorOffset += numWritten;
                }
            var agentIndex = 0;
            foreach (var agent in agents)
            {
+                var sensor = agent.sensors[m_SensorIndex];
-                agent.sensors[m_SensorIndex].Write(m_WriteAdapter);
+                sensor.Write(m_WriteAdapter);
                agentIndex++;
            }
        }
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorGenerator.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorGenerator.cs
            for (var sensorIndex = 0; sensorIndex < agent.sensors.Count; sensorIndex++)
            {
                var sensor = agent.sensors[sensorIndex];
-                var shape = sensor.GetFloatObservationShape();
+                var shape = sensor.GetObservationShape();
                // TODO generalize - we currently only have vector or visual, but can't handle "2D" observations
                var isVectorSensor = (shape.Length == 1);
                if (isVectorSensor)
--- a/UnitySDK/Assets/ML-Agents/Scripts/Policy/BarracudaPolicy.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Policy/BarracudaPolicy.cs
                // First agent, save the sensor sizes
                foreach (var sensor in agent.sensors)
                {
-                    m_SensorShapes.Add(sensor.GetFloatObservationShape());
+                    m_SensorShapes.Add(sensor.GetObservationShape());
                }
            }
            else
                for (var i = 0; i < m_SensorShapes.Count; i++)
                {
                    var cachedShape = m_SensorShapes[i];
-                    var sensorShape = agent.sensors[i].GetFloatObservationShape();
+                    var sensorShape = agent.sensors[i].GetObservationShape();
                    Debug.Assert(cachedShape.Length == sensorShape.Length, "Sensor dimensions must match.");
                    for (var j = 0; j < cachedShape.Length; j++)
                    {
--- a/UnitySDK/Assets/ML-Agents/Scripts/Policy/RemotePolicy.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Policy/RemotePolicy.cs
                // First agent, save the sensor sizes
                foreach (var sensor in agent.sensors)
                {
-                    m_SensorShapes.Add(sensor.GetFloatObservationShape());
+                    m_SensorShapes.Add(sensor.GetObservationShape());
                }
            }
            else
                for (var i = 0; i < m_SensorShapes.Count; i++)
                {
                    var cachedShape = m_SensorShapes[i];
-                    var sensorShape = agent.sensors[i].GetFloatObservationShape();
+                    var sensorShape = agent.sensors[i].GetObservationShape();
                    Debug.Assert(cachedShape.Length == sensorShape.Length, "Sensor dimensions must match.");
                    for (var j = 0; j < cachedShape.Length; j++)
                    {
--- a/UnitySDK/Assets/ML-Agents/Scripts/Sensor/CameraSensor.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Sensor/CameraSensor.cs
        bool m_Grayscale;
        string m_Name;
        int[] m_Shape;
+        SensorCompressionType m_CompressionType;
-        public CameraSensor(Camera camera, int width, int height, bool grayscale, string name)
+        public CameraSensor(Camera camera, int width, int height, bool grayscale, string name,
+            SensorCompressionType compression)
        {
            m_Camera = camera;
            m_Width = width;
            m_Shape = new[] { height, width, grayscale ? 1 : 3 };
+            m_CompressionType = compression;
        }

        public string GetName()

-        public int[] GetFloatObservationShape()
+        public int[] GetObservationShape()
        {
            return m_Shape;
        }

        public SensorCompressionType GetCompressionType()
        {
-            return SensorCompressionType.PNG;
+            return m_CompressionType;
        }

        /// <summary>
--- a/UnitySDK/Assets/ML-Agents/Scripts/Sensor/CameraSensorComponent.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Sensor/CameraSensorComponent.cs
        public int width = 84;
        public int height = 84;
        public bool grayscale;
+        public SensorCompressionType compression = SensorCompressionType.PNG;
-            return new CameraSensor(camera, width, height, grayscale, sensorName);
+            return new CameraSensor(camera, width, height, grayscale, sensorName, compression);
        }

        public override int[] GetObservationShape()
--- a/UnitySDK/Assets/ML-Agents/Scripts/Sensor/ISensor.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Sensor/ISensor.cs
        /// A sensor that returns an RGB image would return new [] {Width, Height, 3}
        /// </summary>
        /// <returns></returns>
-        int[] GetFloatObservationShape();
+        int[] GetObservationShape();

        /// <summary>
        /// Write the observation data directly to the WriteAdapter.
        /// <returns></returns>
        public static int ObservationSize(this ISensor sensor)
        {
-            var shape = sensor.GetFloatObservationShape();
+            var shape = sensor.GetObservationShape();
            int count = 1;
            for (var i = 0; i < shape.Length; i++)
            {
--- a/UnitySDK/Assets/ML-Agents/Scripts/Sensor/RayPerceptionSensor.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Sensor/RayPerceptionSensor.cs
        {
        }

-        public int[] GetFloatObservationShape()
+        public int[] GetObservationShape()
        {
            return m_Shape;
        }
--- a/UnitySDK/Assets/ML-Agents/Scripts/Sensor/RenderTextureSensor.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Sensor/RenderTextureSensor.cs
        bool m_Grayscale;
        string m_Name;
        int[] m_Shape;
+        SensorCompressionType m_CompressionType;
-        public RenderTextureSensor(RenderTexture renderTexture, bool grayscale, string name)
+        public RenderTextureSensor(RenderTexture renderTexture, bool grayscale, string name,
+            SensorCompressionType compressionType)
        {
            m_RenderTexture = renderTexture;
            var width = renderTexture != null ? renderTexture.width : 0;
            m_Shape = new[] { height, width, grayscale ? 1 : 3 };
+            m_CompressionType = compressionType;
        }

        public string GetName()

-        public int[] GetFloatObservationShape()
+        public int[] GetObservationShape()
        {
            return m_Shape;
        }

        public SensorCompressionType GetCompressionType()
        {
-            return SensorCompressionType.PNG;
+            return m_CompressionType;
-        /// Converts a RenderTexture and correspinding resolution to a 2D texture.
+        /// Converts a RenderTexture to a 2D texture.
        /// </summary>
        /// <returns>The 2D texture.</returns>
        /// <param name="obsTexture">RenderTexture.</param>
--- a/UnitySDK/Assets/ML-Agents/Scripts/Sensor/RenderTextureSensorComponent.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Sensor/RenderTextureSensorComponent.cs
        public RenderTexture renderTexture;
        public string sensorName = "RenderTextureSensor";
        public bool grayscale;
+        public SensorCompressionType compression = SensorCompressionType.PNG;
-            return new RenderTextureSensor(renderTexture, grayscale, sensorName);
+            return new RenderTextureSensor(renderTexture, grayscale, sensorName, compression);
        }

        public override int[] GetObservationShape()
--- a/UnitySDK/Assets/ML-Agents/Scripts/Sensor/SensorBase.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Sensor/SensorBase.cs
    {
        /// <summary>
        /// Write the observations to the output buffer. This size of the buffer will be product of the sizes returned
-        /// by GetFloatObservationShape().
+        /// by GetObservationShape().
-        public abstract int[] GetFloatObservationShape();
+        public abstract int[] GetObservationShape();

        public abstract string GetName();

        /// <param name="adapter"></param>
        public virtual int Write(WriteAdapter adapter)
        {
-            // TODO reuse buffer for similar agents, don't call GetFloatObservationShape()
+            // TODO reuse buffer for similar agents, don't call GetObservationShape()
            var numFloats = this.ObservationSize();
            float[] buffer = new float[numFloats];
            WriteObservation(buffer);
--- a/UnitySDK/Assets/ML-Agents/Scripts/Sensor/StackingSensor.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Sensor/StackingSensor.cs

            m_Name = $"StackingSensor_size{numStackedObservations}_{wrapped.GetName()}";

-            var shape = wrapped.GetFloatObservationShape();
+            var shape = wrapped.GetObservationShape();
            m_Shape = new int[shape.Length];

            m_UnstackedObservationSize = wrapped.ObservationSize();
        public int Write(WriteAdapter adapter)
        {
            // First, call the wrapped sensor's write method. Make sure to use our own adapater, not the passed one.
-            m_LocalAdapter.SetTarget(m_StackedObservations[m_CurrentIndex], 0);
+            var wrappedShape = m_WrappedSensor.GetObservationShape();
+            m_LocalAdapter.SetTarget(m_StackedObservations[m_CurrentIndex], wrappedShape, 0);
            m_WrappedSensor.Write(m_LocalAdapter);

            // Now write the saved observations (oldest first)
            m_CurrentIndex = (m_CurrentIndex + 1) % m_NumStackedObservations;
        }

-        public int[] GetFloatObservationShape()
+        public int[] GetObservationShape()
        {
            return m_Shape;
        }
--- a/UnitySDK/Assets/ML-Agents/Scripts/Sensor/VectorSensor.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Sensor/VectorSensor.cs
            Clear();
        }

-        public int[] GetFloatObservationShape()
+        public int[] GetObservationShape()
        {
            return m_Shape;
        }
--- a/UnitySDK/Assets/ML-Agents/Scripts/Sensor/WriteAdapter.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Sensor/WriteAdapter.cs
+using System;
+using Barracuda;
 using MLAgents.InferenceBrain;

 namespace MLAgents.Sensor
        TensorProxy m_Proxy;
        int m_Batch;

+        TensorShape m_TensorShape;
+
-        /// <param name="data"></param>
-        /// <param name="offset"></param>
-        public void SetTarget(IList<float> data, int offset)
+        /// <param name="data">Float array or list that will be written to.</param>
+        /// <param name="shape">Shape of the observations to be written.</param>
+        /// <param name="offset">Offset from the start of the float data to write to.</param>
+        public void SetTarget(IList<float> data, int[] shape, int offset)
-            m_Batch = -1;
+            m_Batch = 0;
+
+            if (shape.Length == 1)
+            {
+                m_TensorShape = new TensorShape(m_Batch, shape[0]);
+            }
+            else
+            {
+                m_TensorShape = new TensorShape(m_Batch, shape[0], shape[1], shape[2]);
+            }
-        /// <param name="tensorProxy"></param>
-        /// <param name="batchIndex"></param>
-        /// <param name="channelOffset"></param>
+        /// <param name="tensorProxy">Tensor proxy that will be writtent to.</param>
+        /// <param name="batchIndex">Batch index in the tensor proxy (i.e. the index of the Agent)</param>
+        /// <param name="channelOffset">Offset from the start of the channel to write to.</param>
        public void SetTarget(TensorProxy tensorProxy, int batchIndex, int channelOffset)
        {
            m_Proxy = tensorProxy;
+            m_TensorShape = m_Proxy.data.shape;
        }

        /// <summary>
        {
            set
            {
-                // Only TensorProxy supports 3D access
-                m_Proxy.data[m_Batch, h, w, ch + m_Offset] = value;
+                if (m_Data != null)
+                {
+                    if (h < 0 || h >= m_TensorShape.height)
+                    {
+                        throw new IndexOutOfRangeException($"height value {h} must be in range [0, {m_TensorShape.height-1}]");
+                    }
+                    if (w < 0 || w >= m_TensorShape.width)
+                    {
+                        throw new IndexOutOfRangeException($"width value {w} must be in range [0, {m_TensorShape.width-1}]");
+                    }
+                    if (ch < 0 || ch >= m_TensorShape.channels)
+                    {
+                        throw new IndexOutOfRangeException($"channel value {ch} must be in range [0, {m_TensorShape.channels-1}]");
+                    }
+
+                    var index = m_TensorShape.Index(m_Batch, h, w, ch + m_Offset);
+                    m_Data[index] = value;
+                }
+                else
+                {
+                    m_Proxy.data[m_Batch, h, w, ch + m_Offset] = value;
+                }
            }
        }

--- a/config/sac_trainer_config.yaml
+++ b/config/sac_trainer_config.yaml
    init_entcoef: 1.0
    learning_rate: 3.0e-4
    learning_rate_schedule: constant
-    max_steps: 5.0e4
+    max_steps: 5.0e5
    memory_size: 256
    normalize: false
    num_update: 1
    sequence_length: 64
-    summary_freq: 1000
+    summary_freq: 10000
    tau: 0.005
    use_recurrent: false
    vis_encode_type: simple
    normalize: false
    batch_size: 256
    buffer_size: 500000
-    max_steps: 1.0e5
+    max_steps: 2.0e6
-    max_steps: 5.0e5
+    max_steps: 2.0e7
-    summary_freq: 1000
+    summary_freq: 20000
-    max_steps: 5.0e4
+    max_steps: 1.5e7
-    summary_freq: 2000
+    summary_freq: 60000
-    max_steps: 1.0e6
+    max_steps: 3e7
-    summary_freq: 2000
+    summary_freq: 20000
    time_horizon: 128
    init_entcoef: 0.1
    num_layers: 2
-    max_steps: 1.0e6
+    max_steps: 3e7
-    summary_freq: 2000
+    summary_freq: 20000
    time_horizon: 128
    num_layers: 2
    init_entcoef: 0.1
-    max_steps: 5.0e5
+    max_steps: 5.0e6
-    summary_freq: 2000
+    summary_freq: 20000
    time_horizon: 128
    init_entcoef: 0.1
    num_layers: 2
-    max_steps: 5.0e5
+    max_steps: 5.0e6
-    summary_freq: 2000
+    summary_freq: 20000
    time_horizon: 128
    init_entcoef: 0.1
    num_layers: 2
-    summary_freq: 2000
+    summary_freq: 30000
    time_horizon: 128
    batch_size: 128
    buffer_init_steps: 10000
    init_entcoef: 0.01
-    max_steps: 5.0e5
+    max_steps: 1.0e7
    sequence_length: 16
    tau: 0.01
    use_recurrent: false
    hidden_units: 256
    buffer_init_steps: 1000
    num_layers: 1
-    max_steps: 5.0e5
+    max_steps: 1.0e7
    buffer_size: 500000
    init_entcoef: 0.01
    tau: 0.01
    normalize: true
    batch_size: 64
    buffer_size: 12000
-    summary_freq: 1000
+    summary_freq: 12000
    time_horizon: 1000
    hidden_units: 64
    init_entcoef: 0.5
    batch_size: 256
-    summary_freq: 1000
+    summary_freq: 12000
-    max_steps: 2e5
+    max_steps: 4e6

 CrawlerStatic:
    normalize: true
    buffer_size: 500000
    buffer_init_steps: 2000
-    max_steps: 5e5
-    summary_freq: 3000
+    max_steps: 5e6
+    summary_freq: 30000
    init_entcoef: 1.0
    num_layers: 3
    hidden_units: 512
    time_horizon: 1000
    batch_size: 256
    buffer_size: 500000
-    summary_freq: 3000
+    summary_freq: 30000
-    max_steps: 1e6
+    max_steps: 1e7
    hidden_units: 512
    reward_signals:
        extrinsic:
    time_horizon: 1000
    batch_size: 256
    buffer_size: 500000
-    max_steps: 2e6
-    summary_freq: 3000
+    max_steps: 2e7
+    summary_freq: 30000
    num_layers: 4
    train_interval: 2
    hidden_units: 512
    time_horizon: 1000
    batch_size: 128
    buffer_size: 500000
-    max_steps: 2e5
-    summary_freq: 3000
+    max_steps: 2e7
+    summary_freq: 60000

 Hallway:
    sequence_length: 32
    init_entcoef: 0.1
-    max_steps: 5.0e5
+    max_steps: 1.0e7
    summary_freq: 1000
    time_horizon: 64
    use_recurrent: true
    memory_size: 256
    gamma: 0.99
    batch_size: 64
-    max_steps: 5.0e5
-    summary_freq: 1000
+    max_steps: 1.0e7
    time_horizon: 64
    use_recurrent: true

    gamma: 0.99
    buffer_size: 1024
    batch_size: 64
-    max_steps: 5.0e5
-    summary_freq: 1000
+    max_steps: 3.0e6
+    summary_freq: 60000
    time_horizon: 64

 GridWorld:
    init_entcoef: 0.5
    buffer_init_steps: 1000
    buffer_size: 50000
-    max_steps: 50000
-    summary_freq: 2000
+    max_steps: 500000
+    summary_freq: 20000
    time_horizon: 5
    reward_signals:
        extrinsic:
--- a/config/trainer_config.yaml
+++ b/config/trainer_config.yaml
    lambd: 0.95
    learning_rate: 3.0e-4
    learning_rate_schedule: linear
-    max_steps: 5.0e4
+    max_steps: 5.0e5
    memory_size: 256
    normalize: false
    num_epoch: 3
-    summary_freq: 1000
+    summary_freq: 10000
    use_recurrent: false
    vis_encode_type: simple
    reward_signals:
    beta: 5.0e-3
    batch_size: 1024
    buffer_size: 10240
-    max_steps: 1.0e5
+    max_steps: 2.0e6
-    max_steps: 1.0e6
+    max_steps: 2.0e7
-    max_steps: 5.0e4
+    max_steps: 1.5e7
-    summary_freq: 2000
+    summary_freq: 60000
-    max_steps: 1.0e6
+    max_steps: 3e7
-    summary_freq: 2000
+    summary_freq: 20000
-    max_steps: 1.0e6
+    max_steps: 3e7
-    summary_freq: 2000
+    summary_freq: 20000
-    max_steps: 5.0e5
+    max_steps: 5.0e6
    learning_rate: 1e-3
    batch_size: 128
    num_epoch: 3
-    summary_freq: 2000
+    summary_freq: 20000
-    max_steps: 5.0e5
+    max_steps: 5.0e6
    learning_rate: 1e-3
    batch_size: 320
    num_epoch: 3
-    summary_freq: 2000
+    summary_freq: 20000
-    summary_freq: 2000
+    summary_freq: 30000
    time_horizon: 128
    batch_size: 128
    buffer_size: 2048
-    max_steps: 5.0e5
+    max_steps: 1.0e7
    num_epoch: 3
    reward_signals:
        extrinsic:
    hidden_units: 256
    num_layers: 1
    beta: 1.0e-2
-    max_steps: 5.0e5
+    max_steps: 1.0e7
    num_epoch: 3
    reward_signals:
        extrinsic:
    normalize: true
    batch_size: 64
    buffer_size: 12000
-    summary_freq: 1000
+    summary_freq: 12000
    time_horizon: 1000
    lambd: 0.99
    beta: 0.001
    batch_size: 1200
    buffer_size: 12000
-    summary_freq: 1000
+    summary_freq: 12000
    time_horizon: 1000
    max_steps: 5.0e5
    beta: 0.001

 Tennis:
    normalize: true
-    max_steps: 2e5
+    max_steps: 4e6

 CrawlerStatic:
    normalize: true
    buffer_size: 20240
-    max_steps: 1e6
-    summary_freq: 3000
+    max_steps: 1e7
+    summary_freq: 30000
    num_layers: 3
    hidden_units: 512
    reward_signals:
    time_horizon: 1000
    batch_size: 2024
    buffer_size: 20240
-    max_steps: 1e6
-    summary_freq: 3000
+    max_steps: 1e7
+    summary_freq: 30000
    num_layers: 3
    hidden_units: 512
    reward_signals:
    time_horizon: 1000
    batch_size: 2048
    buffer_size: 20480
-    max_steps: 2e6
-    summary_freq: 3000
+    max_steps: 2e7
+    summary_freq: 30000
    num_layers: 3
    hidden_units: 512
    reward_signals:
    time_horizon: 1000
    batch_size: 2024
    buffer_size: 20240
-    max_steps: 1e6
-    summary_freq: 3000
+    max_steps: 2e7
+    summary_freq: 60000
    reward_signals:
        extrinsic:
            strength: 1.0
    num_epoch: 3
    buffer_size: 1024
    batch_size: 128
-    max_steps: 5.0e5
-    summary_freq: 1000
+    max_steps: 1.0e7
+    summary_freq: 10000
    time_horizon: 64

 VisualHallway:
    num_epoch: 3
    buffer_size: 1024
    batch_size: 64
-    max_steps: 5.0e5
-    summary_freq: 1000
+    max_steps: 1.0e7
+    summary_freq: 10000
    time_horizon: 64

 VisualPushBlock:
    num_epoch: 3
    buffer_size: 1024
    batch_size: 64
-    max_steps: 5.0e5
-    summary_freq: 1000
+    max_steps: 3.0e6
+    summary_freq: 60000
    time_horizon: 64

 GridWorld:
    hidden_units: 256
    beta: 5.0e-3
    buffer_size: 256
-    max_steps: 50000
-    summary_freq: 2000
+    max_steps: 500000
+    summary_freq: 20000
    time_horizon: 5
    reward_signals:
        extrinsic:
--- a/docs/Learning-Environment-Create-New.md
+++ b/docs/Learning-Environment-Create-New.md
 3. In a file system window, navigate to the folder containing your cloned
    ML-Agents repository.
 4. Drag the `ML-Agents` folder from `UnitySDK/Assets` to the Unity
-    Editor Project window.
+    Editor Project window. If you see console errors about Barracuda, make sure
+    you've installed Barracuda from the Unity Package Manager. More information
+    can be found in the [installation instructions](Installation.md) under
+    **Package Installation**.

 Your Unity **Project** window should contain the following assets:

--- a/docs/Migrating.md
+++ b/docs/Migrating.md

 # Migrating

-## Migrating from 0.12 to 0.13
+
+## Migrating from 0.13 to latest
+
+### Important changes
+* Trainer steps are now counted per-Agent, not per-environment as in previous versions. For instance, if you have 10 Agents in the scene, 20 environment steps now corresponds to 200 steps as printed in the terminal and in Tensorboard.
+
+### Steps to Migrate
+* Multiply `max_steps` and `summary_steps` in your `trainer_config.yaml` by the number of Agents in the scene.
+
+## Migrating from ML-Agents toolkit v0.12.0 to v0.13.0

 ### Important changes
 * The low level Python API has changed. You can look at the document [Low Level Python API documentation](Python-API.md) for more information. This should only affect you if you're writing a custom trainer; if you use `mlagents-learn` for training, this should be a transparent change.
--- a/docs/Training-ML-Agents.md
+++ b/docs/Training-ML-Agents.md
  the oldest checkpoint is deleted when saving a new checkpoint. Defaults to 5.
 * `--lesson=<n>`: Specify which lesson to start with when performing curriculum
  training. Defaults to 0.
-* `--num-runs=<n>`: Sets the number of concurrent training sessions to perform.
-  Default is set to 1. Set to higher values when benchmarking performance and
-  multiple training sessions is desired. Training sessions are independent, and
-  do not improve learning performance.
 * `--num-envs=<n>`: Specifies the number of concurrent Unity environment instances to
  collect experiences from when training. Defaults to 1.
 * `--run-id=<path>`: Specifies an identifier for each training run. This
--- a/gym-unity/gym_unity/tests/test_gym.py
+++ b/gym-unity/gym_unity/tests/test_gym.py
-import unittest.mock as mock
+from unittest import mock
 import pytest
 import numpy as np

--- a/ml-agents-envs/mlagents_envs/exception.py
+++ b/ml-agents-envs/mlagents_envs/exception.py
    pass


+class UnityObservationException(UnityException):
+    """
+    Related to errors with receiving observations.
+    """
+
+    pass
+
+
 class UnityActionException(UnityException):
    """
    Related to errors with sending actions.
--- a/ml-agents-envs/mlagents_envs/rpc_utils.py
+++ b/ml-agents-envs/mlagents_envs/rpc_utils.py
 from mlagents_envs.base_env import AgentGroupSpec, ActionType, BatchedStepResult
+from mlagents_envs.exception import UnityObservationException
+from mlagents_envs.communicator_objects.observation_pb2 import (
+    ObservationProto,
+    NONE as COMPRESSION_NONE,
+)
-from typing import cast, List, Tuple, Union, Collection
+from typing import cast, List, Tuple, Union, Collection, Optional, Iterable
 from PIL import Image

 logger = logging.getLogger("mlagents_envs")
        image = Image.open(io.BytesIO(image_bytearray))
        # Normally Image loads lazily, this forces it to do loading in the timer scope.
        image.load()
-    s = np.array(image) / 255.0
+    s = np.array(image, dtype=np.float32) / 255.0
    if gray_scale:
        s = np.mean(s, axis=2)
        s = np.reshape(s, [s.shape[0], s.shape[1], 1])
@timed
+def observation_to_np_array(
+    obs: ObservationProto, expected_shape: Optional[Iterable[int]] = None
+) -> np.ndarray:
+    """
+    Converts observation proto into numpy array of the appropriate size.
+    :param obs: observation proto to be converted
+    :param expected_shape: optional shape information, used for sanity checks.
+    :return: processed numpy array of observation from environment
+    """
+    if expected_shape is not None:
+        if list(obs.shape) != list(expected_shape):
+            raise UnityObservationException(
+                f"Observation did not have the expected shape - got {obs.shape} but expected {expected_shape}"
+            )
+    gray_scale = obs.shape[2] == 1
+    if obs.compression_type == COMPRESSION_NONE:
+        img = np.array(obs.float_data.data, dtype=np.float32)
+        img = np.reshape(img, obs.shape)
+        return img
+    else:
+        img = process_pixels(obs.compressed_data, gray_scale)
+        # Compare decompressed image size to observation shape and make sure they match
+        if list(obs.shape) != list(img.shape):
+            raise UnityObservationException(
+                f"Decompressed observation did not have the expected shape - "
+                f"decompressed had {img.shape} but expected {obs.shape}"
+            )
+        return img
+
+
+@timed
 def _process_visual_observation(
    obs_index: int,
    shape: Tuple[int, int, int],
    if len(agent_info_list) == 0:
        return np.zeros((0, shape[0], shape[1], shape[2]), dtype=np.float32)

-    gray_scale = shape[2] == 1
-        process_pixels(agent_obs.observations[obs_index].compressed_data, gray_scale)
+        observation_to_np_array(agent_obs.observations[obs_index], shape)
        for agent_obs in agent_info_list
    ]
    return np.array(batched_visual, dtype=np.float32)
--- a/ml-agents-envs/mlagents_envs/tests/test_envs.py
+++ b/ml-agents-envs/mlagents_envs/tests/test_envs.py
-import unittest.mock as mock
+from unittest import mock
 import pytest

 import numpy as np
--- a/ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py
+++ b/ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py
+import io
+import numpy as np
+import pytest
+
 from mlagents_envs.communicator_objects.agent_info_pb2 import AgentInfoProto
 from mlagents_envs.communicator_objects.observation_pb2 import (
    ObservationProto,
 from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
-import numpy as np
-import io
+from mlagents_envs.exception import UnityObservationException
 from mlagents_envs.rpc_utils import (
    agent_group_spec_from_proto,
    process_pixels,
    return obs_proto


+def generate_uncompressed_proto_obs(in_array: np.ndarray) -> ObservationProto:
+    obs_proto = ObservationProto()
+    obs_proto.float_data.data.extend(in_array.flatten().tolist())
+    obs_proto.compression_type = NONE
+    obs_proto.shape.extend(in_array.shape)
+    return obs_proto
+
+
-    in_array = np.random.rand(128, 128, 3)
+    in_array = np.random.rand(128, 64, 3)
-    assert out_array.shape == (128, 128, 3)
+    assert out_array.shape == (128, 64, 3)
-    in_array = np.random.rand(128, 128, 3)
+    in_array = np.random.rand(128, 64, 3)
-    assert out_array.shape == (128, 128, 1)
+    assert out_array.shape == (128, 64, 1)
    assert np.mean(in_array.mean(axis=2, keepdims=True) - out_array) < 0.01
    assert (in_array.mean(axis=2, keepdims=True) - out_array < 0.01).all()



 def test_process_visual_observation():
-    in_array_1 = np.random.rand(128, 128, 3)
+    in_array_1 = np.random.rand(128, 64, 3)
-    in_array_2 = np.random.rand(128, 128, 3)
-    proto_obs_2 = generate_compressed_proto_obs(in_array_2)
+    in_array_2 = np.random.rand(128, 64, 3)
+    proto_obs_2 = generate_uncompressed_proto_obs(in_array_2)
-    arr = _process_visual_observation(0, (128, 128, 3), ap_list)
-    assert list(arr.shape) == [2, 128, 128, 3]
+    arr = _process_visual_observation(0, (128, 64, 3), ap_list)
+    assert list(arr.shape) == [2, 128, 64, 3]
+
+
+def test_process_visual_observation_bad_shape():
+    in_array_1 = np.random.rand(128, 64, 3)
+    proto_obs_1 = generate_compressed_proto_obs(in_array_1)
+    ap1 = AgentInfoProto()
+    ap1.observations.extend([proto_obs_1])
+    ap_list = [ap1]
+    with pytest.raises(UnityObservationException):
+        _process_visual_observation(0, (128, 42, 3), ap_list)


 def test_batched_step_result_from_proto():
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
 import sys
-from typing import List, Dict
-from collections import defaultdict, Counter
+from typing import List, Dict, Deque, TypeVar, Generic
+from collections import defaultdict, Counter, deque
-from mlagents.trainers.trainer import Trainer
+from mlagents.trainers.policy import Policy
+T = TypeVar("T")
+

 class AgentProcessor:
    """

    def __init__(
        self,
-        trainer: Trainer,
        policy: TFPolicy,
        behavior_id: str,
        stats_reporter: StatsReporter,
        self.episode_steps: Counter = Counter()
        self.episode_rewards: Dict[str, float] = defaultdict(float)
        self.stats_reporter = stats_reporter
-        self.trainer = trainer
+        self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
        self.behavior_id = behavior_id

    def add_experiences(
                        next_obs=next_obs,
                        behavior_id=self.behavior_id,
                    )
-                    # This will eventually be replaced with a queue
-                    self.trainer.process_trajectory(trajectory)
+                    for traj_queue in self.trajectory_queues:
+                        traj_queue.put(trajectory)
                    self.experience_buffers[agent_id] = []
                    if next_info.local_done[next_idx]:
                        self.stats_reporter.add_stat(
                        del self.episode_rewards[agent_id]
                elif not next_info.local_done[next_idx]:
                    self.episode_steps[agent_id] += 1
+                    
+
+    def publish_trajectory_queue(
+        self, trajectory_queue: "AgentManagerQueue[Trajectory]"
+    ) -> None:
+        """
+        Adds a trajectory queue to the list of queues to publish to when this AgentProcessor
+        assembles a Trajectory
+        :param trajectory_queue: Trajectory queue to publish to.
+        """
+        self.trajectory_queues.append(trajectory_queue)
+
+
+class AgentManagerQueue(Generic[T]):
+    """
+    Queue used by the AgentManager. Note that we make our own class here because in most implementations
+    deque is sufficient and faster. However, if we want to switch to multiprocessing, we'll need to change
+    out this implementation.
+    """
+
+    class Empty(Exception):
+        """
+        Exception for when the queue is empty.
+        """
+
+        pass
+
+    def __init__(self, behavior_id: str):
+        """
+        Initializes an AgentManagerQueue. Note that we can give it a behavior_id so that it can be identified
+        separately from an AgentManager.
+        """
+        self.queue: Deque[T] = deque()
+        self.behavior_id = behavior_id
+
+    def empty(self) -> bool:
+        return len(self.queue) == 0
+
+    def get_nowait(self) -> T:
+        try:
+            return self.queue.popleft()
+        except IndexError:
+            raise self.Empty("The AgentManagerQueue is empty.")
+
+    def put(self, item: T) -> None:
+        self.queue.append(item)
+
+
+class AgentManager(AgentProcessor):
+    """
+    An AgentManager is an AgentProcessor that also holds a single trajectory and policy queue.
+    Note: this leaves room for adding AgentProcessors that publish multiple trajectory queues.
+    """
+
+    def __init__(
+        self,
+        policy: TFPolicy,
+        behavior_id: str,
+        stats_reporter: StatsReporter,
+        max_trajectory_length: int = sys.maxsize,
+    ):
+        super().__init__(policy, behavior_id, stats_reporter, max_trajectory_length)
+        self.trajectory_queue: AgentManagerQueue[Trajectory] = AgentManagerQueue(
+            self.behavior_id
+        )
+        self.policy_queue: AgentManagerQueue[Policy] = AgentManagerQueue(
+            self.behavior_id
+        )
+        self.publish_trajectory_queue(self.trajectory_queue)
--- a/ml-agents/mlagents/trainers/brain.py
+++ b/ml-agents/mlagents/trainers/brain.py
 import logging
 import numpy as np
-import io
-from mlagents_envs.timers import hierarchical_timer, timed
+from mlagents_envs.timers import timed
+from mlagents_envs import rpc_utils
-from PIL import Image

 logger = logging.getLogger("mlagents.trainers")


    @staticmethod
    @timed
-    def process_pixels(image_bytes: bytes, gray_scale: bool) -> np.ndarray:
-        """
-        Converts byte array observation image into numpy array, re-sizes it,
-        and optionally converts it to grey scale
-        :param gray_scale: Whether to convert the image to grayscale.
-        :param image_bytes: input byte array corresponding to image
-        :return: processed numpy array of observation from environment
-        """
-        with hierarchical_timer("image_decompress"):
-            image_bytearray = bytearray(image_bytes)
-            image = Image.open(io.BytesIO(image_bytearray))
-            # Normally Image loads lazily, this forces it to do loading in the timer scope.
-            image.load()
-        s = np.array(image) / 255.0
-        if gray_scale:
-            s = np.mean(s, axis=2)
-            s = np.reshape(s, [s.shape[0], s.shape[1], 1])
-        return s
-
-    @staticmethod
-    @timed
    def from_agent_proto(
        worker_id: int,
        agent_info_list: Collection[

        vis_obs: List[np.ndarray] = []
        for i in range(brain_params.number_visual_observations):
-            # TODO check compression type, handle uncompressed visuals
-                BrainInfo.process_pixels(
-                    agent_obs[i].compressed_data,
-                    brain_params.camera_resolutions[i].gray_scale,
+                rpc_utils.observation_to_np_array(
+                    agent_obs[i], brain_params.camera_resolutions[i]
                )
                for agent_obs in visual_observation_protos
            ]
--- a/ml-agents/mlagents/trainers/curriculum.py
+++ b/ml-agents/mlagents/trainers/curriculum.py
-import os
 import json
 import math
 from typing import Dict, Any, TextIO
 logger = logging.getLogger("mlagents.trainers")


-class Curriculum(object):
-    def __init__(self, location):
+class Curriculum:
+    def __init__(self, brain_name: str, config: Dict):
-        :param location: Path to JSON defining curriculum.
+        :param brain_name: Name of the brain this Curriculum is associated with
+        :param config: Dictionary of fields needed to configure the Curriculum
-        # The name of the brain should be the basename of the file without the
-        # extension.
-        self._brain_name = os.path.basename(location).split(".")[0]
-        self.data = Curriculum.load_curriculum_file(location)
+        self.brain_name = brain_name
+        self.config = config
-        self.smoothing_value = 0
+        self.smoothing_value = 0.0
        for key in [
            "parameters",
            "measure",
        ]:
-            if key not in self.data:
+            if key not in self.config:
-                    "{0} does not contain a " "{1} field.".format(location, key)
+                    f"{brain_name} curriculum config does not contain a {key} field."
-        self.measure = self.data["measure"]
-        self.min_lesson_length = self.data["min_lesson_length"]
-        self.max_lesson_num = len(self.data["thresholds"])
+        self.measure = self.config["measure"]
+        self.min_lesson_length = self.config["min_lesson_length"]
+        self.max_lesson_num = len(self.config["thresholds"])
-        parameters = self.data["parameters"]
+        parameters = self.config["parameters"]
-                    "The parameter {0} in Curriculum {1} must have {2} values "
-                    "but {3} were found".format(
-                        key, location, self.max_lesson_num + 1, len(parameters[key])
-                    )
+                    f"The parameter {key} in {brain_name}'s curriculum must have {self.max_lesson_num + 1} values "
+                    f"but {len(parameters[key])} were found"
                )

    @property
               steps completed).
        :return Whether the lesson was incremented.
        """
-        if not self.data or not measure_val or math.isnan(measure_val):
+        if not self.config or not measure_val or math.isnan(measure_val):
-        if self.data["signal_smoothing"]:
+        if self.config["signal_smoothing"]:
-            if measure_val > self.data["thresholds"][self.lesson_num]:
+            if measure_val > self.config["thresholds"][self.lesson_num]:
-                parameters = self.data["parameters"]
+                parameters = self.config["parameters"]
-                        self._brain_name,
+                        self.brain_name,
                        self.lesson_num,
                        ", ".join([str(x) + " -> " + str(config[x]) for x in config]),
                    )
               current lesson is returned.
        :return: The configuration of the reset parameters.
        """
-        if not self.data:
+        if not self.config:
-        parameters = self.data["parameters"]
+        parameters = self.config["parameters"]
-    def load_curriculum_file(location: str) -> None:
+    def load_curriculum_file(config_path: str) -> Dict:
-            with open(location) as data_file:
+            with open(config_path) as data_file:
-                "The file {0} could not be found.".format(location)
+                "The file {0} could not be found.".format(config_path)
-                "There was an error decoding {}".format(location)
+                "There was an error decoding {}".format(config_path)
-    def _load_curriculum(fp: TextIO) -> None:
+    def _load_curriculum(fp: TextIO) -> Dict:
        try:
            return json.load(fp)
        except json.decoder.JSONDecodeError as e:
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
 import logging
 import argparse

-from multiprocessing import Process, Queue
 import os
 import glob
 import shutil
 import mlagents_envs
 from mlagents import tf_utils
 from mlagents.trainers.trainer_controller import TrainerController
-from mlagents.trainers.exception import TrainerError
 from mlagents.trainers.meta_curriculum import MetaCurriculum
 from mlagents.trainers.trainer_util import load_config, TrainerFactory
 from mlagents.trainers.stats import TensorboardWriter, CSVWriter, StatsReporter

 class CommandLineOptions(NamedTuple):
    debug: bool
-    num_runs: int
    seed: int
    env_path: str
    run_id: str
        help="The directory name for model and summary statistics",
    )
    parser.add_argument(
-        "--num-runs", default=1, type=int, help="Number of concurrent training sessions"
-    )
-    parser.add_argument(
        "--save-freq", default=50000, type=int, help="Frequency at which to save model"
    )
    parser.add_argument(
    return CommandLineOptions.from_argparse(args)


-def run_training(
-    sub_id: int, run_seed: int, options: CommandLineOptions, process_queue: Queue
-) -> None:
+def run_training(run_seed: int, options: CommandLineOptions) -> None:
-    :param process_queue: Queue used to send signal back to main.
-    :param sub_id: Unique id for training session.
    :param options: parsed command line arguments
    :param run_seed: Random seed used for training.
    :param run_options: Command line arguments for training.
    curriculum_folder = options.curriculum_folder
    # Recognize and use docker volume if one is passed as an argument
    if not options.docker_target_name:
-        model_path = "./models/{run_id}-{sub_id}".format(
-            run_id=options.run_id, sub_id=sub_id
-        )
+        model_path = f"./models/{options.run_id}"
-        trainer_config_path = "/{docker_target_name}/{trainer_config_path}".format(
-            docker_target_name=options.docker_target_name,
-            trainer_config_path=trainer_config_path,
-        )
+        trainer_config_path = f"/{options.docker_target_name}/{trainer_config_path}"
-            curriculum_folder = "/{docker_target_name}/{curriculum_folder}".format(
-                docker_target_name=options.docker_target_name,
-                curriculum_folder=curriculum_folder,
-            )
-        model_path = "/{docker_target_name}/models/{run_id}-{sub_id}".format(
-            docker_target_name=options.docker_target_name,
-            run_id=options.run_id,
-            sub_id=sub_id,
-        )
-        summaries_dir = "/{docker_target_name}/summaries".format(
-            docker_target_name=options.docker_target_name
-        )
+            curriculum_folder = f"/{options.docker_target_name}/{curriculum_folder}"
+        model_path = f"/{options.docker_target_name}/models/{options.run_id}"
+        summaries_dir = f"/{options.docker_target_name}/summaries"
-    port = options.base_port + (sub_id * options.num_envs)
+    port = options.base_port

    # Configure CSV, Tensorboard Writers and StatsReporter
    # We assume reward and episode length are needed in the CSV.
        trainer_factory,
        model_path,
        summaries_dir,
-        options.run_id + "-" + str(sub_id),
+        options.run_id,
        options.save_freq,
        maybe_meta_curriculum,
        options.train_model,
    )
-    # Signal that environment has been launched.
-    process_queue.put(True)
    # Begin training
    try:
        tc.start_learning(env_manager)
        return None

    else:
-        meta_curriculum = MetaCurriculum(curriculum_folder)
+        meta_curriculum = MetaCurriculum.from_directory(curriculum_folder)
-        meta_curriculum.set_all_curriculums_to_lesson_num(lesson)
+        meta_curriculum.set_all_curricula_to_lesson_num(lesson)

        return meta_curriculum

    else:
        # disable noisy warnings from tensorflow.
        tf_utils.set_warnings_enabled(False)
-    if options.env_path is None and options.num_runs > 1:
-        raise TrainerError(
-            "It is not possible to launch more than one concurrent training session "
-            "when training from the editor."
-        )
-    jobs = []
-    if options.num_runs == 1:
-        if options.seed == -1:
-            run_seed = np.random.randint(0, 10000)
-        run_training(0, run_seed, options, Queue())
-    else:
-        for i in range(options.num_runs):
-            if options.seed == -1:
-                run_seed = np.random.randint(0, 10000)
-            process_queue = Queue()
-            p = Process(target=run_training, args=(i, run_seed, options, process_queue))
-            jobs.append(p)
-            p.start()
-            # Wait for signal that environment has successfully launched
-            while process_queue.get() is not True:
-                continue
-
-    # Wait for jobs to complete.  Otherwise we'll have an extra
-    # unhandled KeyboardInterrupt if we end early.
-    try:
-        for job in jobs:
-            job.join()
-    except KeyboardInterrupt:
-        pass
+    if options.seed == -1:
+        run_seed = np.random.randint(0, 10000)
+    run_training(run_seed, options)


 # For python debugger to directly run this script
--- a/ml-agents/mlagents/trainers/meta_curriculum.py
+++ b/ml-agents/mlagents/trainers/meta_curriculum.py
 logger = logging.getLogger("mlagents.trainers")


-class MetaCurriculum(object):
-    """A MetaCurriculum holds curriculums. Each curriculum is associated to a
+class MetaCurriculum:
+    """A MetaCurriculum holds curricula. Each curriculum is associated to a
-    def __init__(self, curriculum_folder: str):
+    def __init__(self, curricula: Dict[str, Curriculum]):
-        Args:
-            curriculum_folder (str): The relative or absolute path of the
-                folder which holds the curriculums for this environment.
-                The folder should contain JSON files whose names are the
-                brains that the curriculums belong to.
-            default_reset_parameters (dict): The default reset parameters
-                of the environment.
+        :param curriculum_folder: Dictionary of brain_name to the
+          Curriculum for each brain.
-        self._brains_to_curriculums: Dict[str, Curriculum] = {}
+        self._brains_to_curricula: Dict[str, Curriculum] = {}
+        for brain_name, curriculum in curricula.items():
+            self._brains_to_curricula[brain_name] = curriculum
+            config_keys: Set[str] = set(curriculum.get_config().keys())
-        try:
-            for curriculum_filename in os.listdir(curriculum_folder):
-                # This process requires JSON files
-                brain_name, extension = os.path.splitext(curriculum_filename)
-                if extension.lower() != ".json":
-                    continue
-                curriculum_filepath = os.path.join(
-                    curriculum_folder, curriculum_filename
+            # Check if any two curricula use the same reset params.
+            if config_keys & used_reset_parameters:
+                logger.warning(
+                    "Two or more curricula will "
+                    "attempt to change the same reset "
+                    "parameter. The result will be "
+                    "non-deterministic."
-                curriculum = Curriculum(curriculum_filepath)
-                config_keys: Set[str] = set(curriculum.get_config().keys())
-                # Check if any two curriculums use the same reset params.
-                if config_keys & used_reset_parameters:
-                    logger.warning(
-                        "Two or more curriculums will "
-                        "attempt to change the same reset "
-                        "parameter. The result will be "
-                        "non-deterministic."
-                    )
-
-                used_reset_parameters.update(config_keys)
-                self._brains_to_curriculums[brain_name] = curriculum
-        except NotADirectoryError:
-            raise MetaCurriculumError(
-                curriculum_folder + " is not a "
-                "directory. Refer to the ML-Agents "
-                "curriculum learning docs."
-            )
+            used_reset_parameters.update(config_keys)
-    def brains_to_curriculums(self):
+    def brains_to_curricula(self):
-        return self._brains_to_curriculums
+        return self._brains_to_curricula
-        for brain_name, curriculum in self.brains_to_curriculums.items():
+        for brain_name, curriculum in self.brains_to_curricula.items():
            lesson_nums[brain_name] = curriculum.lesson_num

        return lesson_nums
        for brain_name, lesson in lesson_nums.items():
-            self.brains_to_curriculums[brain_name].lesson_num = lesson
+            self.brains_to_curricula[brain_name].lesson_num = lesson

    def _lesson_ready_to_increment(
        self, brain_name: str, reward_buff_size: int
            Whether the curriculum of the specified brain should attempt to
            increment its lesson.
        """
-        if brain_name not in self.brains_to_curriculums:
+        if brain_name not in self.brains_to_curricula:
-            self.brains_to_curriculums[brain_name].min_lesson_length
+            self.brains_to_curricula[brain_name].min_lesson_length
-        """Attempts to increments all the lessons of all the curriculums in this
+        """Attempts to increments all the lessons of all the curricula in this
        MetaCurriculum. Note that calling this method does not guarantee the
        lesson of a curriculum will increment. The lesson of a curriculum will
        only increment if the specified measure threshold defined in the
            for brain_name, buff_size in reward_buff_sizes.items():
                if self._lesson_ready_to_increment(brain_name, buff_size):
                    measure_val = measure_vals[brain_name]
-                    ret[brain_name] = self.brains_to_curriculums[
+                    ret[brain_name] = self.brains_to_curricula[
-                ret[brain_name] = self.brains_to_curriculums[
-                    brain_name
-                ].increment_lesson(measure_val)
+                ret[brain_name] = self.brains_to_curricula[brain_name].increment_lesson(
+                    measure_val
+                )
-    def set_all_curriculums_to_lesson_num(self, lesson_num):
-        """Sets all the curriculums in this meta curriculum to a specified
+    def set_all_curricula_to_lesson_num(self, lesson_num):
+        """Sets all the curricula in this meta curriculum to a specified
-            lesson_num (int): The lesson number which all the curriculums will
+            lesson_num (int): The lesson number which all the curricula will
-        for _, curriculum in self.brains_to_curriculums.items():
+        for _, curriculum in self.brains_to_curricula.items():
-        """Get the combined configuration of all curriculums in this
+        """Get the combined configuration of all curricula in this
-        Returns:
-            A dict from parameter to value.
+        :return: A dict from parameter to value.
-        for _, curriculum in self.brains_to_curriculums.items():
+        for _, curriculum in self.brains_to_curricula.items():
+
+    @staticmethod
+    def from_directory(folder_path: str) -> "MetaCurriculum":
+        """
+        Creates a MetaCurriculum given a folder full of curriculum config files.
+
+        :param folder_path: The path to the folder which holds the curriculum configs
+                for this environment. The folder should contain JSON files whose names
+                are the brains that the curricula belong to.
+        """
+        try:
+            curricula = {}
+            for curriculum_filename in os.listdir(folder_path):
+                # This process requires JSON files
+                brain_name, extension = os.path.splitext(curriculum_filename)
+                if extension.lower() != ".json":
+                    continue
+                curriculum_filepath = os.path.join(folder_path, curriculum_filename)
+                curriculum_config = Curriculum.load_curriculum_file(curriculum_filepath)
+                curricula[brain_name] = Curriculum(brain_name, curriculum_config)
+            return MetaCurriculum(curricula)
+        except NotADirectoryError:
+            raise MetaCurriculumError(
+                f"{folder_path} is not a directory. Refer to the ML-Agents "
+                "curriculum learning docs."
+            )
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
            "model_path",
            "reward_signals",
        ]
-        self.check_param_keys()
+        self._check_param_keys()
-    def process_trajectory(self, trajectory: Trajectory) -> None:
+    def _process_trajectory(self, trajectory: Trajectory) -> None:
+        super()._process_trajectory(trajectory)
        agent_id = trajectory.agent_id  # All the agents should have the same ID

        # Add to episode_steps
                agent_id, self.get_policy(trajectory.behavior_id)
            )

-    def is_ready_update(self):
+    def _is_ready_update(self):
        """
        Returns whether or not the trainer has enough elements to run update model
        :return: A boolean corresponding to whether or not update_model() can be run

-    def update_policy(self):
+    def _update_policy(self):
        """
        Uses demonstration_buffer to update the policy.
        The reward signal generators must be updated in this method at their own pace.
--- a/ml-agents/mlagents/trainers/rl_trainer.py
+++ b/ml-agents/mlagents/trainers/rl_trainer.py
 RewardSignalResults = Dict[str, RewardSignalResult]


-class RLTrainer(Trainer):
+class RLTrainer(Trainer):  # pylint: disable=abstract-method
-    Contains methods for adding BrainInfos to the Buffer.
    """

    def __init__(self, *args, **kwargs):

    def clear_update_buffer(self) -> None:
        """
-        Clear the buffers that have been built up during inference. If
-        we're not training, this should be called instead of update_policy.
+        Clear the buffers that have been built up during inference.
+
+    def advance(self) -> None:
+        """
+        Steps the trainer, taking in trajectories and updates if ready
+        """
+        super().advance()
+        if not self.is_training:
+            self.clear_update_buffer()
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
            "vis_encode_type",
        ]

-        self.check_param_keys()
+        self._check_param_keys()
        self.load = load
        self.seed = seed
        self.policy: SACPolicy = None  # type: ignore
            )
        )

-    def process_trajectory(self, trajectory: Trajectory) -> None:
+    def _process_trajectory(self, trajectory: Trajectory) -> None:
+        super()._process_trajectory(trajectory)
        last_step = trajectory.steps[-1]
        agent_id = trajectory.agent_id  # All the agents should have the same ID

                agent_id, self.get_policy(trajectory.behavior_id)
            )

-    def is_ready_update(self) -> bool:
+    def _is_ready_update(self) -> bool:
        """
        Returns whether or not the trainer has enough elements to run update model
        :return: A boolean corresponding to whether or not update_model() can be run
        )

    @timed
-    def update_policy(self) -> None:
+    def _update_policy(self) -> None:
        """
        If train_interval is met, update the SAC policy given the current reward signals.
        If reward_signal_train_interval is met, update the reward signals from the buffer.
--- a/ml-agents/mlagents/trainers/stats.py
+++ b/ml-agents/mlagents/trainers/stats.py
    def add_stat(self, key: str, value: float) -> None:
        """
        Add a float value stat to the StatsReporter.
-        :param category: The highest categorization of the statistic, e.g. behavior name.
+    def set_stat(self, key: str, value: float) -> None:
+        """
+        Sets a stat value to a float. This is for values that we don't want to average, and just
+        want the latest.
+        :param key: The type of statistic, e.g. Environment/Reward.
+        :param value: the value of the statistic.
+        """
+        StatsReporter.stats_dict[self.category][key] = [value]
+
-        :param category: The category which to write out the stats.
        :param step: Training step which to write these stats as.
        """
        values: Dict[str, StatsSummary] = {}
    def write_text(self, text: str, step: int) -> None:
        """
        Write out some text.
-        :param category: The highest categorization of the statistic, e.g. behavior name.
        :param text: The text to write out.
        :param step: Training step which to write these stats as.
        """
    def get_stats_summaries(self, key: str) -> StatsSummary:
        """
        Get the mean, std, and count of a particular statistic, since last write.
-        :param category: The highest categorization of the statistic, e.g. behavior name.
        :param key: The type of statistic, e.g. Environment/Reward.
        :returns: A StatsSummary NamedTuple containing (mean, std, count).
        """
--- a/ml-agents/mlagents/trainers/tests/mock_brain.py
+++ b/ml-agents/mlagents/trainers/tests/mock_brain.py
-import unittest.mock as mock
+from unittest import mock
 import numpy as np

 from mlagents.trainers.brain import CameraResolution, BrainParameters
--- a/ml-agents/mlagents/trainers/tests/test_agent_processor.py
+++ b/ml-agents/mlagents/trainers/tests/test_agent_processor.py
-import unittest.mock as mock
+from unittest import mock
-from mlagents.trainers.agent_processor import AgentProcessor
+from mlagents.trainers.agent_processor import (
+    AgentProcessor,
+    AgentManager,
+    AgentManagerQueue,
+)
+from mlagents.trainers.trajectory import Trajectory
 from mlagents.trainers.stats import StatsReporter


@pytest.mark.parametrize("num_vis_obs", [0, 1, 2], ids=["vec", "1 viz", "2 viz"])
 def test_agentprocessor(num_vis_obs):
    policy = create_mock_policy()
-    trainer = mock.Mock()
+    tqueue = mock.Mock()
-        trainer,
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        num_vector_acts=2,
        num_vis_observations=num_vis_obs,
    )
+    processor.publish_trajectory_queue(tqueue)
-    assert len(trainer.process_trajectory.call_args_list) == 2
+    assert len(tqueue.put.call_args_list) == 2
-    trajectory = trainer.process_trajectory.call_args_list[0][0][0]
+    trajectory = tqueue.put.call_args_list[0][0][0]
+
+
+def test_agent_manager():
+    policy = create_mock_policy()
+    name_behavior_id = "test_brain_name"
+    manager = AgentManager(
+        policy,
+        name_behavior_id,
+        max_trajectory_length=5,
+        stats_reporter=StatsReporter("testcat"),
+    )
+    assert len(manager.trajectory_queues) == 1
+    assert isinstance(manager.trajectory_queues[0], AgentManagerQueue)
+
+
+def test_agent_manager_queue():
+    queue = AgentManagerQueue(behavior_id="testbehavior")
+    trajectory = mock.Mock(spec=Trajectory)
+    assert queue.empty()
+    queue.put(trajectory)
+    assert not queue.empty()
+    queue_traj = queue.get_nowait()
+    assert isinstance(queue_traj, Trajectory)
+    assert queue.empty()
--- a/ml-agents/mlagents/trainers/tests/test_bcmodule.py
+++ b/ml-agents/mlagents/trainers/tests/test_bcmodule.py
-import unittest.mock as mock
+from unittest import mock
 import pytest
 import mlagents.trainers.tests.mock_brain as mb

--- a/ml-agents/mlagents/trainers/tests/test_curriculum.py
+++ b/ml-agents/mlagents/trainers/tests/test_curriculum.py
    }
    """

+dummy_curriculum_config = json.loads(dummy_curriculum_json_str)

 bad_curriculum_json_str = """
    {
    """


-@pytest.fixture
-def location():
-    return "TestBrain.json"
+dummy_curriculum_config_path = "TestBrain.json"


@pytest.fixture

-@patch("builtins.open", new_callable=mock_open, read_data=dummy_curriculum_json_str)
-def test_init_curriculum_happy_path(mock_file, location, default_reset_parameters):
-    curriculum = Curriculum(location)
+def test_init_curriculum_happy_path():
+    curriculum = Curriculum("TestBrain", dummy_curriculum_config)
-    assert curriculum._brain_name == "TestBrain"
+    assert curriculum.brain_name == "TestBrain"
-def test_init_curriculum_bad_curriculum_raises_error(
-    mock_file, location, default_reset_parameters
-):
+def test_load_bad_curriculum_file_raises_error(mock_file):
-        Curriculum(location)
+        Curriculum(
+            "TestBrain", Curriculum.load_curriculum_file(dummy_curriculum_config_path)
+        )
-@patch("builtins.open", new_callable=mock_open, read_data=dummy_curriculum_json_str)
-def test_increment_lesson(mock_file, location, default_reset_parameters):
-    curriculum = Curriculum(location)
+def test_increment_lesson():
+    curriculum = Curriculum("TestBrain", dummy_curriculum_config)
    assert curriculum.lesson_num == 0

    curriculum.lesson_num = 1
    assert curriculum.lesson_num == 3


-@patch("builtins.open", new_callable=mock_open, read_data=dummy_curriculum_json_str)
-def test_get_config(mock_file):
-    curriculum = Curriculum("TestBrain.json")
+def test_get_parameters():
+    curriculum = Curriculum("TestBrain", dummy_curriculum_config)
    assert curriculum.get_config() == {"param1": 0.7, "param2": 100, "param3": 0.2}

    curriculum.lesson_num = 2

 # Test json loading and error handling. These examples don't need to valid config files.
-
-
 def test_curriculum_load_good():
    expected = {"x": 1}
    value = json.dumps(expected)
--- a/ml-agents/mlagents/trainers/tests/test_learn.py
+++ b/ml-agents/mlagents/trainers/tests/test_learn.py
    mock_init = MagicMock(return_value=None)
    with patch.object(TrainerController, "__init__", mock_init):
        with patch.object(TrainerController, "start_learning", MagicMock()):
-            learn.run_training(0, 0, basic_options(), MagicMock())
+            learn.run_training(0, basic_options())
-                "./models/ppo-0",
+                "./models/ppo",
-                "ppo-0",
+                "ppo",
                50000,
                None,
                False,
    mock_init = MagicMock(return_value=None)
    with patch.object(TrainerController, "__init__", mock_init):
        with patch.object(TrainerController, "start_learning", MagicMock()):
-            learn.run_training(0, 0, options_with_docker_target, MagicMock())
+            learn.run_training(0, options_with_docker_target)
-            assert mock_init.call_args[0][1] == "/dockertarget/models/ppo-0"
+            assert mock_init.call_args[0][1] == "/dockertarget/models/ppo"
            assert mock_init.call_args[0][2] == "/dockertarget/summaries"


        "--lesson=3",
        "--load",
        "--run-id=myawesomerun",
-        "--num-runs=3",
        "--save-freq=123456",
        "--seed=7890",
        "--train",
--- a/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
+++ b/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
    _check_environment_trains,
    BRAIN_NAME,
 )
-from mlagents.trainers.tests.test_curriculum import dummy_curriculum_json_str
-
-
-class MetaCurriculumTest(MetaCurriculum):
-    """This class allows us to test MetaCurriculum objects without calling
-    MetaCurriculum's __init__ function.
-    """
-
-    def __init__(self, brains_to_curriculums):
-        self._brains_to_curriculums = brains_to_curriculums
+from mlagents.trainers.tests.test_curriculum import (
+    dummy_curriculum_json_str,
+    dummy_curriculum_config,
+)


@pytest.fixture


@patch("mlagents.trainers.curriculum.Curriculum.get_config", return_value={})
-@patch("mlagents.trainers.curriculum.Curriculum.__init__", return_value=None)
+@patch(
+    "mlagents.trainers.curriculum.Curriculum.load_curriculum_file",
+    return_value=dummy_curriculum_config,
+)
-    meta_curriculum = MetaCurriculum("test/")
+    meta_curriculum = MetaCurriculum.from_directory("test/")
-    assert len(meta_curriculum.brains_to_curriculums) == 2
+    assert len(meta_curriculum.brains_to_curricula) == 2
-    assert "Brain1" in meta_curriculum.brains_to_curriculums
-    assert "Brain2.test" in meta_curriculum.brains_to_curriculums
+    assert "Brain1" in meta_curriculum.brains_to_curricula
+    assert "Brain2.test" in meta_curriculum.brains_to_curricula

    calls = [call("test/Brain1.json"), call("test/Brain2.test.json")]

@patch("os.listdir", side_effect=NotADirectoryError())
 def test_init_meta_curriculum_bad_curriculum_folder_raises_error(listdir):
    with pytest.raises(MetaCurriculumError):
-        MetaCurriculum("test/")
+        MetaCurriculum.from_directory("test/")
-    meta_curriculum = MetaCurriculumTest(
-        {"Brain1": curriculum_a, "Brain2": curriculum_b}
-    )
+    meta_curriculum = MetaCurriculum({"Brain1": curriculum_a, "Brain2": curriculum_b})

    meta_curriculum.lesson_nums = {"Brain1": 1, "Brain2": 3}

@patch("mlagents.trainers.curriculum.Curriculum")
@patch("mlagents.trainers.curriculum.Curriculum")
 def test_increment_lessons(curriculum_a, curriculum_b, measure_vals):
-    meta_curriculum = MetaCurriculumTest(
-        {"Brain1": curriculum_a, "Brain2": curriculum_b}
-    )
+    meta_curriculum = MetaCurriculum({"Brain1": curriculum_a, "Brain2": curriculum_b})

    meta_curriculum.increment_lessons(measure_vals)

 ):
    curriculum_a.min_lesson_length = 5
    curriculum_b.min_lesson_length = 10
-    meta_curriculum = MetaCurriculumTest(
-        {"Brain1": curriculum_a, "Brain2": curriculum_b}
-    )
+    meta_curriculum = MetaCurriculum({"Brain1": curriculum_a, "Brain2": curriculum_b})

    meta_curriculum.increment_lessons(measure_vals, reward_buff_sizes=reward_buff_sizes)

@patch("mlagents.trainers.curriculum.Curriculum")
@patch("mlagents.trainers.curriculum.Curriculum")
 def test_set_all_curriculums_to_lesson_num(curriculum_a, curriculum_b):
-    meta_curriculum = MetaCurriculumTest(
-        {"Brain1": curriculum_a, "Brain2": curriculum_b}
-    )
+    meta_curriculum = MetaCurriculum({"Brain1": curriculum_a, "Brain2": curriculum_b})
-    meta_curriculum.set_all_curriculums_to_lesson_num(2)
+    meta_curriculum.set_all_curricula_to_lesson_num(2)

    assert curriculum_a.lesson_num == 2
    assert curriculum_b.lesson_num == 2
 ):
    curriculum_a.get_config.return_value = default_reset_parameters
    curriculum_b.get_config.return_value = default_reset_parameters
-    meta_curriculum = MetaCurriculumTest(
-        {"Brain1": curriculum_a, "Brain2": curriculum_b}
-    )
+    meta_curriculum = MetaCurriculum({"Brain1": curriculum_a, "Brain2": curriculum_b})

    assert meta_curriculum.get_config() == default_reset_parameters

    with patch(
        "builtins.open", new_callable=mock_open, read_data=dummy_curriculum_json_str
    ):
-        curriculum = Curriculum("TestBrain.json")
-    mc = MetaCurriculumTest({curriculum_brain_name: curriculum})
+        curriculum_config = Curriculum.load_curriculum_file("TestBrain.json")
+        curriculum = Curriculum("TestBrain", curriculum_config)
+    mc = MetaCurriculum({curriculum_brain_name: curriculum})
    _check_environment_trains(env, META_CURRICULUM_CONFIG, mc, -100.0)
--- a/ml-agents/mlagents/trainers/tests/test_multigpu.py
+++ b/ml-agents/mlagents/trainers/tests/test_multigpu.py
-import unittest.mock as mock
+from unittest import mock
 import pytest

 from mlagents.tf_utils import tf
--- a/ml-agents/mlagents/trainers/tests/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/test_ppo.py
-import unittest.mock as mock
+from unittest import mock
 import pytest

 import numpy as np
 from mlagents.trainers.models import EncoderType, LearningModel
 from mlagents.trainers.trainer import UnityTrainerException
 from mlagents.trainers.brain import BrainParameters, CameraResolution
+from mlagents.trainers.agent_processor import AgentManagerQueue
 from mlagents_envs.environment import UnityEnvironment
 from mlagents_envs.mock_communicator import MockCommunicator
 from mlagents.trainers.tests import mock_brain as mb
    trainer = PPOTrainer(
        brain_params.brain_name, 0, trainer_params, True, False, 0, "0", False
    )
-    policy_mock = mock.Mock()
+    policy_mock = mock.Mock(spec=PPOPolicy)
-    )  # 10 hacked becausee this function is no longer called through trainer
+    )  # 10 hacked because this function is no longer called through trainer
-    trainer.policy = policy_mock
+    trainer.add_policy("testbehavior", policy_mock)
-    trainer.increment_step(5)
-    print(trainer.policy.increment_step(5))
+    trainer._increment_step(5, "testbehavior")
    policy_mock.increment_step.assert_called_with(5)
    assert trainer.step == step_count

    buffer["curiosity_value_estimates"] = buffer["rewards"]

    trainer.update_buffer = buffer
-    trainer.update_policy()
+    trainer._update_policy()
-    trainer.update_policy()
+    trainer._update_policy()
-    trainer.update_policy()
+    trainer._update_policy()


 def test_process_trajectory(dummy_config):
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
-    trainer = PPOTrainer(
-        brain_params.brain_name, 0, dummy_config, True, False, 0, "0", False
-    )
+    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False)
+    policy = trainer.create_policy(brain_params)
+    trainer.add_policy(brain_params.brain_name, policy)
+    trajectory_queue = AgentManagerQueue("testbrain")
+    trainer.subscribe_trajectory_queue(trajectory_queue)
    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        action_space=2,
    )
-    policy = trainer.create_policy(brain_params)
-    trainer.add_policy(brain_params.brain_name, policy)
-    trainer.process_trajectory(trajectory)
+    trajectory_queue.put(trajectory)
+    trainer.advance()

    # Check that trainer put trajectory in update buffer
    assert trainer.update_buffer.num_experiences == 15
        num_vis_obs=0,
        action_space=2,
    )
-    trainer.process_trajectory(trajectory)
+    trajectory_queue.put(trajectory)
+    trainer.advance()

    # Check that the stats are reset as episode is finished
    for reward in trainer.collected_rewards.values():
    policy = trainer.create_policy(brain_params)
    trainer.add_policy(brain_params.brain_name, policy)

-    trainer.process_trajectory(trajectory)
+    trainer._process_trajectory(trajectory)

    # Check that the running mean and variance is correct
    steps, mean, variance = trainer.policy.sess.run(
        num_vis_obs=0,
        action_space=2,
    )
-    trainer.process_trajectory(trajectory)
+    trainer._process_trajectory(trajectory)

    # Check that the running mean and variance is correct
    steps, mean, variance = trainer.policy.sess.run(
--- a/ml-agents/mlagents/trainers/tests/test_reward_signals.py
+++ b/ml-agents/mlagents/trainers/tests/test_reward_signals.py
-import unittest.mock as mock
+from unittest import mock
 import pytest
 import yaml
 import os
--- a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
+++ b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
-import unittest.mock as mock
+from unittest import mock
-import numpy as np
 from mlagents.trainers.rl_trainer import RLTrainer
 from mlagents.trainers.tests.test_buffer import construct_fake_buffer

        """
        summary_path: "test/"
+        summary_freq: 1000
        reward_signals:
          extrinsic:
            strength: 1.0
    return mock_brain


-def create_rl_trainer():
-    mock_brainparams = create_mock_brain()
-    trainer = RLTrainer(mock_brainparams.brain_name, dummy_config(), True, 0)
-    return trainer
+# Add concrete implementations of abstract methods
+class FakeTrainer(RLTrainer):
+    def get_policy(self, name_behavior_id):
+        return mock.Mock()
+
+    def _is_ready_update(self):
+        return True
+
+    def _update_policy(self):
+        pass
+
+    def add_policy(self):
+        pass
+    def create_policy(self):
+        return mock.Mock()
-def create_mock_all_brain_info(brain_info):
-    return {"MockBrain": brain_info}
+    def _process_trajectory(self, trajectory):
+        super()._process_trajectory(trajectory)
-def create_mock_policy():
-    mock_policy = mock.Mock()
-    mock_policy.reward_signals = {}
-    mock_policy.retrieve_memories.return_value = np.zeros((1, 1), dtype=np.float32)
-    mock_policy.retrieve_previous_action.return_value = np.zeros(
-        (1, 1), dtype=np.float32
-    )
-    return mock_policy
+def create_rl_trainer():
+    mock_brainparams = create_mock_brain()
+    trainer = FakeTrainer(mock_brainparams, dummy_config(), True, 0)
+    return trainer


 def test_rl_trainer():
--- a/ml-agents/mlagents/trainers/tests/test_sac.py
+++ b/ml-agents/mlagents/trainers/tests/test_sac.py
-import unittest.mock as mock
+from unittest import mock
 import pytest
 import yaml

 from mlagents.trainers.sac.models import SACModel
 from mlagents.trainers.sac.policy import SACPolicy
 from mlagents.trainers.sac.trainer import SACTrainer
+from mlagents.trainers.agent_processor import AgentManagerQueue
 from mlagents.trainers.tests import mock_brain as mb
 from mlagents.trainers.tests.mock_brain import make_brain_parameters
 from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
    policy = trainer.create_policy(brain_params)
    trainer.add_policy(brain_params.brain_name, policy)

+    trajectory_queue = AgentManagerQueue("testbrain")
+    trainer.subscribe_trajectory_queue(trajectory_queue)
+
-    trainer.process_trajectory(trajectory)
+    trajectory_queue.put(trajectory)
+    trainer.advance()

    # Check that trainer put trajectory in update buffer
    assert trainer.update_buffer.num_experiences == 15
        num_vis_obs=0,
        action_space=2,
    )
-    trainer.process_trajectory(trajectory)
+    trajectory_queue.put(trajectory)
+    trainer.advance()

    # Check that the stats are reset as episode is finished
    for reward in trainer.collected_rewards.values():
--- a/ml-agents/mlagents/trainers/tests/test_stats.py
+++ b/ml-agents/mlagents/trainers/tests/test_stats.py
-import unittest.mock as mock
+from unittest import mock
 import os
 import pytest
 import tempfile
--- a/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
+++ b/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
-import unittest.mock as mock
+from unittest import mock
 from unittest.mock import Mock, MagicMock
 import unittest
 from queue import Empty as EmptyQueue
--- a/ml-agents/mlagents/trainers/tests/test_trainer_controller.py
+++ b/ml-agents/mlagents/trainers/tests/test_trainer_controller.py
 import pytest

 from mlagents.tf_utils import tf
-from mlagents.trainers.trainer_controller import TrainerController, AgentManager
+from mlagents.trainers.trainer_controller import TrainerController
 from mlagents.trainers.subprocess_env_manager import EnvironmentStep
 from mlagents.trainers.sampler_class import SamplerManager

    trainer_mock = MagicMock()
    trainer_mock.get_step = 0
    trainer_mock.get_max_steps = 5
+    trainer_mock.should_still_train = True
    trainer_mock.parameters = {"some": "parameter"}
    trainer_mock.write_tensorboard_text = MagicMock()


    def take_step_sideeffect(env):
        tc.trainers["testbrain"].get_step += 1
+        if (
+            not tc.trainers["testbrain"].get_step
+            <= tc.trainers["testbrain"].get_max_steps
+        ):
+            tc.trainers["testbrain"].should_still_train = False
        if tc.trainers["testbrain"].get_step > 10:
            raise KeyboardInterrupt
        return 1
    trainer_mock.parameters = {"some": "parameter"}
    trainer_mock.write_tensorboard_text = MagicMock()

-    processor_mock = MagicMock()
-
-    tc.managers = {"testbrain": AgentManager(processor=processor_mock)}
+    tc.managers = {"testbrain": MagicMock()}

    return tc, trainer_mock

    brain_info_dict = {brain_name: Mock()}
    old_step_info = EnvironmentStep(brain_info_dict, brain_info_dict, action_info_dict)
    new_step_info = EnvironmentStep(brain_info_dict, brain_info_dict, action_info_dict)
-    trainer_mock.is_ready_update = MagicMock(return_value=True)
+    trainer_mock._is_ready_update = MagicMock(return_value=True)

    env_mock = MagicMock()
    env_mock.step.return_value = [new_step_info]
    env_mock.reset.assert_not_called()
    env_mock.step.assert_called_once()

-    processor_mock = tc.managers[brain_name].processor
-    processor_mock.add_experiences.assert_called_once_with(
+    manager_mock = tc.managers[brain_name]
+    manager_mock.add_experiences.assert_called_once_with(
-    trainer_mock.update_policy.assert_called_once()
-    trainer_mock.increment_step.assert_called_once()
+
+    trainer_mock.advance.assert_called_once()


 def test_take_step_if_not_training(trainer_controller_with_take_step_mocks):
    old_step_info = EnvironmentStep(brain_info_dict, brain_info_dict, action_info_dict)
    new_step_info = EnvironmentStep(brain_info_dict, brain_info_dict, action_info_dict)

-    trainer_mock.is_ready_update = MagicMock(return_value=False)
+    trainer_mock._is_ready_update = MagicMock(return_value=False)

    env_mock = MagicMock()
    env_mock.step.return_value = [new_step_info]
    tc.advance(env_mock)
    env_mock.reset.assert_not_called()
    env_mock.step.assert_called_once()
-    processor_mock = tc.managers[brain_name].processor
-    processor_mock.add_experiences.assert_called_once_with(
+    manager_mock = tc.managers[brain_name]
+    manager_mock.add_experiences.assert_called_once_with(
        new_step_info.previous_all_brain_info[brain_name],
        new_step_info.current_all_brain_info[brain_name],
        new_step_info.brain_name_to_action_info[brain_name].outputs,
--- a/ml-agents/mlagents/trainers/tests/test_trainer_util.py
+++ b/ml-agents/mlagents/trainers/tests/test_trainer_util.py
 import io
 from unittest.mock import patch

-import mlagents.trainers.trainer_util as trainer_util
+from mlagents.trainers import trainer_util
 from mlagents.trainers.trainer_util import load_config, _load_config
 from mlagents.trainers.ppo.trainer import PPOTrainer
 from mlagents.trainers.exception import TrainerConfigError
--- a/ml-agents/mlagents/trainers/trainer.py
+++ b/ml-agents/mlagents/trainers/trainer.py
 # # Unity ML-Agents Toolkit
 import logging
 from typing import Dict, List, Deque, Any
+import time
+import abc

 from mlagents.tf_utils import tf

 from mlagents.trainers.tf_policy import TFPolicy
 from mlagents.trainers.stats import StatsReporter
 from mlagents.trainers.trajectory import Trajectory
+from mlagents.trainers.agent_processor import AgentManagerQueue
+from mlagents.trainers.policy import Policy
+from mlagents_envs.timers import hierarchical_timer

 LOGGER = logging.getLogger("mlagents.trainers")

    pass


-class Trainer(object):
+class Trainer(abc.ABC):
    """This class is the base class for the mlagents_envs.trainers"""

    def __init__(
        self.cumulative_returns_since_policy_update: List[float] = []
        self.is_training = training
        self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)
+        self.policy_queues: List[AgentManagerQueue[Policy]] = []
+        self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
+        self.training_start_time = time.time()
+        self.summary_freq = self.trainer_parameters["summary_freq"]
+        self.next_update_step = self.summary_freq
-    def check_param_keys(self):
+    def _check_param_keys(self):
        for k in self.param_keys:
            if k not in self.trainer_parameters:
                raise UnityTrainerException(
            LOGGER.info("Could not write text summary for Tensorboard.")
            pass

-    def dict_to_str(self, param_dict: Dict[str, Any], num_tabs: int) -> str:
+    def _dict_to_str(self, param_dict: Dict[str, Any], num_tabs: int) -> str:
        """
        Takes a parameter dictionary and converts it to a human-readable string.
        Recurses if there are multiple levels of dict. Used to print out hyperaparameters.
                    "\t"
                    + "  " * num_tabs
                    + "{0}:\t{1}".format(
-                        x, self.dict_to_str(param_dict[x], num_tabs + 1)
+                        x, self._dict_to_str(param_dict[x], num_tabs + 1)
                    )
                    for x in param_dict
                ]
        return """Hyperparameters for the {0} of brain {1}: \n{2}""".format(
            self.__class__.__name__,
            self.brain_name,
-            self.dict_to_str(self.trainer_parameters, 0),
+            self._dict_to_str(self.trainer_parameters, 0),
        )

    @property
        return self.trainer_parameters

    @property
-    def get_max_steps(self) -> float:
+    def get_max_steps(self) -> int:
-        return float(self.trainer_parameters["max_steps"])
+        return int(float(self.trainer_parameters["max_steps"]))

    @property
    def get_step(self) -> int:
        return self.step

    @property
+    def should_still_train(self) -> bool:
+        """
+        Returns whether or not the trainer should train. A Trainer could
+        stop training if it wasn't training to begin with, or if max_steps
+        is reached.
+        """
+        return self.is_training and self.get_step <= self.get_max_steps
+
+    @property
    def reward_buffer(self) -> Deque[float]:
        """
        Returns the reward buffer. The reward buffer contains the cumulative
        """
        return self._reward_buffer

-    def increment_step(self, n_steps: int) -> None:
+    def _increment_step(self, n_steps: int, name_behavior_id: str) -> None:
-
+        self.next_update_step = self.step + (
+            self.summary_freq - self.step % self.summary_freq
+        )
+        p = self.get_policy(name_behavior_id)
+        if p:
+            p.increment_step(n_steps)

    def save_model(self, name_behavior_id: str) -> None:
        """
        """
        self.get_policy(name_behavior_id).export_model()

-    def write_summary(self, global_step: int, delta_train_start: float) -> None:
+    def _write_summary(self, step: int) -> None:
-        :param delta_train_start:  Time elapsed since training started.
-        :param global_step: The number of steps the simulation has been going for
-        if (
-            global_step % self.trainer_parameters["summary_freq"] == 0
-            and global_step != 0
-        ):
-            is_training = (
-                "Training."
-                if self.is_training and self.get_step <= self.get_max_steps
-                else "Not Training."
+        is_training = "Training." if self.should_still_train else "Not Training."
+        stats_summary = self.stats_reporter.get_stats_summaries(
+            "Environment/Cumulative Reward"
+        )
+        if stats_summary.num > 0:
+            LOGGER.info(
+                " {}: {}: Step: {}. "
+                "Time Elapsed: {:0.3f} s "
+                "Mean "
+                "Reward: {:0.3f}"
+                ". Std of Reward: {:0.3f}. {}".format(
+                    self.run_id,
+                    self.brain_name,
+                    step,
+                    time.time() - self.training_start_time,
+                    stats_summary.mean,
+                    stats_summary.std,
+                    is_training,
+                )
-            step = min(self.get_step, self.get_max_steps)
-            stats_summary = self.stats_reporter.get_stats_summaries(
-                "Environment/Cumulative Reward"
-            )
-            if stats_summary.num > 0:
-                LOGGER.info(
-                    " {}: {}: Step: {}. "
-                    "Time Elapsed: {:0.3f} s "
-                    "Mean "
-                    "Reward: {:0.3f}"
-                    ". Std of Reward: {:0.3f}. {}".format(
-                        self.run_id,
-                        self.brain_name,
-                        step,
-                        delta_train_start,
-                        stats_summary.mean,
-                        stats_summary.std,
-                        is_training,
-                    )
-                )
-                set_gauge(f"{self.brain_name}.mean_reward", stats_summary.mean)
-            else:
-                LOGGER.info(
-                    " {}: {}: Step: {}. No episode was completed since last summary. {}".format(
-                        self.run_id, self.brain_name, step, is_training
-                    )
+            set_gauge(f"{self.brain_name}.mean_reward", stats_summary.mean)
+        else:
+            LOGGER.info(
+                " {}: {}: Step: {}. No episode was completed since last summary. {}".format(
+                    self.run_id, self.brain_name, step, is_training
-            self.stats_reporter.write_stats(int(step))
+            )
+        self.stats_reporter.write_stats(int(step))
-    def process_trajectory(self, trajectory: Trajectory) -> None:
+    @abc.abstractmethod
+    def _process_trajectory(self, trajectory: Trajectory) -> None:
-        Processing involves calculating value and advantage targets for model updating step.
-        raise UnityTrainerException(
-            "The process_experiences method was not implemented."
-        )
+        self._maybe_write_summary(self.get_step + len(trajectory.steps))
+        self._increment_step(len(trajectory.steps), trajectory.behavior_id)
+    def _maybe_write_summary(self, step_after_process: int) -> None:
+        """
+        If processing the trajectory will make the step exceed the next summary write,
+        write the summary. This logic ensures summaries are written on the update step and not in between.
+        :param step_after_process: the step count after processing the next trajectory.
+        """
+        if step_after_process >= self.next_update_step and self.get_step != 0:
+            self._write_summary(self.next_update_step)
+
+    @abc.abstractmethod
-        raise UnityTrainerException("The end_episode method was not implemented.")
+        pass
+
+    @abc.abstractmethod
+    def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
+        """
+        Creates policy
+        """
+        pass
+
+    @abc.abstractmethod
+    def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
+        """
+        Adds policy to trainer
+        """
+        pass
+
+    @abc.abstractmethod
+    def get_policy(self, name_behavior_id: str) -> TFPolicy:
+        """
+        Gets policy from trainer
+        """
+        pass
-    def is_ready_update(self):
+    @abc.abstractmethod
+    def _is_ready_update(self):
-        raise UnityTrainerException("The is_ready_update method was not implemented.")
+        return False
-    def update_policy(self):
+    @abc.abstractmethod
+    def _update_policy(self):
-        raise UnityTrainerException("The update_model method was not implemented.")
+        pass
-    def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
+    def advance(self) -> None:
-        Creates policy
+        Steps the trainer, taking in trajectories and updates if ready.
-        raise UnityTrainerException("The create_policy method was not implemented.")
+        with hierarchical_timer("process_trajectory"):
+            for traj_queue in self.trajectory_queues:
+                try:
+                    t = traj_queue.get_nowait()
+                    self._process_trajectory(t)
+                except AgentManagerQueue.Empty:
+                    pass
+        if self.should_still_train:
+            if self._is_ready_update():
+                with hierarchical_timer("_update_policy"):
+                    self._update_policy()
+                    for q in self.policy_queues:
+                        # Get policies that correspond to the policy queue in question
+                        q.put(self.get_policy(q.behavior_id))
-    def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
+    def publish_policy_queue(self, policy_queue: AgentManagerQueue[Policy]) -> None:
-        Adds policy to trainer
+        Adds a policy queue to the list of queues to publish to when this Trainer
+        makes a policy update
+        :param queue: Policy queue to publish to.
-        raise UnityTrainerException("The add_policy method was not implemented")
+        self.policy_queues.append(policy_queue)
-    def get_policy(self, name_behavior_id: str) -> TFPolicy:
+    def subscribe_trajectory_queue(
+        self, trajectory_queue: AgentManagerQueue[Trajectory]
+    ) -> None:
-        Gets policy from trainer
+        Adds a trajectory queue to the list of queues for the trainer injest Trajectories from.
+        :param queue: Trajectory queue to publish to.
-        raise UnityTrainerException("The get_policy method was not implemented.")
-
-    def advance(self) -> None:
-        pass
+        self.trajectory_queues.append(trajectory_queue)
--- a/ml-agents/mlagents/trainers/trainer_controller.py
+++ b/ml-agents/mlagents/trainers/trainer_controller.py
 import sys
 import json
 import logging
-from typing import Dict, List, Optional, Set, NamedTuple
+from typing import Dict, List, Optional, Set
-from time import time

 from mlagents.trainers.env_manager import EnvManager, EnvironmentStep
 from mlagents_envs.exception import (
 from mlagents.trainers.trainer import Trainer
 from mlagents.trainers.meta_curriculum import MetaCurriculum
 from mlagents.trainers.trainer_util import TrainerFactory
-from mlagents.trainers.agent_processor import AgentProcessor
-
-
-class AgentManager(NamedTuple):
-    processor: AgentProcessor
+from mlagents.trainers.agent_processor import AgentManager, AgentManagerQueue


 class TrainerController(object):
        self.save_freq = save_freq
        self.train_model = train
        self.meta_curriculum = meta_curriculum
-        self.training_start_time = time()
        self.sampler_manager = sampler_manager
        self.resampling_interval = resampling_interval
        np.random.seed(training_seed)
            for (
                brain_name,
                curriculum,
-            ) in self.meta_curriculum.brains_to_curriculums.items():
+            ) in self.meta_curriculum.brains_to_curricula.items():
-                    measure_val = (
-                        self.trainers[brain_name].get_step
-                        / self.trainers[brain_name].get_max_steps
+                    measure_val = self.trainers[brain_name].get_step / float(
+                        self.trainers[brain_name].get_max_steps
                    )
                    brain_names_to_measure_vals[brain_name] = measure_val
                elif curriculum.measure == "reward":

    def _not_done_training(self) -> bool:
        return (
-            any(t.get_step <= t.get_max_steps for k, t in self.trainers.items())
+            any(t.should_still_train for t in self.trainers.values())
-    def write_to_tensorboard(self, global_step: int) -> None:
-        for brain_name, trainer in self.trainers.items():
-            # Write training statistics to Tensorboard.
-            delta_train_start = time() - self.training_start_time
-            if (
-                self.meta_curriculum
-                and brain_name in self.meta_curriculum.brains_to_curriculums
-            ):
-                lesson_num = self.meta_curriculum.brains_to_curriculums[
-                    brain_name
-                ].lesson_num
-                trainer.stats_reporter.add_stat("Environment/Lesson", lesson_num)
-            trainer.write_summary(global_step, delta_train_start)
+    def _create_trainer_and_manager(
+        self, env_manager: EnvManager, name_behavior_id: str
+    ) -> None:
+        try:
+            brain_name, _ = name_behavior_id.split("?")
+        except ValueError:
+            brain_name = name_behavior_id
+
+        try:
+            trainer = self.trainers[brain_name]
+        except KeyError:
+            trainer = self.trainer_factory.generate(brain_name)
+            self.trainers[brain_name] = trainer
+            self.logger.info(trainer)
+            if self.train_model:
+                trainer.write_tensorboard_text("Hyperparameters", trainer.parameters)
+
+        policy = trainer.create_policy(env_manager.external_brains[name_behavior_id])
+        trainer.add_policy(name_behavior_id, policy)
+
+        env_manager.set_policy(name_behavior_id, policy)
+
+        self.brain_name_to_identifier[brain_name].add(name_behavior_id)
+
+        agent_manager = AgentManager(
+            policy,
+            name_behavior_id,
+            trainer.stats_reporter,
+            trainer.parameters.get("time_horizon", sys.maxsize),
+        )
+        trainer.publish_policy_queue(agent_manager.policy_queue)
+        trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue)
+        self.managers[name_behavior_id] = agent_manager

    def start_learning(self, env_manager: EnvManager) -> None:
        self._create_model_path(self.model_path)
                external_brain_behavior_ids = set(env_manager.external_brains.keys())
                new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids
                for name_behavior_id in new_behavior_ids:
-                    try:
-                        brain_name, _ = name_behavior_id.split("?")
-                    except ValueError:
-                        brain_name = name_behavior_id
-
-                    try:
-                        trainer = self.trainers[brain_name]
-                    except KeyError:
-                        trainer = self.trainer_factory.generate(brain_name)
-                        self.trainers[brain_name] = trainer
-                        self.logger.info(trainer)
-                        if self.train_model:
-                            trainer.write_tensorboard_text(
-                                "Hyperparameters", trainer.parameters
-                            )
-
-                    policy = trainer.create_policy(
-                        env_manager.external_brains[name_behavior_id]
-                    )
-                    trainer.add_policy(name_behavior_id, policy)
-
-                    env_manager.set_policy(name_behavior_id, policy)
-
-                    self.brain_name_to_identifier[brain_name].add(name_behavior_id)
-
-                    agent_manager = AgentManager(
-                        processor=AgentProcessor(
-                            trainer,
-                            policy,
-                            name_behavior_id,
-                            trainer.stats_reporter,
-                            trainer.parameters.get("time_horizon", sys.maxsize),
-                        )
-                    )
-                    self.managers[name_behavior_id] = agent_manager
-
+                    self._create_trainer_and_manager(env_manager, name_behavior_id)
-
                n_steps = self.advance(env_manager)
                for _ in range(n_steps):
                    global_step += 1
                        self._save_model()
-                    self.write_to_tensorboard(global_step)
            # Final save Tensorflow model
            if global_step != 0 and self.train_model:
                self._save_model()
        if meta_curriculum_reset or generalization_reset:
            self.end_trainer_episodes(env, lessons_incremented)

-    @timed
-    def advance(self, env: EnvManager) -> int:
+    def _get_and_process_experiences(self, env: EnvManager) -> int:
+            # Get new policies if found
+            for brain_name in self.trainers.keys():
+                for name_behavior_id in self.brain_name_to_identifier[brain_name]:
+                    try:
+                        _policy = self.managers[
+                            name_behavior_id
+                        ].policy_queue.get_nowait()
+                        env.set_policy(name_behavior_id, _policy)
+                    except AgentManagerQueue.Empty:
+                        pass
+            # Step the environment
+        # Add to AgentProcessor
        for step_info in new_step_infos:
            for name_behavior_id in step_info.name_behavior_ids:
                if name_behavior_id not in self.managers:
                        )
                    )
                    continue
-                _processor = self.managers[name_behavior_id].processor
-                _processor.add_experiences(
+                _processor = self.managers[name_behavior_id].add_experiences(
+        return len(new_step_infos)
-        for brain_name, trainer in self.trainers.items():
-            if self.train_model and trainer.get_step <= trainer.get_max_steps:
-                n_steps = len(new_step_infos)
-                trainer.increment_step(n_steps)
-                for name_behavior_id in self.brain_name_to_identifier[brain_name]:
-                    trainer.get_policy(name_behavior_id).increment_step(n_steps)
-                if trainer.is_ready_update():
-                    # Perform gradient descent with experience buffer
-                    with hierarchical_timer("update_policy"):
-                        trainer.update_policy()
-                    for name_behavior_id in self.brain_name_to_identifier[brain_name]:
-                        env.set_policy(
-                            name_behavior_id, trainer.get_policy(name_behavior_id)
-                        )
-            else:
-                # Avoid memory leak during inference
-                # Eventually this whole block will take place in advance()
-                # But currently this only calls clear_update_buffer() in RLTrainer
-                # and nothing in the base class
+    @timed
+    def advance(self, env: EnvManager) -> int:
+        # Get steps
+        num_steps = self._get_and_process_experiences(env)
+
+        # Report current lesson
+        if self.meta_curriculum:
+            for brain_name, curr in self.meta_curriculum.brains_to_curricula.items():
+                if brain_name in self.trainers:
+                    self.trainers[brain_name].stats_reporter.set_stat(
+                        "Environment/Lesson", curr.lesson_num
+                    )
+
+        # Advance trainers. This can be done in a separate loop in the future.
+        with hierarchical_timer("trainer_advance"):
+            for trainer in self.trainers.values():
-        return len(new_step_infos)
+
+        return num_steps
--- a/ml-agents/mlagents/trainers/trainer_util.py
+++ b/ml-agents/mlagents/trainers/trainer_util.py

    min_lesson_length = 1
    if meta_curriculum:
-        if brain_name in meta_curriculum.brains_to_curriculums:
-            min_lesson_length = meta_curriculum.brains_to_curriculums[
+        if brain_name in meta_curriculum.brains_to_curricula:
+            min_lesson_length = meta_curriculum.brains_to_curricula[
-                f"Brains with curricula: {meta_curriculum.brains_to_curriculums.keys()}. "
+                f"Brains with curricula: {meta_curriculum.brains_to_curricula.keys()}. "
            )

    trainer: Trainer = None  # type: ignore  # will be set to one of these, or raise
--- a/test_requirements.txt
+++ b/test_requirements.txt
 # Test-only dependencies should go here, not in setup.py
 pytest>4.0.0,<6.0.0
 pytest-cov==2.6.1
+pytest-xdist
--- a/UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/FloatVisualSensorTests.cs
+++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/FloatVisualSensorTests.cs
+using NUnit.Framework;
+using UnityEngine;
+using MLAgents.Sensor;
+
+namespace MLAgents.Tests
+{
+    public class Float2DSensor : ISensor
+    {
+        public int Width { get; }
+        public int Height { get; }
+        string m_Name;
+        int[] m_Shape;
+        public float[,] floatData;
+
+        public Float2DSensor(int width, int height, string name)
+        {
+            Width = width;
+            Height = height;
+            m_Name = name;
+            m_Shape = new[] { height, width, 1 };
+            floatData = new float[Height, Width];
+        }
+
+        public Float2DSensor(float[,] floatData, string name)
+        {
+            this.floatData = floatData;
+            Height = floatData.GetLength(0);
+            Width = floatData.GetLength(1);
+            m_Name = name;
+            m_Shape = new[] { Height, Width, 1 };
+        }
+
+        public string GetName()
+        {
+            return m_Name;
+        }
+
+        public int[] GetObservationShape()
+        {
+            return m_Shape;
+        }
+
+        public byte[] GetCompressedObservation()
+        {
+            return null;
+        }
+
+        public int Write(WriteAdapter adapter)
+        {
+            using (TimerStack.Instance.Scoped("Float2DSensor.Write"))
+            {
+                for (var h = 0; h < Height; h++)
+                {
+                    for (var w = 0; w < Width; w++)
+                    {
+                        adapter[h, w, 0] = floatData[h, w];
+                    }
+                }
+                var numWritten = Height * Width;
+                return numWritten;
+            }
+        }
+
+        public void Update() { }
+
+        public SensorCompressionType GetCompressionType()
+        {
+            return SensorCompressionType.None;
+        }
+    }
+
+    public class FloatVisualSensorTests
+    {
+        [Test]
+        public void TestFloat2DSensorWrite()
+        {
+            var sensor = new Float2DSensor(3, 4, "floatsensor");
+            for (var h = 0; h < 4; h++)
+            {
+                for (var w = 0; w < 3; w++)
+                {
+                    sensor.floatData[h, w] = 3 * h + w;
+                }
+            }
+
+            var output = new float[12];
+            var writer = new WriteAdapter();
+            writer.SetTarget(output, sensor.GetObservationShape(), 0);
+            sensor.Write(writer);
+            for (var i = 0; i < 9; i++)
+            {
+                Assert.AreEqual(i, output[i]);
+            }
+        }
+
+        [Test]
+        public void TestFloat2DSensorExternalData()
+        {
+            var data = new float[4, 3];
+            var sensor = new Float2DSensor(data, "floatsensor");
+            Assert.AreEqual(sensor.Height, 4);
+            Assert.AreEqual(sensor.Width, 3);
+        }
+    }
+}
--- a/UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/FloatVisualSensorTests.cs.meta
+++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/FloatVisualSensorTests.cs.meta
+fileFormatVersion: 2
+guid: 49b7da14949a486b803e28ed32d91a09
+timeCreated: 1578093005