Merge branch 'master' into soccer-fives

5 年前 · 5b0aca29
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
 - Agent.CollectObservations now takes a VectorSensor argument. It was also overloaded to optionally take an ActionMasker argument. (#3352, #3389)
 - Beta support for ONNX export was added. If the `tf2onnx` python package is installed, models will be saved to `.onnx` as well as `.nn` format.
 Note that Barracuda 0.6.0 or later is required to import the `.onnx` files properly
+ - Multi-GPU training and the `--multi-gpu` option has been removed temporarily. (#3345)

 ### Minor Changes
 - Monitor.cs was moved to Examples. (#3372)
 - The interface for `RayPerceptionSensor.PerceiveStatic()` was changed to take an input class and write to an output class.
 - The checkpoint file suffix was changed from `.cptk` to `.ckpt` (#3470)
 - The command-line argument used to determine the port that an environment will listen on was changed from `--port` to `--mlagents-port`.
+ - `DemonstrationRecorder` can now record observations outside of the editor.
+ - `DemonstrationRecorder` now has an optional path for the demonstrations. This will default to `Application.dataPath` if not set.
+ - `DemonstrationStore` was changed to accept a `Stream` for its constructor, and was renamed to `DemonstrationWriter`
 - The method `GetStepCount()` on the Agent class has been replaced with the property getter `StepCount`

 ### Bugfixes
--- a/com.unity.ml-agents/Editor/DemonstrationImporter.cs
+++ b/com.unity.ml-agents/Editor/DemonstrationImporter.cs
    [ScriptedImporter(1, new[] {"demo"})]
    internal class DemonstrationImporter : ScriptedImporter
    {
-        const string k_IconPath = "Assets/ML-Agents/Resources/DemoIcon.png";
+        const string k_IconPath = "Packages/com.unity.ml-agents/Editor/Icons/DemoIcon.png";

        public override void OnImportAsset(AssetImportContext ctx)
        {
                var metaDataProto = DemonstrationMetaProto.Parser.ParseDelimitedFrom(reader);
                var metaData = metaDataProto.ToDemonstrationMetaData();

-                reader.Seek(DemonstrationStore.MetaDataBytes + 1, 0);
+                reader.Seek(DemonstrationWriter.MetaDataBytes + 1, 0);
                var brainParamsProto = BrainParametersProto.Parser.ParseDelimitedFrom(reader);
                var brainParameters = brainParamsProto.ToBrainParameters();

--- a/com.unity.ml-agents/Runtime/Academy.cs
+++ b/com.unity.ml-agents/Runtime/Academy.cs
        // Lazy initializer pattern, see https://csharpindepth.com/articles/singleton#lazy
        static Lazy<Academy> s_Lazy = new Lazy<Academy>(() => new Academy());

+        /// <summary>
+        /// True if the Academy is initialized, false otherwise.
+        /// </summary>
+        /// <summary>
+        /// The singleton Academy object.
+        /// </summary>
+        /// <summary>
+        /// Collection of float properties (indexed by a string).
+        /// </summary>
        public IFloatProperties FloatProperties;


        // Signals to all the agents each time the Academy force resets.
        internal event Action AgentForceReset;

-        // Signals that the Academy has been reset by the training process
+        /// <summary>
+        /// Signals that the Academy has been reset by the training process.
+        /// </summary>
        public event Action OnEnvironmentReset;

        AcademyFixedUpdateStepper m_FixedUpdateStepper;

        /// <summary>
        /// Initialize the Academy if it hasn't already been initialized.
-        /// This method is always safe to call; it will have no effect if the Academy is already initialized.
+        /// This method is always safe to call; it will have no effect if the Academy is already
+        /// initialized.
        /// </summary>
        internal void LazyInitialize()
        {
        }

        /// <summary>
-        /// Enable stepping of the Academy during the FixedUpdate phase.  This is done by creating a temporary
-        /// GameObject with a MonoBehavior that calls Academy.EnvironmentStep().
+        /// Enable stepping of the Academy during the FixedUpdate phase. This is done by creating
+        /// a temporary GameObject with a MonoBehaviour that calls Academy.EnvironmentStep().
        /// </summary>
        void EnableAutomaticStepping()
        {
        /// Registers SideChannel to the Academy to send and receive data with Python.
        /// If IsCommunicatorOn is false, the SideChannel will not be registered.
        /// </summary>
-        /// <param name="sideChannel"> The side channel to be registered.</param>
+        /// <param name="channel"> The side channel to be registered.</param>
        public void RegisterSideChannel(SideChannel channel)
        {
            LazyInitialize();
        /// Unregisters SideChannel to the Academy. If the side channel was not registered,
        /// nothing will happen.
        /// </summary>
-        /// <param name="sideChannel"> The side channel to be unregistered.</param>
+        /// <param name="channel"> The side channel to be unregistered.</param>
        public void UnregisterSideChannel(SideChannel channel)
        {
            Communicator?.UnregisterSideChannel(channel);
--- a/com.unity.ml-agents/Runtime/Agent.cs
+++ b/com.unity.ml-agents/Runtime/Agent.cs
 using System.Collections.Generic;
 using UnityEngine;
 using Barracuda;
-using UnityEngine.Serialization;
-    /// observations, actions and current status, that is sent to the Brain.
+    /// observations, actions and current status.
    /// </summary>
    internal struct AgentInfo
    {
        public float[] vectorActions;
    }

-
-    /// Agent Monobehavior class that is attached to a Unity GameObject, making it
+    /// Agent MonoBehaviour class that is attached to a Unity GameObject, making it
-    /// user in <see cref="CollectObservations"/>. On the other hand, actions
-    /// are determined by decisions produced by a Policy. Currently, this
-    /// class is expected to be extended to implement the desired agent behavior.
+    /// user in <see cref="Agent.CollectObservations(VectorSensor)"/> or
+    /// <see cref="Agent.CollectObservations(VectorSensor, ActionMasker)"/>.
+    /// On the other hand, actions are determined by decisions produced by a Policy.
+    /// Currently, this class is expected to be extended to implement the desired agent behavior.
    /// </summary>
    /// <remarks>
    /// Simply speaking, an agent roams through an environment and at each step
    /// little may have changed between successive steps.
    ///
-    /// At any step, an agent may be considered <see cref="m_Done"/>.
-    /// This could occur due to a variety of reasons:
+    /// At any step, an agent may be considered done due to a variety of reasons:
    ///     - The agent reached an end state within its environment.
    ///     - The agent reached the maximum # of steps (i.e. timed out).
    ///     - The academy reached the maximum # of steps (forced agent to be done).
        BehaviorParameters m_PolicyFactory;

        /// This code is here to make the upgrade path for users using maxStep
-        /// easier.  We will hook into the Serialization code and make sure that
+        /// easier. We will hook into the Serialization code and make sure that
        /// agentParameters.maxStep and this.maxStep are in sync.
        [Serializable]
        internal struct AgentParameters
        ActionMasker m_ActionMasker;

        /// <summary>
-        /// Set of DemonstrationStores that the Agent will write its step information to.
-        /// If you use a DemonstrationRecorder component, this will automatically register its DemonstrationStore.
-        /// You can also add your own DemonstrationStore by calling DemonstrationRecorder.AddDemonstrationStoreToAgent()
+        /// Set of DemonstrationWriters that the Agent will write its step information to.
+        /// If you use a DemonstrationRecorder component, this will automatically register its DemonstrationWriter.
+        /// You can also add your own DemonstrationWriter by calling
+        /// DemonstrationRecorder.AddDemonstrationWriterToAgent()
-        internal ISet<DemonstrationStore> DemonstrationStores = new HashSet<DemonstrationStore>();
+        internal ISet<DemonstrationWriter> DemonstrationWriters = new HashSet<DemonstrationWriter>();

        /// <summary>
        /// List of sensors used to generate observations.
        /// </summary>
        internal VectorSensor collectObservationsSensor;

-        /// MonoBehaviour function that is called when the attached GameObject
-        /// becomes enabled or active.
+        /// <summary>
+        /// <inheritdoc cref="OnBeforeSerialize"/>
+        /// </summary>
+            // Manages a serialization upgrade issue from v0.13 to v0.14 where maxStep moved
+            // from AgentParameters (since removed) to Agent
            if (maxStep == 0 && maxStep != agentParameters.maxStep && !hasUpgradedFromAgentParameters)
            {
                maxStep = agentParameters.maxStep;

+        /// <summary>
+        /// <inheritdoc cref="OnAfterDeserialize"/>
+        /// </summary>
+            // Manages a serialization upgrade issue from v0.13 to v0.14 where maxStep moved
+            // from AgentParameters (since removed) to Agent
            if (maxStep == 0 && maxStep != agentParameters.maxStep && !hasUpgradedFromAgentParameters)
            {
                maxStep = agentParameters.maxStep;

-        /// Helper method for the <see cref="OnEnable"/> event, created to
-        /// facilitate testing.
+        /// <summary>
+        /// Initializes the agent. Can be safely called multiple times.
+        /// </summary>
        public void LazyInitialize()
        {
            if (m_Initialized)
            InitializeSensors();
        }

-        /// Monobehavior function that is called when the attached GameObject
-        /// becomes disabled or inactive.
-            DemonstrationStores.Clear();
+            DemonstrationWriters.Clear();

            // If Academy.Dispose has already been called, we don't need to unregister with it.
            // We don't want to even try, because this will lazily create a new Academy!
            m_Brain?.RequestDecision(m_Info, sensors);

            // We also have to write any to any DemonstationStores so that they get the "done" flag.
-            foreach(var demoWriter in DemonstrationStores)
+            foreach(var demoWriter in DemonstrationWriters)
            {
                demoWriter.Record(m_Info, sensors);
            }
            m_Brain = m_PolicyFactory.GeneratePolicy(Heuristic);
        }

+        /// <summary>
        /// Returns the current step counter (within the current episode).
        /// </summary>
        /// <returns>
        /// </returns>
        public virtual float[] Heuristic()
        {
-            throw new UnityAgentsException(string.Format(
+            throw new UnityAgentsException(
-                "{0} GameObject.",
-                gameObject.name));
+                $"{gameObject.name} GameObject.");
        }

        /// <summary>
                collectObservationsSensor = new VectorSensor(param.vectorObservationSize);
                if (param.numStackedVectorObservations > 1)
                {
-                    var stackingSensor = new StackingSensor(collectObservationsSensor, param.numStackedVectorObservations);
+                    var stackingSensor = new StackingSensor(
+                        collectObservationsSensor, param.numStackedVectorObservations);
                    sensors.Add(stackingSensor);
                }
                else
            // Make sure the names are actually unique
            for (var i = 0; i < sensors.Count - 1; i++)
            {
-                Debug.Assert(!sensors[i].GetName().Equals(sensors[i + 1].GetName()), "Sensor names must be unique.");
+                Debug.Assert(
+                    !sensors[i].GetName().Equals(sensors[i + 1].GetName()),
+                    "Sensor names must be unique.");
            }
 #endif
        }

            m_Brain.RequestDecision(m_Info, sensors);

-            // If we have any DemonstrationStores, write the AgentInfo and sensors to them.
-            foreach(var demoWriter in DemonstrationStores)
+            // If we have any DemonstrationWriters, write the AgentInfo and sensors to them.
+            foreach(var demoWriter in DemonstrationWriters)
            {
                demoWriter.Record(m_Info, sensors);
            }
        {
-            for (var i = 0; i < sensors.Count; i++)
+            foreach (var sensor in sensors)
-                sensors[i].Update();
+                sensor.Update();
            }
        }

        /// The agent observation describes the current environment from the
        /// perspective of the agent.
        /// </summary>
+        /// <param name="sensor">
+        /// The vector observations for the agent.
+        /// </param>
        /// <remarks>
        /// An agents observation is any environment information that helps
        /// the Agent achieve its goal. For example, for a fighting Agent, its
        /// Vector observations are added by calling the provided helper methods
        /// on the VectorSensor input:
-        ///     - <see cref="AddObservation(int)"/>
-        ///     - <see cref="AddObservation(float)"/>
-        ///     - <see cref="AddObservation(Vector3)"/>
-        ///     - <see cref="AddObservation(Vector2)"/>
-        ///     - <see cref="AddObservation(Quaternion)"/>
-        ///     - <see cref="AddObservation(bool)"/>
-        ///     - <see cref="AddOneHotObservation(int, int)"/>
+        ///     - <see cref="VectorSensor.AddObservation(int)"/>
+        ///     - <see cref="VectorSensor.AddObservation(float)"/>
+        ///     - <see cref="VectorSensor.AddObservation(Vector3)"/>
+        ///     - <see cref="VectorSensor.AddObservation(Vector2)"/>
+        ///     - <see cref="VectorSensor.AddObservation(Quaternion)"/>
+        ///     - <see cref="VectorSensor.AddObservation(bool)"/>
+        ///     - <see cref="VectorSensor.AddObservation(IEnumerable{float})"/>
+        ///     - <see cref="VectorSensor.AddOneHotObservation(int, int)"/>
        /// Depending on your environment, any combination of these helpers can
        /// be used. They just need to be used in the exact same order each time
        /// this method is called and the resulting size of the vector observation
        }

        /// <summary>
-        /// Collects the vector observations of the agent.
+        /// Collects the vector observations of the agent alongside the masked actions.
+        /// <param name="sensor">
+        /// The vector observations for the agent.
+        /// </param>
+        /// <param name="actionMasker">
+        /// The masked actions for the agent.
+        /// </param>
        /// <remarks>
        /// An agents observation is any environment information that helps
        /// the Agent achieve its goal. For example, for a fighting Agent, its
        /// Vector observations are added by calling the provided helper methods
        /// on the VectorSensor input:
-        ///     - <see cref="AddObservation(int)"/>
-        ///     - <see cref="AddObservation(float)"/>
-        ///     - <see cref="AddObservation(Vector3)"/>
-        ///     - <see cref="AddObservation(Vector2)"/>
-        ///     - <see cref="AddObservation(Quaternion)"/>
-        ///     - <see cref="AddObservation(bool)"/>
-        ///     - <see cref="AddOneHotObservation(int, int)"/>
+        ///     - <see cref="VectorSensor.AddObservation(int)"/>
+        ///     - <see cref="VectorSensor.AddObservation(float)"/>
+        ///     - <see cref="VectorSensor.AddObservation(Vector3)"/>
+        ///     - <see cref="VectorSensor.AddObservation(Vector2)"/>
+        ///     - <see cref="VectorSensor.AddObservation(Quaternion)"/>
+        ///     - <see cref="VectorSensor.AddObservation(bool)"/>
+        ///     - <see cref="VectorSensor.AddObservation(IEnumerable{float})"/>
+        ///     - <see cref="VectorSensor.AddOneHotObservation(int, int)"/>
        /// Depending on your environment, any combination of these helpers can
        /// be used. They just need to be used in the exact same order each time
        /// this method is called and the resulting size of the vector observation
        /// When using Discrete Control, you can prevent the Agent from using a certain
        /// action by masking it. You can call the following method on the ActionMasker
        /// input :
-        ///     - <see cref="SetActionMask(int branch, IEnumerable<int> actionIndices)"/>
-        ///     - <see cref="SetActionMask(int branch, int actionIndex)"/>
-        ///     - <see cref="SetActionMask(IEnumerable<int> actionIndices)"/>
-        ///     - <see cref="SetActionMask(int branch, int actionIndex)"/>
+        ///     - <see cref="ActionMasker.SetActionMask(int)"/>
+        ///     - <see cref="ActionMasker.SetActionMask(int, int)"/>
+        ///     - <see cref="ActionMasker.SetActionMask(int, IEnumerable{int})"/>
+        ///     - <see cref="ActionMasker.SetActionMask(IEnumerable{int})"/>
        /// The branch input is the index of the action, actionIndices are the indices of the
        /// invalid options for that action.
        /// </remarks>
        }

        /// <summary>
-        /// Returns the last action that was decided on by the Agent (returns null if no decision has been made)
+        /// Returns the last action that was decided on by the Agent
+        /// <returns>
+        /// The last action that was decided by the Agent (or null if no decision has been made)
+        /// </returns>
        public float[] GetAction()
        {
            return m_Action.vectorActions;
        /// <param name="min"></param>
        /// <param name="max"></param>
        /// <returns></returns>
-        protected float ScaleAction(float rawAction, float min, float max)
+        protected static float ScaleAction(float rawAction, float min, float max)
        {
            var middle = (min + max) / 2;
            var range = (max - min) / 2;
--- a/com.unity.ml-agents/Tests/Editor/DemonstrationTests.cs
+++ b/com.unity.ml-agents/Tests/Editor/DemonstrationTests.cs
            demoRec.record = true;
            demoRec.demonstrationName = k_DemoName;
            demoRec.demonstrationDirectory = k_DemoDirectory;
-            var demoStore = demoRec.LazyInitialize(fileSystem);
+            var demoWriter = demoRec.LazyInitialize(fileSystem);

            Assert.IsTrue(fileSystem.Directory.Exists(k_DemoDirectory));
            Assert.IsTrue(fileSystem.FileExists(k_DemoDirectory + k_DemoName + k_ExtensionType));
            };


-            demoStore.Record(agentInfo, new System.Collections.Generic.List<ISensor>());
+            demoWriter.Record(agentInfo, new System.Collections.Generic.List<ISensor>());
-            demoStore.Close();
+            demoWriter.Close();
-            demoStore.Record(agentInfo, new System.Collections.Generic.List<ISensor>());
+            demoWriter.Record(agentInfo, new System.Collections.Generic.List<ISensor>());
        }

        public class ObservationAgent : TestAgent

            // Read back the demo file and make sure observations were written
            var reader = fileSystem.File.OpenRead("Assets/Demonstrations/TestBrain.demo");
-            reader.Seek(DemonstrationStore.MetaDataBytes + 1, 0);
+            reader.Seek(DemonstrationWriter.MetaDataBytes + 1, 0);
            BrainParametersProto.Parser.ParseDelimitedFrom(reader);

            var agentInfoProto = AgentInfoActionPairProto.Parser.ParseDelimitedFrom(reader).AgentInfo;
--- a/config/gail_config.yaml
+++ b/config/gail_config.yaml
            strength: 1.0
            gamma: 0.99
            encoding_size: 128
-            demo_path: Project/Assets/Demonstrations/PushblockDemo.demo
+            demo_path: Project/Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo

 Hallway:
    use_recurrent: true
--- a/config/sac_trainer_config.yaml
+++ b/config/sac_trainer_config.yaml
    learning_rate: 3.0e-4
    learning_rate_schedule: constant
    max_steps: 5.0e5
-    memory_size: 256
+    memory_size: 128
    normalize: false
    num_update: 1
    train_interval: 1
    sequence_length: 32
    num_layers: 2
    hidden_units: 128
-    memory_size: 256
+    memory_size: 128
    init_entcoef: 0.1
    max_steps: 1.0e7
    summary_freq: 10000
    sequence_length: 32
    num_layers: 1
    hidden_units: 128
-    memory_size: 256
+    memory_size: 128
+    summary_freq: 10000
    time_horizon: 64
    use_recurrent: true

    num_layers: 1
    hidden_units: 128
-    memory_size: 256
+    memory_size: 128
    gamma: 0.99
    buffer_size: 1024
    batch_size: 64
--- a/config/trainer_config.yaml
+++ b/config/trainer_config.yaml
    learning_rate: 3.0e-4
    learning_rate_schedule: linear
    max_steps: 5.0e5
-    memory_size: 256
+    memory_size: 128
    normalize: false
    num_epoch: 3
    num_layers: 2
    sequence_length: 64
    num_layers: 2
    hidden_units: 128
-    memory_size: 256
+    memory_size: 128
    beta: 1.0e-2
    num_epoch: 3
    buffer_size: 1024
    sequence_length: 64
    num_layers: 1
    hidden_units: 128
-    memory_size: 256
+    memory_size: 128
    beta: 1.0e-2
    num_epoch: 3
    buffer_size: 1024
    sequence_length: 32
    num_layers: 1
    hidden_units: 128
-    memory_size: 256
+    memory_size: 128
    beta: 1.0e-2
    num_epoch: 3
    buffer_size: 1024
--- a/docs/API-Reference.md
+++ b/docs/API-Reference.md
 # API Reference

-Our developer-facing C# classes (Academy, Agent, Decision and Monitor) have been
-documented to be compatible with Doxygen for auto-generating HTML
-documentation.
+Our developer-facing C# classes have been documented to be compatible with
+Doxygen for auto-generating HTML documentation.

 To generate the API reference, download Doxygen
 and run the following command within the `docs/` directory:
 subdirectory to navigate to the API reference home. Note that `html/` is already
 included in the repository's `.gitignore` file.

-In the near future, we aim to expand our documentation to include all the Unity
-C# classes and Python API.
+In the near future, we aim to expand our documentation to include the Python
+classes.
--- a/docs/Migrating.md
+++ b/docs/Migrating.md
 * The interface for `RayPerceptionSensor.PerceiveStatic()` was changed to take an input class and write to an output class.
 * The `SetActionMask` method must now be called on the optional `ActionMasker` argument of the `CollectObservations` method. (We now consider an action mask as a type of observation)
 * The method `GetStepCount()` on the Agent class has been replaced with the property getter `StepCount`
+* The `--multi-gpu` option has been removed temporarily.

 ### Steps to Migrate
 * Replace your Agent's implementation of `CollectObservations()` with `CollectObservations(VectorSensor sensor)`. In addition, replace all calls to `AddVectorObs()` with `sensor.AddObservation()` or `sensor.AddOneHotObservation()` on the `VectorSensor` passed as argument.
--- a/docs/Training-ML-Agents.md
+++ b/docs/Training-ML-Agents.md
  [here](https://docs.unity3d.com/Manual/CommandLineArguments.html) for more
  details.
 * `--debug`: Specify this option to enable debug-level logging for some parts of the code.
-* `--multi-gpu`: Setting this flag enables the use of multiple GPU's (if available) during training.
 * `--cpu`: Forces training using CPU only.
 * Engine Configuration :
  * `--width' : The width of the executable window of the environment(s) in pixels
--- a/docs/Training-PPO.md
+++ b/docs/Training-PPO.md
 ### Memory Size

 `memory_size` corresponds to the size of the array of floating point numbers
-used to store the hidden state of the recurrent neural network. This value must
-be a multiple of 4, and should scale with the amount of information you expect
+used to store the hidden state of the recurrent neural network of the policy. This value must
+be a multiple of 2, and should scale with the amount of information you expect
-Typical Range: `64` - `512`
+Typical Range: `32` - `256`

 ## (Optional) Behavioral Cloning Using Demonstrations

--- a/docs/Training-SAC.md
+++ b/docs/Training-SAC.md
 ### Memory Size

 `memory_size` corresponds to the size of the array of floating point numbers
-used to store the hidden state of the recurrent neural network. This value must
-be a multiple of 4, and should scale with the amount of information you expect
+used to store the hidden state of the recurrent neural network in the policy.
+This value must be a multiple of 2, and should scale with the amount of information you expect
-Typical Range: `64` - `512`
+Typical Range: `32` - `256`

 ### (Optional) Save Replay Buffer

--- a/docs/dox-ml-agents.conf
+++ b/docs/dox-ml-agents.conf
 # Doxyfile 1.8.13

 # To generate the C# API documentation, run:
-# 
+#
 # doxygen dox-ml-agents.conf
 #
 # from the ml-agents-docs directory
 # title of most generated pages and in a few other places.
 # The default value is: My Project.

-PROJECT_NAME           = "ML-Agents Toolkit"
+PROJECT_NAME           = "Unity ML-Agents Toolkit"
-PROJECT_NUMBER         = v0.4
+PROJECT_NUMBER         =
-PROJECT_BRIEF          = 
+PROJECT_BRIEF          =

 # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
 # in the documentation. The maximum height of the logo should not exceed 55
 # entered, it will be relative to the location where doxygen was started. If
 # left blank the current directory will be used.

-OUTPUT_DIRECTORY       = 
+OUTPUT_DIRECTORY       =

 # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
 # directories (in 2 levels) under the output directory of each output format and
 # will be relative from the directory where doxygen is started.
 # This tag requires that the tag FULL_PATH_NAMES is set to YES.

-STRIP_FROM_PATH        = 
+STRIP_FROM_PATH        =

 # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
 # path mentioned in the documentation of a class, which tells the reader which
 # using the -I flag.

-STRIP_FROM_INC_PATH    = 
+STRIP_FROM_INC_PATH    =

 # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
 # less readable) file names. This can be useful is your file systems doesn't
 # "Side Effects:". You can put \n's in the value part of an alias to insert
 # newlines.

-ALIASES                = 
+ALIASES                =
-TCL_SUBST              = 
+TCL_SUBST              =

 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
 # the files are not read by doxygen.

-EXTENSION_MAPPING      = 
+EXTENSION_MAPPING      =

 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
 # sections, marked by \if <section_label> ... \endif and \cond <section_label>
 # ... \endcond blocks.

-ENABLED_SECTIONS       = 
+ENABLED_SECTIONS       =

 # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
 # initial value of a variable or macro / define can have for it to appear in the
 # by doxygen. Whatever the program writes to standard output is used as the file
 # version. For an example see the documentation.

-FILE_VERSION_FILTER    = 
+FILE_VERSION_FILTER    =

 # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
 # by doxygen. The layout file controls the global structure of the generated
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.

-CITE_BIB_FILES         = 
+CITE_BIB_FILES         =

 #---------------------------------------------------------------------------
 # Configuration options related to warning and progress messages
 # messages should be written. If left blank the output is written to standard
 # error (stderr).

-WARN_LOGFILE           = 
+WARN_LOGFILE           =

 #---------------------------------------------------------------------------
 # Configuration options related to the input files
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.

-INPUT                  = ../Project/Assets/ML-Agents/Scripts/Academy.cs \
-                         ../Project/Assets/ML-Agents/Scripts/Agent.cs \
-                         ../Project/Assets/ML-Agents/Scripts/Monitor.cs \
-                         ../Project/Assets/ML-Agents/Scripts/Decision.cs
+INPUT                  = ../com.unity.ml-agents/Runtime/

 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # Note that relative paths are relative to the directory from which doxygen is
 # run.

-EXCLUDE                = 
+EXCLUDE                =

 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*

-EXCLUDE_PATTERNS       = 
+EXCLUDE_PATTERNS       =

 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*

-EXCLUDE_SYMBOLS        = 
+EXCLUDE_SYMBOLS        =
-EXAMPLE_PATH           = 
+EXAMPLE_PATH           =

 # If the value of the EXAMPLE_PATH tag contains directories, you can use the
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-EXAMPLE_PATTERNS       = 
+EXAMPLE_PATTERNS       =

 # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
 # searched for input files to be used with the \include or \dontinclude commands
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # properly processed by doxygen.

-INPUT_FILTER           = 
+INPUT_FILTER           =

 # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
 # basis. Doxygen will compare the file name with each pattern and apply the
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # properly processed by doxygen.

-FILTER_PATTERNS        = 
+FILTER_PATTERNS        =

 # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
 # INPUT_FILTER) will also be used to filter the input files that are used for
 # *.ext= (so without naming a filter).
 # This tag requires that the tag FILTER_SOURCE_FILES is set to YES.

-FILTER_SOURCE_PATTERNS = 
+FILTER_SOURCE_PATTERNS =

 # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
 # is part of the input, its contents will be placed on the main page
 # generated with the -Duse-libclang=ON option for CMake.
 # The default value is: NO.

-CLANG_ASSISTED_PARSING = NO
+#CLANG_ASSISTED_PARSING = NO

 # If clang assisted parsing is enabled you can provide the compiler with command
 # line options that you would normally use when invoking the compiler. Note that

-CLANG_OPTIONS          = 
+#CLANG_OPTIONS          =

 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
 # while generating the index headers.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.

-IGNORE_PREFIX          = 
+IGNORE_PREFIX          =

 #---------------------------------------------------------------------------
 # Configuration options related to the HTML output
 # files will be copied as-is; there are no commands or markers available.
 # This tag requires that the tag GENERATE_HTML is set to YES.

-HTML_EXTRA_FILES       = 
+HTML_EXTRA_FILES       =

 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
 # written to the html output directory.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.

-CHM_FILE               = 
+CHM_FILE               =

 # The HHC_LOCATION tag can be used to specify the location (absolute path
 # including file name) of the HTML help compiler (hhc.exe). If non-empty,

-HHC_LOCATION           = 
+HHC_LOCATION           =

 # The GENERATE_CHI flag controls if a separate .chi index file is generated
 # (YES) or that it should be included in the master .chm file (NO).
 # and project file content.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.

-CHM_INDEX_ENCODING     = 
+CHM_INDEX_ENCODING     =

 # The BINARY_TOC flag controls whether a binary table of contents is generated
 # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
 # the HTML output folder.
 # This tag requires that the tag GENERATE_QHP is set to YES.

-QCH_FILE               = 
+QCH_FILE               =

 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.

-QHP_CUST_FILTER_NAME   = 
+QHP_CUST_FILTER_NAME   =

 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom

-QHP_CUST_FILTER_ATTRS  = 
+QHP_CUST_FILTER_ATTRS  =

 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
-QHP_SECT_FILTER_ATTRS  = 
+QHP_SECT_FILTER_ATTRS  =

 # The QHG_LOCATION tag can be used to specify the location of Qt's
 # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-QHG_LOCATION           = 
+QHG_LOCATION           =

 # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
 # generated, together with the HTML files, they form an Eclipse help plugin. To
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
 # This tag requires that the tag USE_MATHJAX is set to YES.

-MATHJAX_EXTENSIONS     = 
+MATHJAX_EXTENSIONS     =

 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site

-MATHJAX_CODEFILE       = 
+MATHJAX_CODEFILE       =

 # When the SEARCHENGINE tag is enabled doxygen will generate a search box for
 # the HTML output. The underlying search engine uses javascript and DHTML and
 # Searching" for details.
 # This tag requires that the tag SEARCHENGINE is set to YES.

-SEARCHENGINE_URL       = 
+SEARCHENGINE_URL       =

 # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
 # search data is written to a file for indexing by an external tool. With the
 # projects and redirect the results back to the right project.
 # This tag requires that the tag SEARCHENGINE is set to YES.

-EXTERNAL_SEARCH_ID     = 
+EXTERNAL_SEARCH_ID     =

 # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
 # projects other than the one defined by this configuration file, but that are
 # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
 # This tag requires that the tag SEARCHENGINE is set to YES.

-EXTRA_SEARCH_MAPPINGS  = 
+EXTRA_SEARCH_MAPPINGS  =

 #---------------------------------------------------------------------------
 # Configuration options related to the LaTeX output
 # If left blank no extra packages will be included.
 # This tag requires that the tag GENERATE_LATEX is set to YES.

-EXTRA_PACKAGES         = 
+EXTRA_PACKAGES         =

 # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
 # generated LaTeX document. The header should contain everything until the first
 # to HTML_HEADER.
 # This tag requires that the tag GENERATE_LATEX is set to YES.

-LATEX_HEADER           = 
+LATEX_HEADER           =

 # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
 # generated LaTeX document. The footer should contain everything after the last
 # Note: Only use a user-defined footer if you know what you are doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.

-LATEX_FOOTER           = 
+LATEX_FOOTER           =

 # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # LaTeX style sheets that are included after the standard style sheets created
 # list).
 # This tag requires that the tag GENERATE_LATEX is set to YES.

-LATEX_EXTRA_STYLESHEET = 
+LATEX_EXTRA_STYLESHEET =

 # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the LATEX_OUTPUT output

-LATEX_EXTRA_FILES      = 
+LATEX_EXTRA_FILES      =

 # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
 # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
 # default style sheet that doxygen normally uses.
 # This tag requires that the tag GENERATE_RTF is set to YES.

-RTF_STYLESHEET_FILE    = 
+RTF_STYLESHEET_FILE    =

 # Set optional variables used in the generation of an RTF document. Syntax is
 # similar to doxygen's config file. A template extensions file can be generated
-RTF_EXTENSIONS_FILE    = 
+RTF_EXTENSIONS_FILE    =

 # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
 # with syntax highlighting in the RTF output.
 # MAN_EXTENSION with the initial . removed.
 # This tag requires that the tag GENERATE_MAN is set to YES.

-MAN_SUBDIR             = 
+MAN_SUBDIR             =

 # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
 # will generate one additional man file for each entity documented in the real
 # overwrite each other's variables.
 # This tag requires that the tag GENERATE_PERLMOD is set to YES.

-PERLMOD_MAKEVAR_PREFIX = 
+PERLMOD_MAKEVAR_PREFIX =

 #---------------------------------------------------------------------------
 # Configuration options related to the preprocessor
 # preprocessor.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.

-INCLUDE_PATH           = 
+INCLUDE_PATH           =

 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the

-INCLUDE_FILE_PATTERNS  = 
+INCLUDE_FILE_PATTERNS  =

 # The PREDEFINED tag can be used to specify one or more macro names that are
 # defined before the preprocessor is started (similar to the -D option of e.g.
 # definition found in the source code.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

-EXPAND_AS_DEFINED      = 
+EXPAND_AS_DEFINED      =

 # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
 # remove all references to function-like macros that are alone on a line, have
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.

-TAGFILES               = 
+TAGFILES               =
-GENERATE_TAGFILE       = 
+GENERATE_TAGFILE       =

 # If the ALLEXTERNALS tag is set to YES, all external class will be listed in
 # the class index. If set to NO, only the inherited external classes will be
 # interpreter (i.e. the result of 'which perl').
 # The default file (with absolute path) is: /usr/bin/perl.

-PERL_PATH              = /usr/bin/perl
+#PERL_PATH              = /usr/bin/perl

 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 # the mscgen tool resides. If left empty the tool is assumed to be found in the
 # default search path.

-MSCGEN_PATH            = 
+#MSCGEN_PATH            =

 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
-DIA_PATH               = 
+DIA_PATH               =

 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
 # the path where dot can find it using this tag.
 # This tag requires that the tag HAVE_DOT is set to YES.

-DOT_FONTPATH           = 
+DOT_FONTPATH           =

 # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
 # each documented class showing the direct and indirect inheritance relations.
 # found. If left blank, it is assumed the dot tool can be found in the path.
 # This tag requires that the tag HAVE_DOT is set to YES.

-DOT_PATH               = 
+DOT_PATH               =

 # The DOTFILE_DIRS tag can be used to specify one or more directories that
 # contain dot files that are included in the documentation (see the \dotfile
-DOTFILE_DIRS           = 
+DOTFILE_DIRS           =
-MSCFILE_DIRS           = 
+MSCFILE_DIRS           =
-DIAFILE_DIRS           = 
+DIAFILE_DIRS           =

 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
 # path where java can find the plantuml.jar file. If left blank, it is assumed

-PLANTUML_JAR_PATH      = 
+PLANTUML_JAR_PATH      =
-PLANTUML_CFG_FILE      = 
+PLANTUML_CFG_FILE      =
-PLANTUML_INCLUDE_PATH  = 
+PLANTUML_INCLUDE_PATH  =

 # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
 # that will be shown in the graph. If the number of nodes in a graph becomes
--- a/ml-agents-envs/mlagents_envs/environment.py
+++ b/ml-agents-envs/mlagents_envs/environment.py
    def get_communicator(worker_id, base_port, timeout_wait):
        return RpcCommunicator(worker_id, base_port, timeout_wait)

-    def executable_launcher(self, file_name, docker_training, no_graphics, args):
-        cwd = os.getcwd()
-        file_name = (
-            file_name.strip()
+    @staticmethod
+    def validate_environment_path(env_path: str) -> Optional[str]:
+        # Strip out executable extensions if passed
+        env_path = (
+            env_path.strip()
-        true_filename = os.path.basename(os.path.normpath(file_name))
+        true_filename = os.path.basename(os.path.normpath(env_path))
+
+        if not (glob.glob(env_path) or glob.glob(env_path + ".*")):
+            return None
+
+        cwd = os.getcwd()
+        true_filename = os.path.basename(os.path.normpath(env_path))
-            candidates = glob.glob(os.path.join(cwd, file_name) + ".x86_64")
+            candidates = glob.glob(os.path.join(cwd, env_path) + ".x86_64")
-                candidates = glob.glob(os.path.join(cwd, file_name) + ".x86")
+                candidates = glob.glob(os.path.join(cwd, env_path) + ".x86")
-                candidates = glob.glob(file_name + ".x86_64")
+                candidates = glob.glob(env_path + ".x86_64")
-                candidates = glob.glob(file_name + ".x86")
+                candidates = glob.glob(env_path + ".x86")
-                os.path.join(
-                    cwd, file_name + ".app", "Contents", "MacOS", true_filename
-                )
+                os.path.join(cwd, env_path + ".app", "Contents", "MacOS", true_filename)
-                    os.path.join(file_name + ".app", "Contents", "MacOS", true_filename)
+                    os.path.join(env_path + ".app", "Contents", "MacOS", true_filename)
-                    os.path.join(cwd, file_name + ".app", "Contents", "MacOS", "*")
+                    os.path.join(cwd, env_path + ".app", "Contents", "MacOS", "*")
-                    os.path.join(file_name + ".app", "Contents", "MacOS", "*")
+                    os.path.join(env_path + ".app", "Contents", "MacOS", "*")
-            candidates = glob.glob(os.path.join(cwd, file_name + ".exe"))
+            candidates = glob.glob(os.path.join(cwd, env_path + ".exe"))
-                candidates = glob.glob(file_name + ".exe")
+                candidates = glob.glob(env_path + ".exe")
+        return launch_string
+
+    def executable_launcher(self, file_name, docker_training, no_graphics, args):
+        launch_string = self.validate_environment_path(file_name)
-                "Couldn't launch the {0} environment. "
-                "Provided filename does not match any environments.".format(
-                    true_filename
-                )
+                f"Couldn't launch the {file_name} environment. Provided filename does not match any environments."
            )
        else:
            logger.debug("This is the launch string {}".format(launch_string))
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
        if take_action_outputs:
            for _entropy in take_action_outputs["entropy"]:
                self.stats_reporter.add_stat("Policy/Entropy", _entropy)
-            self.stats_reporter.add_stat(
-                "Policy/Learning Rate", take_action_outputs["learning_rate"]
-            )

        terminated_agents: Set[str] = set()
        # Make unique agent_ids that are global across workers
--- a/ml-agents/mlagents/trainers/components/bc/model.py
+++ b/ml-agents/mlagents/trainers/components/bc/model.py
 from mlagents.tf_utils import tf

-from mlagents.trainers.models import LearningModel
+from mlagents.trainers.tf_policy import TFPolicy
-        self,
-        policy_model: LearningModel,
-        learning_rate: float = 3e-4,
-        anneal_steps: int = 0,
+        self, policy: TFPolicy, learning_rate: float = 3e-4, anneal_steps: int = 0
-        :param policy_model: The policy of the learning algorithm
+        :param policy: The policy of the learning algorithm
-        self.policy_model = policy_model
-        self.expert_visual_in = self.policy_model.visual_in
-        self.obs_in_expert = self.policy_model.vector_in
+        self.policy = policy
+        self.expert_visual_in = self.policy.visual_in
+        self.obs_in_expert = self.policy.vector_in
        self.make_inputs()
        self.create_loss(learning_rate, anneal_steps)

        self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32)
        self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32)

-        if self.policy_model.brain.vector_action_space_type == "continuous":
-            action_length = self.policy_model.act_size[0]
+        if self.policy.brain.vector_action_space_type == "continuous":
+            action_length = self.policy.act_size[0]
-            action_length = len(self.policy_model.act_size)
+            action_length = len(self.policy.act_size)
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.int32
            )
-                    for i, act_size in enumerate(self.policy_model.act_size)
+                    for i, act_size in enumerate(self.policy.act_size)
                ],
                axis=1,
            )
        :param learning_rate: The learning rate for the optimizer
        :param anneal_steps: Number of steps over which to anneal the learning_rate
        """
-        selected_action = self.policy_model.output
-        if self.policy_model.brain.vector_action_space_type == "continuous":
+        selected_action = self.policy.output
+        if self.policy.use_continuous_act:
-            log_probs = self.policy_model.all_log_probs
+            log_probs = self.policy.all_log_probs
            self.loss = tf.reduce_mean(
                -tf.log(tf.nn.softmax(log_probs) + 1e-7) * self.expert_action
            )
-                learning_rate,
-                self.policy_model.global_step,
-                anneal_steps,
-                0.0,
-                power=1.0,
+                learning_rate, self.policy.global_step, anneal_steps, 0.0, power=1.0
-        optimizer = tf.train.AdamOptimizer(learning_rate=self.annealed_learning_rate)
+        optimizer = tf.train.AdamOptimizer(
+            learning_rate=self.annealed_learning_rate, name="bc_adam"
+        )
        self.update_batch = optimizer.minimize(self.loss)
--- a/ml-agents/mlagents/trainers/components/bc/module.py
+++ b/ml-agents/mlagents/trainers/components/bc/module.py
 from mlagents.trainers.tf_policy import TFPolicy
 from .model import BCModel
 from mlagents.trainers.demo_loader import demo_to_buffer
-from mlagents.trainers.trainer import UnityTrainerException
+from mlagents.trainers.exception import UnityTrainerException


 class BCModule:
        """
        self.policy = policy
        self.current_lr = policy_learning_rate * strength
-        self.model = BCModel(policy.model, self.current_lr, steps)
+        self.model = BCModel(policy, self.current_lr, steps)
        _, self.demonstration_buffer = demo_to_buffer(demo_path, policy.sequence_length)

        self.batch_size = batch_size if batch_size else default_batch_size
        Helper function for update_batch.
        """
        feed_dict = {
-            self.policy.model.batch_size: n_sequences,
-            self.policy.model.sequence_length: self.policy.sequence_length,
+            self.policy.batch_size_ph: n_sequences,
+            self.policy.sequence_length_ph: self.policy.sequence_length,
-        if self.policy.model.brain.vector_action_space_type == "continuous":
-            feed_dict[self.policy.model.epsilon] = np.random.normal(
-                size=(1, self.policy.model.act_size[0])
-            )
-        else:
-            feed_dict[self.policy.model.action_masks] = np.ones(
+        if not self.policy.use_continuous_act:
+            feed_dict[self.policy.action_masks] = np.ones(
-                    sum(self.policy.model.brain.vector_action_space_size),
+                    sum(self.policy.brain.vector_action_space_size),
-        if self.policy.model.brain.vector_observation_space_size > 0:
-            feed_dict[self.policy.model.vector_in] = mini_batch_demo["vector_obs"]
-        for i, _ in enumerate(self.policy.model.visual_in):
-            feed_dict[self.policy.model.visual_in[i]] = mini_batch_demo[
-                "visual_obs%d" % i
-            ]
+        if self.policy.brain.vector_observation_space_size > 0:
+            feed_dict[self.policy.vector_in] = mini_batch_demo["vector_obs"]
+        for i, _ in enumerate(self.policy.visual_in):
+            feed_dict[self.policy.visual_in[i]] = mini_batch_demo["visual_obs%d" % i]
-            feed_dict[self.policy.model.memory_in] = np.zeros(
+            feed_dict[self.policy.memory_in] = np.zeros(
-            if not self.policy.model.brain.vector_action_space_type == "continuous":
-                feed_dict[self.policy.model.prev_action] = mini_batch_demo[
-                    "prev_action"
-                ]
+            if not self.policy.use_continuous_act:
+                feed_dict[self.policy.prev_action] = mini_batch_demo["prev_action"]
        network_out = self.policy.sess.run(
            list(self.out_dict.values()), feed_dict=feed_dict
        )
--- a/ml-agents/mlagents/trainers/components/reward_signals/init.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/init.py

 from mlagents.tf_utils import tf

-from mlagents.trainers.trainer import UnityTrainerException
+from mlagents.trainers.exception import UnityTrainerException
-from mlagents.trainers.models import LearningModel

 logger = logging.getLogger("mlagents.trainers")



 class RewardSignal(abc.ABC):
-    def __init__(
-        self,
-        policy: TFPolicy,
-        policy_model: LearningModel,
-        strength: float,
-        gamma: float,
-    ):
+    def __init__(self, policy: TFPolicy, strength: float, gamma: float):
-        :param policy: The Policy object (e.g. PPOPolicy) that this Reward Signal will apply to.
+        :param policy: The Policy object (e.g. NNPolicy) that this Reward Signal will apply to.
        :param strength: The strength of the reward. The reward's raw value will be multiplied by this value.
        :param gamma: The time discounting factor used for this reward.
        :return: A RewardSignal object.
        self.update_dict: Dict[str, tf.Tensor] = {}
        self.gamma = gamma
        self.policy = policy
-        self.policy_model = policy_model
        self.strength = strength
        self.stats_name_to_update_name: Dict[str, str] = {}

        )

    def prepare_update(
-        self,
-        policy_model: LearningModel,
-        mini_batch: Dict[str, np.ndarray],
-        num_sequences: int,
+        self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
    ) -> Dict[tf.Tensor, Any]:
        """
        If the reward signal has an internal model (e.g. GAIL or Curiosity), get the feed_dict
--- a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
 from typing import List, Tuple
 from mlagents.tf_utils import tf

-from mlagents.trainers.models import LearningModel
+from mlagents.trainers.models import ModelUtils
+from mlagents.trainers.tf_policy import TFPolicy
-        self,
-        policy_model: LearningModel,
-        encoding_size: int = 128,
-        learning_rate: float = 3e-4,
+        self, policy: TFPolicy, encoding_size: int = 128, learning_rate: float = 3e-4
-        :param policy_model: The model being used by the learning policy
+        :param policy: The policy being trained
-        self.policy_model = policy_model
+        self.policy = policy
        self.next_visual_in: List[tf.Tensor] = []
        encoded_state, encoded_next_state = self.create_curiosity_encoders()
        self.create_inverse_model(encoded_state, encoded_next_state)
        encoded_state_list = []
        encoded_next_state_list = []

-        if self.policy_model.vis_obs_size > 0:
+        if self.policy.vis_obs_size > 0:
-            for i in range(self.policy_model.vis_obs_size):
+            for i in range(self.policy.vis_obs_size):
-                next_visual_input = LearningModel.create_visual_input(
-                    self.policy_model.brain.camera_resolutions[i],
+                next_visual_input = ModelUtils.create_visual_input(
+                    self.policy.brain.camera_resolutions[i],
                    name="curiosity_next_visual_observation_" + str(i),
                )
                self.next_visual_in.append(next_visual_input)
-                encoded_visual = self.policy_model.create_visual_observation_encoder(
-                    self.policy_model.visual_in[i],
+                encoded_visual = ModelUtils.create_visual_observation_encoder(
+                    self.policy.visual_in[i],
-                    LearningModel.swish,
+                    ModelUtils.swish,
-                encoded_next_visual = self.policy_model.create_visual_observation_encoder(
+                encoded_next_visual = ModelUtils.create_visual_observation_encoder(
-                    LearningModel.swish,
+                    ModelUtils.swish,
                    1,
                    "curiosity_stream_{}_visual_obs_encoder".format(i),
                    True,
            encoded_state_list.append(hidden_visual)
            encoded_next_state_list.append(hidden_next_visual)

-        if self.policy_model.vec_obs_size > 0:
+        if self.policy.vec_obs_size > 0:
-                shape=[None, self.policy_model.vec_obs_size],
+                shape=[None, self.policy.vec_obs_size],
-            encoded_vector_obs = self.policy_model.create_vector_observation_encoder(
-                self.policy_model.vector_in,
+            encoded_vector_obs = ModelUtils.create_vector_observation_encoder(
+                self.policy.vector_in,
-                LearningModel.swish,
+                ModelUtils.swish,
-            encoded_next_vector_obs = self.policy_model.create_vector_observation_encoder(
+            encoded_next_vector_obs = ModelUtils.create_vector_observation_encoder(
-                LearningModel.swish,
+                ModelUtils.swish,
                2,
                "curiosity_vector_obs_encoder",
                True,
        :param encoded_next_state: Tensor corresponding to encoded next state.
        """
        combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
-        hidden = tf.layers.dense(combined_input, 256, activation=LearningModel.swish)
-        if self.policy_model.brain.vector_action_space_type == "continuous":
+        hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
+        if self.policy.brain.vector_action_space_type == "continuous":
-                hidden, self.policy_model.act_size[0], activation=None
+                hidden, self.policy.act_size[0], activation=None
-                tf.squared_difference(pred_action, self.policy_model.selected_actions),
-                axis=1,
+                tf.squared_difference(pred_action, self.policy.selected_actions), axis=1
-                tf.dynamic_partition(squared_difference, self.policy_model.mask, 2)[1]
+                tf.dynamic_partition(squared_difference, self.policy.mask, 2)[1]
-                        hidden, self.policy_model.act_size[i], activation=tf.nn.softmax
+                        hidden, self.policy.act_size[i], activation=tf.nn.softmax
-                    for i in range(len(self.policy_model.act_size))
+                    for i in range(len(self.policy.act_size))
-                -tf.log(pred_action + 1e-10) * self.policy_model.selected_actions,
-                axis=1,
+                -tf.log(pred_action + 1e-10) * self.policy.selected_actions, axis=1
-                tf.dynamic_partition(cross_entropy, self.policy_model.mask, 2)[1]
+                tf.dynamic_partition(cross_entropy, self.policy.mask, 2)[1]
            )

    def create_forward_model(
        :param encoded_next_state: Tensor corresponding to encoded next state.
        """
        combined_input = tf.concat(
-            [encoded_state, self.policy_model.selected_actions], axis=1
+            [encoded_state, self.policy.selected_actions], axis=1
-        hidden = tf.layers.dense(combined_input, 256, activation=LearningModel.swish)
+        hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
-            * (
-                self.policy_model.vis_obs_size + int(self.policy_model.vec_obs_size > 0)
-            ),
+            * (self.policy.vis_obs_size + int(self.policy.vec_obs_size > 0)),
            activation=None,
        )
        squared_difference = 0.5 * tf.reduce_sum(
        self.forward_loss = tf.reduce_mean(
-            tf.dynamic_partition(squared_difference, self.policy_model.mask, 2)[1]
+            tf.dynamic_partition(squared_difference, self.policy.mask, 2)[1]
        )

    def create_loss(self, learning_rate: float) -> None:
--- a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
 from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
 from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel
 from mlagents.trainers.tf_policy import TFPolicy
-from mlagents.trainers.models import LearningModel


 class CuriosityRewardSignal(RewardSignal):
-        policy_model: LearningModel,
        strength: float,
        gamma: float,
        encoding_size: int = 128,
        :param encoding_size: The size of the hidden encoding layer for the ICM
        :param learning_rate: The learning rate for the ICM.
        """
-        super().__init__(policy, policy_model, strength, gamma)
+        super().__init__(policy, strength, gamma)
-            policy_model, encoding_size=encoding_size, learning_rate=learning_rate
+            policy, encoding_size=encoding_size, learning_rate=learning_rate
        )
        self.use_terminal_states = False
        self.update_dict = {

    def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
        feed_dict: Dict[tf.Tensor, Any] = {
-            self.policy.model.batch_size: len(mini_batch["actions"]),
-            self.policy.model.sequence_length: self.policy.sequence_length,
+            self.policy.batch_size_ph: len(mini_batch["actions"]),
+            self.policy.sequence_length_ph: self.policy.sequence_length,
-            feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"]
+            feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]
-        if self.policy.model.vis_obs_size > 0:
-            for i in range(len(self.policy.model.visual_in)):
+        if self.policy.vis_obs_size > 0:
+            for i in range(len(self.policy.visual_in)):
-                feed_dict[self.policy.model.visual_in[i]] = _obs
+                feed_dict[self.policy.visual_in[i]] = _obs
-            feed_dict[self.policy.model.selected_actions] = mini_batch["actions"]
+            feed_dict[self.policy.selected_actions] = mini_batch["actions"]
-            feed_dict[self.policy.model.action_holder] = mini_batch["actions"]
+            feed_dict[self.policy.action_holder] = mini_batch["actions"]
        unscaled_reward = self.policy.sess.run(
            self.model.intrinsic_reward, feed_dict=feed_dict
        )
        super().check_config(config_dict, param_keys)

    def prepare_update(
-        self,
-        policy_model: LearningModel,
-        mini_batch: Dict[str, np.ndarray],
-        num_sequences: int,
+        self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
    ) -> Dict[tf.Tensor, Any]:
        """
        Prepare for update and get feed_dict.
        """
        feed_dict = {
-            policy_model.batch_size: num_sequences,
-            policy_model.sequence_length: self.policy.sequence_length,
-            policy_model.mask_input: mini_batch["masks"],
+            policy.batch_size_ph: num_sequences,
+            policy.sequence_length_ph: self.policy.sequence_length,
+            policy.mask_input: mini_batch["masks"],
-            feed_dict[policy_model.selected_actions] = mini_batch["actions"]
+            feed_dict[policy.selected_actions] = mini_batch["actions"]
-            feed_dict[policy_model.action_holder] = mini_batch["actions"]
+            feed_dict[policy.action_holder] = mini_batch["actions"]
-            feed_dict[policy_model.vector_in] = mini_batch["vector_obs"]
+            feed_dict[policy.vector_in] = mini_batch["vector_obs"]
-        if policy_model.vis_obs_size > 0:
-            for i, vis_in in enumerate(policy_model.visual_in):
+        if policy.vis_obs_size > 0:
+            for i, vis_in in enumerate(policy.visual_in):
                feed_dict[vis_in] = mini_batch["visual_obs%d" % i]
            for i, next_vis_in in enumerate(self.model.next_visual_in):
                feed_dict[next_vis_in] = mini_batch["next_visual_obs%d" % i]
--- a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py

 from mlagents.tf_utils import tf

-from mlagents.trainers.models import LearningModel
+from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.models import ModelUtils

 EPSILON = 1e-7

        self,
-        policy_model: LearningModel,
+        policy: TFPolicy,
        h_size: int = 128,
        learning_rate: float = 3e-4,
        encoding_size: int = 64,
        self.z_size = 128
        self.alpha = 0.0005
        self.mutual_information = 0.5
-        self.policy_model = policy_model
+        self.policy = policy
        self.encoding_size = encoding_size
        self.gradient_penalty_weight = gradient_penalty_weight
        self.use_vail = use_vail
        self.done_expert = tf.expand_dims(self.done_expert_holder, -1)
        self.done_policy = tf.expand_dims(self.done_policy_holder, -1)

-        if self.policy_model.brain.vector_action_space_type == "continuous":
-            action_length = self.policy_model.act_size[0]
+        if self.policy.brain.vector_action_space_type == "continuous":
+            action_length = self.policy.act_size[0]
-            action_length = len(self.policy_model.act_size)
+            action_length = len(self.policy.act_size)
            self.action_in_expert = tf.placeholder(
                shape=[None, action_length], dtype=tf.int32
            )
-                    for i, act_size in enumerate(self.policy_model.act_size)
+                    for i, act_size in enumerate(self.policy.act_size)
                ],
                axis=1,
            )

-        if self.policy_model.vec_obs_size > 0:
+        if self.policy.vec_obs_size > 0:
-                shape=[None, self.policy_model.vec_obs_size], dtype=tf.float32
+                shape=[None, self.policy.vec_obs_size], dtype=tf.float32
-            if self.policy_model.normalize:
+            if self.policy.normalize:
-                    self.policy_model.normalize_vector_obs(self.obs_in_expert)
-                )
-                encoded_policy_list.append(
-                    self.policy_model.normalize_vector_obs(self.policy_model.vector_in)
+                    ModelUtils.normalize_vector_obs(
+                        self.obs_in_expert,
+                        self.policy.running_mean,
+                        self.policy.running_variance,
+                        self.policy.normalization_steps,
+                    )
+                encoded_policy_list.append(self.policy.processed_vector_in)
-                encoded_policy_list.append(self.policy_model.vector_in)
+                encoded_policy_list.append(self.policy.vector_in)
-        if self.policy_model.vis_obs_size > 0:
+        if self.policy.vis_obs_size > 0:
-            for i in range(self.policy_model.vis_obs_size):
+            for i in range(self.policy.vis_obs_size):
-                visual_input = self.policy_model.create_visual_input(
-                    self.policy_model.brain.camera_resolutions[i],
+                visual_input = ModelUtils.create_visual_input(
+                    self.policy.brain.camera_resolutions[i],
-                encoded_policy_visual = self.policy_model.create_visual_observation_encoder(
-                    self.policy_model.visual_in[i],
+                encoded_policy_visual = ModelUtils.create_visual_observation_encoder(
+                    self.policy.visual_in[i],
-                    LearningModel.swish,
+                    ModelUtils.swish,
-                encoded_expert_visual = self.policy_model.create_visual_observation_encoder(
+                encoded_expert_visual = ModelUtils.create_visual_observation_encoder(
-                    LearningModel.swish,
+                    ModelUtils.swish,
                    1,
                    "gail_stream_{}_visual_obs_encoder".format(i),
                    True,
            hidden_1 = tf.layers.dense(
                concat_input,
                self.h_size,
-                activation=LearningModel.swish,
+                activation=ModelUtils.swish,
                name="gail_d_hidden_1",
                reuse=reuse,
            )
                self.h_size,
-                activation=LearningModel.swish,
+                activation=ModelUtils.swish,
                name="gail_d_hidden_2",
                reuse=reuse,
            )
                    self.z_size,
                    reuse=reuse,
                    name="gail_z_mean",
-                    kernel_initializer=LearningModel.scaled_init(0.01),
+                    kernel_initializer=ModelUtils.scaled_init(0.01),
                )

                self.noise = tf.random_normal(tf.shape(z_mean), dtype=tf.float32)
        )
        self.policy_estimate, self.z_mean_policy, _ = self.create_encoder(
            self.encoded_policy,
-            self.policy_model.selected_actions,
+            self.policy.selected_actions,
            self.done_policy,
            reuse=True,
        )
        for off-policy. Compute gradients w.r.t randomly interpolated input.
        """
        expert = [self.encoded_expert, self.expert_action, self.done_expert]
-        policy = [
-            self.encoded_policy,
-            self.policy_model.selected_actions,
-            self.done_policy,
-        ]
+        policy = [self.encoded_policy, self.policy.selected_actions, self.done_policy]
        interp = []
        for _expert_in, _policy_in in zip(expert, policy):
            alpha = tf.random_uniform(tf.shape(_expert_in))
--- a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py

 from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
 from mlagents.trainers.tf_policy import TFPolicy
-from mlagents.trainers.models import LearningModel
 from .model import GAILModel
 from mlagents.trainers.demo_loader import demo_to_buffer

    def __init__(
        self,
        policy: TFPolicy,
-        policy_model: LearningModel,
        strength: float,
        gamma: float,
        demo_path: str,
        :param use_vail: Whether or not to use a variational bottleneck for the discriminator.
        See https://arxiv.org/abs/1810.00821.
        """
-        super().__init__(policy, policy_model, strength, gamma)
+        super().__init__(policy, strength, gamma)
-            policy.model, 128, learning_rate, encoding_size, use_actions, use_vail
+            policy, 128, learning_rate, encoding_size, use_actions, use_vail
        )
        _, self.demonstration_buffer = demo_to_buffer(demo_path, policy.sequence_length)
        self.has_updated = False

    def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
        feed_dict: Dict[tf.Tensor, Any] = {
-            self.policy.model.batch_size: len(mini_batch["actions"]),
-            self.policy.model.sequence_length: self.policy.sequence_length,
+            self.policy.batch_size_ph: len(mini_batch["actions"]),
+            self.policy.sequence_length_ph: self.policy.sequence_length,
-            feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"]
-        if self.policy.model.vis_obs_size > 0:
-            for i in range(len(self.policy.model.visual_in)):
+            feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]
+        if self.policy.vis_obs_size > 0:
+            for i in range(len(self.policy.visual_in)):
-                feed_dict[self.policy.model.visual_in[i]] = _obs
+                feed_dict[self.policy.visual_in[i]] = _obs
-            feed_dict[self.policy.model.selected_actions] = mini_batch["actions"]
+            feed_dict[self.policy.selected_actions] = mini_batch["actions"]
-            feed_dict[self.policy.model.action_holder] = mini_batch["actions"]
+            feed_dict[self.policy.action_holder] = mini_batch["actions"]
        feed_dict[self.model.done_policy_holder] = np.array(
            mini_batch["done"]
        ).flatten()
        super().check_config(config_dict, param_keys)

    def prepare_update(
-        self,
-        policy_model: LearningModel,
-        mini_batch: Dict[str, np.ndarray],
-        num_sequences: int,
+        self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
    ) -> Dict[tf.Tensor, Any]:
        """
        Prepare inputs for update. .

        feed_dict[self.model.action_in_expert] = np.array(mini_batch_demo["actions"])
        if self.policy.use_continuous_act:
-            feed_dict[policy_model.selected_actions] = mini_batch["actions"]
+            feed_dict[policy.selected_actions] = mini_batch["actions"]
-            feed_dict[policy_model.action_holder] = mini_batch["actions"]
+            feed_dict[policy.action_holder] = mini_batch["actions"]
-            for i in range(len(policy_model.visual_in)):
-                feed_dict[policy_model.visual_in[i]] = mini_batch["visual_obs%d" % i]
+            for i in range(len(policy.visual_in)):
+                feed_dict[policy.visual_in[i]] = mini_batch["visual_obs%d" % i]
-            feed_dict[policy_model.vector_in] = mini_batch["vector_obs"]
+            feed_dict[policy.vector_in] = mini_batch["vector_obs"]
            feed_dict[self.model.obs_in_expert] = mini_batch_demo["vector_obs"]
        self.has_updated = True
        return feed_dict
--- a/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
 import logging
 from typing import Any, Dict, Type

-from mlagents.trainers.trainer import UnityTrainerException
+from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.components.reward_signals import RewardSignal
 from mlagents.trainers.components.reward_signals.extrinsic.signal import (
    ExtrinsicRewardSignal,
    CuriosityRewardSignal,
 )
 from mlagents.trainers.tf_policy import TFPolicy
-from mlagents.trainers.models import LearningModel

 logger = logging.getLogger("mlagents.trainers")



 def create_reward_signal(
-    policy: TFPolicy,
-    policy_model: LearningModel,
-    name: str,
-    config_entry: Dict[str, Any],
+    policy: TFPolicy, name: str, config_entry: Dict[str, Any]
 ) -> RewardSignal:
    """
    Creates a reward signal class based on the name and config entry provided as a dict.
        raise UnityTrainerException("Unknown reward signal type {0}".format(name))
    rcls.check_config(config_entry)
    try:
-        class_inst = rcls(policy, policy_model, **config_entry)
+        class_inst = rcls(policy, **config_entry)
    except TypeError:
        raise UnityTrainerException(
            "Unknown parameters given for reward signal {0}".format(name)
--- a/ml-agents/mlagents/trainers/exception.py
+++ b/ml-agents/mlagents/trainers/exception.py
    """

    pass
+
+
+class UnityTrainerException(TrainerError):
+    """
+    Related to errors with the Trainer.
+    """
+
+    pass
--- a/ml-agents/mlagents/trainers/ghost/trainer.py
+++ b/ml-agents/mlagents/trainers/ghost/trainer.py
        return self.trainer.create_policy(brain_parameters)

    def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
-        # for saving/swapping snapshots
-        policy.init_load_weights()
+        """
+        Adds policy to trainer. For the first policy added, add a trainer
+        to the policy and set the learning behavior name to name_behavior_id.
+        :param name_behavior_id: Behavior ID that the policy should belong to.
+        :param policy: Policy to associate with name_behavior_id.
+        """
+        policy.create_tf_graph()
-            self._save_snapshot(policy)
+            self._save_snapshot(policy)  # Need to save after trainer initializes policy
+        else:
+            # for saving/swapping snapshots
+            policy.init_load_weights()

    def get_policy(self, name_behavior_id: str) -> TFPolicy:
        return self.policies[name_behavior_id]
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
 from mlagents.trainers.subprocess_env_manager import SubprocessEnvManager
 from mlagents_envs.side_channel.side_channel import SideChannel
 from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig
+from mlagents_envs.exception import UnityEnvironmentException


 def _create_parser():
        default=False,
        action="store_true",
        help="Whether to run ML-Agents in debug mode with detailed logging",
-    )
-    argparser.add_argument(
-        "--multi-gpu",
-        default=False,
-        action="store_true",
-        help="Setting this flag enables the use of multiple GPU's (if available) during training",
    )
    argparser.add_argument(
        "--env-args",
    env_args: Optional[List[str]],
 ) -> Callable[[int, List[SideChannel]], BaseEnv]:
    if env_path is not None:
-        # Strip out executable extensions if passed
-        env_path = (
-            env_path.strip()
-            .replace(".app", "")
-            .replace(".exe", "")
-            .replace(".x86_64", "")
-            .replace(".x86", "")
-        )
+        launch_string = UnityEnvironment.validate_environment_path(env_path)
+        if launch_string is None:
+            raise UnityEnvironmentException(
+                f"Couldn't launch the {env_path} environment. Provided filename does not match any environments."
+            )
    docker_training = docker_target_name is not None
    if docker_training and env_path is not None:
        #     Comments for future maintenance:
--- a/ml-agents/mlagents/trainers/models.py
+++ b/ml-agents/mlagents/trainers/models.py
 import logging
 from enum import Enum
-from typing import Callable, Dict, List, Optional
+from typing import Callable, Dict, List, Tuple, NamedTuple
-from mlagents.trainers.trainer import UnityTrainerException
+from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.brain import CameraResolution

 logger = logging.getLogger("mlagents.trainers")
    LINEAR = "linear"


-class LearningModel:
-    _version_number_ = 2
+class NormalizerTensors(NamedTuple):
+    update_op: tf.Operation
+    steps: tf.Tensor
+    running_mean: tf.Tensor
+    running_variance: tf.Tensor
+
+class ModelUtils:
    # Minimum supported side for each encoder type. If refactoring an encoder, please
    # adjust these also.
    MIN_RESOLUTION_FOR_ENCODER = {
    }

-    def __init__(
-        self, m_size, normalize, use_recurrent, brain, seed, stream_names=None
-    ):
-        tf.set_random_seed(seed)
-        self.brain = brain
-        self.vector_in = None
-        self.global_step, self.increment_step, self.steps_to_increment = (
-            self.create_global_steps()
-        )
-        self.visual_in = []
-        self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name="batch_size")
-        self.sequence_length = tf.placeholder(
-            shape=None, dtype=tf.int32, name="sequence_length"
-        )
-        self.mask_input = tf.placeholder(shape=[None], dtype=tf.float32, name="masks")
-        self.mask = tf.cast(self.mask_input, tf.int32)
-        self.stream_names = stream_names or []
-        self.use_recurrent = use_recurrent
-        if self.use_recurrent:
-            self.m_size = m_size
-        else:
-            self.m_size = 0
-        self.normalize = normalize
-        self.act_size = brain.vector_action_space_size
-        self.vec_obs_size = brain.vector_observation_space_size
-        self.vis_obs_size = brain.number_visual_observations
-        tf.Variable(
-            int(brain.vector_action_space_type == "continuous"),
-            name="is_continuous_control",
-            trainable=False,
-            dtype=tf.int32,
-        )
-        tf.Variable(
-            self._version_number_,
-            name="version_number",
-            trainable=False,
-            dtype=tf.int32,
-        )
-        tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32)
-        if brain.vector_action_space_type == "continuous":
-            tf.Variable(
-                self.act_size[0],
-                name="action_output_shape",
-                trainable=False,
-                dtype=tf.int32,
-            )
-        else:
-            tf.Variable(
-                sum(self.act_size),
-                name="action_output_shape",
-                trainable=False,
-                dtype=tf.int32,
-            )
-        self.value_heads: Dict[str, tf.Tensor] = {}
-        self.normalization_steps: Optional[tf.Variable] = None
-        self.running_mean: Optional[tf.Variable] = None
-        self.running_variance: Optional[tf.Variable] = None
-        self.update_normalization: Optional[tf.Operation] = None
-        self.value: Optional[tf.Tensor] = None
-        self.all_log_probs: Optional[tf.Tensor] = None
-        self.output: Optional[tf.Tensor] = None
-        self.selected_actions: Optional[tf.Tensor] = None
-        self.action_holder: Optional[tf.Tensor] = None
-
    @staticmethod
    def create_global_steps():
        """Creates TF ops to track and increment global training step."""
        global_step: tf.Tensor,
        max_step: int,
    ) -> tf.Tensor:
+        """
+        Create a learning rate tensor.
+        :param lr_schedule: Type of learning rate schedule.
+        :param lr: Base learning rate.
+        :param global_step: A TF Tensor representing the total global step.
+        :param max_step: The maximum number of steps in the training run.
+        :return: A Tensor containing the learning rate.
+        """
        if lr_schedule == LearningRateSchedule.CONSTANT:
            learning_rate = tf.Variable(lr)
        elif lr_schedule == LearningRateSchedule.LINEAR:
        )
        return visual_in

-    def create_vector_input(self, name="vector_observation"):
+    @staticmethod
+    def create_visual_input_placeholders(
+        camera_resolutions: List[CameraResolution]
+    ) -> List[tf.Tensor]:
+        """
+        Creates input placeholders for visual inputs.
+        :param camera_resolutions: A List of CameraResolutions that specify the resolutions
+        of the input visual observations.
+        :returns: A List of Tensorflow placeholders where the input iamges should be fed.
+        """
+        visual_in: List[tf.Tensor] = []
+        for i, camera_resolution in enumerate(camera_resolutions):
+            visual_input = ModelUtils.create_visual_input(
+                camera_resolution, name="visual_observation_" + str(i)
+            )
+            visual_in.append(visual_input)
+        return visual_in
+
+    @staticmethod
+    def create_vector_input(
+        vec_obs_size: int, name: str = "vector_observation"
+    ) -> tf.Tensor:
-        :param name: Name of the placeholder op.
-        :return:
+        :param name: Name of the placeholder op.
+        :return: Placeholder for vector observations.
-        self.vector_in = tf.placeholder(
-            shape=[None, self.vec_obs_size], dtype=tf.float32, name=name
+        vector_in = tf.placeholder(
+            shape=[None, vec_obs_size], dtype=tf.float32, name=name
-        if self.normalize:
-            self.create_normalizer(self.vector_in)
-            return self.normalize_vector_obs(self.vector_in)
-        else:
-            return self.vector_in
+        return vector_in
-    def normalize_vector_obs(self, vector_obs):
+    @staticmethod
+    def normalize_vector_obs(
+        vector_obs: tf.Tensor,
+        running_mean: tf.Tensor,
+        running_variance: tf.Tensor,
+        normalization_steps: tf.Tensor,
+    ) -> tf.Tensor:
+        """
+        Create a normalized version of an input tensor.
+        :param vector_obs: Input vector observation tensor.
+        :param running_mean: Tensorflow tensor representing the current running mean.
+        :param running_variance: Tensorflow tensor representing the current running variance.
+        :param normalization_steps: Tensorflow tensor representing the current number of normalization_steps.
+        :return: A normalized version of vector_obs.
+        """
-            (vector_obs - self.running_mean)
+            (vector_obs - running_mean)
-                self.running_variance
-                / (tf.cast(self.normalization_steps, tf.float32) + 1)
+                running_variance / (tf.cast(normalization_steps, tf.float32) + 1)
            ),
            -5,
            5,

-    def create_normalizer(self, vector_obs):
-        self.normalization_steps = tf.get_variable(
+    @staticmethod
+    def create_normalizer(vector_obs: tf.Tensor) -> NormalizerTensors:
+        """
+        Creates the normalizer and the variables required to store its state.
+        :param vector_obs: A Tensor representing the next value to normalize. When the
+            update operation is called, it will use vector_obs to update the running mean
+            and variance.
+        :return: A NormalizerTensors tuple that holds running mean, running variance, number of steps,
+            and the update operation.
+        """
+
+        vec_obs_size = vector_obs.shape[1]
+        steps = tf.get_variable(
            "normalization_steps",
            [],
            trainable=False,
-        self.running_mean = tf.get_variable(
+        running_mean = tf.get_variable(
-            [self.vec_obs_size],
+            [vec_obs_size],
-        self.running_variance = tf.get_variable(
+        running_variance = tf.get_variable(
-            [self.vec_obs_size],
+            [vec_obs_size],
-        self.update_normalization = self.create_normalizer_update(vector_obs)
+        update_normalization = ModelUtils.create_normalizer_update(
+            vector_obs, steps, running_mean, running_variance
+        )
+        return NormalizerTensors(
+            update_normalization, steps, running_mean, running_variance
+        )
-    def create_normalizer_update(self, vector_input):
+    @staticmethod
+    def create_normalizer_update(
+        vector_input: tf.Tensor,
+        steps: tf.Tensor,
+        running_mean: tf.Tensor,
+        running_variance: tf.Tensor,
+    ) -> tf.Operation:
+        """
+        Creates the update operation for the normalizer.
+        :param vector_input: Vector observation to use for updating the running mean and variance.
+        :param running_mean: Tensorflow tensor representing the current running mean.
+        :param running_variance: Tensorflow tensor representing the current running variance.
+        :param steps: Tensorflow tensor representing the current number of steps that have been normalized.
+        :return: A TF operation that updates the normalization based on vector_input.
+        """
-        total_new_steps = tf.add(self.normalization_steps, steps_increment)
+        total_new_steps = tf.add(steps, steps_increment)
-        input_to_old_mean = tf.subtract(vector_input, self.running_mean)
-        new_mean = self.running_mean + tf.reduce_sum(
+        input_to_old_mean = tf.subtract(vector_input, running_mean)
+        new_mean = running_mean + tf.reduce_sum(
-        new_variance = self.running_variance + tf.reduce_sum(
+        new_variance = running_variance + tf.reduce_sum(
-        update_mean = tf.assign(self.running_mean, new_mean)
-        update_variance = tf.assign(self.running_variance, new_variance)
-        update_norm_step = tf.assign(self.normalization_steps, total_new_steps)
+        update_mean = tf.assign(running_mean, new_mean)
+        update_variance = tf.assign(running_variance, new_variance)
+        update_norm_step = tf.assign(steps, total_new_steps)
        return tf.group([update_mean, update_variance, update_norm_step])

    @staticmethod
            hidden = tf.layers.flatten(conv2)

        with tf.variable_scope(scope + "/" + "flat_encoding"):
-            hidden_flat = LearningModel.create_vector_observation_encoder(
+            hidden_flat = ModelUtils.create_vector_observation_encoder(
                hidden, h_size, activation, num_layers, scope, reuse
            )
        return hidden_flat
            hidden = tf.layers.flatten(conv3)

        with tf.variable_scope(scope + "/" + "flat_encoding"):
-            hidden_flat = LearningModel.create_vector_observation_encoder(
+            hidden_flat = ModelUtils.create_vector_observation_encoder(
                hidden, h_size, activation, num_layers, scope, reuse
            )
        return hidden_flat
            hidden = tf.layers.flatten(hidden)

        with tf.variable_scope(scope + "/" + "flat_encoding"):
-            hidden_flat = LearningModel.create_vector_observation_encoder(
+            hidden_flat = ModelUtils.create_vector_observation_encoder(
                hidden, h_size, activation, num_layers, scope, reuse
            )
        return hidden_flat
        ENCODER_FUNCTION_BY_TYPE = {
-            EncoderType.SIMPLE: LearningModel.create_visual_observation_encoder,
-            EncoderType.NATURE_CNN: LearningModel.create_nature_cnn_visual_observation_encoder,
-            EncoderType.RESNET: LearningModel.create_resnet_visual_observation_encoder,
+            EncoderType.SIMPLE: ModelUtils.create_visual_observation_encoder,
+            EncoderType.NATURE_CNN: ModelUtils.create_nature_cnn_visual_observation_encoder,
+            EncoderType.RESNET: ModelUtils.create_resnet_visual_observation_encoder,
-            encoder_type, LearningModel.create_visual_observation_encoder
+            encoder_type, ModelUtils.create_visual_observation_encoder
        )

    @staticmethod

    @staticmethod
    def _check_resolution_for_encoder(
-        camera_res: CameraResolution, vis_encoder_type: EncoderType
+        vis_in: tf.Tensor, vis_encoder_type: EncoderType
-        min_res = LearningModel.MIN_RESOLUTION_FOR_ENCODER[vis_encoder_type]
-        if camera_res.height < min_res or camera_res.width < min_res:
+        min_res = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[vis_encoder_type]
+        height = vis_in.shape[1]
+        width = vis_in.shape[2]
+        if height < min_res or width < min_res:
-                f"Visual observation resolution ({camera_res.width}x{camera_res.height}) is too small for"
+                f"Visual observation resolution ({width}x{height}) is too small for"
+    @staticmethod
-        self,
+        visual_in: List[tf.Tensor],
+        vector_in: tf.Tensor,
        num_streams: int,
        h_size: int,
        num_layers: int,
            the scopes for each of the streams. None if all under the same TF scope.
        :return: List of encoded streams.
        """
-        brain = self.brain
-        activation_fn = self.swish
-
-        self.visual_in = []
-        for i in range(brain.number_visual_observations):
-            LearningModel._check_resolution_for_encoder(
-                brain.camera_resolutions[i], vis_encode_type
-            )
-            visual_input = self.create_visual_input(
-                brain.camera_resolutions[i], name="visual_observation_" + str(i)
-            )
-            self.visual_in.append(visual_input)
-        vector_observation_input = self.create_vector_input()
+        activation_fn = ModelUtils.swish
+        vector_observation_input = vector_in
-            create_encoder_func = LearningModel.get_encoder_for_type(vis_encode_type)
+            create_encoder_func = ModelUtils.get_encoder_for_type(vis_encode_type)
-            if self.vis_obs_size > 0:
-                for j in range(brain.number_visual_observations):
+            if len(visual_in) > 0:
+                for j, vis_in in enumerate(visual_in):
+                    ModelUtils._check_resolution_for_encoder(vis_in, vis_encode_type)
-                        self.visual_in[j],
+                        vis_in,
                        h_size,
                        activation_fn,
                        num_layers,
                    visual_encoders.append(encoded_visual)
                hidden_visual = tf.concat(visual_encoders, axis=1)
-            if brain.vector_observation_space_size > 0:
-                hidden_state = self.create_vector_observation_encoder(
+            if vector_in.get_shape()[-1] > 0:  # Don't encode 0-shape inputs
+                hidden_state = ModelUtils.create_vector_observation_encoder(
                    vector_observation_input,
                    h_size,
                    activation_fn,
        recurrent_output = tf.reshape(recurrent_output, shape=[-1, half_point])
        return recurrent_output, tf.concat([lstm_state_out.c, lstm_state_out.h], axis=1)

-    def create_value_heads(self, stream_names, hidden_input):
+    @staticmethod
+    def create_value_heads(
+        stream_names: List[str], hidden_input: tf.Tensor
+    ) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
        """
        Creates one value estimator head for each reward signal in stream_names.
        Also creates the node corresponding to the mean of all the value heads in self.value.
        of the hidden input.
        """
+        value_heads = {}
-            self.value_heads[name] = value
-        self.value = tf.reduce_mean(list(self.value_heads.values()), 0)
+            value_heads[name] = value
+        value = tf.reduce_mean(list(value_heads.values()), 0)
+        return value_heads, value
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py

 import numpy as np

-from mlagents.trainers.ppo.policy import PPOPolicy
-from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy, get_devices
+from mlagents.trainers.common.nn_policy import NNPolicy
+from mlagents.trainers.ppo.optimizer import PPOOptimizer
 from mlagents.trainers.trajectory import Trajectory

 logger = logging.getLogger("mlagents.trainers")
        load: bool,
        seed: int,
        run_id: str,
-        multi_gpu: bool,
    ):
        """
        Responsible for collecting experiences and training PPO model.
        :param load: Whether the model should be loaded.
        :param seed: The seed the model will be initialized with
        :param run_id: The identifier of the current run
-        :param multi_gpu: Boolean for multi-gpu policy model
        """
        super(PPOTrainer, self).__init__(
            brain_name, trainer_parameters, training, run_id, reward_buff_cap
        ]
        self._check_param_keys()
        self.load = load
-        self.multi_gpu = multi_gpu
-        self.policy: PPOPolicy = None  # type: ignore
+        self.policy: NNPolicy = None  # type: ignore

    def _process_trajectory(self, trajectory: Trajectory) -> None:
        """
            self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])

        # Get all value estimates
-        value_estimates = self.policy.get_batched_value_estimates(
-            agent_buffer_trajectory
+        value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
+            agent_buffer_trajectory,
+            trajectory.next_obs,
+            trajectory.done_reached and not trajectory.max_step_reached,
-                self.policy.reward_signals[name].value_name, np.mean(v)
+                self.optimizer.reward_signals[name].value_name, np.mean(v)
-        value_next = self.policy.get_value_estimates(
-            trajectory.next_obs,
-            agent_id,
-            trajectory.done_reached and not trajectory.max_step_reached,
-        )
-
-        for name, reward_signal in self.policy.reward_signals.items():
+        for name, reward_signal in self.optimizer.reward_signals.items():
            evaluate_result = reward_signal.evaluate_batch(
                agent_buffer_trajectory
            ).scaled_reward
        # Compute GAE and returns
        tmp_advantages = []
        tmp_returns = []
-        for name in self.policy.reward_signals:
+        for name in self.optimizer.reward_signals:
            bootstrap_value = value_next[name]

            local_rewards = agent_buffer_trajectory[
                rewards=local_rewards,
                value_estimates=local_value_estimates,
                value_next=bootstrap_value,
-                gamma=self.policy.reward_signals[name].gamma,
+                gamma=self.optimizer.reward_signals[name].gamma,
                lambd=self.trainer_parameters["lambd"],
            )
            local_return = local_advantage + local_value_estimates

        # If this was a terminal trajectory, append stats and reset reward collection
        if trajectory.done_reached:
-            self._update_end_episode_stats(
-                agent_id, self.get_policy(trajectory.behavior_id)
-            )
+            self._update_end_episode_stats(agent_id, self.optimizer)

    def _is_ready_update(self):
        """
            buffer = self.update_buffer
            max_num_batch = buffer_length // batch_size
            for l in range(0, max_num_batch * batch_size, batch_size):
-                update_stats = self.policy.update(
+                update_stats = self.optimizer.update(
                    buffer.make_mini_batch(l, l + batch_size), n_sequences
                )
                for stat_name, value in update_stats.items():
            self.stats_reporter.add_stat(stat, np.mean(stat_list))

-        if self.policy.bc_module:
-            update_stats = self.policy.bc_module.update()
+        if self.optimizer.bc_module:
+            update_stats = self.optimizer.bc_module.update()
            for stat, val in update_stats.items():
                self.stats_reporter.add_stat(stat, val)
        self.clear_update_buffer()
        :param brain_parameters: specifications for policy construction
        :return policy
        """
-
-        if self.multi_gpu and len(get_devices()) > 1:
-            policy: PPOPolicy = MultiGpuPPOPolicy(
-                self.seed,
-                brain_parameters,
-                self.trainer_parameters,
-                self.is_training,
-                self.load,
-            )
-        else:
-            policy = PPOPolicy(
-                self.seed,
-                brain_parameters,
-                self.trainer_parameters,
-                self.is_training,
-                self.load,
-            )
-
-        for _reward_signal in policy.reward_signals.keys():
-            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
+        policy = NNPolicy(
+            self.seed,
+            brain_parameters,
+            self.trainer_parameters,
+            self.is_training,
+            self.load,
+            condition_sigma_on_obs=False,  # Faster training for PPO
+            create_tf_graph=False,  # We will create the TF graph in the Optimizer
+        )

        return policy

-        :param brain_parameters: specifications for policy construction
+        :param name_behavior_id: Behavior ID that the policy should belong to.
+        :param policy: Policy to associate with name_behavior_id.
        """
        if self.policy:
            logger.warning(
            )
-        if not isinstance(policy, PPOPolicy):
-            raise RuntimeError("Non-PPOPolicy passed to PPOTrainer.add_policy()")
+        if not isinstance(policy, NNPolicy):
+            raise RuntimeError("Non-NNPolicy passed to PPOTrainer.add_policy()")
+        self.optimizer = PPOOptimizer(self.policy, self.trainer_parameters)
+        for _reward_signal in self.optimizer.reward_signals.keys():
+            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
        # Needed to resume loads properly
        self.step = policy.get_current_step()
        self.next_summary_step = self._get_next_summary_step()
--- a/ml-agents/mlagents/trainers/rl_trainer.py
+++ b/ml-agents/mlagents/trainers/rl_trainer.py
 from typing import Dict
 from collections import defaultdict

-from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.common.tf_optimizer import TFOptimizer
-from mlagents.trainers.trainer import Trainer, UnityTrainerException
+from mlagents.trainers.trainer import Trainer
+from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.components.reward_signals import RewardSignalResult

 LOGGER = logging.getLogger("mlagents.trainers")
            for agent_id in rewards:
                rewards[agent_id] = 0

-    def _update_end_episode_stats(self, agent_id: str, policy: TFPolicy) -> None:
+    def _update_end_episode_stats(self, agent_id: str, optimizer: TFOptimizer) -> None:
        self.episode_steps[agent_id] = 0
        for name, rewards in self.collected_rewards.items():
            if name == "environment":
                rewards[agent_id] = 0
            else:
                self.stats_reporter.add_stat(
-                    policy.reward_signals[name].stat_name, rewards.get(agent_id, 0)
+                    optimizer.reward_signals[name].stat_name, rewards.get(agent_id, 0)
                )
                rewards[agent_id] = 0

--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py

 from mlagents_envs.timers import timed
 from mlagents.trainers.tf_policy import TFPolicy
-from mlagents.trainers.sac.policy import SACPolicy
+from mlagents.trainers.common.nn_policy import NNPolicy
+from mlagents.trainers.sac.optimizer import SACOptimizer
 from mlagents.trainers.rl_trainer import RLTrainer
 from mlagents.trainers.trajectory import Trajectory, SplitObservations
 from mlagents.trainers.brain import BrainParameters
        self._check_param_keys()
        self.load = load
        self.seed = seed
-        self.policy: SACPolicy = None  # type: ignore
+        self.policy: NNPolicy = None  # type: ignore
+        self.optimizer: SACOptimizer = None  # type: ignore

        self.step = 0
        self.train_interval = (
            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

        # Get all value estimates for reporting purposes
-        value_estimates = self.policy.get_batched_value_estimates(
-            agent_buffer_trajectory
+        value_estimates, _ = self.optimizer.get_trajectory_value_estimates(
+            agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached
-                self.policy.reward_signals[name].value_name, np.mean(v)
+                self.optimizer.reward_signals[name].value_name, np.mean(v)
            )

        # Bootstrap using the last step rather than the bootstrap step if max step is reached.
        )

        if trajectory.done_reached:
-            self._update_end_episode_stats(
-                agent_id, self.get_policy(trajectory.behavior_id)
-            )
+            self._update_end_episode_stats(agent_id, self.optimizer)

    def _is_ready_update(self) -> bool:
        """
            self.update_reward_signals()

    def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
-        policy = SACPolicy(
+        policy = NNPolicy(
+            tanh_squash=True,
+            reparameterize=True,
+            create_tf_graph=False,
        )
        for _reward_signal in policy.reward_signals.keys():
            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
                    sequence_length=self.policy.sequence_length,
                )
                # Get rewards for each reward
-                for name, signal in self.policy.reward_signals.items():
+                for name, signal in self.optimizer.reward_signals.items():
-                update_stats = self.policy.update(sampled_minibatch, n_sequences)
+                update_stats = self.optimizer.update(sampled_minibatch, n_sequences)
                for stat_name, value in update_stats.items():
                    batch_update_stats[stat_name].append(value)

        for stat, stat_list in batch_update_stats.items():
            self.stats_reporter.add_stat(stat, np.mean(stat_list))

-        bc_module = self.policy.bc_module
-        if bc_module:
-            update_stats = bc_module.update()
+        if self.optimizer.bc_module:
+            update_stats = self.optimizer.bc_module.update()
            for stat, val in update_stats.items():
                self.stats_reporter.add_stat(stat, val)

        for _ in range(num_updates):
            # Get minibatches for reward signal update if needed
            reward_signal_minibatches = {}
-            for name, signal in self.policy.reward_signals.items():
+            for name, signal in self.optimizer.reward_signals.items():
                logger.debug("Updating {} at step {}".format(name, self.step))
                # Some signals don't need a minibatch to be sampled - so we don't!
                if signal.update_dict:
                    )
-            update_stats = self.policy.update_reward_signals(
+            update_stats = self.optimizer.update_reward_signals(
                reward_signal_minibatches, n_sequences
            )
            for stat_name, value in update_stats.items():
                    self.__class__.__name__
                )
            )
-        if not isinstance(policy, SACPolicy):
+        if not isinstance(policy, NNPolicy):
+        self.optimizer = SACOptimizer(self.policy, self.trainer_parameters)
+        for _reward_signal in self.optimizer.reward_signals.keys():
+            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
        # Needed to resume loads properly
        self.step = policy.get_current_step()
        self.next_summary_step = self._get_next_summary_step()
--- a/ml-agents/mlagents/trainers/tests/mock_brain.py
+++ b/ml-agents/mlagents/trainers/tests/mock_brain.py
        done = False
        if is_discrete:
            action_size = len(action_space)
+            action_probs = np.ones(np.sum(action_space), dtype=np.float32)
+            action_probs = np.ones((1), dtype=np.float32)
-        action_probs = np.ones(action_size, dtype=np.float32)
        action_pre = np.zeros(action_size, dtype=np.float32)
        action_mask = (
            [[False for _ in range(branch)] for branch in action_space]
--- a/ml-agents/mlagents/trainers/tests/test_bcmodule.py
+++ b/ml-agents/mlagents/trainers/tests/test_bcmodule.py
 import yaml
 import os

-from mlagents.trainers.ppo.policy import PPOPolicy
-from mlagents.trainers.sac.policy import SACPolicy
+from mlagents.trainers.common.nn_policy import NNPolicy
+from mlagents.trainers.components.bc.module import BCModule


 def ppo_dummy_config():
    )


-def sac_dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: sac
-        batch_size: 128
-        buffer_size: 50000
-        buffer_init_steps: 0
-        hidden_units: 128
-        init_entcoef: 1.0
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        memory_size: 256
-        normalize: false
-        num_update: 1
-        train_interval: 1
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        tau: 0.005
-        use_recurrent: false
-        vis_encode_type: simple
-        behavioral_cloning:
-            demo_path: ./Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
-            strength: 1.0
-            steps: 10000000
-        reward_signals:
-            extrinsic:
-                strength: 1.0
-                gamma: 0.99
-        """
-    )
-
-
-def create_policy_with_bc_mock(mock_brain, trainer_config, use_rnn, demo_file):
+def create_bc_module(mock_brain, trainer_config, use_rnn, demo_file, tanhresample):
    # model_path = env.external_brain_names[0]
    trainer_config["model_path"] = "testpath"
    trainer_config["keep_checkpoints"] = 3
    )

-    policy = (
-        PPOPolicy(0, mock_brain, trainer_config, False, False)
-        if trainer_config["trainer"] == "ppo"
-        else SACPolicy(0, mock_brain, trainer_config, False, False)
+    policy = NNPolicy(
+        0, mock_brain, trainer_config, False, False, tanhresample, tanhresample
-    return policy
+    with policy.graph.as_default():
+        bc_module = BCModule(
+            policy,
+            policy_learning_rate=trainer_config["learning_rate"],
+            default_batch_size=trainer_config["batch_size"],
+            default_num_epoch=3,
+            **trainer_config["behavioral_cloning"],
+        )
+    policy.initialize_or_load()  # Normally the optimizer calls this after the BCModule is created
+    return bc_module


 # Test default values
    trainer_config = ppo_dummy_config()
-    policy = create_policy_with_bc_mock(mock_brain, trainer_config, False, "test.demo")
-    assert policy.bc_module.num_epoch == 3
-    assert policy.bc_module.batch_size == trainer_config["batch_size"]
+    bc_module = create_bc_module(mock_brain, trainer_config, False, "test.demo", False)
+    assert bc_module.num_epoch == 3
+    assert bc_module.batch_size == trainer_config["batch_size"]
-    policy = create_policy_with_bc_mock(mock_brain, trainer_config, False, "test.demo")
-    assert policy.bc_module.num_epoch == 100
-    assert policy.bc_module.batch_size == 10000
+    bc_module = create_bc_module(mock_brain, trainer_config, False, "test.demo", False)
+    assert bc_module.num_epoch == 100
+    assert bc_module.batch_size == 10000
-@pytest.mark.parametrize(
-    "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
-)
-def test_bcmodule_update(trainer_config):
+@pytest.mark.parametrize("is_sac", [True, False], ids=["ppo", "sac"])
+def test_bcmodule_update(is_sac):
-    policy = create_policy_with_bc_mock(mock_brain, trainer_config, False, "test.demo")
-    stats = policy.bc_module.update()
+    bc_module = create_bc_module(
+        mock_brain, ppo_dummy_config(), False, "test.demo", is_sac
+    )
+    stats = bc_module.update()
-@pytest.mark.parametrize(
-    "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
-)
-def test_bcmodule_constant_lr_update(trainer_config):
+@pytest.mark.parametrize("is_sac", [True, False], ids=["ppo", "sac"])
+def test_bcmodule_constant_lr_update(is_sac):
+    trainer_config = ppo_dummy_config()
-    policy = create_policy_with_bc_mock(mock_brain, trainer_config, False, "test.demo")
-    stats = policy.bc_module.update()
+    bc_module = create_bc_module(mock_brain, trainer_config, False, "test.demo", is_sac)
+    stats = bc_module.update()
-    old_learning_rate = policy.bc_module.current_lr
+    old_learning_rate = bc_module.current_lr
-    stats = policy.bc_module.update()
-    assert old_learning_rate == policy.bc_module.current_lr
+    stats = bc_module.update()
+    assert old_learning_rate == bc_module.current_lr
-@pytest.mark.parametrize(
-    "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
-)
-def test_bcmodule_rnn_update(trainer_config):
+@pytest.mark.parametrize("is_sac", [True, False], ids=["ppo", "sac"])
+def test_bcmodule_rnn_update(is_sac):
-    policy = create_policy_with_bc_mock(mock_brain, trainer_config, True, "test.demo")
-    stats = policy.bc_module.update()
+    bc_module = create_bc_module(
+        mock_brain, ppo_dummy_config(), True, "test.demo", is_sac
+    )
+    stats = bc_module.update()
-@pytest.mark.parametrize(
-    "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
-)
-def test_bcmodule_dc_visual_update(trainer_config):
+@pytest.mark.parametrize("is_sac", [True, False], ids=["ppo", "sac"])
+def test_bcmodule_dc_visual_update(is_sac):
-    policy = create_policy_with_bc_mock(
-        mock_brain, trainer_config, False, "testdcvis.demo"
+    bc_module = create_bc_module(
+        mock_brain, ppo_dummy_config(), False, "testdcvis.demo", is_sac
-    stats = policy.bc_module.update()
+    stats = bc_module.update()
-@pytest.mark.parametrize(
-    "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
-)
-def test_bcmodule_rnn_dc_update(trainer_config):
+@pytest.mark.parametrize("is_sac", [True, False], ids=["ppo", "sac"])
+def test_bcmodule_rnn_dc_update(is_sac):
-    policy = create_policy_with_bc_mock(
-        mock_brain, trainer_config, True, "testdcvis.demo"
+    bc_module = create_bc_module(
+        mock_brain, ppo_dummy_config(), True, "testdcvis.demo", is_sac
-    stats = policy.bc_module.update()
+    stats = bc_module.update()
    for _, item in stats.items():
        assert isinstance(item, np.float32)

--- a/ml-agents/mlagents/trainers/tests/test_ghost.py
+++ b/ml-agents/mlagents/trainers/tests/test_ghost.py
    )

    trainer_params = dummy_config
-    trainer = PPOTrainer(
-        mock_brain.brain_name, 0, trainer_params, True, False, 0, "0", False
-    )
+    trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0")
+    policy.create_tf_graph()
+    to_load_policy.create_tf_graph()
    to_load_policy.init_load_weights()

    weights = policy.get_weights()
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
-    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0", False)
+    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    trainer = GhostTrainer(ppo_trainer, brain_name, 0, dummy_config, True, "0")

    # first policy encountered becomes policy trained by wrapped PPO
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
-    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0", False)
+    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    trainer = GhostTrainer(ppo_trainer, brain_name, 0, dummy_config, True, "0")

    # First policy encountered becomes policy trained by wrapped PPO
--- a/ml-agents/mlagents/trainers/tests/test_learn.py
+++ b/ml-agents/mlagents/trainers/tests/test_learn.py
 from mlagents.trainers import learn
 from mlagents.trainers.trainer_controller import TrainerController
 from mlagents.trainers.learn import parse_command_line
+from mlagents_envs.exception import UnityEnvironmentException


 def basic_options(extra_args=None):
            assert mock_init.call_args[0][2] == "/dockertarget/summaries"


+def test_bad_env_path():
+    with pytest.raises(UnityEnvironmentException):
+        learn.create_environment_factory(
+            env_path="/foo/bar",
+            docker_target_name=None,
+            no_graphics=True,
+            seed=None,
+            start_port=8000,
+            env_args=None,
+        )
+
+
@patch("builtins.open", new_callable=mock_open, read_data="{}")
 def test_commandline_args(mock_file):

    assert opt.docker_target_name is None
    assert opt.no_graphics is False
    assert opt.debug is False
-    assert opt.multi_gpu is False
    assert opt.env_args is None

    full_args = [
        "--docker-target-name=mydockertarget",
        "--no-graphics",
        "--debug",
-        "--multi-gpu",
    ]

    opt = parse_command_line(full_args)
    assert opt.docker_target_name == "mydockertarget"
    assert opt.no_graphics is True
    assert opt.debug is True
-    assert opt.multi_gpu is True


@patch("builtins.open", new_callable=mock_open, read_data="{}")
--- a/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
+++ b/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
        hidden_units: 128
        lambd: 0.95
        learning_rate: 5.0e-3
-        max_steps: 200
+        max_steps: 300
        memory_size: 256
        normalize: false
        num_epoch: 3
--- a/ml-agents/mlagents/trainers/tests/test_policy.py
+++ b/ml-agents/mlagents/trainers/tests/test_policy.py
 def basic_mock_brain():
    mock_brain = MagicMock()
    mock_brain.vector_action_space_type = "continuous"
+    mock_brain.vector_observation_space_size = 1
+    mock_brain.vector_action_space_size = [1]
    return mock_brain



+class FakePolicy(TFPolicy):
+    def create_tf_graph(self):
+        pass
+
+    def get_trainable_variables(self):
+        return []
+
+
-    policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
+    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
    # Doesn't really matter what this is
    dummy_groupspec = AgentGroupSpec([(1,)], "continuous", 1)
    no_agent_step = BatchedStepResult.empty(dummy_groupspec)

 def test_take_action_returns_nones_on_missing_values():
    test_seed = 3
-    policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
+    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
    policy.evaluate = MagicMock(return_value={})
    policy.save_memories = MagicMock()
    step_with_agents = BatchedStepResult(

 def test_take_action_returns_action_info_when_available():
    test_seed = 3
-    policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
+    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
    policy_eval_out = {
        "action": np.array([1.0], dtype=np.float32),
        "memory_out": np.array([[2.5]], dtype=np.float32),
--- a/ml-agents/mlagents/trainers/tests/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/test_ppo.py

 import yaml

-from mlagents.trainers.ppo.models import PPOModel
-from mlagents.trainers.ppo.policy import PPOPolicy
-from mlagents.trainers.models import EncoderType, LearningModel
-from mlagents.trainers.trainer import UnityTrainerException
-from mlagents.trainers.brain import BrainParameters, CameraResolution
+from mlagents.trainers.ppo.optimizer import PPOOptimizer
+from mlagents.trainers.common.nn_policy import NNPolicy
+from mlagents.trainers.brain import BrainParameters
-from mlagents_envs.environment import UnityEnvironment
-from mlagents_envs.mock_communicator import MockCommunicator
-from mlagents.trainers.brain_conversion_utils import group_spec_to_brain_parameters


@pytest.fixture
        summary_freq: 1000
        use_recurrent: false
        normalize: true
-        memory_size: 8
+        memory_size: 10
        curiosity_strength: 0.0
        curiosity_enc_size: 1
        summary_path: test
 VECTOR_ACTION_SPACE = [2]
 VECTOR_OBS_SPACE = 8
 DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
-BUFFER_INIT_SAMPLES = 32
+BUFFER_INIT_SAMPLES = 64
-@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
-@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
-def test_ppo_policy_evaluate(mock_communicator, mock_launcher, dummy_config):
-    tf.reset_default_graph()
-    mock_communicator.return_value = MockCommunicator(
-        discrete_action=False, visual_inputs=0
-    )
-    env = UnityEnvironment(" ")
-    env.reset()
-    brain_name = env.get_agent_groups()[0]
-    batched_step = env.get_step_result(brain_name)
-    brain_params = group_spec_to_brain_parameters(
-        brain_name, env.get_agent_group_spec(brain_name)
+def _create_ppo_optimizer_ops_mock(dummy_config, use_rnn, use_discrete, use_visual):
+    mock_brain = mb.setup_mock_brain(
+        use_discrete,
+        use_visual,
+        vector_action_space=VECTOR_ACTION_SPACE,
+        vector_obs_space=VECTOR_OBS_SPACE,
+        discrete_action_space=DISCRETE_ACTION_SPACE,
-    model_path = brain_name
+    model_path = "testmodel"
-    policy = PPOPolicy(0, brain_params, trainer_parameters, False, False)
-    run_out = policy.evaluate(batched_step, list(batched_step.agent_id))
-    assert run_out["action"].shape == (3, 2)
-    env.close()
+    trainer_parameters["use_recurrent"] = use_rnn
+    policy = NNPolicy(
+        0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
+    )
+    optimizer = PPOOptimizer(policy, trainer_parameters)
+    return optimizer
+
+
+@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
+@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
+@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
+def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete):
+    # Test evaluate
+    tf.reset_default_graph()
+    optimizer = _create_ppo_optimizer_ops_mock(
+        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
+    )
+    # Test update
+    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
+    # Mock out reward signal eval
+    update_buffer["advantages"] = update_buffer["environment_rewards"]
+    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
+    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
+    optimizer.update(
+        update_buffer,
+        num_sequences=update_buffer.num_experiences // dummy_config["sequence_length"],
+    )


@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
-    policy = PPOPolicy(0, brain_params, dummy_config, False, False)
+    policy = NNPolicy(
+        0, brain_params, dummy_config, False, False, create_tf_graph=False
+    )
+    optimizer = PPOOptimizer(policy, dummy_config)
    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        action_space=[2],
    )
-    run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=False)
+    run_out, final_value_out = optimizer.get_trajectory_value_estimates(
+        trajectory.to_agentbuffer(), trajectory.next_obs, done=False
+    )
-        assert type(val) is float
+        assert len(val) == 15
-    run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=True)
-    for key, val in run_out.items():
+    run_out, final_value_out = optimizer.get_trajectory_value_estimates(
+        trajectory.to_agentbuffer(), trajectory.next_obs, done=True
+    )
+    for key, val in final_value_out.items():
-    policy.reward_signals["extrinsic"].use_terminal_states = False
-    run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=True)
-    for key, val in run_out.items():
+    optimizer.reward_signals["extrinsic"].use_terminal_states = False
+    run_out, final_value_out = optimizer.get_trajectory_value_estimates(
+        trajectory.to_agentbuffer(), trajectory.next_obs, done=False
+    )
+    for key, val in final_value_out.items():
-    agentbuffer = trajectory.to_agentbuffer()
-    batched_values = policy.get_batched_value_estimates(agentbuffer)
-    for values in batched_values.values():
-        assert len(values) == 15
-
-
-def test_ppo_model_cc_vector():
-    tf.reset_default_graph()
-    with tf.Session() as sess:
-        with tf.variable_scope("FakeGraphScope"):
-            model = PPOModel(
-                make_brain_parameters(discrete_action=False, visual_inputs=0)
-            )
-            init = tf.global_variables_initializer()
-            sess.run(init)
-
-            run_list = [
-                model.output,
-                model.log_probs,
-                model.value,
-                model.entropy,
-                model.learning_rate,
-            ]
-            feed_dict = {
-                model.batch_size: 2,
-                model.sequence_length: 1,
-                model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
-                model.epsilon: np.array([[0, 1], [2, 3]]),
-            }
-            sess.run(run_list, feed_dict=feed_dict)
-
-
-def test_ppo_model_cc_visual():
-    tf.reset_default_graph()
-    with tf.Session() as sess:
-        with tf.variable_scope("FakeGraphScope"):
-            model = PPOModel(
-                make_brain_parameters(discrete_action=False, visual_inputs=2)
-            )
-            init = tf.global_variables_initializer()
-            sess.run(init)
-
-            run_list = [
-                model.output,
-                model.log_probs,
-                model.value,
-                model.entropy,
-                model.learning_rate,
-            ]
-            feed_dict = {
-                model.batch_size: 2,
-                model.sequence_length: 1,
-                model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
-                model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
-                model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
-                model.epsilon: np.array([[0, 1], [2, 3]], dtype=np.float32),
-            }
-            sess.run(run_list, feed_dict=feed_dict)
-
-
-def test_ppo_model_dc_visual():
-    tf.reset_default_graph()
-    with tf.Session() as sess:
-        with tf.variable_scope("FakeGraphScope"):
-            model = PPOModel(
-                make_brain_parameters(discrete_action=True, visual_inputs=2)
-            )
-            init = tf.global_variables_initializer()
-            sess.run(init)
-
-            run_list = [
-                model.output,
-                model.all_log_probs,
-                model.value,
-                model.entropy,
-                model.learning_rate,
-            ]
-            feed_dict = {
-                model.batch_size: 2,
-                model.sequence_length: 1,
-                model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
-                model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
-                model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
-                model.action_masks: np.ones([2, 2], dtype=np.float32),
-            }
-            sess.run(run_list, feed_dict=feed_dict)
-
-
-def test_ppo_model_dc_vector():
-    tf.reset_default_graph()
-    with tf.Session() as sess:
-        with tf.variable_scope("FakeGraphScope"):
-            model = PPOModel(
-                make_brain_parameters(discrete_action=True, visual_inputs=0)
-            )
-            init = tf.global_variables_initializer()
-            sess.run(init)
-
-            run_list = [
-                model.output,
-                model.all_log_probs,
-                model.value,
-                model.entropy,
-                model.learning_rate,
-            ]
-            feed_dict = {
-                model.batch_size: 2,
-                model.sequence_length: 1,
-                model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
-                model.action_masks: np.ones([2, 2], dtype=np.float32),
-            }
-            sess.run(run_list, feed_dict=feed_dict)
-
-
-def test_ppo_model_dc_vector_rnn():
-    tf.reset_default_graph()
-    with tf.Session() as sess:
-        with tf.variable_scope("FakeGraphScope"):
-            memory_size = 128
-            model = PPOModel(
-                make_brain_parameters(discrete_action=True, visual_inputs=0),
-                use_recurrent=True,
-                m_size=memory_size,
-            )
-            init = tf.global_variables_initializer()
-            sess.run(init)
-
-            run_list = [
-                model.output,
-                model.all_log_probs,
-                model.value,
-                model.entropy,
-                model.learning_rate,
-                model.memory_out,
-            ]
-            feed_dict = {
-                model.batch_size: 1,
-                model.sequence_length: 2,
-                model.prev_action: [[0], [0]],
-                model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
-                model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
-                model.action_masks: np.ones([1, 2], dtype=np.float32),
-            }
-            sess.run(run_list, feed_dict=feed_dict)
-
-
-def test_ppo_model_cc_vector_rnn():
-    tf.reset_default_graph()
-    with tf.Session() as sess:
-        with tf.variable_scope("FakeGraphScope"):
-            memory_size = 128
-            model = PPOModel(
-                make_brain_parameters(discrete_action=False, visual_inputs=0),
-                use_recurrent=True,
-                m_size=memory_size,
-            )
-            init = tf.global_variables_initializer()
-            sess.run(init)
-
-            run_list = [
-                model.output,
-                model.all_log_probs,
-                model.value,
-                model.entropy,
-                model.learning_rate,
-                model.memory_out,
-            ]
-            feed_dict = {
-                model.batch_size: 1,
-                model.sequence_length: 2,
-                model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
-                model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
-                model.epsilon: np.array([[0, 1]]),
-            }
-            sess.run(run_list, feed_dict=feed_dict)
-

 def test_rl_functions():
    rewards = np.array([0.0, 0.0, 0.0, 1.0], dtype=np.float32)
    )


-def test_trainer_increment_step(dummy_config):
+@mock.patch("mlagents.trainers.ppo.trainer.PPOOptimizer")
+def test_trainer_increment_step(ppo_optimizer, dummy_config):
+    mock_optimizer = mock.Mock()
+    mock_optimizer.reward_signals = {}
+    ppo_optimizer.return_value = mock_optimizer
+
    brain_params = BrainParameters(
        brain_name="test_brain",
        vector_observation_space_size=1,
    )

    trainer = PPOTrainer(
-        brain_params.brain_name, 0, trainer_params, True, False, 0, "0", False
+        brain_params.brain_name, 0, trainer_params, True, False, 0, "0"
-    policy_mock = mock.Mock(spec=PPOPolicy)
+    policy_mock = mock.Mock(spec=NNPolicy)
    policy_mock.get_current_step.return_value = 0
    step_count = (
        5
    trainer_params["reward_signals"]["curiosity"]["gamma"] = 0.99
    trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128

-    trainer = PPOTrainer(
-        mock_brain.brain_name, 0, trainer_params, True, False, 0, "0", False
-    )
+    trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0")
    policy = trainer.create_policy(mock_brain)
    trainer.add_policy(mock_brain.brain_name, policy)
    # Test update with sequence length smaller than batch size
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
-    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False)
+    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
    policy = trainer.create_policy(brain_params)
    trainer.add_policy(brain_params.brain_name, policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0


-def test_add_get_policy(dummy_config):
+@mock.patch("mlagents.trainers.ppo.trainer.PPOOptimizer")
+def test_add_get_policy(ppo_optimizer, dummy_config):
+    mock_optimizer = mock.Mock()
+    mock_optimizer.reward_signals = {}
+    ppo_optimizer.return_value = mock_optimizer
+
-    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False)
-    policy = mock.Mock(spec=PPOPolicy)
+    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
+    policy = mock.Mock(spec=NNPolicy)
    policy.get_current_step.return_value = 2000

    trainer.add_policy(brain_params.brain_name, policy)
    policy = mock.Mock()
    with pytest.raises(RuntimeError):
        trainer.add_policy(brain_params, policy)
-
-
-def test_normalization(dummy_config):
-    brain_params = BrainParameters(
-        brain_name="test_brain",
-        vector_observation_space_size=1,
-        camera_resolutions=[],
-        vector_action_space_size=[2],
-        vector_action_descriptions=[],
-        vector_action_space_type=0,
-    )
-    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
-    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
-    trainer = PPOTrainer(
-        brain_params.brain_name, 0, dummy_config, True, False, 0, "0", False
-    )
-    time_horizon = 6
-    trajectory = make_fake_trajectory(
-        length=time_horizon,
-        max_step_complete=True,
-        vec_obs_size=1,
-        num_vis_obs=0,
-        action_space=[2],
-    )
-    # Change half of the obs to 0
-    for i in range(3):
-        trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
-    policy = trainer.create_policy(brain_params)
-    trainer.add_policy(brain_params.brain_name, policy)
-
-    trainer._process_trajectory(trajectory)
-
-    # Check that the running mean and variance is correct
-    steps, mean, variance = trainer.policy.sess.run(
-        [
-            trainer.policy.model.normalization_steps,
-            trainer.policy.model.running_mean,
-            trainer.policy.model.running_variance,
-        ]
-    )
-
-    assert steps == 6
-    assert mean[0] == 0.5
-    # Note: variance is divided by number of steps, and initialized to 1 to avoid
-    # divide by 0. The right answer is 0.25
-    assert (variance[0] - 1) / steps == 0.25
-
-    # Make another update, this time with all 1's
-    time_horizon = 10
-    trajectory = make_fake_trajectory(
-        length=time_horizon,
-        max_step_complete=True,
-        vec_obs_size=1,
-        num_vis_obs=0,
-        action_space=[2],
-    )
-    trainer._process_trajectory(trajectory)
-
-    # Check that the running mean and variance is correct
-    steps, mean, variance = trainer.policy.sess.run(
-        [
-            trainer.policy.model.normalization_steps,
-            trainer.policy.model.running_mean,
-            trainer.policy.model.running_variance,
-        ]
-    )
-
-    assert steps == 16
-    assert mean[0] == 0.8125
-    assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
-
-
-def test_min_visual_size():
-    # Make sure each EncoderType has an entry in MIS_RESOLUTION_FOR_ENCODER
-    assert set(LearningModel.MIN_RESOLUTION_FOR_ENCODER.keys()) == set(EncoderType)
-
-    for encoder_type in EncoderType:
-        with tf.Graph().as_default():
-            good_size = LearningModel.MIN_RESOLUTION_FOR_ENCODER[encoder_type]
-            good_res = CameraResolution(
-                width=good_size, height=good_size, num_channels=3
-            )
-            LearningModel._check_resolution_for_encoder(good_res, encoder_type)
-            vis_input = LearningModel.create_visual_input(
-                good_res, "test_min_visual_size"
-            )
-            enc_func = LearningModel.get_encoder_for_type(encoder_type)
-            enc_func(vis_input, 32, LearningModel.swish, 1, "test", False)
-
-        # Anything under the min size should raise an exception. If not, decrease the min size!
-        with pytest.raises(Exception):
-            with tf.Graph().as_default():
-                bad_size = LearningModel.MIN_RESOLUTION_FOR_ENCODER[encoder_type] - 1
-                bad_res = CameraResolution(
-                    width=bad_size, height=bad_size, num_channels=3
-                )
-
-                with pytest.raises(UnityTrainerException):
-                    # Make sure we'd hit a friendly error during model setup time.
-                    LearningModel._check_resolution_for_encoder(bad_res, encoder_type)
-
-                vis_input = LearningModel.create_visual_input(
-                    bad_res, "test_min_visual_size"
-                )
-                enc_func = LearningModel.get_encoder_for_type(encoder_type)
-                enc_func(vis_input, 32, LearningModel.swish, 1, "test", False)


 if __name__ == "__main__":
--- a/ml-agents/mlagents/trainers/tests/test_reward_signals.py
+++ b/ml-agents/mlagents/trainers/tests/test_reward_signals.py
 import yaml
 import os
 import mlagents.trainers.tests.mock_brain as mb
-from mlagents.trainers.ppo.policy import PPOPolicy
-from mlagents.trainers.sac.policy import SACPolicy
+from mlagents.trainers.common.nn_policy import NNPolicy
+from mlagents.trainers.sac.optimizer import SACOptimizer
+from mlagents.trainers.ppo.optimizer import PPOOptimizer


 def ppo_dummy_config():
 NUM_AGENTS = 12


-def create_policy_mock(
+def create_optimizer_mock(
    trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual
 ):
    mock_brain = mb.setup_mock_brain(
    trainer_parameters["keep_checkpoints"] = 3
    trainer_parameters["reward_signals"].update(reward_signal_config)
    trainer_parameters["use_recurrent"] = use_rnn
-    if trainer_config["trainer"] == "ppo":
-        policy = PPOPolicy(0, mock_brain, trainer_parameters, False, False)
+    policy = NNPolicy(
+        0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
+    )
+    if trainer_parameters["trainer"] == "sac":
+        optimizer = SACOptimizer(policy, trainer_parameters)
-        policy = SACPolicy(0, mock_brain, trainer_parameters, False, False)
-    return policy
+        optimizer = PPOOptimizer(policy, trainer_parameters)
+    return optimizer
-def reward_signal_eval(policy, reward_signal_name):
-    buffer = mb.simulate_rollout(BATCH_SIZE, policy.brain)
+def reward_signal_eval(optimizer, reward_signal_name):
+    buffer = mb.simulate_rollout(BATCH_SIZE, optimizer.policy.brain)
-    rsig_result = policy.reward_signals[reward_signal_name].evaluate_batch(buffer)
+    rsig_result = optimizer.reward_signals[reward_signal_name].evaluate_batch(buffer)
-def reward_signal_update(policy, reward_signal_name):
-    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
-    feed_dict = policy.reward_signals[reward_signal_name].prepare_update(
-        policy.model, buffer.make_mini_batch(0, 10), 2
+def reward_signal_update(optimizer, reward_signal_name):
+    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
+    feed_dict = optimizer.reward_signals[reward_signal_name].prepare_update(
+        optimizer.policy, buffer.make_mini_batch(0, 10), 2
-    out = policy._execute_model(
-        feed_dict, policy.reward_signals[reward_signal_name].update_dict
+    out = optimizer.policy._execute_model(
+        feed_dict, optimizer.reward_signals[reward_signal_name].update_dict
    )
    assert type(out) is dict

 )
 def test_gail_cc(trainer_config, gail_dummy_config):
-    policy = create_policy_mock(trainer_config, gail_dummy_config, False, False, False)
-    reward_signal_eval(policy, "gail")
-    reward_signal_update(policy, "gail")
+    optimizer = create_optimizer_mock(
+        trainer_config, gail_dummy_config, False, False, False
+    )
+    reward_signal_eval(optimizer, "gail")
+    reward_signal_update(optimizer, "gail")


@pytest.mark.parametrize(
    gail_dummy_config["gail"]["demo_path"] = (
        os.path.dirname(os.path.abspath(__file__)) + "/testdcvis.demo"
    )
-    policy = create_policy_mock(trainer_config, gail_dummy_config, False, True, True)
-    reward_signal_eval(policy, "gail")
-    reward_signal_update(policy, "gail")
+    optimizer = create_optimizer_mock(
+        trainer_config, gail_dummy_config, False, True, True
+    )
+    reward_signal_eval(optimizer, "gail")
+    reward_signal_update(optimizer, "gail")


@pytest.mark.parametrize(
-    policy = create_policy_mock(trainer_config, gail_dummy_config, True, False, False)
+    policy = create_optimizer_mock(
+        trainer_config, gail_dummy_config, True, False, False
+    )
    reward_signal_eval(policy, "gail")
    reward_signal_update(policy, "gail")

 )
 def test_curiosity_cc(trainer_config, curiosity_dummy_config):
-    policy = create_policy_mock(
+    policy = create_optimizer_mock(
        trainer_config, curiosity_dummy_config, False, False, False
    )
    reward_signal_eval(policy, "curiosity")
    "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
 )
 def test_curiosity_dc(trainer_config, curiosity_dummy_config):
-    policy = create_policy_mock(
+    policy = create_optimizer_mock(
        trainer_config, curiosity_dummy_config, False, True, False
    )
    reward_signal_eval(policy, "curiosity")
    "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
 )
 def test_curiosity_visual(trainer_config, curiosity_dummy_config):
-    policy = create_policy_mock(
+    policy = create_optimizer_mock(
        trainer_config, curiosity_dummy_config, False, False, True
    )
    reward_signal_eval(policy, "curiosity")
    "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
 )
 def test_curiosity_rnn(trainer_config, curiosity_dummy_config):
-    policy = create_policy_mock(
+    policy = create_optimizer_mock(
        trainer_config, curiosity_dummy_config, True, False, False
    )
    reward_signal_eval(policy, "curiosity")
    "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
 )
 def test_extrinsic(trainer_config, curiosity_dummy_config):
-    policy = create_policy_mock(
+    policy = create_optimizer_mock(
        trainer_config, curiosity_dummy_config, False, False, False
    )
    reward_signal_eval(policy, "extrinsic")
--- a/ml-agents/mlagents/trainers/tests/test_sac.py
+++ b/ml-agents/mlagents/trainers/tests/test_sac.py
 from unittest import mock
 import yaml

-import numpy as np
-from mlagents.trainers.sac.models import SACModel
-from mlagents.trainers.sac.policy import SACPolicy
+from mlagents.trainers.sac.optimizer import SACOptimizer
+from mlagents.trainers.common.nn_policy import NNPolicy
-from mlagents.trainers.buffer import AgentBuffer
 from mlagents.trainers.tests import mock_brain as mb
 from mlagents.trainers.tests.mock_brain import make_brain_parameters
 from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
        init_entcoef: 0.1
        learning_rate: 3.0e-4
        max_steps: 1024
-        memory_size: 8
+        memory_size: 10
        normalize: false
        num_update: 1
        train_interval: 1
 VECTOR_ACTION_SPACE = [2]
 VECTOR_OBS_SPACE = 8
 DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
-BUFFER_INIT_SAMPLES = 32
+BUFFER_INIT_SAMPLES = 64
-def create_sac_policy_mock(dummy_config, use_rnn, use_discrete, use_visual):
+def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual):
    mock_brain = mb.setup_mock_brain(
        use_discrete,
        use_visual,
    trainer_parameters["model_path"] = model_path
    trainer_parameters["keep_checkpoints"] = 3
    trainer_parameters["use_recurrent"] = use_rnn
-    policy = SACPolicy(0, mock_brain, trainer_parameters, False, False)
-    return policy
+    policy = NNPolicy(
+        0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
+    )
+    optimizer = SACOptimizer(policy, trainer_parameters)
+    return optimizer
-def test_sac_cc_policy(dummy_config):
+@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
+@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
+@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
+def test_sac_optimizer_update(dummy_config, rnn, visual, discrete):
-    policy = create_sac_policy_mock(
-        dummy_config, use_rnn=False, use_discrete=False, use_visual=False
+    optimizer = create_sac_optimizer_mock(
+        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
-    step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
-
-    run_out = policy.evaluate(step, list(step.agent_id))
-    assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE[0])
-
-    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
+    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
-    policy.update(update_buffer, num_sequences=update_buffer.num_experiences)
+    optimizer.update(
+        update_buffer,
+        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
+    )


@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
    dummy_config["reward_signals"]["curiosity"]["strength"] = 1.0
    dummy_config["reward_signals"]["curiosity"]["gamma"] = 0.99
    dummy_config["reward_signals"]["curiosity"]["encoding_size"] = 128
-    policy = create_sac_policy_mock(
+    optimizer = create_sac_optimizer_mock(
-    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
+    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
-    policy.update_reward_signals(
+    optimizer.update_reward_signals(
-def test_sac_dc_policy(dummy_config):
-    # Test evaluate
-    tf.reset_default_graph()
-    policy = create_sac_policy_mock(
-        dummy_config, use_rnn=False, use_discrete=True, use_visual=False
-    )
-    step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
-
-    run_out = policy.evaluate(step, list(step.agent_id))
-    assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
-
-    # Test update
-    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
-    # Mock out reward signal eval
-    update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
-    policy.update(update_buffer, num_sequences=update_buffer.num_experiences)
-
-
-def test_sac_visual_policy(dummy_config):
-    # Test evaluate
-    tf.reset_default_graph()
-    policy = create_sac_policy_mock(
-        dummy_config, use_rnn=False, use_discrete=True, use_visual=True
-    )
-    step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
-    run_out = policy.evaluate(step, list(step.agent_id))
-    assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
-
-    # Test update
-    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
-    # Mock out reward signal eval
-    update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
-    run_out = policy.update(update_buffer, num_sequences=update_buffer.num_experiences)
-    assert type(run_out) is dict
-
-
-def test_sac_rnn_policy(dummy_config):
-    # Test evaluate
-    tf.reset_default_graph()
-    policy = create_sac_policy_mock(
-        dummy_config, use_rnn=True, use_discrete=True, use_visual=False
-    )
-    step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
-    run_out = policy.evaluate(step, list(step.agent_id))
-    assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
-
-    # Test update
-    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain, memory_size=8)
-    # Mock out reward signal eval
-    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
-    update_buffer = AgentBuffer()
-    buffer.resequence_and_append(update_buffer, training_length=policy.sequence_length)
-    run_out = policy.update(
-        update_buffer,
-        num_sequences=update_buffer.num_experiences // policy.sequence_length,
-    )
-
-
-def test_sac_model_cc_vector():
-    tf.reset_default_graph()
-    with tf.Session() as sess:
-        with tf.variable_scope("FakeGraphScope"):
-            model = SACModel(
-                make_brain_parameters(discrete_action=False, visual_inputs=0)
-            )
-            init = tf.global_variables_initializer()
-            sess.run(init)
-
-            run_list = [model.output, model.value, model.entropy, model.learning_rate]
-            feed_dict = {
-                model.batch_size: 2,
-                model.sequence_length: 1,
-                model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
-            }
-            sess.run(run_list, feed_dict=feed_dict)
-
-
-def test_sac_model_cc_visual():
-    tf.reset_default_graph()
-    with tf.Session() as sess:
-        with tf.variable_scope("FakeGraphScope"):
-            model = SACModel(
-                make_brain_parameters(discrete_action=False, visual_inputs=2)
-            )
-            init = tf.global_variables_initializer()
-            sess.run(init)
-
-            run_list = [model.output, model.value, model.entropy, model.learning_rate]
-            feed_dict = {
-                model.batch_size: 2,
-                model.sequence_length: 1,
-                model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
-                model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
-                model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
-            }
-            sess.run(run_list, feed_dict=feed_dict)
-
-
-def test_sac_model_dc_visual():
-    tf.reset_default_graph()
-    with tf.Session() as sess:
-        with tf.variable_scope("FakeGraphScope"):
-            model = SACModel(
-                make_brain_parameters(discrete_action=True, visual_inputs=2)
-            )
-            init = tf.global_variables_initializer()
-            sess.run(init)
-
-            run_list = [model.output, model.value, model.entropy, model.learning_rate]
-            feed_dict = {
-                model.batch_size: 2,
-                model.sequence_length: 1,
-                model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
-                model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
-                model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
-                model.action_masks: np.ones([2, 2], dtype=np.float32),
-            }
-            sess.run(run_list, feed_dict=feed_dict)
-
-
-def test_sac_model_dc_vector():
-    tf.reset_default_graph()
-    with tf.Session() as sess:
-        with tf.variable_scope("FakeGraphScope"):
-            model = SACModel(
-                make_brain_parameters(discrete_action=True, visual_inputs=0)
-            )
-            init = tf.global_variables_initializer()
-            sess.run(init)
-
-            run_list = [model.output, model.value, model.entropy, model.learning_rate]
-            feed_dict = {
-                model.batch_size: 2,
-                model.sequence_length: 1,
-                model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
-                model.action_masks: np.ones([2, 2], dtype=np.float32),
-            }
-            sess.run(run_list, feed_dict=feed_dict)
-
-
-def test_sac_model_dc_vector_rnn():
-    tf.reset_default_graph()
-    with tf.Session() as sess:
-        with tf.variable_scope("FakeGraphScope"):
-            memory_size = 128
-            model = SACModel(
-                make_brain_parameters(discrete_action=True, visual_inputs=0),
-                use_recurrent=True,
-                m_size=memory_size,
-            )
-            init = tf.global_variables_initializer()
-            sess.run(init)
-
-            run_list = [
-                model.output,
-                model.all_log_probs,
-                model.value,
-                model.entropy,
-                model.learning_rate,
-                model.memory_out,
-            ]
-            feed_dict = {
-                model.batch_size: 1,
-                model.sequence_length: 2,
-                model.prev_action: [[0], [0]],
-                model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
-                model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
-                model.action_masks: np.ones([1, 2], dtype=np.float32),
-            }
-            sess.run(run_list, feed_dict=feed_dict)
-
-
-def test_sac_model_cc_vector_rnn():
-    tf.reset_default_graph()
-    with tf.Session() as sess:
-        with tf.variable_scope("FakeGraphScope"):
-            memory_size = 128
-            model = SACModel(
-                make_brain_parameters(discrete_action=False, visual_inputs=0),
-                use_recurrent=True,
-                m_size=memory_size,
-            )
-            init = tf.global_variables_initializer()
-            sess.run(init)
-
-            run_list = [
-                model.output,
-                model.all_log_probs,
-                model.value,
-                model.entropy,
-                model.learning_rate,
-                model.memory_out,
-            ]
-            feed_dict = {
-                model.batch_size: 1,
-                model.sequence_length: 2,
-                model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
-                model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
-            }
-            sess.run(run_list, feed_dict=feed_dict)
-
-
 def test_sac_save_load_buffer(tmpdir, dummy_config):
    mock_brain = mb.setup_mock_brain(
        False,
    assert trainer2.update_buffer.num_experiences == buffer_len


-def test_add_get_policy(dummy_config):
+@mock.patch("mlagents.trainers.sac.trainer.SACOptimizer")
+def test_add_get_policy(sac_optimizer, dummy_config):
+    mock_optimizer = mock.Mock()
+    mock_optimizer.reward_signals = {}
+    sac_optimizer.return_value = mock_optimizer
+
-    policy = mock.Mock(spec=SACPolicy)
+    policy = mock.Mock(spec=NNPolicy)
    policy.get_current_step.return_value = 2000

    trainer.add_policy(brain_params.brain_name, policy)
--- a/ml-agents/mlagents/trainers/tests/test_trainer_util.py
+++ b/ml-agents/mlagents/trainers/tests/test_trainer_util.py
    external_brains = {"testbrain": brain_params_mock}

    def mock_constructor(
-        self,
-        brain,
-        reward_buff_cap,
-        trainer_parameters,
-        training,
-        load,
-        seed,
-        run_id,
-        multi_gpu,
+        self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id
    ):
        assert brain == brain_params_mock.brain_name
        assert trainer_parameters == expected_config
        assert seed == seed
        assert run_id == run_id
-        assert multi_gpu == multi_gpu

    with patch.object(PPOTrainer, "__init__", mock_constructor):
        trainer_factory = trainer_util.TrainerFactory(
    expected_config["keep_checkpoints"] = keep_checkpoints

    def mock_constructor(
-        self,
-        brain,
-        reward_buff_cap,
-        trainer_parameters,
-        training,
-        load,
-        seed,
-        run_id,
-        multi_gpu,
+        self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id
    ):
        assert brain == brain_params_mock.brain_name
        assert trainer_parameters == expected_config
        assert seed == seed
        assert run_id == run_id
-        assert multi_gpu == multi_gpu

    with patch.object(PPOTrainer, "__init__", mock_constructor):
        trainer_factory = trainer_util.TrainerFactory(
--- a/ml-agents/mlagents/trainers/tf_policy.py
+++ b/ml-agents/mlagents/trainers/tf_policy.py
 import logging
 from typing import Any, Dict, List, Optional

+import abc
 import numpy as np

 from mlagents.tf_utils import tf
 from mlagents.trainers.policy import Policy
 from mlagents.trainers.action_info import ActionInfo
 from mlagents.trainers.trajectory import SplitObservations
-from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.models import ModelUtils


 logger = logging.getLogger("mlagents.trainers")
 class TFPolicy(Policy):
    """
    Contains a learning model, and the necessary
-    functions to interact with it to perform evaluate and updating.
+    functions to save/load models and create the input placeholders.
-    def __init__(self, seed, brain, trainer_parameters):
+    def __init__(self, seed, brain, trainer_parameters, load=False):
        """
        Initialized the policy.
        :param seed: Random seed to use for TensorFlow.
+        self._version_number_ = 2
+        self.m_size = 0
-        self.m_size = None
-        self.model = None
+
+        self.act_size = brain.vector_action_space_size
+        self.vec_obs_size = brain.vector_observation_space_size
+        self.vis_obs_size = brain.number_visual_observations
+
        self.use_recurrent = trainer_parameters["use_recurrent"]
        self.memory_dict: Dict[str, np.ndarray] = {}
        self.reward_signals: Dict[str, "RewardSignal"] = {}
            config=tf_utils.generate_session_config(), graph=self.graph
        )
        self.saver = None
+        self.seed = seed
        if self.use_recurrent:
            self.m_size = trainer_parameters["memory_size"]
            self.sequence_length = trainer_parameters["sequence_length"]
                    "though the trainer uses recurrent.".format(brain.brain_name)
                )
-            elif self.m_size % 4 != 0:
+            elif self.m_size % 2 != 0:
-                    "but it must be divisible by 4.".format(
+                    "but it must be divisible by 2.".format(
+        self._initialize_tensorflow_references()
+        self.load = load
+
+    @abc.abstractmethod
+    def get_trainable_variables(self) -> List[tf.Variable]:
+        """
+        Returns a List of the trainable variables in this policy. if create_tf_graph hasn't been called,
+        returns empty list.
+        """
+        pass
+
+    @abc.abstractmethod
+    def create_tf_graph(self):
+        """
+        Builds the tensorflow graph needed for this policy.
+        """
+        pass

    def _initialize_graph(self):
        with self.graph.as_default():
                )
            self.saver.restore(self.sess, ckpt.model_checkpoint_path)

+    def initialize_or_load(self):
+        if self.load:
+            self._load_graph()
+        else:
+            self._initialize_graph()
+
    def get_weights(self):
        with self.graph.as_default():
            _vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    def fill_eval_dict(self, feed_dict, batched_step_result):
        vec_vis_obs = SplitObservations.from_observations(batched_step_result.obs)
        for i, _ in enumerate(vec_vis_obs.visual_observations):
-            feed_dict[self.model.visual_in[i]] = vec_vis_obs.visual_observations[i]
+            feed_dict[self.visual_in[i]] = vec_vis_obs.visual_observations[i]
-            feed_dict[self.model.vector_in] = vec_vis_obs.vector_observations
+            feed_dict[self.vector_in] = vec_vis_obs.vector_observations
        if not self.use_continuous_act:
            mask = np.ones(
                (
            )
            if batched_step_result.action_mask is not None:
                mask = 1 - np.concatenate(batched_step_result.action_mask, axis=1)
-            feed_dict[self.model.action_masks] = mask
+            feed_dict[self.action_masks] = mask
        return feed_dict

    def make_empty_memory(self, num_agents):
        Gets current model step.
        :return: current model step.
        """
-        step = self.sess.run(self.model.global_step)
+        step = self.sess.run(self.global_step)
        return step

    def increment_step(self, n_steps):
        out_dict = {
-            "global_step": self.model.global_step,
-            "increment_step": self.model.increment_step,
+            "global_step": self.global_step,
+            "increment_step": self.increment_step_op,
-        feed_dict = {self.model.steps_to_increment: n_steps}
+        feed_dict = {self.steps_to_increment: n_steps}
        return self.sess.run(out_dict, feed_dict=feed_dict)["global_step"]

    def get_inference_vars(self):
        """
        if self.use_vec_obs and self.normalize:
            self.sess.run(
-                self.model.update_normalization,
-                feed_dict={self.model.vector_in: vector_obs},
+                self.update_normalization_op, feed_dict={self.vector_in: vector_obs}
-    def get_batched_value_estimates(self, batch: AgentBuffer) -> Dict[str, np.ndarray]:
-        feed_dict: Dict[tf.Tensor, Any] = {
-            self.model.batch_size: batch.num_experiences,
-            self.model.sequence_length: 1,  # We want to feed data in batch-wise, not time-wise.
-        }
+    @property
+    def use_vis_obs(self):
+        return self.vis_obs_size > 0
-        if self.use_vec_obs:
-            feed_dict[self.model.vector_in] = batch["vector_obs"]
-        if self.model.vis_obs_size > 0:
-            for i in range(len(self.model.visual_in)):
-                _obs = batch["visual_obs%d" % i]
-                feed_dict[self.model.visual_in[i]] = _obs
-        if self.use_recurrent:
-            feed_dict[self.model.memory_in] = batch["memory"]
-        if not self.use_continuous_act and self.use_recurrent:
-            feed_dict[self.model.prev_action] = batch["prev_action"]
-        value_estimates = self.sess.run(self.model.value_heads, feed_dict)
-        value_estimates = {k: np.squeeze(v, axis=1) for k, v in value_estimates.items()}
+    @property
+    def use_vec_obs(self):
+        return self.vec_obs_size > 0
-        return value_estimates
+    def _initialize_tensorflow_references(self):
+        self.value_heads: Dict[str, tf.Tensor] = {}
+        self.normalization_steps: Optional[tf.Variable] = None
+        self.running_mean: Optional[tf.Variable] = None
+        self.running_variance: Optional[tf.Variable] = None
+        self.update_normalization_op: Optional[tf.Operation] = None
+        self.value: Optional[tf.Tensor] = None
+        self.all_log_probs: tf.Tensor = None
+        self.log_probs: Optional[tf.Tensor] = None
+        self.entropy: Optional[tf.Tensor] = None
+        self.action_oh: tf.Tensor = None
+        self.output_pre: Optional[tf.Tensor] = None
+        self.output: Optional[tf.Tensor] = None
+        self.selected_actions: Optional[tf.Tensor] = None
+        self.action_holder: Optional[tf.Tensor] = None
+        self.action_masks: Optional[tf.Tensor] = None
+        self.prev_action: Optional[tf.Tensor] = None
+        self.memory_in: Optional[tf.Tensor] = None
+        self.memory_out: Optional[tf.Tensor] = None
-    def get_value_estimates(
-        self, next_obs: List[np.ndarray], agent_id: str, done: bool
-    ) -> Dict[str, float]:
-        """
-        Generates value estimates for bootstrapping.
-        :param experience: AgentExperience to be used for bootstrapping.
-        :param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
-        :return: The value estimate dictionary with key being the name of the reward signal and the value the
-        corresponding value estimate.
-        """
-
-        feed_dict: Dict[tf.Tensor, Any] = {
-            self.model.batch_size: 1,
-            self.model.sequence_length: 1,
-        }
-        vec_vis_obs = SplitObservations.from_observations(next_obs)
-        for i in range(len(vec_vis_obs.visual_observations)):
-            feed_dict[self.model.visual_in[i]] = [vec_vis_obs.visual_observations[i]]
-
-        if self.use_vec_obs:
-            feed_dict[self.model.vector_in] = [vec_vis_obs.vector_observations]
-        if self.use_recurrent:
-            feed_dict[self.model.memory_in] = self.retrieve_memories([agent_id])
-        if not self.use_continuous_act and self.use_recurrent:
-            feed_dict[self.model.prev_action] = self.retrieve_previous_action(
-                [agent_id]
+    def create_input_placeholders(self):
+        with self.graph.as_default():
+            self.global_step, self.increment_step_op, self.steps_to_increment = (
+                ModelUtils.create_global_steps()
+            )
+            self.visual_in = ModelUtils.create_visual_input_placeholders(
+                self.brain.camera_resolutions
-        value_estimates = self.sess.run(self.model.value_heads, feed_dict)
-
-        value_estimates = {k: float(v) for k, v in value_estimates.items()}
-
-        # If we're done, reassign all of the value estimates that need terminal states.
-        if done:
-            for k in value_estimates:
-                if self.reward_signals[k].use_terminal_states:
-                    value_estimates[k] = 0.0
-
-        return value_estimates
-
-    @property
-    def vis_obs_size(self):
-        return self.model.vis_obs_size
-
-    @property
-    def vec_obs_size(self):
-        return self.model.vec_obs_size
+            self.vector_in = ModelUtils.create_vector_input(self.vec_obs_size)
+            if self.normalize:
+                normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
+                self.update_normalization_op = normalization_tensors.update_op
+                self.normalization_steps = normalization_tensors.steps
+                self.running_mean = normalization_tensors.running_mean
+                self.running_variance = normalization_tensors.running_variance
+                self.processed_vector_in = ModelUtils.normalize_vector_obs(
+                    self.vector_in,
+                    self.running_mean,
+                    self.running_variance,
+                    self.normalization_steps,
+                )
+            else:
+                self.processed_vector_in = self.vector_in
+                self.update_normalization_op = None
-    @property
-    def use_vis_obs(self):
-        return self.model.vis_obs_size > 0
+            self.batch_size_ph = tf.placeholder(
+                shape=None, dtype=tf.int32, name="batch_size"
+            )
+            self.sequence_length_ph = tf.placeholder(
+                shape=None, dtype=tf.int32, name="sequence_length"
+            )
+            self.mask_input = tf.placeholder(
+                shape=[None], dtype=tf.float32, name="masks"
+            )
+            # Only needed for PPO, but needed for BC module
+            self.epsilon = tf.placeholder(
+                shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
+            )
+            self.mask = tf.cast(self.mask_input, tf.int32)
-    @property
-    def use_vec_obs(self):
-        return self.model.vec_obs_size > 0
+            tf.Variable(
+                int(self.brain.vector_action_space_type == "continuous"),
+                name="is_continuous_control",
+                trainable=False,
+                dtype=tf.int32,
+            )
+            tf.Variable(
+                self._version_number_,
+                name="version_number",
+                trainable=False,
+                dtype=tf.int32,
+            )
+            tf.Variable(
+                self.m_size, name="memory_size", trainable=False, dtype=tf.int32
+            )
+            if self.brain.vector_action_space_type == "continuous":
+                tf.Variable(
+                    self.act_size[0],
+                    name="action_output_shape",
+                    trainable=False,
+                    dtype=tf.int32,
+                )
+            else:
+                tf.Variable(
+                    sum(self.act_size),
+                    name="action_output_shape",
+                    trainable=False,
+                    dtype=tf.int32,
+                )
--- a/ml-agents/mlagents/trainers/trainer.py
+++ b/ml-agents/mlagents/trainers/trainer.py

 from collections import deque

-from mlagents_envs.exception import UnityException
 from mlagents_envs.timers import set_gauge
 from mlagents.model_serialization import export_policy_model, SerializationSettings
 from mlagents.trainers.tf_policy import TFPolicy
 from mlagents.trainers.brain import BrainParameters
 from mlagents.trainers.policy import Policy
+from mlagents.trainers.exception import UnityTrainerException
-
-
-class UnityTrainerException(UnityException):
-    """
-    Related to errors with the Trainer.
-    """
-
-    pass


 class Trainer(abc.ABC):
--- a/ml-agents/mlagents/trainers/trainer_util.py
+++ b/ml-agents/mlagents/trainers/trainer_util.py

 from mlagents.trainers.meta_curriculum import MetaCurriculum
 from mlagents.trainers.exception import TrainerConfigError
-from mlagents.trainers.trainer import Trainer, UnityTrainerException
+from mlagents.trainers.trainer import Trainer
+from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.ppo.trainer import PPOTrainer
 from mlagents.trainers.sac.trainer import SACTrainer
 from mlagents.trainers.ghost.trainer import GhostTrainer
    :param load_model: Whether to load the model or randomly initialize
    :param seed: The random seed to use
    :param meta_curriculum: Optional meta_curriculum, used to determine a reward buffer length for PPOTrainer
-    :param multi_gpu: Whether to use multi-GPU training
    :return:
    """
    if "default" not in trainer_config and brain_name not in trainer_config:
            load_model,
            seed,
            run_id,
-            multi_gpu,
        )
    elif trainer_type == "sac":
        trainer = SACTrainer(
--- a/com.unity.ml-agents/Runtime/Demonstrations/Demonstration.cs.meta
+++ b/com.unity.ml-agents/Runtime/Demonstrations/Demonstration.cs.meta
 fileFormatVersion: 2
-guid: b651f66c75a1646c6ab48de06d0e13ef
+guid: a5e0cbcbc514b473399c262dd37541ea
 MonoImporter:
  externalObjects: {}
  serializedVersion: 2
--- a/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationRecorder.cs.meta
+++ b/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationRecorder.cs.meta
 fileFormatVersion: 2
-guid: 50f710d360a49461cad67ff5e6bcefe1
+guid: f2902496c0120472b90269f94a0aec7e
 MonoImporter:
  externalObjects: {}
  serializedVersion: 2
--- a/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationWriter.cs
+++ b/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationWriter.cs
    /// <summary>
    /// Responsible for writing demonstration data to stream (usually a file stream).
    /// </summary>
-    public class DemonstrationStore
+    public class DemonstrationWriter
    {
        public const int MetaDataBytes = 32; // Number of bytes allocated to metadata in demo file.

        WriteAdapter m_WriteAdapter = new WriteAdapter();

        /// <summary>
-        /// Create a DemonstrationStore that will write to the specified stream.
+        /// Create a DemonstrationWriter that will write to the specified stream.
-        public DemonstrationStore(Stream stream)
+        public DemonstrationWriter(Stream stream)
        {
            m_Writer = stream;
        }
--- a/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationWriter.cs.meta
+++ b/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationWriter.cs.meta
 fileFormatVersion: 2
-guid: a79c7ccb2cd042b5b1e710b9588d921b
+guid: ebaf7878a8cc74ee3aae07daf9e1b6f2
 MonoImporter:
  externalObjects: {}
  serializedVersion: 2
--- a/com.unity.ml-agents/Runtime/Demonstrations.meta
+++ b/com.unity.ml-agents/Runtime/Demonstrations.meta
+fileFormatVersion: 2
+guid: 85e02c21d231b4f5fa0c5f87e5f907a2
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/ml-agents/mlagents/trainers/ppo/optimizer.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer.py
+import logging
+from typing import Optional, Any, Dict
+
+import numpy as np
+from mlagents.tf_utils import tf
+from mlagents_envs.timers import timed
+from mlagents.trainers.models import ModelUtils, EncoderType, LearningRateSchedule
+from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.common.tf_optimizer import TFOptimizer
+from mlagents.trainers.buffer import AgentBuffer
+
+
+logger = logging.getLogger("mlagents.trainers")
+
+
+class PPOOptimizer(TFOptimizer):
+    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
+        """
+        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
+        The PPO optimizer has a value estimator and a loss function.
+        :param policy: A TFPolicy object that will be updated by this PPO Optimizer.
+        :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
+        """
+        # Create the graph here to give more granular control of the TF graph to the Optimizer.
+        policy.create_tf_graph()
+
+        with policy.graph.as_default():
+            with tf.variable_scope("optimizer/"):
+                super().__init__(policy, trainer_params)
+
+                lr = float(trainer_params["learning_rate"])
+                lr_schedule = LearningRateSchedule(
+                    trainer_params.get("learning_rate_schedule", "linear")
+                )
+                h_size = int(trainer_params["hidden_units"])
+                epsilon = float(trainer_params["epsilon"])
+                beta = float(trainer_params["beta"])
+                max_step = float(trainer_params["max_steps"])
+                num_layers = int(trainer_params["num_layers"])
+                vis_encode_type = EncoderType(
+                    trainer_params.get("vis_encode_type", "simple")
+                )
+                self.burn_in_ratio = float(trainer_params.get("burn_in_ratio", 0.0))
+
+                self.stream_names = list(self.reward_signals.keys())
+
+                self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None
+                self.grads = None
+                self.update_batch: Optional[tf.Operation] = None
+
+                self.stats_name_to_update_name = {
+                    "Losses/Value Loss": "value_loss",
+                    "Losses/Policy Loss": "policy_loss",
+                    "Policy/Learning Rate": "learning_rate",
+                }
+                if self.policy.use_recurrent:
+                    self.m_size = self.policy.m_size
+                    self.memory_in = tf.placeholder(
+                        shape=[None, self.m_size],
+                        dtype=tf.float32,
+                        name="recurrent_value_in",
+                    )
+
+                if num_layers < 1:
+                    num_layers = 1
+                if policy.use_continuous_act:
+                    self._create_cc_critic(h_size, num_layers, vis_encode_type)
+                else:
+                    self._create_dc_critic(h_size, num_layers, vis_encode_type)
+
+                self.learning_rate = ModelUtils.create_learning_rate(
+                    lr_schedule, lr, self.policy.global_step, int(max_step)
+                )
+                self._create_losses(
+                    self.policy.log_probs,
+                    self.old_log_probs,
+                    self.value_heads,
+                    self.policy.entropy,
+                    beta,
+                    epsilon,
+                    lr,
+                    max_step,
+                )
+                self._create_ppo_optimizer_ops()
+
+            self.update_dict.update(
+                {
+                    "value_loss": self.value_loss,
+                    "policy_loss": self.abs_policy_loss,
+                    "update_batch": self.update_batch,
+                    "learning_rate": self.learning_rate,
+                }
+            )
+
+            self.policy.initialize_or_load()
+
+    def _create_cc_critic(
+        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
+    ) -> None:
+        """
+        Creates Continuous control actor-critic model.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: The type of visual encoder to use.
+        """
+        hidden_stream = ModelUtils.create_observation_streams(
+            self.policy.visual_in,
+            self.policy.processed_vector_in,
+            1,
+            h_size,
+            num_layers,
+            vis_encode_type,
+        )[0]
+
+        if self.policy.use_recurrent:
+            hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
+                hidden_stream,
+                self.memory_in,
+                self.policy.sequence_length_ph,
+                name="lstm_value",
+            )
+            self.memory_out = memory_value_out
+        else:
+            hidden_value = hidden_stream
+
+        self.value_heads, self.value = ModelUtils.create_value_heads(
+            self.stream_names, hidden_value
+        )
+        self.all_old_log_probs = tf.placeholder(
+            shape=[None, 1], dtype=tf.float32, name="old_probabilities"
+        )
+
+        self.old_log_probs = tf.reduce_sum(
+            (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
+        )
+
+    def _create_dc_critic(
+        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
+    ) -> None:
+        """
+        Creates Discrete control actor-critic model.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: The type of visual encoder to use.
+        """
+        hidden_stream = ModelUtils.create_observation_streams(
+            self.policy.visual_in,
+            self.policy.processed_vector_in,
+            1,
+            h_size,
+            num_layers,
+            vis_encode_type,
+        )[0]
+
+        if self.policy.use_recurrent:
+            hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
+                hidden_stream,
+                self.memory_in,
+                self.policy.sequence_length_ph,
+                name="lstm_value",
+            )
+            self.memory_out = memory_value_out
+        else:
+            hidden_value = hidden_stream
+
+        self.value_heads, self.value = ModelUtils.create_value_heads(
+            self.stream_names, hidden_value
+        )
+
+        self.all_old_log_probs = tf.placeholder(
+            shape=[None, sum(self.policy.act_size)],
+            dtype=tf.float32,
+            name="old_probabilities",
+        )
+        _, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer(
+            self.all_old_log_probs, self.policy.action_masks, self.policy.act_size
+        )
+
+        action_idx = [0] + list(np.cumsum(self.policy.act_size))
+
+        self.old_log_probs = tf.reduce_sum(
+            (
+                tf.stack(
+                    [
+                        -tf.nn.softmax_cross_entropy_with_logits_v2(
+                            labels=self.policy.action_oh[
+                                :, action_idx[i] : action_idx[i + 1]
+                            ],
+                            logits=old_normalized_logits[
+                                :, action_idx[i] : action_idx[i + 1]
+                            ],
+                        )
+                        for i in range(len(self.policy.act_size))
+                    ],
+                    axis=1,
+                )
+            ),
+            axis=1,
+            keepdims=True,
+        )
+
+    def _create_losses(
+        self, probs, old_probs, value_heads, entropy, beta, epsilon, lr, max_step
+    ):
+        """
+        Creates training-specific Tensorflow ops for PPO models.
+        :param probs: Current policy probabilities
+        :param old_probs: Past policy probabilities
+        :param value_heads: Value estimate tensors from each value stream
+        :param beta: Entropy regularization strength
+        :param entropy: Current policy entropy
+        :param epsilon: Value for policy-divergence threshold
+        :param lr: Learning rate
+        :param max_step: Total number of training steps.
+        """
+        self.returns_holders = {}
+        self.old_values = {}
+        for name in value_heads.keys():
+            returns_holder = tf.placeholder(
+                shape=[None], dtype=tf.float32, name="{}_returns".format(name)
+            )
+            old_value = tf.placeholder(
+                shape=[None], dtype=tf.float32, name="{}_value_estimate".format(name)
+            )
+            self.returns_holders[name] = returns_holder
+            self.old_values[name] = old_value
+        self.advantage = tf.placeholder(
+            shape=[None], dtype=tf.float32, name="advantages"
+        )
+        advantage = tf.expand_dims(self.advantage, -1)
+
+        decay_epsilon = tf.train.polynomial_decay(
+            epsilon, self.policy.global_step, max_step, 0.1, power=1.0
+        )
+        decay_beta = tf.train.polynomial_decay(
+            beta, self.policy.global_step, max_step, 1e-5, power=1.0
+        )
+
+        value_losses = []
+        for name, head in value_heads.items():
+            clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
+                tf.reduce_sum(head, axis=1) - self.old_values[name],
+                -decay_epsilon,
+                decay_epsilon,
+            )
+            v_opt_a = tf.squared_difference(
+                self.returns_holders[name], tf.reduce_sum(head, axis=1)
+            )
+            v_opt_b = tf.squared_difference(
+                self.returns_holders[name], clipped_value_estimate
+            )
+            value_loss = tf.reduce_mean(
+                tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.policy.mask, 2)[
+                    1
+                ]
+            )
+            value_losses.append(value_loss)
+        self.value_loss = tf.reduce_mean(value_losses)
+
+        r_theta = tf.exp(probs - old_probs)
+        p_opt_a = r_theta * advantage
+        p_opt_b = (
+            tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon)
+            * advantage
+        )
+        self.policy_loss = -tf.reduce_mean(
+            tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.policy.mask, 2)[1]
+        )
+        # For cleaner stats reporting
+        self.abs_policy_loss = tf.abs(self.policy_loss)
+
+        self.loss = (
+            self.policy_loss
+            + 0.5 * self.value_loss
+            - decay_beta
+            * tf.reduce_mean(tf.dynamic_partition(entropy, self.policy.mask, 2)[1])
+        )
+
+    def _create_ppo_optimizer_ops(self):
+        self.tf_optimizer = self.create_optimizer_op(self.learning_rate)
+        self.grads = self.tf_optimizer.compute_gradients(self.loss)
+        self.update_batch = self.tf_optimizer.minimize(self.loss)
+
+    @timed
+    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
+        """
+        Performs update on model.
+        :param mini_batch: Batch of experiences.
+        :param num_sequences: Number of sequences to process.
+        :return: Results of update.
+        """
+        feed_dict = self._construct_feed_dict(batch, num_sequences)
+        stats_needed = self.stats_name_to_update_name
+        update_stats = {}
+        # Collect feed dicts for all reward signals.
+        for _, reward_signal in self.reward_signals.items():
+            feed_dict.update(
+                reward_signal.prepare_update(self.policy, batch, num_sequences)
+            )
+            stats_needed.update(reward_signal.stats_name_to_update_name)
+
+        update_vals = self._execute_model(feed_dict, self.update_dict)
+        for stat_name, update_name in stats_needed.items():
+            update_stats[stat_name] = update_vals[update_name]
+        return update_stats
+
+    def _construct_feed_dict(
+        self, mini_batch: AgentBuffer, num_sequences: int
+    ) -> Dict[tf.Tensor, Any]:
+        # Do an optional burn-in for memories
+        num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
+        burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
+        burn_in_mask[range(0, num_burn_in)] = 0
+        burn_in_mask = np.tile(burn_in_mask, num_sequences)
+        feed_dict = {
+            self.policy.batch_size_ph: num_sequences,
+            self.policy.sequence_length_ph: self.policy.sequence_length,
+            self.policy.mask_input: mini_batch["masks"] * burn_in_mask,
+            self.advantage: mini_batch["advantages"],
+            self.all_old_log_probs: mini_batch["action_probs"],
+        }
+        for name in self.reward_signals:
+            feed_dict[self.returns_holders[name]] = mini_batch[
+                "{}_returns".format(name)
+            ]
+            feed_dict[self.old_values[name]] = mini_batch[
+                "{}_value_estimates".format(name)
+            ]
+
+        if self.policy.output_pre is not None and "actions_pre" in mini_batch:
+            feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]
+        else:
+            feed_dict[self.policy.action_holder] = mini_batch["actions"]
+            if self.policy.use_recurrent:
+                feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
+            feed_dict[self.policy.action_masks] = mini_batch["action_mask"]
+        if "vector_obs" in mini_batch:
+            feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]
+        if self.policy.vis_obs_size > 0:
+            for i, _ in enumerate(self.policy.visual_in):
+                feed_dict[self.policy.visual_in[i]] = mini_batch["visual_obs%d" % i]
+        if self.policy.use_recurrent:
+            feed_dict[self.policy.memory_in] = [
+                mini_batch["memory"][i]
+                for i in range(
+                    0, len(mini_batch["memory"]), self.policy.sequence_length
+                )
+            ]
+            feed_dict[self.memory_in] = self._make_zero_mem(
+                self.m_size, mini_batch.num_experiences
+            )
+        return feed_dict
--- a/ml-agents/mlagents/trainers/sac/network.py
+++ b/ml-agents/mlagents/trainers/sac/network.py
+import logging
+from typing import Dict, Optional
+
+from mlagents.tf_utils import tf
+
+from mlagents.trainers.models import ModelUtils, EncoderType
+
+LOG_STD_MAX = 2
+LOG_STD_MIN = -20
+EPSILON = 1e-6  # Small value to avoid divide by zero
+DISCRETE_TARGET_ENTROPY_SCALE = 0.2  # Roughly equal to e-greedy 0.05
+CONTINUOUS_TARGET_ENTROPY_SCALE = 1.0  # TODO: Make these an optional hyperparam.
+
+LOGGER = logging.getLogger("mlagents.trainers")
+
+POLICY_SCOPE = ""
+TARGET_SCOPE = "target_network"
+
+
+class SACNetwork:
+    """
+    Base class for an SAC network. Implements methods for creating the actor and critic heads.
+    """
+
+    def __init__(
+        self,
+        policy=None,
+        m_size=None,
+        h_size=128,
+        normalize=False,
+        use_recurrent=False,
+        num_layers=2,
+        stream_names=None,
+        vis_encode_type=EncoderType.SIMPLE,
+    ):
+        self.normalize = normalize
+        self.use_recurrent = use_recurrent
+        self.num_layers = num_layers
+        self.stream_names = stream_names
+        self.h_size = h_size
+        self.activ_fn = ModelUtils.swish
+
+        self.sequence_length_ph = tf.placeholder(
+            shape=None, dtype=tf.int32, name="sac_sequence_length"
+        )
+
+        self.policy_memory_in: Optional[tf.Tensor] = None
+        self.policy_memory_out: Optional[tf.Tensor] = None
+        self.value_memory_in: Optional[tf.Tensor] = None
+        self.value_memory_out: Optional[tf.Tensor] = None
+        self.q1: Optional[tf.Tensor] = None
+        self.q2: Optional[tf.Tensor] = None
+        self.q1_p: Optional[tf.Tensor] = None
+        self.q2_p: Optional[tf.Tensor] = None
+        self.q1_memory_in: Optional[tf.Tensor] = None
+        self.q2_memory_in: Optional[tf.Tensor] = None
+        self.q1_memory_out: Optional[tf.Tensor] = None
+        self.q2_memory_out: Optional[tf.Tensor] = None
+        self.prev_action: Optional[tf.Tensor] = None
+        self.action_masks: Optional[tf.Tensor] = None
+        self.external_action_in: Optional[tf.Tensor] = None
+        self.log_sigma_sq: Optional[tf.Tensor] = None
+        self.entropy: Optional[tf.Tensor] = None
+        self.deterministic_output: Optional[tf.Tensor] = None
+        self.normalized_logprobs: Optional[tf.Tensor] = None
+        self.action_probs: Optional[tf.Tensor] = None
+        self.output_oh: Optional[tf.Tensor] = None
+        self.output_pre: Optional[tf.Tensor] = None
+
+        self.value_vars = None
+        self.q_vars = None
+        self.critic_vars = None
+        self.policy_vars = None
+
+        self.q1_heads: Dict[str, tf.Tensor] = None
+        self.q2_heads: Dict[str, tf.Tensor] = None
+        self.q1_pheads: Dict[str, tf.Tensor] = None
+        self.q2_pheads: Dict[str, tf.Tensor] = None
+
+        self.policy = policy
+
+    def get_vars(self, scope):
+        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
+
+    def join_scopes(self, scope_1, scope_2):
+        """
+        Joins two scopes. Does so safetly (i.e., if one of the two scopes doesn't
+        exist, don't add any backslashes)
+        """
+        if not scope_1:
+            return scope_2
+        if not scope_2:
+            return scope_1
+        else:
+            return "/".join(filter(None, [scope_1, scope_2]))
+
+    def create_value_heads(self, stream_names, hidden_input):
+        """
+        Creates one value estimator head for each reward signal in stream_names.
+        Also creates the node corresponding to the mean of all the value heads in self.value.
+        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
+        :param stream_names: The list of reward signal names
+        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
+        of the hidden input.
+        """
+        self.value_heads = {}
+        for name in stream_names:
+            value = tf.layers.dense(hidden_input, 1, name="{}_value".format(name))
+            self.value_heads[name] = value
+        self.value = tf.reduce_mean(list(self.value_heads.values()), 0)
+
+    def _create_cc_critic(self, hidden_value, scope, create_qs=True):
+        """
+        Creates just the critic network
+        """
+        scope = self.join_scopes(scope, "critic")
+        self.create_sac_value_head(
+            self.stream_names,
+            hidden_value,
+            self.num_layers,
+            self.h_size,
+            self.join_scopes(scope, "value"),
+        )
+
+        self.value_vars = self.get_vars(self.join_scopes(scope, "value"))
+        if create_qs:
+            hidden_q = tf.concat([hidden_value, self.policy.action_holder], axis=-1)
+            hidden_qp = tf.concat([hidden_value, self.policy.output], axis=-1)
+            self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads(
+                self.stream_names,
+                hidden_q,
+                self.num_layers,
+                self.h_size,
+                self.join_scopes(scope, "q"),
+            )
+            self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads(
+                self.stream_names,
+                hidden_qp,
+                self.num_layers,
+                self.h_size,
+                self.join_scopes(scope, "q"),
+                reuse=True,
+            )
+            self.q_vars = self.get_vars(self.join_scopes(scope, "q"))
+        self.critic_vars = self.get_vars(scope)
+
+    def _create_dc_critic(self, hidden_value, scope, create_qs=True):
+        """
+        Creates just the critic network
+        """
+        scope = self.join_scopes(scope, "critic")
+        self.create_sac_value_head(
+            self.stream_names,
+            hidden_value,
+            self.num_layers,
+            self.h_size,
+            self.join_scopes(scope, "value"),
+        )
+
+        self.value_vars = self.get_vars("/".join([scope, "value"]))
+
+        if create_qs:
+            self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads(
+                self.stream_names,
+                hidden_value,
+                self.num_layers,
+                self.h_size,
+                self.join_scopes(scope, "q"),
+                num_outputs=sum(self.policy.act_size),
+            )
+            self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads(
+                self.stream_names,
+                hidden_value,
+                self.num_layers,
+                self.h_size,
+                self.join_scopes(scope, "q"),
+                reuse=True,
+                num_outputs=sum(self.policy.act_size),
+            )
+            self.q_vars = self.get_vars(scope)
+        self.critic_vars = self.get_vars(scope)
+
+    def create_sac_value_head(
+        self, stream_names, hidden_input, num_layers, h_size, scope
+    ):
+        """
+        Creates one value estimator head for each reward signal in stream_names.
+        Also creates the node corresponding to the mean of all the value heads in self.value.
+        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
+        :param stream_names: The list of reward signal names
+        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
+        of the hidden input.
+        :param num_layers: Number of hidden layers for value network
+        :param h_size: size of hidden layers for value network
+        :param scope: TF scope for value network.
+        """
+        with tf.variable_scope(scope):
+            value_hidden = ModelUtils.create_vector_observation_encoder(
+                hidden_input, h_size, self.activ_fn, num_layers, "encoder", False
+            )
+            if self.use_recurrent:
+                value_hidden, memory_out = ModelUtils.create_recurrent_encoder(
+                    value_hidden,
+                    self.value_memory_in,
+                    self.sequence_length_ph,
+                    name="lstm_value",
+                )
+                self.value_memory_out = memory_out
+            self.create_value_heads(stream_names, value_hidden)
+
+    def create_q_heads(
+        self,
+        stream_names,
+        hidden_input,
+        num_layers,
+        h_size,
+        scope,
+        reuse=False,
+        num_outputs=1,
+    ):
+        """
+        Creates two q heads for each reward signal in stream_names.
+        Also creates the node corresponding to the mean of all the value heads in self.value.
+        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
+        :param stream_names: The list of reward signal names
+        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
+        of the hidden input.
+        :param num_layers: Number of hidden layers for Q network
+        :param h_size: size of hidden layers for Q network
+        :param scope: TF scope for Q network.
+        :param reuse: Whether or not to reuse variables. Useful for creating Q of policy.
+        :param num_outputs: Number of outputs of each Q function. If discrete, equal to number of actions.
+        """
+        with tf.variable_scope(self.join_scopes(scope, "q1_encoding"), reuse=reuse):
+            q1_hidden = ModelUtils.create_vector_observation_encoder(
+                hidden_input, h_size, self.activ_fn, num_layers, "q1_encoder", reuse
+            )
+            if self.use_recurrent:
+                q1_hidden, memory_out = ModelUtils.create_recurrent_encoder(
+                    q1_hidden,
+                    self.q1_memory_in,
+                    self.sequence_length_ph,
+                    name="lstm_q1",
+                )
+                self.q1_memory_out = memory_out
+
+            q1_heads = {}
+            for name in stream_names:
+                _q1 = tf.layers.dense(q1_hidden, num_outputs, name="{}_q1".format(name))
+                q1_heads[name] = _q1
+
+            q1 = tf.reduce_mean(list(q1_heads.values()), axis=0)
+        with tf.variable_scope(self.join_scopes(scope, "q2_encoding"), reuse=reuse):
+            q2_hidden = ModelUtils.create_vector_observation_encoder(
+                hidden_input, h_size, self.activ_fn, num_layers, "q2_encoder", reuse
+            )
+            if self.use_recurrent:
+                q2_hidden, memory_out = ModelUtils.create_recurrent_encoder(
+                    q2_hidden,
+                    self.q2_memory_in,
+                    self.sequence_length_ph,
+                    name="lstm_q2",
+                )
+                self.q2_memory_out = memory_out
+
+            q2_heads = {}
+            for name in stream_names:
+                _q2 = tf.layers.dense(q2_hidden, num_outputs, name="{}_q2".format(name))
+                q2_heads[name] = _q2
+
+            q2 = tf.reduce_mean(list(q2_heads.values()), axis=0)
+
+        return q1_heads, q2_heads, q1, q2
+
+
+class SACTargetNetwork(SACNetwork):
+    """
+    Instantiation for the SAC target network. Only contains a single
+    value estimator and is updated from the Policy Network.
+    """
+
+    def __init__(
+        self,
+        policy,
+        m_size=None,
+        h_size=128,
+        normalize=False,
+        use_recurrent=False,
+        num_layers=2,
+        stream_names=None,
+        vis_encode_type=EncoderType.SIMPLE,
+    ):
+        super().__init__(
+            policy,
+            m_size,
+            h_size,
+            normalize,
+            use_recurrent,
+            num_layers,
+            stream_names,
+            vis_encode_type,
+        )
+        with tf.variable_scope(TARGET_SCOPE):
+            self.visual_in = ModelUtils.create_visual_input_placeholders(
+                policy.brain.camera_resolutions
+            )
+            self.vector_in = ModelUtils.create_vector_input(policy.vec_obs_size)
+            if self.policy.normalize:
+                normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
+                self.update_normalization_op = normalization_tensors.update_op
+                self.normalization_steps = normalization_tensors.steps
+                self.running_mean = normalization_tensors.running_mean
+                self.running_variance = normalization_tensors.running_variance
+                self.processed_vector_in = ModelUtils.normalize_vector_obs(
+                    self.vector_in,
+                    self.running_mean,
+                    self.running_variance,
+                    self.normalization_steps,
+                )
+            else:
+                self.processed_vector_in = self.vector_in
+                self.update_normalization_op = None
+
+            if self.policy.use_recurrent:
+                self.memory_in = tf.placeholder(
+                    shape=[None, m_size], dtype=tf.float32, name="target_recurrent_in"
+                )
+                self.value_memory_in = self.memory_in
+            hidden_streams = ModelUtils.create_observation_streams(
+                self.visual_in,
+                self.processed_vector_in,
+                1,
+                self.h_size,
+                0,
+                vis_encode_type=vis_encode_type,
+                stream_scopes=["critic/value/"],
+            )
+        if self.policy.use_continuous_act:
+            self._create_cc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
+        else:
+            self._create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
+        if self.use_recurrent:
+            self.memory_out = tf.concat(
+                self.value_memory_out, axis=1
+            )  # Needed for Barracuda to work
+
+    def copy_normalization(self, mean, variance, steps):
+        """
+        Copies the mean, variance, and steps into the normalizers of the
+        input of this SACNetwork. Used to copy the normalizer from the policy network
+        to the target network.
+        param mean: Tensor containing the mean.
+        param variance: Tensor containing the variance
+        param steps: Tensor containing the number of steps.
+        """
+        update_mean = tf.assign(self.running_mean, mean)
+        update_variance = tf.assign(self.running_variance, variance)
+        update_norm_step = tf.assign(self.normalization_steps, steps)
+        return tf.group([update_mean, update_variance, update_norm_step])
+
+
+class SACPolicyNetwork(SACNetwork):
+    """
+    Instantiation for SAC policy network. Contains a dual Q estimator,
+    a value estimator, and a reference to the actual policy network.
+    """
+
+    def __init__(
+        self,
+        policy,
+        m_size=None,
+        h_size=128,
+        normalize=False,
+        use_recurrent=False,
+        num_layers=2,
+        stream_names=None,
+        vis_encode_type=EncoderType.SIMPLE,
+    ):
+        super().__init__(
+            policy,
+            m_size,
+            h_size,
+            normalize,
+            use_recurrent,
+            num_layers,
+            stream_names,
+            vis_encode_type,
+        )
+        if self.policy.use_recurrent:
+            self._create_memory_ins(m_size)
+
+        hidden_critic = self._create_observation_in(vis_encode_type)
+        self.policy.output = self.policy.output
+        # Use the sequence length of the policy
+        self.sequence_length_ph = self.policy.sequence_length_ph
+
+        if self.policy.use_continuous_act:
+            self._create_cc_critic(hidden_critic, POLICY_SCOPE)
+
+        else:
+            self._create_dc_critic(hidden_critic, POLICY_SCOPE)
+
+        if self.use_recurrent:
+            mem_outs = [self.value_memory_out, self.q1_memory_out, self.q2_memory_out]
+            self.memory_out = tf.concat(mem_outs, axis=1)
+
+    def _create_memory_ins(self, m_size):
+        """
+        Creates the memory input placeholders for LSTM.
+        :param m_size: the total size of the memory.
+        """
+        self.memory_in = tf.placeholder(
+            shape=[None, m_size * 3], dtype=tf.float32, name="value_recurrent_in"
+        )
+
+        # Re-break-up for each network
+        num_mems = 3
+        input_size = self.memory_in.get_shape().as_list()[1]
+        mem_ins = []
+        for i in range(num_mems):
+            _start = input_size // num_mems * i
+            _end = input_size // num_mems * (i + 1)
+            mem_ins.append(self.memory_in[:, _start:_end])
+        self.value_memory_in = mem_ins[0]
+        self.q1_memory_in = mem_ins[1]
+        self.q2_memory_in = mem_ins[2]
+
+    def _create_observation_in(self, vis_encode_type):
+        """
+        Creates the observation inputs, and a CNN if needed,
+        :param vis_encode_type: Type of CNN encoder.
+        :param share_ac_cnn: Whether or not to share the actor and critic CNNs.
+        :return A tuple of (hidden_policy, hidden_critic). We don't save it to self since they're used
+        once and thrown away.
+        """
+        with tf.variable_scope(POLICY_SCOPE):
+            hidden_streams = ModelUtils.create_observation_streams(
+                self.policy.visual_in,
+                self.policy.processed_vector_in,
+                1,
+                self.h_size,
+                0,
+                vis_encode_type=vis_encode_type,
+                stream_scopes=["critic/value/"],
+            )
+        hidden_critic = hidden_streams[0]
+        return hidden_critic
--- a/ml-agents/mlagents/trainers/sac/optimizer.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer.py
+import logging
+import numpy as np
+from typing import Dict, List, Optional, Any, Mapping
+
+from mlagents.tf_utils import tf
+
+from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
+from mlagents.trainers.models import LearningRateSchedule, EncoderType, ModelUtils
+from mlagents.trainers.common.tf_optimizer import TFOptimizer
+from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.buffer import AgentBuffer
+from mlagents_envs.timers import timed
+
+EPSILON = 1e-6  # Small value to avoid divide by zero
+
+LOGGER = logging.getLogger("mlagents.trainers")
+
+POLICY_SCOPE = ""
+TARGET_SCOPE = "target_network"
+
+
+class SACOptimizer(TFOptimizer):
+    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
+        """
+        Takes a Unity environment and model-specific hyper-parameters and returns the
+        appropriate PPO agent model for the environment.
+        :param brain: Brain parameters used to generate specific network graph.
+        :param lr: Learning rate.
+        :param lr_schedule: Learning rate decay schedule.
+        :param h_size: Size of hidden layers
+        :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster,
+            set higher to explore more.
+        :return: a sub-class of PPOAgent tailored to the environment.
+        :param max_step: Total number of training steps.
+        :param normalize: Whether to normalize vector observation input.
+        :param use_recurrent: Whether to use an LSTM layer in the network.
+        :param num_layers: Number of hidden layers between encoded input and policy & value layers
+        :param tau: Strength of soft-Q update.
+        :param m_size: Size of brain memory.
+        """
+        # Create the graph here to give more granular control of the TF graph to the Optimizer.
+        policy.create_tf_graph()
+
+        with policy.graph.as_default():
+            with tf.variable_scope(""):
+                super().__init__(policy, trainer_params)
+                lr = float(trainer_params["learning_rate"])
+                lr_schedule = LearningRateSchedule(
+                    trainer_params.get("learning_rate_schedule", "constant")
+                )
+                self.policy = policy
+                self.act_size = self.policy.act_size
+                h_size = int(trainer_params["hidden_units"])
+                max_step = float(trainer_params["max_steps"])
+                num_layers = int(trainer_params["num_layers"])
+                vis_encode_type = EncoderType(
+                    trainer_params.get("vis_encode_type", "simple")
+                )
+                self.tau = trainer_params.get("tau", 0.005)
+                self.burn_in_ratio = float(trainer_params.get("burn_in_ratio", 0.0))
+
+                # Non-exposed SAC parameters
+                self.discrete_target_entropy_scale = (
+                    0.2
+                )  # Roughly equal to e-greedy 0.05
+                self.continuous_target_entropy_scale = 1.0
+
+                self.init_entcoef = trainer_params.get("init_entcoef", 1.0)
+                stream_names = list(self.reward_signals.keys())
+                # Use to reduce "survivor bonus" when using Curiosity or GAIL.
+                self.gammas = [
+                    _val["gamma"] for _val in trainer_params["reward_signals"].values()
+                ]
+                self.use_dones_in_backup = {
+                    name: tf.Variable(1.0) for name in stream_names
+                }
+                self.disable_use_dones = {
+                    name: self.use_dones_in_backup[name].assign(0.0)
+                    for name in stream_names
+                }
+
+                if num_layers < 1:
+                    num_layers = 1
+
+                self.target_init_op: List[tf.Tensor] = []
+                self.target_update_op: List[tf.Tensor] = []
+                self.update_batch_policy: Optional[tf.Operation] = None
+                self.update_batch_value: Optional[tf.Operation] = None
+                self.update_batch_entropy: Optional[tf.Operation] = None
+
+                self.policy_network = SACPolicyNetwork(
+                    policy=self.policy,
+                    m_size=self.policy.m_size,  # 3x policy.m_size
+                    h_size=h_size,
+                    normalize=self.policy.normalize,
+                    use_recurrent=self.policy.use_recurrent,
+                    num_layers=num_layers,
+                    stream_names=stream_names,
+                    vis_encode_type=vis_encode_type,
+                )
+                self.target_network = SACTargetNetwork(
+                    policy=self.policy,
+                    m_size=self.policy.m_size,  # 1x policy.m_size
+                    h_size=h_size,
+                    normalize=self.policy.normalize,
+                    use_recurrent=self.policy.use_recurrent,
+                    num_layers=num_layers,
+                    stream_names=stream_names,
+                    vis_encode_type=vis_encode_type,
+                )
+                # The optimizer's m_size is 3 times the policy (Q1, Q2, and Value)
+                self.m_size = 3 * self.policy.m_size
+                self._create_inputs_and_outputs()
+                self.learning_rate = ModelUtils.create_learning_rate(
+                    lr_schedule, lr, self.policy.global_step, int(max_step)
+                )
+                self._create_losses(
+                    self.policy_network.q1_heads,
+                    self.policy_network.q2_heads,
+                    lr,
+                    int(max_step),
+                    stream_names,
+                    discrete=not self.policy.use_continuous_act,
+                )
+                self._create_sac_optimizer_ops()
+
+                self.selected_actions = (
+                    self.policy.selected_actions
+                )  # For GAIL and other reward signals
+                if self.policy.normalize:
+                    target_update_norm = self.target_network.copy_normalization(
+                        self.policy.running_mean,
+                        self.policy.running_variance,
+                        self.policy.normalization_steps,
+                    )
+                    # Update the normalization of the optimizer when the policy does.
+                    self.policy.update_normalization_op = tf.group(
+                        [self.policy.update_normalization_op, target_update_norm]
+                    )
+
+                self.policy.initialize_or_load()
+
+        self.stats_name_to_update_name = {
+            "Losses/Value Loss": "value_loss",
+            "Losses/Policy Loss": "policy_loss",
+            "Losses/Q1 Loss": "q1_loss",
+            "Losses/Q2 Loss": "q2_loss",
+            "Policy/Entropy Coeff": "entropy_coef",
+            "Policy/Learning Rate": "learning_rate",
+        }
+
+        self.update_dict = {
+            "value_loss": self.total_value_loss,
+            "policy_loss": self.policy_loss,
+            "q1_loss": self.q1_loss,
+            "q2_loss": self.q2_loss,
+            "entropy_coef": self.ent_coef,
+            "entropy": self.policy.entropy,
+            "update_batch": self.update_batch_policy,
+            "update_value": self.update_batch_value,
+            "update_entropy": self.update_batch_entropy,
+            "learning_rate": self.learning_rate,
+        }
+
+    def _create_inputs_and_outputs(self) -> None:
+        """
+        Assign the higher-level SACModel's inputs and outputs to those of its policy or
+        target network.
+        """
+        self.vector_in = self.policy.vector_in
+        self.visual_in = self.policy.visual_in
+        self.next_vector_in = self.target_network.vector_in
+        self.next_visual_in = self.target_network.visual_in
+        self.action_holder = self.policy.action_holder
+        self.sequence_length_ph = self.policy.sequence_length_ph
+        self.next_sequence_length_ph = self.target_network.sequence_length_ph
+        if not self.policy.use_continuous_act:
+            self.action_masks = self.policy_network.action_masks
+        else:
+            self.output_pre = self.policy_network.output_pre
+
+        # Don't use value estimate during inference. TODO: Check why PPO uses value_estimate in inference.
+        self.value = tf.identity(
+            self.policy_network.value, name="value_estimate_unused"
+        )
+        self.value_heads = self.policy_network.value_heads
+        self.dones_holder = tf.placeholder(
+            shape=[None], dtype=tf.float32, name="dones_holder"
+        )
+
+        if self.policy.use_recurrent:
+            self.memory_in = self.policy_network.memory_in
+            self.memory_out = self.policy_network.memory_out
+            if not self.policy.use_continuous_act:
+                self.prev_action = self.policy_network.prev_action
+            self.next_memory_in = self.target_network.memory_in
+
+    def _create_losses(
+        self,
+        q1_streams: Dict[str, tf.Tensor],
+        q2_streams: Dict[str, tf.Tensor],
+        lr: tf.Tensor,
+        max_step: int,
+        stream_names: List[str],
+        discrete: bool = False,
+    ) -> None:
+        """
+        Creates training-specific Tensorflow ops for SAC models.
+        :param q1_streams: Q1 streams from policy network
+        :param q1_streams: Q2 streams from policy network
+        :param lr: Learning rate
+        :param max_step: Total number of training steps.
+        :param stream_names: List of reward stream names.
+        :param discrete: Whether or not to use discrete action losses.
+        """
+
+        if discrete:
+            self.target_entropy = [
+                self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
+                for i in self.act_size
+            ]
+            discrete_action_probs = tf.exp(self.policy.all_log_probs)
+            per_action_entropy = discrete_action_probs * self.policy.all_log_probs
+        else:
+            self.target_entropy = (
+                -1
+                * self.continuous_target_entropy_scale
+                * np.prod(self.act_size[0]).astype(np.float32)
+            )
+
+        self.rewards_holders = {}
+        self.min_policy_qs = {}
+
+        for name in stream_names:
+            if discrete:
+                _branched_mpq1 = self._apply_as_branches(
+                    self.policy_network.q1_pheads[name] * discrete_action_probs
+                )
+                branched_mpq1 = tf.stack(
+                    [
+                        tf.reduce_sum(_br, axis=1, keep_dims=True)
+                        for _br in _branched_mpq1
+                    ]
+                )
+                _q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0)
+
+                _branched_mpq2 = self._apply_as_branches(
+                    self.policy_network.q2_pheads[name] * discrete_action_probs
+                )
+                branched_mpq2 = tf.stack(
+                    [
+                        tf.reduce_sum(_br, axis=1, keep_dims=True)
+                        for _br in _branched_mpq2
+                    ]
+                )
+                _q2_p_mean = tf.reduce_mean(branched_mpq2, axis=0)
+
+                self.min_policy_qs[name] = tf.minimum(_q1_p_mean, _q2_p_mean)
+            else:
+                self.min_policy_qs[name] = tf.minimum(
+                    self.policy_network.q1_pheads[name],
+                    self.policy_network.q2_pheads[name],
+                )
+
+            rewards_holder = tf.placeholder(
+                shape=[None], dtype=tf.float32, name="{}_rewards".format(name)
+            )
+            self.rewards_holders[name] = rewards_holder
+
+        q1_losses = []
+        q2_losses = []
+        # Multiple q losses per stream
+        expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
+        for i, name in enumerate(stream_names):
+            _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1)
+
+            q_backup = tf.stop_gradient(
+                _expanded_rewards
+                + (1.0 - self.use_dones_in_backup[name] * expanded_dones)
+                * self.gammas[i]
+                * self.target_network.value_heads[name]
+            )
+
+            if discrete:
+                # We need to break up the Q functions by branch, and update them individually.
+                branched_q1_stream = self._apply_as_branches(
+                    self.policy.action_oh * q1_streams[name]
+                )
+                branched_q2_stream = self._apply_as_branches(
+                    self.policy.action_oh * q2_streams[name]
+                )
+
+                # Reduce each branch into scalar
+                branched_q1_stream = [
+                    tf.reduce_sum(_branch, axis=1, keep_dims=True)
+                    for _branch in branched_q1_stream
+                ]
+                branched_q2_stream = [
+                    tf.reduce_sum(_branch, axis=1, keep_dims=True)
+                    for _branch in branched_q2_stream
+                ]
+
+                q1_stream = tf.reduce_mean(branched_q1_stream, axis=0)
+                q2_stream = tf.reduce_mean(branched_q2_stream, axis=0)
+
+            else:
+                q1_stream = q1_streams[name]
+                q2_stream = q2_streams[name]
+
+            _q1_loss = 0.5 * tf.reduce_mean(
+                tf.to_float(self.policy.mask)
+                * tf.squared_difference(q_backup, q1_stream)
+            )
+
+            _q2_loss = 0.5 * tf.reduce_mean(
+                tf.to_float(self.policy.mask)
+                * tf.squared_difference(q_backup, q2_stream)
+            )
+
+            q1_losses.append(_q1_loss)
+            q2_losses.append(_q2_loss)
+
+        self.q1_loss = tf.reduce_mean(q1_losses)
+        self.q2_loss = tf.reduce_mean(q2_losses)
+
+        # Learn entropy coefficient
+        if discrete:
+            # Create a log_ent_coef for each branch
+            self.log_ent_coef = tf.get_variable(
+                "log_ent_coef",
+                dtype=tf.float32,
+                initializer=np.log([self.init_entcoef] * len(self.act_size)).astype(
+                    np.float32
+                ),
+                trainable=True,
+            )
+        else:
+            self.log_ent_coef = tf.get_variable(
+                "log_ent_coef",
+                dtype=tf.float32,
+                initializer=np.log(self.init_entcoef).astype(np.float32),
+                trainable=True,
+            )
+
+        self.ent_coef = tf.exp(self.log_ent_coef)
+        if discrete:
+            # We also have to do a different entropy and target_entropy per branch.
+            branched_per_action_ent = self._apply_as_branches(per_action_entropy)
+            branched_ent_sums = tf.stack(
+                [
+                    tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
+                    for _lp, _te in zip(branched_per_action_ent, self.target_entropy)
+                ],
+                axis=1,
+            )
+            self.entropy_loss = -tf.reduce_mean(
+                tf.to_float(self.policy.mask)
+                * tf.reduce_mean(
+                    self.log_ent_coef
+                    * tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
+                    axis=1,
+                )
+            )
+
+            # Same with policy loss, we have to do the loss per branch and average them,
+            # so that larger branches don't get more weight.
+            # The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q
+            branched_q_term = self._apply_as_branches(
+                discrete_action_probs * self.policy_network.q1_p
+            )
+
+            branched_policy_loss = tf.stack(
+                [
+                    tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True)
+                    for i, (_lp, _qt) in enumerate(
+                        zip(branched_per_action_ent, branched_q_term)
+                    )
+                ]
+            )
+            self.policy_loss = tf.reduce_mean(
+                tf.to_float(self.policy.mask) * tf.squeeze(branched_policy_loss)
+            )
+
+            # Do vbackup entropy bonus per branch as well.
+            branched_ent_bonus = tf.stack(
+                [
+                    tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True)
+                    for i, _lp in enumerate(branched_per_action_ent)
+                ]
+            )
+            value_losses = []
+            for name in stream_names:
+                v_backup = tf.stop_gradient(
+                    self.min_policy_qs[name]
+                    - tf.reduce_mean(branched_ent_bonus, axis=0)
+                )
+                value_losses.append(
+                    0.5
+                    * tf.reduce_mean(
+                        tf.to_float(self.policy.mask)
+                        * tf.squared_difference(
+                            self.policy_network.value_heads[name], v_backup
+                        )
+                    )
+                )
+
+        else:
+            self.entropy_loss = -tf.reduce_mean(
+                self.log_ent_coef
+                * tf.to_float(self.policy.mask)
+                * tf.stop_gradient(
+                    tf.reduce_sum(
+                        self.policy.all_log_probs + self.target_entropy,
+                        axis=1,
+                        keep_dims=True,
+                    )
+                )
+            )
+            batch_policy_loss = tf.reduce_mean(
+                self.ent_coef * self.policy.all_log_probs - self.policy_network.q1_p,
+                axis=1,
+            )
+            self.policy_loss = tf.reduce_mean(
+                tf.to_float(self.policy.mask) * batch_policy_loss
+            )
+
+            value_losses = []
+            for name in stream_names:
+                v_backup = tf.stop_gradient(
+                    self.min_policy_qs[name]
+                    - tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
+                )
+                value_losses.append(
+                    0.5
+                    * tf.reduce_mean(
+                        tf.to_float(self.policy.mask)
+                        * tf.squared_difference(
+                            self.policy_network.value_heads[name], v_backup
+                        )
+                    )
+                )
+        self.value_loss = tf.reduce_mean(value_losses)
+
+        self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss
+
+        self.entropy = self.policy_network.entropy
+
+    def _apply_as_branches(self, concat_logits: tf.Tensor) -> List[tf.Tensor]:
+        """
+        Takes in a concatenated set of logits and breaks it up into a list of non-concatenated logits, one per
+        action branch
+        """
+        action_idx = [0] + list(np.cumsum(self.act_size))
+        branches_logits = [
+            concat_logits[:, action_idx[i] : action_idx[i + 1]]
+            for i in range(len(self.act_size))
+        ]
+        return branches_logits
+
+    def _create_sac_optimizer_ops(self) -> None:
+        """
+        Creates the Adam optimizers and update ops for SAC, including
+        the policy, value, and entropy updates, as well as the target network update.
+        """
+        policy_optimizer = self.create_optimizer_op(
+            learning_rate=self.learning_rate, name="sac_policy_opt"
+        )
+        entropy_optimizer = self.create_optimizer_op(
+            learning_rate=self.learning_rate, name="sac_entropy_opt"
+        )
+        value_optimizer = self.create_optimizer_op(
+            learning_rate=self.learning_rate, name="sac_value_opt"
+        )
+
+        self.target_update_op = [
+            tf.assign(target, (1 - self.tau) * target + self.tau * source)
+            for target, source in zip(
+                self.target_network.value_vars, self.policy_network.value_vars
+            )
+        ]
+        LOGGER.debug("value_vars")
+        self.print_all_vars(self.policy_network.value_vars)
+        LOGGER.debug("targvalue_vars")
+        self.print_all_vars(self.target_network.value_vars)
+        LOGGER.debug("critic_vars")
+        self.print_all_vars(self.policy_network.critic_vars)
+        LOGGER.debug("q_vars")
+        self.print_all_vars(self.policy_network.q_vars)
+        LOGGER.debug("policy_vars")
+        policy_vars = self.policy.get_trainable_variables()
+        self.print_all_vars(policy_vars)
+
+        self.target_init_op = [
+            tf.assign(target, source)
+            for target, source in zip(
+                self.target_network.value_vars, self.policy_network.value_vars
+            )
+        ]
+
+        self.update_batch_policy = policy_optimizer.minimize(
+            self.policy_loss, var_list=policy_vars
+        )
+
+        # Make sure policy is updated first, then value, then entropy.
+        with tf.control_dependencies([self.update_batch_policy]):
+            self.update_batch_value = value_optimizer.minimize(
+                self.total_value_loss, var_list=self.policy_network.critic_vars
+            )
+            # Add entropy coefficient optimization operation
+            with tf.control_dependencies([self.update_batch_value]):
+                self.update_batch_entropy = entropy_optimizer.minimize(
+                    self.entropy_loss, var_list=self.log_ent_coef
+                )
+
+    def print_all_vars(self, variables):
+        for _var in variables:
+            LOGGER.debug(_var)
+
+    @timed
+    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
+        """
+        Updates model using buffer.
+        :param num_sequences: Number of trajectories in batch.
+        :param batch: Experience mini-batch.
+        :param update_target: Whether or not to update target value network
+        :param reward_signal_batches: Minibatches to use for updating the reward signals,
+            indexed by name. If none, don't update the reward signals.
+        :return: Output from update process.
+        """
+        feed_dict = self._construct_feed_dict(self.policy, batch, num_sequences)
+        stats_needed = self.stats_name_to_update_name
+        update_stats: Dict[str, float] = {}
+        update_vals = self._execute_model(feed_dict, self.update_dict)
+        for stat_name, update_name in stats_needed.items():
+            update_stats[stat_name] = update_vals[update_name]
+        # Update target network. By default, target update happens at every policy update.
+        self.sess.run(self.target_update_op)
+        return update_stats
+
+    def update_reward_signals(
+        self, reward_signal_minibatches: Mapping[str, Dict], num_sequences: int
+    ) -> Dict[str, float]:
+        """
+        Only update the reward signals.
+        :param reward_signal_batches: Minibatches to use for updating the reward signals,
+            indexed by name. If none, don't update the reward signals.
+        """
+        # Collect feed dicts for all reward signals.
+        feed_dict: Dict[tf.Tensor, Any] = {}
+        update_dict: Dict[str, tf.Tensor] = {}
+        update_stats: Dict[str, float] = {}
+        stats_needed: Dict[str, str] = {}
+        if reward_signal_minibatches:
+            self.add_reward_signal_dicts(
+                feed_dict,
+                update_dict,
+                stats_needed,
+                reward_signal_minibatches,
+                num_sequences,
+            )
+        update_vals = self._execute_model(feed_dict, update_dict)
+        for stat_name, update_name in stats_needed.items():
+            update_stats[stat_name] = update_vals[update_name]
+        return update_stats
+
+    def add_reward_signal_dicts(
+        self,
+        feed_dict: Dict[tf.Tensor, Any],
+        update_dict: Dict[str, tf.Tensor],
+        stats_needed: Dict[str, str],
+        reward_signal_minibatches: Mapping[str, Dict],
+        num_sequences: int,
+    ) -> None:
+        """
+        Adds the items needed for reward signal updates to the feed_dict and stats_needed dict.
+        :param feed_dict: Feed dict needed update
+        :param update_dit: Update dict that needs update
+        :param stats_needed: Stats needed to get from the update.
+        :param reward_signal_minibatches: Minibatches to use for updating the reward signals,
+            indexed by name.
+        """
+        for name, r_batch in reward_signal_minibatches.items():
+            feed_dict.update(
+                self.reward_signals[name].prepare_update(
+                    self.policy, r_batch, num_sequences
+                )
+            )
+            update_dict.update(self.reward_signals[name].update_dict)
+            stats_needed.update(self.reward_signals[name].stats_name_to_update_name)
+
+    def _construct_feed_dict(
+        self, policy: TFPolicy, batch: AgentBuffer, num_sequences: int
+    ) -> Dict[tf.Tensor, Any]:
+        """
+        Builds the feed dict for updating the SAC model.
+        :param model: The model to update. May be different when, e.g. using multi-GPU.
+        :param batch: Mini-batch to use to update.
+        :param num_sequences: Number of LSTM sequences in batch.
+        """
+        # Do an optional burn-in for memories
+        num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
+        burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
+        burn_in_mask[range(0, num_burn_in)] = 0
+        burn_in_mask = np.tile(burn_in_mask, num_sequences)
+        feed_dict = {
+            policy.batch_size_ph: num_sequences,
+            policy.sequence_length_ph: self.policy.sequence_length,
+            self.next_sequence_length_ph: self.policy.sequence_length,
+            self.policy.mask_input: batch["masks"] * burn_in_mask,
+        }
+        for name in self.reward_signals:
+            feed_dict[self.rewards_holders[name]] = batch["{}_rewards".format(name)]
+
+        if self.policy.use_continuous_act:
+            feed_dict[policy.action_holder] = batch["actions"]
+        else:
+            feed_dict[policy.action_holder] = batch["actions"]
+            if self.policy.use_recurrent:
+                feed_dict[policy.prev_action] = batch["prev_action"]
+            feed_dict[policy.action_masks] = batch["action_mask"]
+        if self.policy.use_vec_obs:
+            feed_dict[policy.vector_in] = batch["vector_obs"]
+            feed_dict[self.next_vector_in] = batch["next_vector_in"]
+        if self.policy.vis_obs_size > 0:
+            for i, _ in enumerate(policy.visual_in):
+                _obs = batch["visual_obs%d" % i]
+                feed_dict[policy.visual_in[i]] = _obs
+            for i, _ in enumerate(self.next_visual_in):
+                _obs = batch["next_visual_obs%d" % i]
+                feed_dict[self.next_visual_in[i]] = _obs
+        if self.policy.use_recurrent:
+            feed_dict[policy.memory_in] = [
+                batch["memory"][i]
+                for i in range(0, len(batch["memory"]), self.policy.sequence_length)
+            ]
+            feed_dict[self.policy_network.memory_in] = self._make_zero_mem(
+                self.m_size, batch.num_experiences
+            )
+            feed_dict[self.target_network.memory_in] = self._make_zero_mem(
+                self.m_size // 3, batch.num_experiences
+            )
+        feed_dict[self.dones_holder] = batch["done"]
+        return feed_dict
--- a/ml-agents/mlagents/trainers/tests/test_nn_policy.py
+++ b/ml-agents/mlagents/trainers/tests/test_nn_policy.py
+import pytest
+
+import numpy as np
+from mlagents.tf_utils import tf
+
+import yaml
+
+from mlagents.trainers.common.nn_policy import NNPolicy
+from mlagents.trainers.models import EncoderType, ModelUtils
+from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.brain import BrainParameters, CameraResolution
+from mlagents.trainers.tests import mock_brain as mb
+from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
+
+
+@pytest.fixture
+def dummy_config():
+    return yaml.safe_load(
+        """
+        trainer: ppo
+        batch_size: 32
+        beta: 5.0e-3
+        buffer_size: 512
+        epsilon: 0.2
+        hidden_units: 128
+        lambd: 0.95
+        learning_rate: 3.0e-4
+        max_steps: 5.0e4
+        normalize: true
+        num_epoch: 5
+        num_layers: 2
+        time_horizon: 64
+        sequence_length: 64
+        summary_freq: 1000
+        use_recurrent: false
+        normalize: true
+        memory_size: 8
+        curiosity_strength: 0.0
+        curiosity_enc_size: 1
+        summary_path: test
+        model_path: test
+        reward_signals:
+          extrinsic:
+            strength: 1.0
+            gamma: 0.99
+        """
+    )
+
+
+VECTOR_ACTION_SPACE = [2]
+VECTOR_OBS_SPACE = 8
+DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
+BUFFER_INIT_SAMPLES = 32
+NUM_AGENTS = 12
+
+
+def create_policy_mock(dummy_config, use_rnn, use_discrete, use_visual):
+    mock_brain = mb.setup_mock_brain(
+        use_discrete,
+        use_visual,
+        vector_action_space=VECTOR_ACTION_SPACE,
+        vector_obs_space=VECTOR_OBS_SPACE,
+        discrete_action_space=DISCRETE_ACTION_SPACE,
+    )
+
+    trainer_parameters = dummy_config
+    model_path = "testmodel"
+    trainer_parameters["model_path"] = model_path
+    trainer_parameters["keep_checkpoints"] = 3
+    trainer_parameters["use_recurrent"] = use_rnn
+    policy = NNPolicy(0, mock_brain, trainer_parameters, False, False)
+    return policy
+
+
+@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
+@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
+@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
+def test_policy_evaluate(dummy_config, rnn, visual, discrete):
+    # Test evaluate
+    tf.reset_default_graph()
+    policy = create_policy_mock(
+        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
+    )
+    step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
+
+    run_out = policy.evaluate(step, list(step.agent_id))
+    if discrete:
+        run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
+    else:
+        assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE[0])
+
+
+def test_normalization(dummy_config):
+    brain_params = BrainParameters(
+        brain_name="test_brain",
+        vector_observation_space_size=1,
+        camera_resolutions=[],
+        vector_action_space_size=[2],
+        vector_action_descriptions=[],
+        vector_action_space_type=0,
+    )
+    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
+    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
+
+    time_horizon = 6
+    trajectory = make_fake_trajectory(
+        length=time_horizon,
+        max_step_complete=True,
+        vec_obs_size=1,
+        num_vis_obs=0,
+        action_space=[2],
+    )
+    # Change half of the obs to 0
+    for i in range(3):
+        trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
+    policy = policy = NNPolicy(0, brain_params, dummy_config, False, False)
+
+    trajectory_buffer = trajectory.to_agentbuffer()
+    policy.update_normalization(trajectory_buffer["vector_obs"])
+
+    # Check that the running mean and variance is correct
+    steps, mean, variance = policy.sess.run(
+        [policy.normalization_steps, policy.running_mean, policy.running_variance]
+    )
+
+    assert steps == 6
+    assert mean[0] == 0.5
+    # Note: variance is divided by number of steps, and initialized to 1 to avoid
+    # divide by 0. The right answer is 0.25
+    assert (variance[0] - 1) / steps == 0.25
+
+    # Make another update, this time with all 1's
+    time_horizon = 10
+    trajectory = make_fake_trajectory(
+        length=time_horizon,
+        max_step_complete=True,
+        vec_obs_size=1,
+        num_vis_obs=0,
+        action_space=[2],
+    )
+    trajectory_buffer = trajectory.to_agentbuffer()
+    policy.update_normalization(trajectory_buffer["vector_obs"])
+
+    # Check that the running mean and variance is correct
+    steps, mean, variance = policy.sess.run(
+        [policy.normalization_steps, policy.running_mean, policy.running_variance]
+    )
+
+    assert steps == 16
+    assert mean[0] == 0.8125
+    assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
+
+
+def test_min_visual_size():
+    # Make sure each EncoderType has an entry in MIS_RESOLUTION_FOR_ENCODER
+    assert set(ModelUtils.MIN_RESOLUTION_FOR_ENCODER.keys()) == set(EncoderType)
+
+    for encoder_type in EncoderType:
+        with tf.Graph().as_default():
+            good_size = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[encoder_type]
+            good_res = CameraResolution(
+                width=good_size, height=good_size, num_channels=3
+            )
+            vis_input = ModelUtils.create_visual_input(good_res, "test_min_visual_size")
+            ModelUtils._check_resolution_for_encoder(vis_input, encoder_type)
+            enc_func = ModelUtils.get_encoder_for_type(encoder_type)
+            enc_func(vis_input, 32, ModelUtils.swish, 1, "test", False)
+
+        # Anything under the min size should raise an exception. If not, decrease the min size!
+        with pytest.raises(Exception):
+            with tf.Graph().as_default():
+                bad_size = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[encoder_type] - 1
+                bad_res = CameraResolution(
+                    width=bad_size, height=bad_size, num_channels=3
+                )
+                vis_input = ModelUtils.create_visual_input(
+                    bad_res, "test_min_visual_size"
+                )
+
+                with pytest.raises(UnityTrainerException):
+                    # Make sure we'd hit a friendly error during model setup time.
+                    ModelUtils._check_resolution_for_encoder(vis_input, encoder_type)
+
+                enc_func = ModelUtils.get_encoder_for_type(encoder_type)
+                enc_func(vis_input, 32, ModelUtils.swish, 1, "test", False)
+
+
+if __name__ == "__main__":
+    pytest.main()
--- a/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationRecorder.cs
+++ b/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationRecorder.cs
+using System.IO.Abstractions;
+using System.Text.RegularExpressions;
+using UnityEngine;
+using System.IO;
+
+namespace MLAgents
+{
+    /// <summary>
+    /// Demonstration Recorder Component.
+    /// </summary>
+    [RequireComponent(typeof(Agent))]
+    [AddComponentMenu("ML Agents/Demonstration Recorder", (int)MenuGroup.Default)]
+    public class DemonstrationRecorder : MonoBehaviour
+    {
+        [Tooltip("Whether or not to record demonstrations.")]
+        public bool record;
+
+        [Tooltip("Base demonstration file name. Will have numbers appended to make unique.")]
+        public string demonstrationName;
+
+        [Tooltip("Base directory to write the demo files. If null, will use {Application.dataPath}/Demonstrations.")]
+        public string demonstrationDirectory;
+
+        DemonstrationWriter m_DemoWriter;
+        internal const int MaxNameLength = 16;
+
+        const string k_ExtensionType = ".demo";
+        IFileSystem m_FileSystem;
+
+        Agent m_Agent;
+
+        void OnEnable()
+        {
+            m_Agent = GetComponent<Agent>();
+        }
+
+        void Update()
+        {
+            if (record)
+            {
+                LazyInitialize();
+            }
+        }
+
+        /// <summary>
+        /// Creates demonstration store for use in recording.
+        /// Has no effect if the demonstration store was already created.
+        /// </summary>
+        internal DemonstrationWriter LazyInitialize(IFileSystem fileSystem = null)
+        {
+            if (m_DemoWriter != null)
+            {
+                return m_DemoWriter;
+            }
+
+            if (m_Agent == null)
+            {
+                m_Agent = GetComponent<Agent>();
+            }
+
+            m_FileSystem = fileSystem ?? new FileSystem();
+            var behaviorParams = GetComponent<BehaviorParameters>();
+            if (string.IsNullOrEmpty(demonstrationName))
+            {
+                demonstrationName = behaviorParams.behaviorName;
+            }
+            if (string.IsNullOrEmpty(demonstrationDirectory))
+            {
+                demonstrationDirectory = Path.Combine(Application.dataPath, "Demonstrations");
+            }
+
+            demonstrationName = SanitizeName(demonstrationName, MaxNameLength);
+            var filePath = MakeDemonstrationFilePath(m_FileSystem, demonstrationDirectory, demonstrationName);
+            var stream = m_FileSystem.File.Create(filePath);
+            m_DemoWriter = new DemonstrationWriter(stream);
+
+            m_DemoWriter.Initialize(
+                demonstrationName,
+                behaviorParams.brainParameters,
+                behaviorParams.fullyQualifiedBehaviorName
+            );
+
+            AddDemonstrationWriterToAgent(m_DemoWriter);
+
+            return m_DemoWriter;
+        }
+
+        /// <summary>
+        /// Removes all characters except alphanumerics from demonstration name.
+        /// Shorten name if it is longer than the maxNameLength.
+        /// </summary>
+        internal static string SanitizeName(string demoName, int maxNameLength)
+        {
+            var rgx = new Regex("[^a-zA-Z0-9 -]");
+            demoName = rgx.Replace(demoName, "");
+            // If the string is too long, it will overflow the metadata.
+            if (demoName.Length > maxNameLength)
+            {
+                demoName = demoName.Substring(0, maxNameLength);
+            }
+            return demoName;
+        }
+
+        /// <summary>
+        /// Gets a unique path for the demonstrationName in the demonstrationDirectory.
+        /// </summary>
+        /// <param name="fileSystem"></param>
+        /// <param name="demonstrationDirectory"></param>
+        /// <param name="demonstrationName"></param>
+        /// <returns></returns>
+        internal static string MakeDemonstrationFilePath(
+            IFileSystem fileSystem, string demonstrationDirectory, string demonstrationName
+        )
+        {
+            // Create the directory if it doesn't already exist
+            if (!fileSystem.Directory.Exists(demonstrationDirectory))
+            {
+                fileSystem.Directory.CreateDirectory(demonstrationDirectory);
+            }
+
+            var literalName = demonstrationName;
+            var filePath = Path.Combine(demonstrationDirectory, literalName + k_ExtensionType);
+            var uniqueNameCounter = 0;
+            while (fileSystem.File.Exists(filePath))
+            {
+                // TODO should we use a timestamp instead of a counter here? This loops an increasing number of times
+                // as the number of demos increases.
+                literalName = demonstrationName + "_" + uniqueNameCounter;
+                filePath = Path.Combine(demonstrationDirectory, literalName + k_ExtensionType);
+                uniqueNameCounter++;
+            }
+
+            return filePath;
+        }
+
+        /// <summary>
+        /// Close the DemonstrationWriter and remove it from the Agent.
+        /// Has no effect if the DemonstrationWriter is already closed (or wasn't opened)
+        /// </summary>
+        public void Close()
+        {
+            if (m_DemoWriter != null)
+            {
+                RemoveDemonstrationWriterFromAgent(m_DemoWriter);
+
+                m_DemoWriter.Close();
+                m_DemoWriter = null;
+            }
+        }
+
+        /// <summary>
+        /// Clean up the DemonstrationWriter when shutting down or destroying the Agent.
+        /// </summary>
+        void OnDestroy()
+        {
+            Close();
+        }
+
+        /// <summary>
+        /// Add additional DemonstrationWriter to the Agent. It is still up to the user to Close this
+        /// DemonstrationWriters when recording is done.
+        /// </summary>
+        /// <param name="demoWriter"></param>
+        public void AddDemonstrationWriterToAgent(DemonstrationWriter demoWriter)
+        {
+            m_Agent.DemonstrationWriters.Add(demoWriter);
+        }
+
+        /// <summary>
+        /// Remove additional DemonstrationWriter to the Agent. It is still up to the user to Close this
+        /// DemonstrationWriters when recording is done.
+        /// </summary>
+        /// <param name="demoWriter"></param>
+        public void RemoveDemonstrationWriterFromAgent(DemonstrationWriter demoWriter)
+        {
+            m_Agent.DemonstrationWriters.Remove(demoWriter);
+        }
+    }
+}
--- a/ml-agents/mlagents/trainers/common/init.py
+++ b/ml-agents/mlagents/trainers/common/init.py
--- a/ml-agents/mlagents/trainers/common/nn_policy.py
+++ b/ml-agents/mlagents/trainers/common/nn_policy.py
+import logging
+import numpy as np
+from typing import Any, Dict, Optional, List
+
+from mlagents.tf_utils import tf
+
+from mlagents_envs.timers import timed
+from mlagents_envs.base_env import BatchedStepResult
+from mlagents.trainers.brain import BrainParameters
+from mlagents.trainers.models import EncoderType
+from mlagents.trainers.models import ModelUtils
+from mlagents.trainers.tf_policy import TFPolicy
+
+logger = logging.getLogger("mlagents.trainers")
+
+EPSILON = 1e-6  # Small value to avoid divide by zero
+
+
+class NNPolicy(TFPolicy):
+    def __init__(
+        self,
+        seed: int,
+        brain: BrainParameters,
+        trainer_params: Dict[str, Any],
+        is_training: bool,
+        load: bool,
+        tanh_squash: bool = False,
+        reparameterize: bool = False,
+        condition_sigma_on_obs: bool = True,
+        create_tf_graph: bool = True,
+    ):
+        """
+        Policy that uses a multilayer perceptron to map the observations to actions. Could
+        also use a CNN to encode visual input prior to the MLP. Supports discrete and
+        continuous action spaces, as well as recurrent networks.
+        :param seed: Random seed.
+        :param brain: Assigned BrainParameters object.
+        :param trainer_params: Defined training parameters.
+        :param is_training: Whether the model should be trained.
+        :param load: Whether a pre-trained model will be loaded or a new one created.
+        :param tanh_squash: Whether to use a tanh function on the continuous output, or a clipped output.
+        :param reparameterize: Whether we are using the resampling trick to update the policy in continuous output.
+        """
+        super().__init__(seed, brain, trainer_params, load)
+        self.grads = None
+        self.update_batch: Optional[tf.Operation] = None
+        num_layers = trainer_params["num_layers"]
+        self.h_size = trainer_params["hidden_units"]
+        if num_layers < 1:
+            num_layers = 1
+        self.num_layers = num_layers
+        self.vis_encode_type = EncoderType(
+            trainer_params.get("vis_encode_type", "simple")
+        )
+        self.tanh_squash = tanh_squash
+        self.reparameterize = reparameterize
+        self.condition_sigma_on_obs = condition_sigma_on_obs
+        self.trainable_variables: List[tf.Variable] = []
+
+        # Non-exposed parameters; these aren't exposed because they don't have a
+        # good explanation and usually shouldn't be touched.
+        self.log_std_min = -20
+        self.log_std_max = 2
+        if create_tf_graph:
+            self.create_tf_graph()
+
+    def get_trainable_variables(self) -> List[tf.Variable]:
+        """
+        Returns a List of the trainable variables in this policy. if create_tf_graph hasn't been called,
+        returns empty list.
+        """
+        return self.trainable_variables
+
+    def create_tf_graph(self) -> None:
+        """
+        Builds the tensorflow graph needed for this policy.
+        """
+        with self.graph.as_default():
+            tf.set_random_seed(self.seed)
+            _vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
+            if len(_vars) > 0:
+                # We assume the first thing created in the graph is the Policy. If
+                # already populated, don't create more tensors.
+                return
+
+            self.create_input_placeholders()
+            encoded = self._create_encoder(
+                self.visual_in,
+                self.processed_vector_in,
+                self.h_size,
+                self.num_layers,
+                self.vis_encode_type,
+            )
+            if self.use_continuous_act:
+                self._create_cc_actor(
+                    encoded,
+                    self.tanh_squash,
+                    self.reparameterize,
+                    self.condition_sigma_on_obs,
+                )
+            else:
+                self._create_dc_actor(encoded)
+            self.trainable_variables = tf.get_collection(
+                tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy"
+            )
+            self.trainable_variables += tf.get_collection(
+                tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm"
+            )  # LSTMs need to be root scope for Barracuda export
+
+        self.inference_dict: Dict[str, tf.Tensor] = {
+            "action": self.output,
+            "log_probs": self.all_log_probs,
+            "entropy": self.entropy,
+        }
+        if self.use_continuous_act:
+            self.inference_dict["pre_action"] = self.output_pre
+        if self.use_recurrent:
+            self.inference_dict["memory_out"] = self.memory_out
+
+        # We do an initialize to make the Policy usable out of the box. If an optimizer is needed,
+        # it will re-load the full graph
+        self._initialize_graph()
+
+    @timed
+    def evaluate(
+        self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]
+    ) -> Dict[str, Any]:
+        """
+        Evaluates policy for the agent experiences provided.
+        :param batched_step_result: BatchedStepResult object containing inputs.
+        :param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result.
+        :return: Outputs from network as defined by self.inference_dict.
+        """
+        feed_dict = {
+            self.batch_size_ph: batched_step_result.n_agents(),
+            self.sequence_length_ph: 1,
+        }
+        if self.use_recurrent:
+            if not self.use_continuous_act:
+                feed_dict[self.prev_action] = self.retrieve_previous_action(
+                    global_agent_ids
+                )
+            feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
+        feed_dict = self.fill_eval_dict(feed_dict, batched_step_result)
+        run_out = self._execute_model(feed_dict, self.inference_dict)
+        return run_out
+
+    def _create_encoder(
+        self,
+        visual_in: List[tf.Tensor],
+        vector_in: tf.Tensor,
+        h_size: int,
+        num_layers: int,
+        vis_encode_type: EncoderType,
+    ) -> tf.Tensor:
+        """
+        Creates an encoder for visual and vector observations.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
+        :return: The hidden layer (tf.Tensor) after the encoder.
+        """
+        with tf.variable_scope("policy"):
+            encoded = ModelUtils.create_observation_streams(
+                self.visual_in,
+                self.processed_vector_in,
+                1,
+                h_size,
+                num_layers,
+                vis_encode_type,
+            )[0]
+        return encoded
+
+    def _create_cc_actor(
+        self,
+        encoded: tf.Tensor,
+        tanh_squash: bool = False,
+        reparameterize: bool = False,
+        condition_sigma_on_obs: bool = True,
+    ) -> None:
+        """
+        Creates Continuous control actor-critic model.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
+        :param tanh_squash: Whether to use a tanh function, or a clipped output.
+        :param reparameterize: Whether we are using the resampling trick to update the policy.
+        """
+        if self.use_recurrent:
+            self.memory_in = tf.placeholder(
+                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
+            )
+            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
+                encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy"
+            )
+
+            self.memory_out = tf.identity(memory_policy_out, name="recurrent_out")
+        else:
+            hidden_policy = encoded
+
+        with tf.variable_scope("policy"):
+            mu = tf.layers.dense(
+                hidden_policy,
+                self.act_size[0],
+                activation=None,
+                name="mu",
+                kernel_initializer=ModelUtils.scaled_init(0.01),
+                reuse=tf.AUTO_REUSE,
+            )
+
+            # Policy-dependent log_sigma
+            if condition_sigma_on_obs:
+                log_sigma = tf.layers.dense(
+                    hidden_policy,
+                    self.act_size[0],
+                    activation=None,
+                    name="log_sigma",
+                    kernel_initializer=ModelUtils.scaled_init(0.01),
+                )
+            else:
+                log_sigma = tf.get_variable(
+                    "log_sigma",
+                    [self.act_size[0]],
+                    dtype=tf.float32,
+                    initializer=tf.zeros_initializer(),
+                )
+            log_sigma = tf.clip_by_value(log_sigma, self.log_std_min, self.log_std_max)
+
+            sigma = tf.exp(log_sigma)
+
+            epsilon = tf.random_normal(tf.shape(mu))
+
+            sampled_policy = mu + sigma * epsilon
+
+            # Stop gradient if we're not doing the resampling trick
+            if not reparameterize:
+                sampled_policy_probs = tf.stop_gradient(sampled_policy)
+            else:
+                sampled_policy_probs = sampled_policy
+
+            # Compute probability of model output.
+            _gauss_pre = -0.5 * (
+                ((sampled_policy_probs - mu) / (sigma + EPSILON)) ** 2
+                + 2 * log_sigma
+                + np.log(2 * np.pi)
+            )
+            all_probs = _gauss_pre
+            all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)
+
+        if tanh_squash:
+            self.output_pre = tf.tanh(sampled_policy)
+
+            # Squash correction
+            all_probs -= tf.reduce_sum(
+                tf.log(1 - self.output_pre ** 2 + EPSILON), axis=1, keepdims=True
+            )
+            self.output = tf.identity(self.output_pre, name="action")
+        else:
+            self.output_pre = sampled_policy
+            # Clip and scale output to ensure actions are always within [-1, 1] range.
+            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
+            self.output = tf.identity(output_post, name="action")
+
+        self.selected_actions = tf.stop_gradient(self.output)
+
+        self.all_log_probs = tf.identity(all_probs, name="action_probs")
+
+        single_dim_entropy = 0.5 * tf.reduce_mean(
+            tf.log(2 * np.pi * np.e) + 2 * log_sigma
+        )
+        # Make entropy the right shape
+        self.entropy = tf.ones_like(tf.reshape(mu[:, 0], [-1])) * single_dim_entropy
+
+        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
+        self.log_probs = tf.reduce_sum(
+            (tf.identity(self.all_log_probs)), axis=1, keepdims=True
+        )
+
+        self.action_holder = tf.placeholder(
+            shape=[None, self.act_size[0]], dtype=tf.float32, name="action_holder"
+        )
+
+    def _create_dc_actor(self, encoded: tf.Tensor) -> None:
+        """
+        Creates Discrete control actor-critic model.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
+        """
+        if self.use_recurrent:
+            self.prev_action = tf.placeholder(
+                shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action"
+            )
+            prev_action_oh = tf.concat(
+                [
+                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
+                    for i in range(len(self.act_size))
+                ],
+                axis=1,
+            )
+            hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)
+
+            self.memory_in = tf.placeholder(
+                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
+            )
+            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
+                hidden_policy,
+                self.memory_in,
+                self.sequence_length_ph,
+                name="lstm_policy",
+            )
+
+            self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
+        else:
+            hidden_policy = encoded
+
+        policy_branches = []
+        with tf.variable_scope("policy"):
+            for size in self.act_size:
+                policy_branches.append(
+                    tf.layers.dense(
+                        hidden_policy,
+                        size,
+                        activation=None,
+                        use_bias=False,
+                        kernel_initializer=ModelUtils.scaled_init(0.01),
+                    )
+                )
+
+        raw_log_probs = tf.concat(policy_branches, axis=1, name="action_probs")
+
+        self.action_masks = tf.placeholder(
+            shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
+        )
+        output, self.action_probs, normalized_logits = ModelUtils.create_discrete_action_masking_layer(
+            raw_log_probs, self.action_masks, self.act_size
+        )
+
+        self.output = tf.identity(output)
+        self.all_log_probs = tf.identity(normalized_logits, name="action")
+
+        self.action_holder = tf.placeholder(
+            shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder"
+        )
+        self.action_oh = tf.concat(
+            [
+                tf.one_hot(self.action_holder[:, i], self.act_size[i])
+                for i in range(len(self.act_size))
+            ],
+            axis=1,
+        )
+        self.selected_actions = tf.stop_gradient(self.action_oh)
+
+        action_idx = [0] + list(np.cumsum(self.act_size))
+
+        self.entropy = tf.reduce_sum(
+            (
+                tf.stack(
+                    [
+                        tf.nn.softmax_cross_entropy_with_logits_v2(
+                            labels=tf.nn.softmax(
+                                self.all_log_probs[:, action_idx[i] : action_idx[i + 1]]
+                            ),
+                            logits=self.all_log_probs[
+                                :, action_idx[i] : action_idx[i + 1]
+                            ],
+                        )
+                        for i in range(len(self.act_size))
+                    ],
+                    axis=1,
+                )
+            ),
+            axis=1,
+        )
+
+        self.log_probs = tf.reduce_sum(
+            (
+                tf.stack(
+                    [
+                        -tf.nn.softmax_cross_entropy_with_logits_v2(
+                            labels=self.action_oh[:, action_idx[i] : action_idx[i + 1]],
+                            logits=normalized_logits[
+                                :, action_idx[i] : action_idx[i + 1]
+                            ],
+                        )
+                        for i in range(len(self.act_size))
+                    ],
+                    axis=1,
+                )
+            ),
+            axis=1,
+            keepdims=True,
+        )
--- a/ml-agents/mlagents/trainers/common/optimizer.py
+++ b/ml-agents/mlagents/trainers/common/optimizer.py
+import abc
+from typing import Dict
+
+from mlagents.trainers.buffer import AgentBuffer
+
+
+class Optimizer(abc.ABC):
+    """
+    Creates loss functions and auxillary networks (e.g. Q or Value) needed for training.
+    Provides methods to update the Policy.
+    """
+
+    @abc.abstractmethod
+    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
+        """
+        Update the Policy based on the batch that was passed in.
+        :param batch: AgentBuffer that contains the minibatch of data used for this update.
+        :param num_sequences: Number of recurrent sequences found in the minibatch.
+        :return: A Dict containing statistics (name, value) from the update (e.g. loss)
+        """
+        pass
--- a/ml-agents/mlagents/trainers/common/tf_optimizer.py
+++ b/ml-agents/mlagents/trainers/common/tf_optimizer.py
+from typing import Dict, Any, List, Tuple, Optional
+import numpy as np
+
+from mlagents.tf_utils.tf import tf
+from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.common.optimizer import Optimizer
+from mlagents.trainers.trajectory import SplitObservations
+from mlagents.trainers.components.reward_signals.reward_signal_factory import (
+    create_reward_signal,
+)
+from mlagents.trainers.components.bc.module import BCModule
+
+
+class TFOptimizer(Optimizer):  # pylint: disable=W0223
+    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
+        self.sess = policy.sess
+        self.policy = policy
+        self.update_dict: Dict[str, tf.Tensor] = {}
+        self.value_heads: Dict[str, tf.Tensor] = {}
+        self.create_reward_signals(trainer_params["reward_signals"])
+        self.memory_in: tf.Tensor = None
+        self.memory_out: tf.Tensor = None
+        self.m_size: int = 0
+        self.bc_module: Optional[BCModule] = None
+        # Create pretrainer if needed
+        if "behavioral_cloning" in trainer_params:
+            BCModule.check_config(trainer_params["behavioral_cloning"])
+            self.bc_module = BCModule(
+                self.policy,
+                policy_learning_rate=trainer_params["learning_rate"],
+                default_batch_size=trainer_params["batch_size"],
+                default_num_epoch=3,
+                **trainer_params["behavioral_cloning"],
+            )
+
+    def get_trajectory_value_estimates(
+        self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
+        feed_dict: Dict[tf.Tensor, Any] = {
+            self.policy.batch_size_ph: batch.num_experiences,
+            self.policy.sequence_length_ph: batch.num_experiences,  # We want to feed data in batch-wise, not time-wise.
+        }
+
+        if self.policy.vec_obs_size > 0:
+            feed_dict[self.policy.vector_in] = batch["vector_obs"]
+        if self.policy.vis_obs_size > 0:
+            for i in range(len(self.policy.visual_in)):
+                _obs = batch["visual_obs%d" % i]
+                feed_dict[self.policy.visual_in[i]] = _obs
+        if self.policy.use_recurrent:
+            feed_dict[self.policy.memory_in] = [np.zeros((self.policy.m_size))]
+            feed_dict[self.memory_in] = [np.zeros((self.m_size))]
+        if self.policy.prev_action is not None:
+            feed_dict[self.policy.prev_action] = batch["prev_action"]
+
+        if self.policy.use_recurrent:
+            value_estimates, policy_mem, value_mem = self.sess.run(
+                [self.value_heads, self.policy.memory_out, self.memory_out], feed_dict
+            )
+            prev_action = batch["actions"][-1]
+        else:
+            value_estimates = self.sess.run(self.value_heads, feed_dict)
+            prev_action = None
+            policy_mem = None
+            value_mem = None
+        value_estimates = {k: np.squeeze(v, axis=1) for k, v in value_estimates.items()}
+
+        # We do this in a separate step to feed the memory outs - a further optimization would
+        # be to append to the obs before running sess.run.
+        final_value_estimates = self._get_value_estimates(
+            next_obs, done, policy_mem, value_mem, prev_action
+        )
+
+        return value_estimates, final_value_estimates
+
+    def _get_value_estimates(
+        self,
+        next_obs: List[np.ndarray],
+        done: bool,
+        policy_memory: np.ndarray = None,
+        value_memory: np.ndarray = None,
+        prev_action: np.ndarray = None,
+    ) -> Dict[str, float]:
+        """
+        Generates value estimates for bootstrapping.
+        :param experience: AgentExperience to be used for bootstrapping.
+        :param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
+        :return: The value estimate dictionary with key being the name of the reward signal and the value the
+        corresponding value estimate.
+        """
+
+        feed_dict: Dict[tf.Tensor, Any] = {
+            self.policy.batch_size_ph: 1,
+            self.policy.sequence_length_ph: 1,
+        }
+        vec_vis_obs = SplitObservations.from_observations(next_obs)
+        for i in range(len(vec_vis_obs.visual_observations)):
+            feed_dict[self.policy.visual_in[i]] = [vec_vis_obs.visual_observations[i]]
+
+        if self.policy.vec_obs_size > 0:
+            feed_dict[self.policy.vector_in] = [vec_vis_obs.vector_observations]
+        if policy_memory is not None:
+            feed_dict[self.policy.memory_in] = policy_memory
+        if value_memory is not None:
+            feed_dict[self.memory_in] = value_memory
+        if prev_action is not None:
+            feed_dict[self.policy.prev_action] = [prev_action]
+        value_estimates = self.sess.run(self.value_heads, feed_dict)
+
+        value_estimates = {k: float(v) for k, v in value_estimates.items()}
+
+        # If we're done, reassign all of the value estimates that need terminal states.
+        if done:
+            for k in value_estimates:
+                if self.reward_signals[k].use_terminal_states:
+                    value_estimates[k] = 0.0
+
+        return value_estimates
+
+    def create_reward_signals(self, reward_signal_configs: Dict[str, Any]) -> None:
+        """
+        Create reward signals
+        :param reward_signal_configs: Reward signal config.
+        """
+        self.reward_signals = {}
+        # Create reward signals
+        for reward_signal, config in reward_signal_configs.items():
+            self.reward_signals[reward_signal] = create_reward_signal(
+                self.policy, reward_signal, config
+            )
+            self.update_dict.update(self.reward_signals[reward_signal].update_dict)
+
+    def create_optimizer_op(
+        self, learning_rate: tf.Tensor, name: str = "Adam"
+    ) -> tf.train.Optimizer:
+        return tf.train.AdamOptimizer(learning_rate=learning_rate, name=name)
+
+    def _execute_model(
+        self, feed_dict: Dict[tf.Tensor, np.ndarray], out_dict: Dict[str, tf.Tensor]
+    ) -> Dict[str, np.ndarray]:
+        """
+        Executes model.
+        :param feed_dict: Input dictionary mapping nodes to input data.
+        :param out_dict: Output dictionary mapping names to nodes.
+        :return: Dictionary mapping names to input data.
+        """
+        network_out = self.sess.run(list(out_dict.values()), feed_dict=feed_dict)
+        run_out = dict(zip(list(out_dict.keys()), network_out))
+        return run_out
+
+    def _make_zero_mem(self, m_size: int, length: int) -> List[np.ndarray]:
+        return [
+            np.zeros((m_size), dtype=np.float32)
+            for i in range(0, length, self.policy.sequence_length)
+        ]
--- a/com.unity.ml-agents/Runtime/DemonstrationRecorder.cs
+++ b/com.unity.ml-agents/Runtime/DemonstrationRecorder.cs
-using System.IO.Abstractions;
-using System.Text.RegularExpressions;
-using UnityEngine;
-using System.IO;
-
-namespace MLAgents
-{
-    /// <summary>
-    /// Demonstration Recorder Component.
-    /// </summary>
-    [RequireComponent(typeof(Agent))]
-    [AddComponentMenu("ML Agents/Demonstration Recorder", (int)MenuGroup.Default)]
-    public class DemonstrationRecorder : MonoBehaviour
-    {
-        [Tooltip("Whether or not to record demonstrations.")]
-        public bool record;
-
-        [Tooltip("Base demonstration file name. Will have numbers appended to make unique.")]
-        public string demonstrationName;
-
-        [Tooltip("Base directory to write the demo files. If null, will use {Application.dataPath}/Demonstrations.")]
-        public string demonstrationDirectory;
-
-        DemonstrationStore m_DemoStore;
-        internal const int MaxNameLength = 16;
-
-        const string k_ExtensionType = ".demo";
-        IFileSystem m_FileSystem;
-
-        Agent m_Agent;
-
-        void OnEnable()
-        {
-            m_Agent = GetComponent<Agent>();
-        }
-
-        void Update()
-        {
-            if (record)
-            {
-                LazyInitialize();
-            }
-        }
-
-        /// <summary>
-        /// Creates demonstration store for use in recording.
-        /// Has no effect if the demonstration store was already created.
-        /// </summary>
-        internal DemonstrationStore LazyInitialize(IFileSystem fileSystem = null)
-        {
-            if (m_DemoStore != null)
-            {
-                return m_DemoStore;
-            }
-
-            if (m_Agent == null)
-            {
-                m_Agent = GetComponent<Agent>();
-            }
-
-            m_FileSystem = fileSystem ?? new FileSystem();
-            var behaviorParams = GetComponent<BehaviorParameters>();
-            if (string.IsNullOrEmpty(demonstrationName))
-            {
-                demonstrationName = behaviorParams.behaviorName;
-            }
-            if (string.IsNullOrEmpty(demonstrationDirectory))
-            {
-                demonstrationDirectory = Path.Combine(Application.dataPath, "Demonstrations");
-            }
-
-            demonstrationName = SanitizeName(demonstrationName, MaxNameLength);
-            var filePath = MakeDemonstrationFilePath(m_FileSystem, demonstrationDirectory, demonstrationName);
-            var stream = m_FileSystem.File.Create(filePath);
-            m_DemoStore = new DemonstrationStore(stream);
-
-            m_DemoStore.Initialize(
-                demonstrationName,
-                behaviorParams.brainParameters,
-                behaviorParams.fullyQualifiedBehaviorName
-            );
-
-            AddDemonstrationStoreToAgent(m_DemoStore);
-
-            return m_DemoStore;
-        }
-
-        /// <summary>
-        /// Removes all characters except alphanumerics from demonstration name.
-        /// Shorten name if it is longer than the maxNameLength.
-        /// </summary>
-        internal static string SanitizeName(string demoName, int maxNameLength)
-        {
-            var rgx = new Regex("[^a-zA-Z0-9 -]");
-            demoName = rgx.Replace(demoName, "");
-            // If the string is too long, it will overflow the metadata.
-            if (demoName.Length > maxNameLength)
-            {
-                demoName = demoName.Substring(0, maxNameLength);
-            }
-            return demoName;
-        }
-
-        /// <summary>
-        /// Gets a unique path for the demonstrationName in the demonstrationDirectory.
-        /// </summary>
-        /// <param name="fileSystem"></param>
-        /// <param name="demonstrationDirectory"></param>
-        /// <param name="demonstrationName"></param>
-        /// <returns></returns>
-        internal static string MakeDemonstrationFilePath(
-            IFileSystem fileSystem, string demonstrationDirectory, string demonstrationName
-        )
-        {
-            // Create the directory if it doesn't already exist
-            if (!fileSystem.Directory.Exists(demonstrationDirectory))
-            {
-                fileSystem.Directory.CreateDirectory(demonstrationDirectory);
-            }
-
-            var literalName = demonstrationName;
-            var filePath = Path.Combine(demonstrationDirectory, literalName + k_ExtensionType);
-            var uniqueNameCounter = 0;
-            while (fileSystem.File.Exists(filePath))
-            {
-                // TODO should we use a timestamp instead of a counter here? This loops an increasing number of times
-                // as the number of demos increases.
-                literalName = demonstrationName + "_" + uniqueNameCounter;
-                filePath = Path.Combine(demonstrationDirectory, literalName + k_ExtensionType);
-                uniqueNameCounter++;
-            }
-
-            return filePath;
-        }
-
-        /// <summary>
-        /// Close the DemonstrationStore and remove it from the Agent.
-        /// Has no effect if the DemonstrationStore is already closed (or wasn't opened)
-        /// </summary>
-        public void Close()
-        {
-            if (m_DemoStore != null)
-            {
-                RemoveDemonstrationStoreFromAgent(m_DemoStore);
-
-                m_DemoStore.Close();
-                m_DemoStore = null;
-            }
-        }
-
-        /// <summary>
-        /// Clean up the DemonstrationStore when shutting down or destroying the Agent.
-        /// </summary>
-        void OnDestroy()
-        {
-            Close();
-        }
-
-        /// <summary>
-        /// Add additional DemonstrationStore to the Agent. It is still up to the user to Close this
-        /// DemonstrationStores when recording is done.
-        /// </summary>
-        /// <param name="demoStore"></param>
-        public void AddDemonstrationStoreToAgent(DemonstrationStore demoStore)
-        {
-            m_Agent.DemonstrationStores.Add(demoStore);
-        }
-
-        /// <summary>
-        /// Remove additional DemonstrationStore to the Agent. It is still up to the user to Close this
-        /// DemonstrationStores when recording is done.
-        /// </summary>
-        /// <param name="demoStore"></param>
-        public void RemoveDemonstrationStoreFromAgent(DemonstrationStore demoStore)
-        {
-            m_Agent.DemonstrationStores.Remove(demoStore);
-        }
-    }
-}
--- a/ml-agents/mlagents/trainers/ppo/models.py
+++ b/ml-agents/mlagents/trainers/ppo/models.py
-import logging
-from typing import Optional
-
-import numpy as np
-from mlagents.tf_utils import tf
-from mlagents.trainers.models import LearningModel, EncoderType, LearningRateSchedule
-
-logger = logging.getLogger("mlagents.trainers")
-
-
-class PPOModel(LearningModel):
-    def __init__(
-        self,
-        brain,
-        lr=1e-4,
-        lr_schedule=LearningRateSchedule.LINEAR,
-        h_size=128,
-        epsilon=0.2,
-        beta=1e-3,
-        max_step=5e6,
-        normalize=False,
-        use_recurrent=False,
-        num_layers=2,
-        m_size=None,
-        seed=0,
-        stream_names=None,
-        vis_encode_type=EncoderType.SIMPLE,
-    ):
-        """
-        Takes a Unity environment and model-specific hyper-parameters and returns the
-        appropriate PPO agent model for the environment.
-        :param brain: brain parameters used to generate specific network graph.
-        :param lr: Learning rate.
-        :param lr_schedule: Learning rate decay schedule.
-        :param h_size: Size of hidden layers
-        :param epsilon: Value for policy-divergence threshold.
-        :param beta: Strength of entropy regularization.
-        :param max_step: Total number of training steps.
-        :param normalize: Whether to normalize vector observation input.
-        :param use_recurrent: Whether to use an LSTM layer in the network.
-        :param num_layers Number of hidden layers between encoded input and policy & value layers
-        :param m_size: Size of brain memory.
-        :param seed: Seed to use for initialization of model.
-        :param stream_names: List of names of value streams. Usually, a list of the Reward Signals being used.
-        :return: a sub-class of PPOAgent tailored to the environment.
-        """
-        LearningModel.__init__(
-            self, m_size, normalize, use_recurrent, brain, seed, stream_names
-        )
-
-        self.optimizer: Optional[tf.train.AdamOptimizer] = None
-        self.grads = None
-        self.update_batch: Optional[tf.Operation] = None
-
-        if num_layers < 1:
-            num_layers = 1
-        if brain.vector_action_space_type == "continuous":
-            self.create_cc_actor_critic(h_size, num_layers, vis_encode_type)
-            self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy
-        else:
-            self.create_dc_actor_critic(h_size, num_layers, vis_encode_type)
-        self.learning_rate = self.create_learning_rate(
-            lr_schedule, lr, self.global_step, max_step
-        )
-        self.create_losses(
-            self.log_probs,
-            self.old_log_probs,
-            self.value_heads,
-            self.entropy,
-            beta,
-            epsilon,
-            lr,
-            max_step,
-        )
-
-    def create_cc_actor_critic(
-        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
-    ) -> None:
-        """
-        Creates Continuous control actor-critic model.
-        :param h_size: Size of hidden linear layers.
-        :param num_layers: Number of hidden linear layers.
-        """
-        hidden_streams = self.create_observation_streams(
-            2, h_size, num_layers, vis_encode_type
-        )
-
-        if self.use_recurrent:
-            self.memory_in = tf.placeholder(
-                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
-            )
-            _half_point = int(self.m_size / 2)
-            hidden_policy, memory_policy_out = self.create_recurrent_encoder(
-                hidden_streams[0],
-                self.memory_in[:, :_half_point],
-                self.sequence_length,
-                name="lstm_policy",
-            )
-
-            hidden_value, memory_value_out = self.create_recurrent_encoder(
-                hidden_streams[1],
-                self.memory_in[:, _half_point:],
-                self.sequence_length,
-                name="lstm_value",
-            )
-            self.memory_out = tf.concat(
-                [memory_policy_out, memory_value_out], axis=1, name="recurrent_out"
-            )
-        else:
-            hidden_policy = hidden_streams[0]
-            hidden_value = hidden_streams[1]
-
-        mu = tf.layers.dense(
-            hidden_policy,
-            self.act_size[0],
-            activation=None,
-            kernel_initializer=LearningModel.scaled_init(0.01),
-            reuse=tf.AUTO_REUSE,
-        )
-
-        self.log_sigma_sq = tf.get_variable(
-            "log_sigma_squared",
-            [self.act_size[0]],
-            dtype=tf.float32,
-            initializer=tf.zeros_initializer(),
-        )
-
-        sigma_sq = tf.exp(self.log_sigma_sq)
-
-        self.epsilon = tf.placeholder(
-            shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
-        )
-        # Clip and scale output to ensure actions are always within [-1, 1] range.
-        self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon
-        output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
-        self.output = tf.identity(output_post, name="action")
-        self.selected_actions = tf.stop_gradient(output_post)
-
-        # Compute probability of model output.
-        all_probs = (
-            -0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) / sigma_sq
-            - 0.5 * tf.log(2.0 * np.pi)
-            - 0.5 * self.log_sigma_sq
-        )
-
-        self.all_log_probs = tf.identity(all_probs, name="action_probs")
-
-        self.entropy = 0.5 * tf.reduce_mean(
-            tf.log(2 * np.pi * np.e) + self.log_sigma_sq
-        )
-
-        self.create_value_heads(self.stream_names, hidden_value)
-
-        self.all_old_log_probs = tf.placeholder(
-            shape=[None, self.act_size[0]], dtype=tf.float32, name="old_probabilities"
-        )
-
-        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
-        self.log_probs = tf.reduce_sum(
-            (tf.identity(self.all_log_probs)), axis=1, keepdims=True
-        )
-        self.old_log_probs = tf.reduce_sum(
-            (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
-        )
-
-    def create_dc_actor_critic(
-        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
-    ) -> None:
-        """
-        Creates Discrete control actor-critic model.
-        :param h_size: Size of hidden linear layers.
-        :param num_layers: Number of hidden linear layers.
-        """
-        hidden_streams = self.create_observation_streams(
-            1, h_size, num_layers, vis_encode_type
-        )
-        hidden = hidden_streams[0]
-
-        if self.use_recurrent:
-            self.prev_action = tf.placeholder(
-                shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action"
-            )
-            prev_action_oh = tf.concat(
-                [
-                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
-                    for i in range(len(self.act_size))
-                ],
-                axis=1,
-            )
-            hidden = tf.concat([hidden, prev_action_oh], axis=1)
-
-            self.memory_in = tf.placeholder(
-                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
-            )
-            hidden, memory_out = self.create_recurrent_encoder(
-                hidden, self.memory_in, self.sequence_length
-            )
-            self.memory_out = tf.identity(memory_out, name="recurrent_out")
-
-        policy_branches = []
-        for size in self.act_size:
-            policy_branches.append(
-                tf.layers.dense(
-                    hidden,
-                    size,
-                    activation=None,
-                    use_bias=False,
-                    kernel_initializer=LearningModel.scaled_init(0.01),
-                )
-            )
-
-        self.all_log_probs = tf.concat(policy_branches, axis=1, name="action_probs")
-
-        self.action_masks = tf.placeholder(
-            shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
-        )
-        output, _, normalized_logits = self.create_discrete_action_masking_layer(
-            self.all_log_probs, self.action_masks, self.act_size
-        )
-
-        self.output = tf.identity(output)
-        self.normalized_logits = tf.identity(normalized_logits, name="action")
-
-        self.create_value_heads(self.stream_names, hidden)
-
-        self.action_holder = tf.placeholder(
-            shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder"
-        )
-        self.action_oh = tf.concat(
-            [
-                tf.one_hot(self.action_holder[:, i], self.act_size[i])
-                for i in range(len(self.act_size))
-            ],
-            axis=1,
-        )
-        self.selected_actions = tf.stop_gradient(self.action_oh)
-
-        self.all_old_log_probs = tf.placeholder(
-            shape=[None, sum(self.act_size)], dtype=tf.float32, name="old_probabilities"
-        )
-        _, _, old_normalized_logits = self.create_discrete_action_masking_layer(
-            self.all_old_log_probs, self.action_masks, self.act_size
-        )
-
-        action_idx = [0] + list(np.cumsum(self.act_size))
-
-        self.entropy = tf.reduce_sum(
-            (
-                tf.stack(
-                    [
-                        tf.nn.softmax_cross_entropy_with_logits_v2(
-                            labels=tf.nn.softmax(
-                                self.all_log_probs[:, action_idx[i] : action_idx[i + 1]]
-                            ),
-                            logits=self.all_log_probs[
-                                :, action_idx[i] : action_idx[i + 1]
-                            ],
-                        )
-                        for i in range(len(self.act_size))
-                    ],
-                    axis=1,
-                )
-            ),
-            axis=1,
-        )
-
-        self.log_probs = tf.reduce_sum(
-            (
-                tf.stack(
-                    [
-                        -tf.nn.softmax_cross_entropy_with_logits_v2(
-                            labels=self.action_oh[:, action_idx[i] : action_idx[i + 1]],
-                            logits=normalized_logits[
-                                :, action_idx[i] : action_idx[i + 1]
-                            ],
-                        )
-                        for i in range(len(self.act_size))
-                    ],
-                    axis=1,
-                )
-            ),
-            axis=1,
-            keepdims=True,
-        )
-        self.old_log_probs = tf.reduce_sum(
-            (
-                tf.stack(
-                    [
-                        -tf.nn.softmax_cross_entropy_with_logits_v2(
-                            labels=self.action_oh[:, action_idx[i] : action_idx[i + 1]],
-                            logits=old_normalized_logits[
-                                :, action_idx[i] : action_idx[i + 1]
-                            ],
-                        )
-                        for i in range(len(self.act_size))
-                    ],
-                    axis=1,
-                )
-            ),
-            axis=1,
-            keepdims=True,
-        )
-
-    def create_losses(
-        self, probs, old_probs, value_heads, entropy, beta, epsilon, lr, max_step
-    ):
-        """
-        Creates training-specific Tensorflow ops for PPO models.
-        :param probs: Current policy probabilities
-        :param old_probs: Past policy probabilities
-        :param value_heads: Value estimate tensors from each value stream
-        :param beta: Entropy regularization strength
-        :param entropy: Current policy entropy
-        :param epsilon: Value for policy-divergence threshold
-        :param lr: Learning rate
-        :param max_step: Total number of training steps.
-        """
-        self.returns_holders = {}
-        self.old_values = {}
-        for name in value_heads.keys():
-            returns_holder = tf.placeholder(
-                shape=[None], dtype=tf.float32, name="{}_returns".format(name)
-            )
-            old_value = tf.placeholder(
-                shape=[None], dtype=tf.float32, name="{}_value_estimate".format(name)
-            )
-            self.returns_holders[name] = returns_holder
-            self.old_values[name] = old_value
-        self.advantage = tf.placeholder(
-            shape=[None], dtype=tf.float32, name="advantages"
-        )
-        advantage = tf.expand_dims(self.advantage, -1)
-
-        decay_epsilon = tf.train.polynomial_decay(
-            epsilon, self.global_step, max_step, 0.1, power=1.0
-        )
-        decay_beta = tf.train.polynomial_decay(
-            beta, self.global_step, max_step, 1e-5, power=1.0
-        )
-
-        value_losses = []
-        for name, head in value_heads.items():
-            clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
-                tf.reduce_sum(head, axis=1) - self.old_values[name],
-                -decay_epsilon,
-                decay_epsilon,
-            )
-            v_opt_a = tf.squared_difference(
-                self.returns_holders[name], tf.reduce_sum(head, axis=1)
-            )
-            v_opt_b = tf.squared_difference(
-                self.returns_holders[name], clipped_value_estimate
-            )
-            value_loss = tf.reduce_mean(
-                tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.mask, 2)[1]
-            )
-            value_losses.append(value_loss)
-        self.value_loss = tf.reduce_mean(value_losses)
-
-        r_theta = tf.exp(probs - old_probs)
-        p_opt_a = r_theta * advantage
-        p_opt_b = (
-            tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon)
-            * advantage
-        )
-        self.policy_loss = -tf.reduce_mean(
-            tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.mask, 2)[1]
-        )
-        # For cleaner stats reporting
-        self.abs_policy_loss = tf.abs(self.policy_loss)
-
-        self.loss = (
-            self.policy_loss
-            + 0.5 * self.value_loss
-            - decay_beta
-            * tf.reduce_mean(tf.dynamic_partition(entropy, self.mask, 2)[1])
-        )
-
-    def create_ppo_optimizer(self):
-        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
-        self.grads = self.optimizer.compute_gradients(self.loss)
-        self.update_batch = self.optimizer.minimize(self.loss)
--- a/ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py
+++ b/ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py
-import logging
-from typing import Any, Dict, List, Optional
-
-from mlagents.tf_utils import tf
-
-from tensorflow.python.client import device_lib
-from mlagents.trainers.brain import BrainParameters
-from mlagents_envs.timers import timed
-from mlagents.trainers.models import EncoderType, LearningRateSchedule
-from mlagents.trainers.ppo.policy import PPOPolicy
-from mlagents.trainers.ppo.models import PPOModel
-from mlagents.trainers.components.reward_signals import RewardSignal
-from mlagents.trainers.components.reward_signals.reward_signal_factory import (
-    create_reward_signal,
-)
-
-# Variable scope in which created variables will be placed under
-TOWER_SCOPE_NAME = "tower"
-
-logger = logging.getLogger("mlagents.trainers")
-
-
-class MultiGpuPPOPolicy(PPOPolicy):
-    def __init__(
-        self,
-        seed: int,
-        brain: BrainParameters,
-        trainer_params: Dict[str, Any],
-        is_training: bool,
-        load: bool,
-    ):
-        self.towers: List[PPOModel] = []
-        self.devices: List[str] = []
-        self.model: Optional[PPOModel] = None
-        self.total_policy_loss: Optional[tf.Tensor] = None
-        self.reward_signal_towers: List[Dict[str, RewardSignal]] = []
-        self.reward_signals: Dict[str, RewardSignal] = {}
-
-        super().__init__(seed, brain, trainer_params, is_training, load)
-
-    def create_model(
-        self, brain, trainer_params, reward_signal_configs, is_training, load, seed
-    ):
-        """
-        Create PPO models, one on each device
-        :param brain: Assigned Brain object.
-        :param trainer_params: Defined training parameters.
-        :param reward_signal_configs: Reward signal config
-        :param seed: Random seed.
-        """
-        self.devices = get_devices()
-
-        with self.graph.as_default():
-            with tf.variable_scope("", reuse=tf.AUTO_REUSE):
-                for device in self.devices:
-                    with tf.device(device):
-                        self.towers.append(
-                            PPOModel(
-                                brain=brain,
-                                lr=float(trainer_params["learning_rate"]),
-                                lr_schedule=LearningRateSchedule(
-                                    trainer_params.get(
-                                        "learning_rate_schedule", "linear"
-                                    )
-                                ),
-                                h_size=int(trainer_params["hidden_units"]),
-                                epsilon=float(trainer_params["epsilon"]),
-                                beta=float(trainer_params["beta"]),
-                                max_step=float(trainer_params["max_steps"]),
-                                normalize=trainer_params["normalize"],
-                                use_recurrent=trainer_params["use_recurrent"],
-                                num_layers=int(trainer_params["num_layers"]),
-                                m_size=self.m_size,
-                                seed=seed,
-                                stream_names=list(reward_signal_configs.keys()),
-                                vis_encode_type=EncoderType(
-                                    trainer_params.get("vis_encode_type", "simple")
-                                ),
-                            )
-                        )
-                        self.towers[-1].create_ppo_optimizer()
-            self.model = self.towers[0]
-            avg_grads = self.average_gradients([t.grads for t in self.towers])
-            update_batch = self.model.optimizer.apply_gradients(avg_grads)
-
-            avg_value_loss = tf.reduce_mean(
-                tf.stack([model.value_loss for model in self.towers]), 0
-            )
-            avg_policy_loss = tf.reduce_mean(
-                tf.stack([model.policy_loss for model in self.towers]), 0
-            )
-
-        self.inference_dict.update(
-            {
-                "action": self.model.output,
-                "log_probs": self.model.all_log_probs,
-                "value_heads": self.model.value_heads,
-                "value": self.model.value,
-                "entropy": self.model.entropy,
-                "learning_rate": self.model.learning_rate,
-            }
-        )
-        if self.use_continuous_act:
-            self.inference_dict["pre_action"] = self.model.output_pre
-        if self.use_recurrent:
-            self.inference_dict["memory_out"] = self.model.memory_out
-        if (
-            is_training
-            and self.use_vec_obs
-            and trainer_params["normalize"]
-            and not load
-        ):
-            self.inference_dict["update_mean"] = self.model.update_normalization
-
-        self.total_policy_loss = self.model.abs_policy_loss
-        self.update_dict.update(
-            {
-                "value_loss": avg_value_loss,
-                "policy_loss": avg_policy_loss,
-                "update_batch": update_batch,
-            }
-        )
-
-    def create_reward_signals(self, reward_signal_configs):
-        """
-        Create reward signals
-        :param reward_signal_configs: Reward signal config.
-        """
-        with self.graph.as_default():
-            with tf.variable_scope(TOWER_SCOPE_NAME, reuse=tf.AUTO_REUSE):
-                for device_id, device in enumerate(self.devices):
-                    with tf.device(device):
-                        reward_tower = {}
-                        for reward_signal, config in reward_signal_configs.items():
-                            reward_tower[reward_signal] = create_reward_signal(
-                                self, self.towers[device_id], reward_signal, config
-                            )
-                            for k, v in reward_tower[reward_signal].update_dict.items():
-                                self.update_dict[k + "_" + str(device_id)] = v
-                        self.reward_signal_towers.append(reward_tower)
-                for _, reward_tower in self.reward_signal_towers[0].items():
-                    for _, update_key in reward_tower.stats_name_to_update_name.items():
-                        all_reward_signal_stats = tf.stack(
-                            [
-                                self.update_dict[update_key + "_" + str(i)]
-                                for i in range(len(self.towers))
-                            ]
-                        )
-                        mean_reward_signal_stats = tf.reduce_mean(
-                            all_reward_signal_stats, 0
-                        )
-                        self.update_dict.update({update_key: mean_reward_signal_stats})
-
-            self.reward_signals = self.reward_signal_towers[0]
-
-    @timed
-    def update(self, mini_batch, num_sequences):
-        """
-        Updates model using buffer.
-        :param n_sequences: Number of trajectories in batch.
-        :param mini_batch: Experience batch.
-        :return: Output from update process.
-        """
-        feed_dict = {}
-        stats_needed = self.stats_name_to_update_name
-
-        device_batch_size = num_sequences // len(self.devices)
-        device_batches = []
-        for i in range(len(self.devices)):
-            device_batches.append(
-                {
-                    k: v[
-                        i * device_batch_size : i * device_batch_size
-                        + device_batch_size
-                    ]
-                    for (k, v) in mini_batch.items()
-                }
-            )
-
-        for batch, tower, reward_tower in zip(
-            device_batches, self.towers, self.reward_signal_towers
-        ):
-            feed_dict.update(self.construct_feed_dict(tower, batch, num_sequences))
-            stats_needed.update(self.stats_name_to_update_name)
-            for _, reward_signal in reward_tower.items():
-                feed_dict.update(
-                    reward_signal.prepare_update(tower, batch, num_sequences)
-                )
-                stats_needed.update(reward_signal.stats_name_to_update_name)
-
-        update_vals = self._execute_model(feed_dict, self.update_dict)
-        update_stats = {}
-        for stat_name, update_name in stats_needed.items():
-            update_stats[stat_name] = update_vals[update_name]
-        return update_stats
-
-    def average_gradients(self, tower_grads):
-        """
-        Average gradients from all towers
-        :param tower_grads: Gradients from all towers
-        """
-        average_grads = []
-        for grad_and_vars in zip(*tower_grads):
-            grads = [g for g, _ in grad_and_vars if g is not None]
-            if not grads:
-                continue
-            avg_grad = tf.reduce_mean(tf.stack(grads), 0)
-            var = grad_and_vars[0][1]
-            average_grads.append((avg_grad, var))
-        return average_grads
-
-
-def get_devices() -> List[str]:
-    """
-    Get all available GPU devices
-    """
-    local_device_protos = device_lib.list_local_devices()
-    devices = [x.name for x in local_device_protos if x.device_type == "GPU"]
-    return devices
--- a/ml-agents/mlagents/trainers/ppo/policy.py
+++ b/ml-agents/mlagents/trainers/ppo/policy.py
-import logging
-import numpy as np
-from typing import Any, Dict, Optional, List
-
-from mlagents.tf_utils import tf
-
-from mlagents_envs.timers import timed
-from mlagents_envs.base_env import BatchedStepResult
-from mlagents.trainers.brain import BrainParameters
-from mlagents.trainers.models import EncoderType, LearningRateSchedule
-from mlagents.trainers.ppo.models import PPOModel
-from mlagents.trainers.tf_policy import TFPolicy
-from mlagents.trainers.components.reward_signals.reward_signal_factory import (
-    create_reward_signal,
-)
-from mlagents.trainers.components.bc.module import BCModule
-
-logger = logging.getLogger("mlagents.trainers")
-
-
-class PPOPolicy(TFPolicy):
-    def __init__(
-        self,
-        seed: int,
-        brain: BrainParameters,
-        trainer_params: Dict[str, Any],
-        is_training: bool,
-        load: bool,
-    ):
-        """
-        Policy for Proximal Policy Optimization Networks.
-        :param seed: Random seed.
-        :param brain: Assigned Brain object.
-        :param trainer_params: Defined training parameters.
-        :param is_training: Whether the model should be trained.
-        :param load: Whether a pre-trained model will be loaded or a new one created.
-        """
-        super().__init__(seed, brain, trainer_params)
-
-        reward_signal_configs = trainer_params["reward_signals"]
-        self.inference_dict: Dict[str, tf.Tensor] = {}
-        self.update_dict: Dict[str, tf.Tensor] = {}
-        self.stats_name_to_update_name = {
-            "Losses/Value Loss": "value_loss",
-            "Losses/Policy Loss": "policy_loss",
-        }
-
-        self.create_model(
-            brain, trainer_params, reward_signal_configs, is_training, load, seed
-        )
-        self.create_reward_signals(reward_signal_configs)
-
-        with self.graph.as_default():
-            self.bc_module: Optional[BCModule] = None
-            # Create pretrainer if needed
-            if "behavioral_cloning" in trainer_params:
-                BCModule.check_config(trainer_params["behavioral_cloning"])
-                self.bc_module = BCModule(
-                    self,
-                    policy_learning_rate=trainer_params["learning_rate"],
-                    default_batch_size=trainer_params["batch_size"],
-                    default_num_epoch=3,
-                    **trainer_params["behavioral_cloning"],
-                )
-
-        if load:
-            self._load_graph()
-        else:
-            self._initialize_graph()
-
-    def create_model(
-        self, brain, trainer_params, reward_signal_configs, is_training, load, seed
-    ):
-        """
-        Create PPO model
-        :param brain: Assigned Brain object.
-        :param trainer_params: Defined training parameters.
-        :param reward_signal_configs: Reward signal config
-        :param seed: Random seed.
-        """
-        with self.graph.as_default():
-            self.model = PPOModel(
-                brain=brain,
-                lr=float(trainer_params["learning_rate"]),
-                lr_schedule=LearningRateSchedule(
-                    trainer_params.get("learning_rate_schedule", "linear")
-                ),
-                h_size=int(trainer_params["hidden_units"]),
-                epsilon=float(trainer_params["epsilon"]),
-                beta=float(trainer_params["beta"]),
-                max_step=float(trainer_params["max_steps"]),
-                normalize=trainer_params["normalize"],
-                use_recurrent=trainer_params["use_recurrent"],
-                num_layers=int(trainer_params["num_layers"]),
-                m_size=self.m_size,
-                seed=seed,
-                stream_names=list(reward_signal_configs.keys()),
-                vis_encode_type=EncoderType(
-                    trainer_params.get("vis_encode_type", "simple")
-                ),
-            )
-            self.model.create_ppo_optimizer()
-
-        self.inference_dict.update(
-            {
-                "action": self.model.output,
-                "log_probs": self.model.all_log_probs,
-                "entropy": self.model.entropy,
-                "learning_rate": self.model.learning_rate,
-            }
-        )
-        if self.use_continuous_act:
-            self.inference_dict["pre_action"] = self.model.output_pre
-        if self.use_recurrent:
-            self.inference_dict["memory_out"] = self.model.memory_out
-
-        self.total_policy_loss = self.model.abs_policy_loss
-        self.update_dict.update(
-            {
-                "value_loss": self.model.value_loss,
-                "policy_loss": self.total_policy_loss,
-                "update_batch": self.model.update_batch,
-            }
-        )
-
-    def create_reward_signals(self, reward_signal_configs):
-        """
-        Create reward signals
-        :param reward_signal_configs: Reward signal config.
-        """
-        self.reward_signals = {}
-        with self.graph.as_default():
-            # Create reward signals
-            for reward_signal, config in reward_signal_configs.items():
-                self.reward_signals[reward_signal] = create_reward_signal(
-                    self, self.model, reward_signal, config
-                )
-                self.update_dict.update(self.reward_signals[reward_signal].update_dict)
-
-    @timed
-    def evaluate(
-        self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]
-    ) -> Dict[str, Any]:
-        """
-        Evaluates policy for the agent experiences provided.
-        :param batched_step_result: BatchedStepResult object containing inputs.
-        :param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result.
-        :return: Outputs from network as defined by self.inference_dict.
-        """
-        feed_dict = {
-            self.model.batch_size: batched_step_result.n_agents(),
-            self.model.sequence_length: 1,
-        }
-        epsilon = None
-        if self.use_recurrent:
-            if not self.use_continuous_act:
-                feed_dict[self.model.prev_action] = self.retrieve_previous_action(
-                    global_agent_ids
-                )
-            feed_dict[self.model.memory_in] = self.retrieve_memories(global_agent_ids)
-        if self.use_continuous_act:
-            epsilon = np.random.normal(
-                size=(batched_step_result.n_agents(), self.model.act_size[0])
-            )
-            feed_dict[self.model.epsilon] = epsilon
-        feed_dict = self.fill_eval_dict(feed_dict, batched_step_result)
-        run_out = self._execute_model(feed_dict, self.inference_dict)
-        return run_out
-
-    @timed
-    def update(self, mini_batch, num_sequences):
-        """
-        Performs update on model.
-        :param mini_batch: Batch of experiences.
-        :param num_sequences: Number of sequences to process.
-        :return: Results of update.
-        """
-        feed_dict = self.construct_feed_dict(self.model, mini_batch, num_sequences)
-        stats_needed = self.stats_name_to_update_name
-        update_stats = {}
-        # Collect feed dicts for all reward signals.
-        for _, reward_signal in self.reward_signals.items():
-            feed_dict.update(
-                reward_signal.prepare_update(self.model, mini_batch, num_sequences)
-            )
-            stats_needed.update(reward_signal.stats_name_to_update_name)
-
-        update_vals = self._execute_model(feed_dict, self.update_dict)
-        for stat_name, update_name in stats_needed.items():
-            update_stats[stat_name] = update_vals[update_name]
-        return update_stats
-
-    def construct_feed_dict(self, model, mini_batch, num_sequences):
-        feed_dict = {
-            model.batch_size: num_sequences,
-            model.sequence_length: self.sequence_length,
-            model.mask_input: mini_batch["masks"],
-            model.advantage: mini_batch["advantages"],
-            model.all_old_log_probs: mini_batch["action_probs"],
-        }
-        for name in self.reward_signals:
-            feed_dict[model.returns_holders[name]] = mini_batch[
-                "{}_returns".format(name)
-            ]
-            feed_dict[model.old_values[name]] = mini_batch[
-                "{}_value_estimates".format(name)
-            ]
-
-        if self.use_continuous_act:
-            feed_dict[model.output_pre] = mini_batch["actions_pre"]
-        else:
-            feed_dict[model.action_holder] = mini_batch["actions"]
-            if self.use_recurrent:
-                feed_dict[model.prev_action] = mini_batch["prev_action"]
-            feed_dict[model.action_masks] = mini_batch["action_mask"]
-        if self.use_vec_obs:
-            feed_dict[model.vector_in] = mini_batch["vector_obs"]
-        if self.model.vis_obs_size > 0:
-            for i, _ in enumerate(self.model.visual_in):
-                feed_dict[model.visual_in[i]] = mini_batch["visual_obs%d" % i]
-        if self.use_recurrent:
-            mem_in = [
-                mini_batch["memory"][i]
-                for i in range(0, len(mini_batch["memory"]), self.sequence_length)
-            ]
-            feed_dict[model.memory_in] = mem_in
-        return feed_dict
--- a/ml-agents/mlagents/trainers/sac/models.py
+++ b/ml-agents/mlagents/trainers/sac/models.py
--- a/ml-agents/mlagents/trainers/sac/policy.py
+++ b/ml-agents/mlagents/trainers/sac/policy.py
-import logging
-from typing import Dict, Any, Optional, Mapping, List
-import numpy as np
-from mlagents.tf_utils import tf
-
-from mlagents_envs.timers import timed
-from mlagents_envs.base_env import BatchedStepResult
-from mlagents.trainers.brain import BrainParameters
-from mlagents.trainers.models import EncoderType, LearningRateSchedule
-from mlagents.trainers.sac.models import SACModel
-from mlagents.trainers.tf_policy import TFPolicy
-from mlagents.trainers.components.reward_signals.reward_signal_factory import (
-    create_reward_signal,
-)
-from mlagents.trainers.components.reward_signals import RewardSignal
-from mlagents.trainers.components.bc.module import BCModule
-
-logger = logging.getLogger("mlagents.trainers")
-
-
-class SACPolicy(TFPolicy):
-    def __init__(
-        self,
-        seed: int,
-        brain: BrainParameters,
-        trainer_params: Dict[str, Any],
-        is_training: bool,
-        load: bool,
-    ) -> None:
-        """
-        Policy for Proximal Policy Optimization Networks.
-        :param seed: Random seed.
-        :param brain: Assigned Brain object.
-        :param trainer_params: Defined training parameters.
-        :param is_training: Whether the model should be trained.
-        :param load: Whether a pre-trained model will be loaded or a new one created.
-        """
-        super().__init__(seed, brain, trainer_params)
-
-        reward_signal_configs = {}
-        for key, rsignal in trainer_params["reward_signals"].items():
-            if type(rsignal) is dict:
-                reward_signal_configs[key] = rsignal
-
-        self.inference_dict: Dict[str, tf.Tensor] = {}
-        self.update_dict: Dict[str, tf.Tensor] = {}
-        self.create_model(
-            brain, trainer_params, reward_signal_configs, is_training, load, seed
-        )
-        self.create_reward_signals(reward_signal_configs)
-
-        self.stats_name_to_update_name = {
-            "Losses/Value Loss": "value_loss",
-            "Losses/Policy Loss": "policy_loss",
-            "Losses/Q1 Loss": "q1_loss",
-            "Losses/Q2 Loss": "q2_loss",
-            "Policy/Entropy Coeff": "entropy_coef",
-        }
-
-        with self.graph.as_default():
-            # Create pretrainer if needed
-            self.bc_module: Optional[BCModule] = None
-            if "behavioral_cloning" in trainer_params:
-                BCModule.check_config(trainer_params["behavioral_cloning"])
-                self.bc_module = BCModule(
-                    self,
-                    policy_learning_rate=trainer_params["learning_rate"],
-                    default_batch_size=trainer_params["batch_size"],
-                    default_num_epoch=1,
-                    samples_per_update=trainer_params["batch_size"],
-                    **trainer_params["behavioral_cloning"],
-                )
-                # SAC-specific setting - we don't want to do a whole epoch each update!
-                if "samples_per_update" in trainer_params["behavioral_cloning"]:
-                    logger.warning(
-                        "Pretraining: Samples Per Update is not a valid setting for SAC."
-                    )
-                    self.bc_module.samples_per_update = 1
-
-        if load:
-            self._load_graph()
-        else:
-            self._initialize_graph()
-            self.sess.run(self.model.target_init_op)
-
-        # Disable terminal states for certain reward signals to avoid survivor bias
-        for name, reward_signal in self.reward_signals.items():
-            if not reward_signal.use_terminal_states:
-                self.sess.run(self.model.disable_use_dones[name])
-
-    def create_model(
-        self,
-        brain: BrainParameters,
-        trainer_params: Dict[str, Any],
-        reward_signal_configs: Dict[str, Any],
-        is_training: bool,
-        load: bool,
-        seed: int,
-    ) -> None:
-        with self.graph.as_default():
-            self.model = SACModel(
-                brain,
-                lr=float(trainer_params["learning_rate"]),
-                lr_schedule=LearningRateSchedule(
-                    trainer_params.get("learning_rate_schedule", "constant")
-                ),
-                h_size=int(trainer_params["hidden_units"]),
-                init_entcoef=float(trainer_params["init_entcoef"]),
-                max_step=float(trainer_params["max_steps"]),
-                normalize=trainer_params["normalize"],
-                use_recurrent=trainer_params["use_recurrent"],
-                num_layers=int(trainer_params["num_layers"]),
-                m_size=self.m_size,
-                seed=seed,
-                stream_names=list(reward_signal_configs.keys()),
-                tau=float(trainer_params["tau"]),
-                gammas=[_val["gamma"] for _val in reward_signal_configs.values()],
-                vis_encode_type=EncoderType(
-                    trainer_params.get("vis_encode_type", "simple")
-                ),
-            )
-            self.model.create_sac_optimizers()
-
-        self.inference_dict.update(
-            {
-                "action": self.model.output,
-                "log_probs": self.model.all_log_probs,
-                "entropy": self.model.entropy,
-                "learning_rate": self.model.learning_rate,
-            }
-        )
-        if self.use_continuous_act:
-            self.inference_dict["pre_action"] = self.model.output_pre
-        if self.use_recurrent:
-            self.inference_dict["memory_out"] = self.model.memory_out
-
-        self.update_dict.update(
-            {
-                "value_loss": self.model.total_value_loss,
-                "policy_loss": self.model.policy_loss,
-                "q1_loss": self.model.q1_loss,
-                "q2_loss": self.model.q2_loss,
-                "entropy_coef": self.model.ent_coef,
-                "entropy": self.model.entropy,
-                "update_batch": self.model.update_batch_policy,
-                "update_value": self.model.update_batch_value,
-                "update_entropy": self.model.update_batch_entropy,
-            }
-        )
-
-    def create_reward_signals(self, reward_signal_configs: Dict[str, Any]) -> None:
-        """
-        Create reward signals
-        :param reward_signal_configs: Reward signal config.
-        """
-        self.reward_signals: Dict[str, RewardSignal] = {}
-        with self.graph.as_default():
-            # Create reward signals
-            for reward_signal, config in reward_signal_configs.items():
-                if type(config) is dict:
-                    self.reward_signals[reward_signal] = create_reward_signal(
-                        self, self.model, reward_signal, config
-                    )
-
-    def evaluate(
-        self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]
-    ) -> Dict[str, np.ndarray]:
-        """
-        Evaluates policy for the agent experiences provided.
-        :param batched_step_result: BatchedStepResult object containing inputs.
-        :return: Outputs from network as defined by self.inference_dict.
-        """
-        feed_dict = {
-            self.model.batch_size: batched_step_result.n_agents(),
-            self.model.sequence_length: 1,
-        }
-        if self.use_recurrent:
-            if not self.use_continuous_act:
-                feed_dict[self.model.prev_action] = self.retrieve_previous_action(
-                    global_agent_ids
-                )
-            feed_dict[self.model.memory_in] = self.retrieve_memories(global_agent_ids)
-
-        feed_dict = self.fill_eval_dict(feed_dict, batched_step_result)
-        run_out = self._execute_model(feed_dict, self.inference_dict)
-        return run_out
-
-    @timed
-    def update(
-        self, mini_batch: Dict[str, Any], num_sequences: int
-    ) -> Dict[str, float]:
-        """
-        Updates model using buffer.
-        :param num_sequences: Number of trajectories in batch.
-        :param mini_batch: Experience batch.
-        :param update_target: Whether or not to update target value network
-        :param reward_signal_mini_batches: Minibatches to use for updating the reward signals,
-            indexed by name. If none, don't update the reward signals.
-        :return: Output from update process.
-        """
-        feed_dict = self.construct_feed_dict(self.model, mini_batch, num_sequences)
-        stats_needed = self.stats_name_to_update_name
-        update_stats: Dict[str, float] = {}
-        update_vals = self._execute_model(feed_dict, self.update_dict)
-        for stat_name, update_name in stats_needed.items():
-            update_stats[stat_name] = update_vals[update_name]
-        # Update target network. By default, target update happens at every policy update.
-        self.sess.run(self.model.target_update_op)
-        return update_stats
-
-    def update_reward_signals(
-        self, reward_signal_minibatches: Mapping[str, Dict], num_sequences: int
-    ) -> Dict[str, float]:
-        """
-        Only update the reward signals.
-        :param reward_signal_mini_batches: Minibatches to use for updating the reward signals,
-            indexed by name. If none, don't update the reward signals.
-        """
-        # Collect feed dicts for all reward signals.
-        feed_dict: Dict[tf.Tensor, Any] = {}
-        update_dict: Dict[str, tf.Tensor] = {}
-        update_stats: Dict[str, float] = {}
-        stats_needed: Dict[str, str] = {}
-        if reward_signal_minibatches:
-            self.add_reward_signal_dicts(
-                feed_dict,
-                update_dict,
-                stats_needed,
-                reward_signal_minibatches,
-                num_sequences,
-            )
-        update_vals = self._execute_model(feed_dict, update_dict)
-        for stat_name, update_name in stats_needed.items():
-            update_stats[stat_name] = update_vals[update_name]
-        return update_stats
-
-    def add_reward_signal_dicts(
-        self,
-        feed_dict: Dict[tf.Tensor, Any],
-        update_dict: Dict[str, tf.Tensor],
-        stats_needed: Dict[str, str],
-        reward_signal_minibatches: Mapping[str, Dict],
-        num_sequences: int,
-    ) -> None:
-        """
-        Adds the items needed for reward signal updates to the feed_dict and stats_needed dict.
-        :param feed_dict: Feed dict needed update
-        :param update_dit: Update dict that needs update
-        :param stats_needed: Stats needed to get from the update.
-        :param reward_signal_minibatches: Minibatches to use for updating the reward signals,
-            indexed by name.
-        """
-        for name, r_mini_batch in reward_signal_minibatches.items():
-            feed_dict.update(
-                self.reward_signals[name].prepare_update(
-                    self.model, r_mini_batch, num_sequences
-                )
-            )
-            update_dict.update(self.reward_signals[name].update_dict)
-            stats_needed.update(self.reward_signals[name].stats_name_to_update_name)
-
-    def construct_feed_dict(
-        self, model: SACModel, mini_batch: Dict[str, Any], num_sequences: int
-    ) -> Dict[tf.Tensor, Any]:
-        """
-        Builds the feed dict for updating the SAC model.
-        :param model: The model to update. May be different when, e.g. using multi-GPU.
-        :param mini_batch: Mini-batch to use to update.
-        :param num_sequences: Number of LSTM sequences in mini_batch.
-        """
-        feed_dict = {
-            self.model.batch_size: num_sequences,
-            self.model.sequence_length: self.sequence_length,
-            self.model.next_sequence_length: self.sequence_length,
-            self.model.mask_input: mini_batch["masks"],
-        }
-        for name in self.reward_signals:
-            feed_dict[model.rewards_holders[name]] = mini_batch[
-                "{}_rewards".format(name)
-            ]
-
-        if self.use_continuous_act:
-            feed_dict[model.action_holder] = mini_batch["actions"]
-        else:
-            feed_dict[model.action_holder] = mini_batch["actions"]
-            if self.use_recurrent:
-                feed_dict[model.prev_action] = mini_batch["prev_action"]
-            feed_dict[model.action_masks] = mini_batch["action_mask"]
-        if self.use_vec_obs:
-            feed_dict[model.vector_in] = mini_batch["vector_obs"]
-            feed_dict[model.next_vector_in] = mini_batch["next_vector_in"]
-        if self.model.vis_obs_size > 0:
-            for i, _ in enumerate(model.visual_in):
-                _obs = mini_batch["visual_obs%d" % i]
-                feed_dict[model.visual_in[i]] = _obs
-            for i, _ in enumerate(model.next_visual_in):
-                _obs = mini_batch["next_visual_obs%d" % i]
-                feed_dict[model.next_visual_in[i]] = _obs
-        if self.use_recurrent:
-            mem_in = [
-                mini_batch["memory"][i]
-                for i in range(0, len(mini_batch["memory"]), self.sequence_length)
-            ]
-            # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true.
-            offset = 1 if self.sequence_length > 1 else 0
-            next_mem_in = [
-                mini_batch["memory"][i][
-                    : self.m_size // 4
-                ]  # only pass value part of memory to target network
-                for i in range(offset, len(mini_batch["memory"]), self.sequence_length)
-            ]
-            feed_dict[model.memory_in] = mem_in
-            feed_dict[model.next_memory_in] = next_mem_in
-        feed_dict[model.dones_holder] = mini_batch["done"]
-        return feed_dict
--- a/ml-agents/mlagents/trainers/tests/test_multigpu.py
+++ b/ml-agents/mlagents/trainers/tests/test_multigpu.py
-from unittest import mock
-import pytest
-
-from mlagents.tf_utils import tf
-import yaml
-
-from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy
-from mlagents.trainers.tests.mock_brain import create_mock_brainparams
-
-
-@pytest.fixture
-def dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        use_recurrent: false
-        memory_size: 8
-        curiosity_strength: 0.0
-        curiosity_enc_size: 1
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        """
-    )
-
-
-@mock.patch("mlagents.trainers.ppo.multi_gpu_policy.get_devices")
-def test_create_model(mock_get_devices, dummy_config):
-    tf.reset_default_graph()
-    mock_get_devices.return_value = [
-        "/device:GPU:0",
-        "/device:GPU:1",
-        "/device:GPU:2",
-        "/device:GPU:3",
-    ]
-
-    trainer_parameters = dummy_config
-    trainer_parameters["model_path"] = ""
-    trainer_parameters["keep_checkpoints"] = 3
-    brain = create_mock_brainparams()
-
-    policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False)
-    assert len(policy.towers) == len(mock_get_devices.return_value)
-
-
-@mock.patch("mlagents.trainers.ppo.multi_gpu_policy.get_devices")
-def test_average_gradients(mock_get_devices, dummy_config):
-    tf.reset_default_graph()
-    mock_get_devices.return_value = [
-        "/device:GPU:0",
-        "/device:GPU:1",
-        "/device:GPU:2",
-        "/device:GPU:3",
-    ]
-
-    trainer_parameters = dummy_config
-    trainer_parameters["model_path"] = ""
-    trainer_parameters["keep_checkpoints"] = 3
-    brain = create_mock_brainparams()
-    with tf.Session() as sess:
-        policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False)
-        var = tf.Variable(0)
-        tower_grads = [
-            [(tf.constant(0.1), var)],
-            [(tf.constant(0.2), var)],
-            [(tf.constant(0.3), var)],
-            [(tf.constant(0.4), var)],
-        ]
-        avg_grads = policy.average_gradients(tower_grads)
-
-        init = tf.global_variables_initializer()
-        sess.run(init)
-        run_out = sess.run(avg_grads)
-    assert run_out == [(0.25, 0)]
-
-
-@mock.patch("mlagents.trainers.tf_policy.TFPolicy._execute_model")
-@mock.patch("mlagents.trainers.ppo.policy.PPOPolicy.construct_feed_dict")
-@mock.patch("mlagents.trainers.ppo.multi_gpu_policy.get_devices")
-def test_update(
-    mock_get_devices, mock_construct_feed_dict, mock_execute_model, dummy_config
-):
-    tf.reset_default_graph()
-    mock_get_devices.return_value = ["/device:GPU:0", "/device:GPU:1"]
-    mock_construct_feed_dict.return_value = {}
-    mock_execute_model.return_value = {
-        "value_loss": 0.1,
-        "policy_loss": 0.3,
-        "update_batch": None,
-    }
-
-    trainer_parameters = dummy_config
-    trainer_parameters["model_path"] = ""
-    trainer_parameters["keep_checkpoints"] = 3
-    brain = create_mock_brainparams()
-    policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False)
-    mock_mini_batch = mock.Mock()
-    mock_mini_batch.items.return_value = [("action", [1, 2]), ("value", [3, 4])]
-    run_out = policy.update(mock_mini_batch, 1)
-
-    assert mock_mini_batch.items.call_count == len(mock_get_devices.return_value)
-    assert mock_construct_feed_dict.call_count == len(mock_get_devices.return_value)
-    assert run_out["Losses/Value Loss"] == 0.1
-    assert run_out["Losses/Policy Loss"] == 0.3
-
-
-if __name__ == "__main__":
-    pytest.main()
--- a//com.unity.ml-agents/Runtime/Demonstrations/Demonstration.cs.meta
+++ b//com.unity.ml-agents/Runtime/Demonstrations/Demonstration.cs.meta
--- a//com.unity.ml-agents/Runtime/Demonstrations/DemonstrationRecorder.cs.meta
+++ b//com.unity.ml-agents/Runtime/Demonstrations/DemonstrationRecorder.cs.meta
--- a//com.unity.ml-agents/Runtime/Demonstrations/Demonstration.cs
+++ b//com.unity.ml-agents/Runtime/Demonstrations/Demonstration.cs
--- a//com.unity.ml-agents/Runtime/Demonstrations/DemonstrationWriter.cs
+++ b//com.unity.ml-agents/Runtime/Demonstrations/DemonstrationWriter.cs
--- a//com.unity.ml-agents/Runtime/Demonstrations/DemonstrationWriter.cs.meta
+++ b//com.unity.ml-agents/Runtime/Demonstrations/DemonstrationWriter.cs.meta