Merge branch 'master' into sensitivity

4 年前 · 06e4356c
--- a/com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyPoseExtractor.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyPoseExtractor.cs
            var t = go.transform;
            return new Pose { rotation = t.rotation, position = t.position };
        }
+
+        internal ArticulationBody[] Bodies => m_Bodies;
    }
 }
 #endif // UNITY_2020_1_OR_NEWER
--- a/com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodySensorComponent.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodySensorComponent.cs
            // TODO static method in PhysicsBodySensor?
            // TODO only update PoseExtractor when body changes?
            var poseExtractor = new ArticulationBodyPoseExtractor(RootBody);
-            var numTransformObservations = Settings.TransformSize(poseExtractor.NumPoses);
-            return new[] { numTransformObservations };
+            var numPoseObservations = poseExtractor.GetNumPoseObservations(Settings);
+            var numJointObservations = 0;
+            // Start from i=1 to ignore the root
+            for (var i = 1; i < poseExtractor.Bodies.Length; i++)
+            {
+                numJointObservations += ArticulationBodyJointExtractor.NumObservations(
+                    poseExtractor.Bodies[i], Settings
+                );
+            }
+            return new[] { numPoseObservations + numJointObservations };
        }
    }

--- a/com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsBodySensor.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsBodySensor.cs
        string m_SensorName;

        PoseExtractor m_PoseExtractor;
+        IJointExtractor[] m_JointExtractors;
        PhysicsSensorSettings m_Settings;

        /// <summary>
        /// <param name="sensorName"></param>
        public PhysicsBodySensor(Rigidbody rootBody, GameObject rootGameObject, PhysicsSensorSettings settings, string sensorName=null)
        {
-            m_PoseExtractor = new RigidBodyPoseExtractor(rootBody, rootGameObject);
+            var poseExtractor = new RigidBodyPoseExtractor(rootBody, rootGameObject);
+            m_PoseExtractor = poseExtractor;
-            var numTransformObservations = settings.TransformSize(m_PoseExtractor.NumPoses);
-            m_Shape = new[] { numTransformObservations };
+            var numJointExtractorObservations = 0;
+            var rigidBodies = poseExtractor.Bodies;
+            if (rigidBodies != null)
+            {
+                m_JointExtractors = new IJointExtractor[rigidBodies.Length - 1]; // skip the root
+                for (var i = 1; i < rigidBodies.Length; i++)
+                {
+                    var jointExtractor = new RigidBodyJointExtractor(rigidBodies[i]);
+                    numJointExtractorObservations += jointExtractor.NumObservations(settings);
+                    m_JointExtractors[i - 1] = jointExtractor;
+                }
+            }
+            else
+            {
+                m_JointExtractors = new IJointExtractor[0];
+            }
+
+            var numTransformObservations = m_PoseExtractor.GetNumPoseObservations(settings);
+            m_Shape = new[] { numTransformObservations + numJointExtractorObservations };
-            m_PoseExtractor = new ArticulationBodyPoseExtractor(rootBody);
+            var poseExtractor = new ArticulationBodyPoseExtractor(rootBody);
+            m_PoseExtractor = poseExtractor;
-            var numTransformObservations = settings.TransformSize(m_PoseExtractor.NumPoses);
-            m_Shape = new[] { numTransformObservations };
+            var numJointExtractorObservations = 0;
+            var articBodies = poseExtractor.Bodies;
+            if (articBodies != null)
+            {
+                m_JointExtractors = new IJointExtractor[articBodies.Length - 1]; // skip the root
+                for (var i = 1; i < articBodies.Length; i++)
+                {
+                    var jointExtractor = new ArticulationBodyJointExtractor(articBodies[i]);
+                    numJointExtractorObservations += jointExtractor.NumObservations(settings);
+                    m_JointExtractors[i - 1] = jointExtractor;
+                }
+            }
+            else
+            {
+                m_JointExtractors = new IJointExtractor[0];
+            }
+
+            var numTransformObservations = m_PoseExtractor.GetNumPoseObservations(settings);
+            m_Shape = new[] { numTransformObservations + numJointExtractorObservations };
        }
 #endif

        public int Write(ObservationWriter writer)
        {
            var numWritten = writer.WritePoses(m_Settings, m_PoseExtractor);
+            foreach (var jointExtractor in m_JointExtractors)
+            {
+                numWritten += jointExtractor.Write(m_Settings, writer, numWritten);
+            }
            return numWritten;
        }

--- a/com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsSensorSettings.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsSensorSettings.cs
        public bool UseLocalSpaceLinearVelocity;

        /// <summary>
+        /// Whether to use joint-specific positions and angles as observations.
+        /// </summary>
+        public bool UseJointPositionsAndAngles;
+
+        /// <summary>
+        /// Whether to use the joint forces and torques that are applied by the solver as observations.
+        /// </summary>
+        public bool UseJointForces;
+
+        /// <summary>
        /// Creates a PhysicsSensorSettings with reasonable default values.
        /// </summary>
        /// <returns></returns>
        public bool UseLocalSpace
        {
            get { return UseLocalSpaceTranslations || UseLocalSpaceRotations || UseLocalSpaceLinearVelocity; }
-        }
-
-
-        /// <summary>
-        /// The number of floats needed to represent a given number of transforms.
-        /// </summary>
-        /// <param name="numTransforms"></param>
-        /// <returns></returns>
-        public int TransformSize(int numTransforms)
-        {
-            int obsPerTransform = 0;
-            obsPerTransform += UseModelSpaceTranslations ? 3 : 0;
-            obsPerTransform += UseModelSpaceRotations ? 4 : 0;
-            obsPerTransform += UseLocalSpaceTranslations ? 3 : 0;
-            obsPerTransform += UseLocalSpaceRotations ? 4 : 0;
-
-            obsPerTransform += UseModelSpaceLinearVelocity ? 3 : 0;
-            obsPerTransform += UseLocalSpaceLinearVelocity ? 3 : 0;
-
-            return numTransforms * obsPerTransform;
        }
    }

--- a/com.unity.ml-agents.extensions/Runtime/Sensors/PoseExtractor.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Sensors/PoseExtractor.cs
            }
        }

+        /// <summary>
+        /// Compute the number of floats needed to represent the poses for the given PhysicsSensorSettings.
+        /// </summary>
+        /// <param name="settings"></param>
+        /// <returns></returns>
+        public int GetNumPoseObservations(PhysicsSensorSettings settings)
+        {
+            int obsPerPose = 0;
+            obsPerPose += settings.UseModelSpaceTranslations ? 3 : 0;
+            obsPerPose += settings.UseModelSpaceRotations ? 4 : 0;
+            obsPerPose += settings.UseLocalSpaceTranslations ? 3 : 0;
+            obsPerPose += settings.UseLocalSpaceRotations ? 4 : 0;
+
+            obsPerPose += settings.UseModelSpaceLinearVelocity ? 3 : 0;
+            obsPerPose += settings.UseLocalSpaceLinearVelocity ? 3 : 0;
+
+            return NumPoses * obsPerPose;
+        }

        internal void DrawModelSpace(Vector3 offset)
        {
--- a/com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyPoseExtractor.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyPoseExtractor.cs
            var body = m_Bodies[index];
            return new Pose { rotation = body.rotation, position = body.position };
        }
+
+        internal Rigidbody[] Bodies => m_Bodies;
    }

 }
--- a/com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodySensorComponent.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodySensorComponent.cs
            // TODO static method in PhysicsBodySensor?
            // TODO only update PoseExtractor when body changes?
            var poseExtractor = new RigidBodyPoseExtractor(RootBody, gameObject);
-            var numTransformObservations = Settings.TransformSize(poseExtractor.NumPoses);
-            return new[] { numTransformObservations };
+            var numPoseObservations = poseExtractor.GetNumPoseObservations(Settings);
+
+            var numJointObservations = 0;
+            // Start from i=1 to ignore the root
+            for (var i = 1; i < poseExtractor.Bodies.Length; i++)
+            {
+                var body = poseExtractor.Bodies[i];
+                var joint = body?.GetComponent<Joint>();
+                numJointObservations += RigidBodyJointExtractor.NumObservations(body, joint, Settings);
+            }
+            return new[] { numPoseObservations + numJointObservations };
        }
    }

--- a/com.unity.ml-agents.extensions/Tests/Editor/Sensors/ArticulationBodySensorTests.cs
+++ b/com.unity.ml-agents.extensions/Tests/Editor/Sensors/ArticulationBodySensorTests.cs
                0f, 0f, 0f, 1f // LocalSpaceRotations
            };
            SensorTestHelper.CompareObservation(sensor, expected);
+            Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
        }

        [Test]
            var leafArticBody = leafGameObj.AddComponent<ArticulationBody>();
            leafGameObj.transform.SetParent(middleGamObj.transform);
            leafGameObj.transform.localPosition = new Vector3(4.2f, 0f, 0f);
-            leafArticBody.jointType = ArticulationJointType.RevoluteJoint;
+            leafArticBody.jointType = ArticulationJointType.PrismaticJoint;
+            leafArticBody.linearLockZ = ArticulationDofLock.LimitedMotion;
+            leafArticBody.zDrive = new ArticulationDrive
+            {
+                lowerLimit = -3,
+                upperLimit = 1
+            };
+

 #if UNITY_2020_2_OR_NEWER
            // ArticulationBody.velocity is read-only in 2020.1
 #endif
            };
            SensorTestHelper.CompareObservation(sensor, expected);
+            Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
+
+            // Update the settings to only process joint observations
+            sensorComponent.Settings = new PhysicsSensorSettings
+            {
+                UseJointForces = true,
+                UseJointPositionsAndAngles = true,
+            };
+
+            sensor = sensorComponent.CreateSensor();
+            sensor.Update();
+
+            expected = new[]
+            {
+                // revolute
+                0f, 1f, // joint1.position (sin and cos)
+                0f, // joint1.force
+
+                // prismatic
+                0.5f, // joint2.position (interpolate between limits)
+                0f, // joint2.force
+            };
+            SensorTestHelper.CompareObservation(sensor, expected);
+            Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
        }
    }
 }
--- a/com.unity.ml-agents.extensions/Tests/Editor/Sensors/RigidBodySensorTests.cs
+++ b/com.unity.ml-agents.extensions/Tests/Editor/Sensors/RigidBodySensorTests.cs
                0f, 0f, 0f, 1f // LocalSpaceRotations
            };
            SensorTestHelper.CompareObservation(sensor, expected);
+            Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
        }

        [Test]
                0f, -1f, 1f // Leaf vel
            };
            SensorTestHelper.CompareObservation(sensor, expected);
+            Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
+
+            // Update the settings to only process joint observations
+            sensorComponent.Settings = new PhysicsSensorSettings
+            {
+                UseJointPositionsAndAngles = true,
+                UseJointForces = true,
+            };
+
+            sensor = sensorComponent.CreateSensor();
+            sensor.Update();
+
+            expected = new[]
+            {
+                0f, 0f, 0f, // joint1.force
+                0f, 0f, 0f, // joint1.torque
+                0f, 0f, 0f, // joint2.force
+                0f, 0f, 0f, // joint2.torque
+            };
+            SensorTestHelper.CompareObservation(sensor, expected);
+            Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);

        }
    }
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
 ### Major Changes
 #### com.unity.ml-agents (C#)
 #### ml-agents / ml-agents-envs / gym-unity (Python)
-The minimum supported python version for ml-agents-envs was changed to 3.6.1. (#4244)
+- The minimum supported python version for ml-agents-envs was changed to 3.6.1. (#4244)
+- The interaction between EnvManager and TrainerController was changed; EnvManager.advance() was split into to stages,
+and TrainerController now uses the results from the first stage to handle new behavior names. This change speeds up
+Python training by approximately 5-10%. (#4259)

 ### Minor Changes
 #### com.unity.ml-agents (C#)
--- a/docs/Getting-Started.md
+++ b/docs/Getting-Started.md
 TensorBoard. From the command line run:

 ```sh
-tensorboard --logdir=results
+tensorboard --logdir results
 ```

 Then navigate to `localhost:6006` in your browser to view the TensorBoard
--- a/docs/Training-Configuration-File.md
+++ b/docs/Training-Configuration-File.md
 | `hyperparameters -> beta`      | (default = `5.0e-3`) Strength of the entropy regularization, which makes the policy "more random." This ensures that agents properly explore the action space during training. Increasing this will ensure more random actions are taken. This should be adjusted such that the entropy (measurable from TensorBoard) slowly decreases alongside increases in reward. If entropy drops too quickly, increase beta. If entropy drops too slowly, decrease `beta`. <br><br>Typical range: `1e-4` - `1e-2`                                                                                                                                                                     |
 | `hyperparameters -> epsilon`   | (default = `0.2`) Influences how rapidly the policy can evolve during training. Corresponds to the acceptable threshold of divergence between the old and new policies during gradient descent updating. Setting this value small will result in more stable updates, but will also slow the training process. <br><br>Typical range: `0.1` - `0.3`                                                                                                                                                                                                                                                                                                                      |
 | `hyperparameters -> lambd`     | (default = `0.95`) Regularization parameter (lambda) used when calculating the Generalized Advantage Estimate ([GAE](https://arxiv.org/abs/1506.02438)). This can be thought of as how much the agent relies on its current value estimate when calculating an updated value estimate. Low values correspond to relying more on the current value estimate (which can be high bias), and high values correspond to relying more on the actual rewards received in the environment (which can be high variance). The parameter provides a trade-off between the two, and the right value can lead to a more stable training process. <br><br>Typical range: `0.9` - `0.95` |
-| `hyperparameters -> num_epoch` | Number of passes to make through the experience buffer when performing gradient descent optimization.The larger the batch_size, the larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning. <br><br>Typical range: `3` - `10`                                                                                                                                                                                                                                                                                                                                                           |
+| `hyperparameters -> num_epoch` | (default = `3`) Number of passes to make through the experience buffer when performing gradient descent optimization.The larger the batch_size, the larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning. <br><br>Typical range: `3` - `10`                                                                                                                                                                                                                                                                                                                                                           |

 ### SAC-specific Configurations

--- a/docs/Training-on-Microsoft-Azure.md
+++ b/docs/Training-on-Microsoft-Azure.md
 2. Unless you started the training as a background process, connect to your VM
   from another terminal instance.
 3. Run the following command from your terminal
-   `tensorboard --logdir=summaries --host 0.0.0.0`
+   `tensorboard --logdir results --host 0.0.0.0`
 4. You should now be able to open a browser and navigate to
   `<Your_VM_IP_Address>:6060` to view the TensorBoard report.

--- a/docs/Using-Docker.md
+++ b/docs/Using-Docker.md
 http://localhost:6006:

 ```sh
-docker exec -it <container-name> tensorboard --logdir=/unity-volume/summaries --host=0.0.0.0
+docker exec -it <container-name> tensorboard --logdir /unity-volume/results --host 0.0.0.0
-docker exec -it 3DBallContainer.first.trial tensorboard --logdir=/unity-volume/summaries --host=0.0.0.0
+docker exec -it 3DBallContainer.first.trial tensorboard --logdir /unity-volume/results --host 0.0.0.0
 ```

 For more details on Tensorboard, check out the documentation about
--- a/docs/Using-Tensorboard.md
+++ b/docs/Using-Tensorboard.md

 1. Open a terminal or console window:
 1. Navigate to the directory where the ML-Agents Toolkit is installed.
-1. From the command line run: `tensorboard --logdir=results --port=6006`
+1. From the command line run: `tensorboard --logdir results --port 6006`
 1. Open a browser window and navigate to
   [localhost:6006](http://localhost:6006).

--- a/docs/localized/zh-CN/docs/Getting-Started-with-Balance-Ball.md
+++ b/docs/localized/zh-CN/docs/Getting-Started-with-Balance-Ball.md
 ### 观测训练进度

 开始使用 `learn.py` 按照前面部分所述的方式进行训练后，`ml-agents` 文件夹将
-包含一个 `summaries` 目录。为了更详细地观测训练过程，
+包含一个 `results` 目录。为了更详细地观测训练过程，
-`tensorboard --logdir=summaries`
+`tensorboard --logdir results`

 然后导航至 `localhost:6006`。

--- a/ml-agents-envs/setup.py
+++ b/ml-agents-envs/setup.py
    install_requires=[
        "cloudpickle",
        "grpcio>=1.11.0",
-        "numpy>=1.14.1,<2.0",
+        "numpy>=1.14.1,<1.19.0",
        "Pillow>=4.2.1",
        "protobuf>=3.6",
        "pyyaml>=3.1.0",
--- a/ml-agents/mlagents/model_serialization.py
+++ b/ml-agents/mlagents/model_serialization.py
 from tensorflow.python.framework import graph_util

 from mlagents_envs.logging_util import get_logger
-from mlagents.trainers import tensorflow_to_barracuda as tf2bc
+from mlagents.trainers.tf import tensorflow_to_barracuda as tf2bc

 if LooseVersion(tf.__version__) < LooseVersion("1.12.0"):
    # ONNX is only tested on 1.12.0 and later
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
    EnvironmentStats,
 )
 from mlagents.trainers.trajectory import Trajectory, AgentExperience
-from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.policy import Policy
 from mlagents.trainers.action_info import ActionInfo, ActionInfoOutputs
 from mlagents.trainers.stats import StatsReporter

    def __init__(
        self,
-        policy: TFPolicy,
+        policy: Policy,
        behavior_id: str,
        stats_reporter: StatsReporter,
        max_trajectory_length: int = sys.maxsize,

    def __init__(
        self,
-        policy: TFPolicy,
+        policy: Policy,
        behavior_id: str,
        stats_reporter: StatsReporter,
        max_trajectory_length: int = sys.maxsize,
--- a/ml-agents/mlagents/trainers/components/reward_signals/init.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/init.py
        """
        Initializes a reward signal. At minimum, you must pass in the policy it is being applied to,
        the reward strength, and the gamma (discount factor.)
-        :param policy: The Policy object (e.g. NNPolicy) that this Reward Signal will apply to.
+        :param policy: The Policy object (e.g. TFPolicy) that this Reward Signal will apply to.
        :param settings: Settings parameters for this Reward Signal, including gamma and strength.
        :return: A RewardSignal object.
        """
--- a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
 from typing import List, Tuple
 from mlagents.tf_utils import tf

-from mlagents.trainers.models import ModelUtils
+from mlagents.trainers.tf.models import ModelUtils
 from mlagents.trainers.policy.tf_policy import TFPolicy


--- a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
 from mlagents.tf_utils import tf

 from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.models import ModelUtils
+from mlagents.trainers.tf.models import ModelUtils

 EPSILON = 1e-7

--- a/ml-agents/mlagents/trainers/env_manager.py
+++ b/ml-agents/mlagents/trainers/env_manager.py
 )
 from mlagents_envs.side_channel.stats_side_channel import EnvironmentStats

-from mlagents.trainers.policy.tf_policy import TFPolicy
+from mlagents.trainers.policy import Policy
 from mlagents.trainers.agent_processor import AgentManager, AgentManagerQueue
 from mlagents.trainers.action_info import ActionInfo
 from mlagents_envs.logging_util import get_logger

 class EnvManager(ABC):
    def __init__(self):
-        self.policies: Dict[BehaviorName, TFPolicy] = {}
+        self.policies: Dict[BehaviorName, Policy] = {}
-        self.first_step_infos: List[EnvironmentStep] = None
+        self.first_step_infos: List[EnvironmentStep] = []
-    def set_policy(self, brain_name: BehaviorName, policy: TFPolicy) -> None:
+    def set_policy(self, brain_name: BehaviorName, policy: Policy) -> None:
        self.policies[brain_name] = policy
        if brain_name in self.agent_managers:
            self.agent_managers[brain_name].policy = policy
    def close(self):
        pass

-    def advance(self):
+    def get_steps(self) -> List[EnvironmentStep]:
+        """
+        Updates the policies, steps the environments, and returns the step information from the environments.
+        Calling code should pass the returned EnvironmentSteps to process_steps() after calling this.
+        :return: The list of EnvironmentSteps
+        """
-        if self.first_step_infos is not None:
+        if self.first_step_infos:
-            self.first_step_infos = None
+            self.first_step_infos = []
-        for brain_name in self.training_behaviors:
+        for brain_name in self.agent_managers.keys():
            _policy = None
            try:
                # We make sure to empty the policy queue before continuing to produce steps.
            except AgentManagerQueue.Empty:
                if _policy is not None:
-                    self.set_policy(brain_name, _policy)
-        # Step the environment
+                    # policy_queue contains Policy, but we need a TFPolicy here
+                    self.set_policy(brain_name, _policy)  # type: ignore
+        # Step the environments
+        return new_step_infos
+
+    def process_steps(self, new_step_infos: List[EnvironmentStep]) -> int:
        # Add to AgentProcessor
        num_step_infos = self._process_step_infos(new_step_infos)
        return num_step_infos
--- a/ml-agents/mlagents/trainers/ghost/trainer.py
+++ b/ml-agents/mlagents/trainers/ghost/trainer.py
 # ## ML-Agent Learning (Ghost Trainer)

 from collections import defaultdict
-from typing import Deque, Dict, DefaultDict, List, cast
+from typing import Deque, Dict, DefaultDict, List

 import numpy as np

-from mlagents.trainers.policy.tf_policy import TFPolicy

 from mlagents.trainers.trainer import Trainer
 from mlagents.trainers.trajectory import Trajectory
        for brain_name in self._internal_policy_queues:
            internal_policy_queue = self._internal_policy_queues[brain_name]
            try:
-                policy = cast(TFPolicy, internal_policy_queue.get_nowait())
+                policy = internal_policy_queue.get_nowait()
                self.current_policy_snapshot[brain_name] = policy.get_weights()
            except AgentManagerQueue.Empty:
                pass

    def create_policy(
        self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
-    ) -> TFPolicy:
+    ) -> Policy:
        """
        Creates policy with the wrapped trainer's create_policy function
        The first policy encountered sets the wrapped
        return policy

    def add_policy(
-        self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
+        self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
    ) -> None:
        """
        Adds policy to GhostTrainer.
        self._name_to_parsed_behavior_id[name_behavior_id] = parsed_behavior_id
        self.policies[name_behavior_id] = policy

-    def get_policy(self, name_behavior_id: str) -> TFPolicy:
+    def get_policy(self, name_behavior_id: str) -> Policy:
        """
        Gets policy associated with name_behavior_id
        :param name_behavior_id: Fully qualified behavior name
--- a/ml-agents/mlagents/trainers/optimizer/optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer/optimizer.py
    Provides methods to update the Policy.
    """

+    def __init__(self):
+        self.reward_signals = {}
+
    @abc.abstractmethod
    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
        """
--- a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py

 class TFOptimizer(Optimizer):  # pylint: disable=W0223
    def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
+        super().__init__()
        self.sess = policy.sess
        self.policy = policy
        self.update_dict: Dict[str, tf.Tensor] = {}
        Create reward signals
        :param reward_signal_configs: Reward signal config.
        """
-        self.reward_signals = {}
        # Create reward signals
        for reward_signal, settings in reward_signal_configs.items():
            # Name reward signals by string in case we have duplicates later
--- a/ml-agents/mlagents/trainers/policy/policy.py
+++ b/ml-agents/mlagents/trainers/policy/policy.py
-from abc import ABC, abstractmethod
+from abc import abstractmethod
+from typing import Dict, List, Optional
+import numpy as np
+from mlagents_envs.exception import UnityException
+
+from mlagents.model_serialization import SerializationSettings
+from mlagents_envs.base_env import BehaviorSpec
+from mlagents.trainers.settings import TrainerSettings, NetworkSettings
-class Policy(ABC):
-    @abstractmethod
+class UnityPolicyException(UnityException):
+    """
+    Related to errors with the Trainer.
+    """
+
+    pass
+
+
+class Policy:
+    def __init__(
+        self,
+        seed: int,
+        behavior_spec: BehaviorSpec,
+        trainer_settings: TrainerSettings,
+        model_path: str,
+        load: bool = False,
+        tanh_squash: bool = False,
+        reparameterize: bool = False,
+        condition_sigma_on_obs: bool = True,
+    ):
+        self.behavior_spec = behavior_spec
+        self.trainer_settings = trainer_settings
+        self.network_settings: NetworkSettings = trainer_settings.network_settings
+        self.seed = seed
+        self.act_size = (
+            list(behavior_spec.discrete_action_branches)
+            if behavior_spec.is_action_discrete()
+            else [behavior_spec.action_size]
+        )
+        self.vec_obs_size = sum(
+            shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1
+        )
+        self.vis_obs_size = sum(
+            1 for shape in behavior_spec.observation_shapes if len(shape) == 3
+        )
+        self.model_path = model_path
+        self.initialize_path = self.trainer_settings.init_path
+        self._keep_checkpoints = self.trainer_settings.keep_checkpoints
+        self.use_continuous_act = behavior_spec.is_action_continuous()
+        self.num_branches = self.behavior_spec.action_size
+        self.previous_action_dict: Dict[str, np.array] = {}
+        self.memory_dict: Dict[str, np.ndarray] = {}
+        self.normalize = trainer_settings.network_settings.normalize
+        self.use_recurrent = self.network_settings.memory is not None
+        self.load = load
+        self.h_size = self.network_settings.hidden_units
+        num_layers = self.network_settings.num_layers
+        if num_layers < 1:
+            num_layers = 1
+        self.num_layers = num_layers
+
+        self.vis_encode_type = self.network_settings.vis_encode_type
+        self.tanh_squash = tanh_squash
+        self.reparameterize = reparameterize
+        self.condition_sigma_on_obs = condition_sigma_on_obs
+
+        self.m_size = 0
+        self.sequence_length = 1
+        if self.network_settings.memory is not None:
+            self.m_size = self.network_settings.memory.memory_size
+            self.sequence_length = self.network_settings.memory.sequence_length
+
+        # Non-exposed parameters; these aren't exposed because they don't have a
+        # good explanation and usually shouldn't be touched.
+        self.log_std_min = -20
+        self.log_std_max = 2
+
+    def make_empty_memory(self, num_agents):
+        """
+        Creates empty memory for use with RNNs
+        :param num_agents: Number of agents.
+        :return: Numpy array of zeros.
+        """
+        return np.zeros((num_agents, self.m_size), dtype=np.float32)
+
+    def save_memories(
+        self, agent_ids: List[str], memory_matrix: Optional[np.ndarray]
+    ) -> None:
+        if memory_matrix is None:
+            return
+        for index, agent_id in enumerate(agent_ids):
+            self.memory_dict[agent_id] = memory_matrix[index, :]
+
+    def retrieve_memories(self, agent_ids: List[str]) -> np.ndarray:
+        memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
+        for index, agent_id in enumerate(agent_ids):
+            if agent_id in self.memory_dict:
+                memory_matrix[index, :] = self.memory_dict[agent_id]
+        return memory_matrix
+
+    def remove_memories(self, agent_ids):
+        for agent_id in agent_ids:
+            if agent_id in self.memory_dict:
+                self.memory_dict.pop(agent_id)
+
+    def make_empty_previous_action(self, num_agents):
+        """
+        Creates empty previous action for use with RNNs and discrete control
+        :param num_agents: Number of agents.
+        :return: Numpy array of zeros.
+        """
+        return np.zeros((num_agents, self.num_branches), dtype=np.int)
+
+    def save_previous_action(
+        self, agent_ids: List[str], action_matrix: Optional[np.ndarray]
+    ) -> None:
+        if action_matrix is None:
+            return
+        for index, agent_id in enumerate(agent_ids):
+            self.previous_action_dict[agent_id] = action_matrix[index, :]
+
+    def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
+        action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int)
+        for index, agent_id in enumerate(agent_ids):
+            if agent_id in self.previous_action_dict:
+                action_matrix[index, :] = self.previous_action_dict[agent_id]
+        return action_matrix
+
+    def remove_previous_action(self, agent_ids):
+        for agent_id in agent_ids:
+            if agent_id in self.previous_action_dict:
+                self.previous_action_dict.pop(agent_id)
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def update_normalization(self, vector_obs: np.ndarray) -> None:
+        pass
+
+    @abstractmethod
+    def increment_step(self, n_steps):
+        pass
+
+    @abstractmethod
+    def get_current_step(self):
+        pass
+
+    @abstractmethod
+    def checkpoint(self, checkpoint_path: str, settings: SerializationSettings) -> None:
+        pass
+
+    @abstractmethod
+    def save(self, output_filepath: str, settings: SerializationSettings) -> None:
+        pass
+
+    @abstractmethod
+    def load_weights(self, values: List[np.ndarray]) -> None:
+        pass
+
+    @abstractmethod
+    def get_weights(self) -> List[np.ndarray]:
+        return []
+
+    @abstractmethod
+    def init_load_weights(self) -> None:
        pass
--- a/ml-agents/mlagents/trainers/policy/tf_policy.py
+++ b/ml-agents/mlagents/trainers/policy/tf_policy.py
 from typing import Any, Dict, List, Optional, Tuple
-import abc
+
+from mlagents_envs.timers import timed

 from mlagents.model_serialization import SerializationSettings, export_policy_model
 from mlagents.tf_utils import tf
 from mlagents.trainers.trajectory import SplitObservations
 from mlagents.trainers.behavior_id_utils import get_global_agent_id
 from mlagents_envs.base_env import DecisionSteps
-from mlagents.trainers.models import ModelUtils
-from mlagents.trainers.settings import TrainerSettings, NetworkSettings
+from mlagents.trainers.tf.models import ModelUtils
+from mlagents.trainers.settings import TrainerSettings, EncoderType
+from mlagents.trainers.tf.distributions import (
+    GaussianDistribution,
+    MultiCategoricalDistribution,
+)


 logger = get_logger(__name__)
 # determines compatibility with inference in Barracuda.
 MODEL_FORMAT_VERSION = 2

+EPSILON = 1e-6  # Small value to avoid divide by zero
+

 class UnityPolicyException(UnityException):
    """
        trainer_settings: TrainerSettings,
        model_path: str,
        load: bool = False,
+        tanh_squash: bool = False,
+        reparameterize: bool = False,
+        condition_sigma_on_obs: bool = True,
+        create_tf_graph: bool = True,
    ):
        """
        Initialized the policy.
        :param model_path: Where to load/save the model.
        :param load: If True, load model from model_path. Otherwise, create new model.
        """
-
-        self.m_size = 0
-        self.trainer_settings = trainer_settings
-        self.network_settings: NetworkSettings = trainer_settings.network_settings
+        super().__init__(
+            seed,
+            behavior_spec,
+            trainer_settings,
+            model_path,
+            load,
+            tanh_squash,
+            reparameterize,
+            condition_sigma_on_obs,
+        )
-
-        self.inference_dict: Dict[str, tf.Tensor] = {}
-        self.sequence_length = 1
-        self.seed = seed
-        self.behavior_spec = behavior_spec
-
-        self.act_size = (
-            list(behavior_spec.discrete_action_branches)
-            if behavior_spec.is_action_discrete()
-            else [behavior_spec.action_size]
-        )
-        self.vec_obs_size = sum(
-            shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1
-        )
-        self.vis_obs_size = sum(
-            1 for shape in behavior_spec.observation_shapes if len(shape) == 3
-        )
+        self.inference_dict: Dict[str, tf.Tensor] = {}
-        self.use_recurrent = self.network_settings.memory is not None
-        self.memory_dict: Dict[str, np.ndarray] = {}
-        self.num_branches = self.behavior_spec.action_size
-        self.previous_action_dict: Dict[str, np.array] = {}
-        self.normalize = self.network_settings.normalize
-        self.use_continuous_act = behavior_spec.is_action_continuous()
-        self.model_path = model_path
-        self.initialize_path = self.trainer_settings.init_path
-        self.keep_checkpoints = self.trainer_settings.keep_checkpoints
-        self.seed = seed
-        if self.network_settings.memory is not None:
-            self.m_size = self.network_settings.memory.memory_size
-            self.sequence_length = self.network_settings.memory.sequence_length
-        self.load = load
+        self.grads = None
+        self.update_batch: Optional[tf.Operation] = None
+        self.trainable_variables: List[tf.Variable] = []
+        if create_tf_graph:
+            self.create_tf_graph()
-    @abc.abstractmethod
-        pass
+        return self.trainable_variables
-    @abc.abstractmethod
-    def create_tf_graph(self):
+    def create_tf_graph(self) -> None:
-        pass
+        with self.graph.as_default():
+            tf.set_random_seed(self.seed)
+            _vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
+            if len(_vars) > 0:
+                # We assume the first thing created in the graph is the Policy. If
+                # already populated, don't create more tensors.
+                return
+
+            self.create_input_placeholders()
+            encoded = self._create_encoder(
+                self.visual_in,
+                self.processed_vector_in,
+                self.h_size,
+                self.num_layers,
+                self.vis_encode_type,
+            )
+            if self.use_continuous_act:
+                self._create_cc_actor(
+                    encoded,
+                    self.tanh_squash,
+                    self.reparameterize,
+                    self.condition_sigma_on_obs,
+                )
+            else:
+                self._create_dc_actor(encoded)
+            self.saliency = tf.reduce_mean(
+                    tf.square(tf.gradients(self.output, self.vector_in)), axis=1
+                )
+            self.trainable_variables = tf.get_collection(
+                tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy"
+            )
+            self.trainable_variables += tf.get_collection(
+                tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm"
+            )  # LSTMs need to be root scope for Barracuda export
+
+        self.inference_dict = {
+            "action": self.output,
+            "log_probs": self.all_log_probs,
+            "entropy": self.entropy,
+        }
+        if self.use_continuous_act:
+            self.inference_dict["pre_action"] = self.output_pre
+        if self.use_recurrent:
+            self.inference_dict["memory_out"] = self.memory_out
+
+        # We do an initialize to make the Policy usable out of the box. If an optimizer is needed,
+        # it will re-load the full graph
+        self._initialize_graph()
+
+    def _create_encoder(
+        self,
+        visual_in: List[tf.Tensor],
+        vector_in: tf.Tensor,
+        h_size: int,
+        num_layers: int,
+        vis_encode_type: EncoderType,
+    ) -> tf.Tensor:
+        """
+        Creates an encoder for visual and vector observations.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
+        :return: The hidden layer (tf.Tensor) after the encoder.
+        """
+        with tf.variable_scope("policy"):
+            encoded = ModelUtils.create_observation_streams(
+                self.visual_in,
+                self.processed_vector_in,
+                1,
+                h_size,
+                num_layers,
+                vis_encode_type,
+            )[0]
+        return encoded

    @staticmethod
    def _convert_version_string(version_string: str) -> Tuple[int, ...]:

    def _initialize_graph(self):
        with self.graph.as_default():
-            self.saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
+            self.saver = tf.train.Saver(max_to_keep=self._keep_checkpoints)
-            self.saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
+            self.saver = tf.train.Saver(max_to_keep=self._keep_checkpoints)
            logger.info(f"Loading model from {model_path}.")
            ckpt = tf.train.get_checkpoint_state(model_path)
            if ckpt is None:
                feed_dict[assign_ph] = value
            self.sess.run(self.assign_ops, feed_dict=feed_dict)

+    @timed
-        :param decision_requests: DecisionSteps input to network.
-        :return: Output from policy based on self.inference_dict.
+        :param decision_requests: DecisionSteps object containing inputs.
+        :param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result.
+        :return: Outputs from network as defined by self.inference_dict.
-        raise UnityPolicyException("The evaluate function was not implemented.")
+        feed_dict = {
+            self.batch_size_ph: len(decision_requests),
+            self.sequence_length_ph: 1,
+        }
+        if self.use_recurrent:
+            if not self.use_continuous_act:
+                feed_dict[self.prev_action] = self.retrieve_previous_action(
+                    global_agent_ids
+                )
+            feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
+        feed_dict = self.fill_eval_dict(feed_dict, decision_requests)
+        run_out = self._execute_model(feed_dict, self.inference_dict)
+        return run_out

    def get_action(
        self, decision_requests: DecisionSteps, worker_id: int = 0
                mask = 1 - np.concatenate(batched_step_result.action_mask, axis=1)
            feed_dict[self.action_masks] = mask
        return feed_dict
-
-    def make_empty_memory(self, num_agents):
-        """
-        Creates empty memory for use with RNNs
-        :param num_agents: Number of agents.
-        :return: Numpy array of zeros.
-        """
-        return np.zeros((num_agents, self.m_size), dtype=np.float32)
-
-    def save_memories(
-        self, agent_ids: List[str], memory_matrix: Optional[np.ndarray]
-    ) -> None:
-        if memory_matrix is None:
-            return
-        for index, agent_id in enumerate(agent_ids):
-            self.memory_dict[agent_id] = memory_matrix[index, :]
-
-    def retrieve_memories(self, agent_ids: List[str]) -> np.ndarray:
-        memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
-        for index, agent_id in enumerate(agent_ids):
-            if agent_id in self.memory_dict:
-                memory_matrix[index, :] = self.memory_dict[agent_id]
-        return memory_matrix
-
-    def remove_memories(self, agent_ids):
-        for agent_id in agent_ids:
-            if agent_id in self.memory_dict:
-                self.memory_dict.pop(agent_id)
-
-    def make_empty_previous_action(self, num_agents):
-        """
-        Creates empty previous action for use with RNNs and discrete control
-        :param num_agents: Number of agents.
-        :return: Numpy array of zeros.
-        """
-        return np.zeros((num_agents, self.num_branches), dtype=np.int)
-
-    def save_previous_action(
-        self, agent_ids: List[str], action_matrix: Optional[np.ndarray]
-    ) -> None:
-        if action_matrix is None:
-            return
-        for index, agent_id in enumerate(agent_ids):
-            self.previous_action_dict[agent_id] = action_matrix[index, :]
-
-    def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
-        action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int)
-        for index, agent_id in enumerate(agent_ids):
-            if agent_id in self.previous_action_dict:
-                action_matrix[index, :] = self.previous_action_dict[agent_id]
-        return action_matrix
-
-    def remove_previous_action(self, agent_ids):
-        for agent_id in agent_ids:
-            if agent_id in self.previous_action_dict:
-                self.previous_action_dict.pop(agent_id)

    def get_current_step(self):
        """
                    trainable=False,
                    dtype=tf.int32,
                )
+
+    def _create_cc_actor(
+        self,
+        encoded: tf.Tensor,
+        tanh_squash: bool = False,
+        reparameterize: bool = False,
+        condition_sigma_on_obs: bool = True,
+    ) -> None:
+        """
+        Creates Continuous control actor-critic model.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
+        :param tanh_squash: Whether to use a tanh function, or a clipped output.
+        :param reparameterize: Whether we are using the resampling trick to update the policy.
+        """
+        if self.use_recurrent:
+            self.memory_in = tf.placeholder(
+                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
+            )
+            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
+                encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy"
+            )
+
+            self.memory_out = tf.identity(memory_policy_out, name="recurrent_out")
+        else:
+            hidden_policy = encoded
+
+        with tf.variable_scope("policy"):
+            distribution = GaussianDistribution(
+                hidden_policy,
+                self.act_size,
+                reparameterize=reparameterize,
+                tanh_squash=tanh_squash,
+                condition_sigma=condition_sigma_on_obs,
+            )
+
+        if tanh_squash:
+            self.output_pre = distribution.sample
+            self.output = tf.identity(self.output_pre, name="action")
+        else:
+            self.output_pre = distribution.sample
+            # Clip and scale output to ensure actions are always within [-1, 1] range.
+            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
+            self.output = tf.identity(output_post, name="action")
+
+        self.selected_actions = tf.stop_gradient(self.output)
+
+        self.all_log_probs = tf.identity(distribution.log_probs, name="action_probs")
+        self.entropy = distribution.entropy
+
+        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
+        self.total_log_probs = distribution.total_log_probs
+
+    def _create_dc_actor(self, encoded: tf.Tensor) -> None:
+        """
+        Creates Discrete control actor-critic model.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
+        """
+        if self.use_recurrent:
+            self.prev_action = tf.placeholder(
+                shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action"
+            )
+            prev_action_oh = tf.concat(
+                [
+                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
+                    for i in range(len(self.act_size))
+                ],
+                axis=1,
+            )
+            hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)
+
+            self.memory_in = tf.placeholder(
+                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
+            )
+            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
+                hidden_policy,
+                self.memory_in,
+                self.sequence_length_ph,
+                name="lstm_policy",
+            )
+
+            self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
+        else:
+            hidden_policy = encoded
+
+        self.action_masks = tf.placeholder(
+            shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
+        )
+
+        with tf.variable_scope("policy"):
+            distribution = MultiCategoricalDistribution(
+                hidden_policy, self.act_size, self.action_masks
+            )
+        # It's important that we are able to feed_dict a value into this tensor to get the
+        # right one-hot encoding, so we can't do identity on it.
+        self.output = distribution.sample
+        self.all_log_probs = tf.identity(distribution.log_probs, name="action")
+        self.selected_actions = tf.stop_gradient(
+            distribution.sample_onehot
+        )  # In discrete, these are onehot
+        self.entropy = distribution.entropy
+        self.total_log_probs = distribution.total_log_probs
--- a/ml-agents/mlagents/trainers/ppo/optimizer.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer.py
 import numpy as np
 from mlagents.tf_utils import tf
 from mlagents_envs.timers import timed
-from mlagents.trainers.models import ModelUtils, EncoderType
+from mlagents.trainers.tf.models import ModelUtils, EncoderType
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
 from mlagents.trainers.buffer import AgentBuffer
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py

 from mlagents_envs.logging_util import get_logger
 from mlagents_envs.base_env import BehaviorSpec
-from mlagents.trainers.policy.nn_policy import NNPolicy
+from mlagents.trainers.policy import Policy
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.ppo.optimizer import PPOOptimizer
 from mlagents.trainers.trajectory import Trajectory
        )
        self.load = load
        self.seed = seed
-        self.policy: NNPolicy = None  # type: ignore
+        self.policy: Policy = None  # type: ignore

    def _process_trajectory(self, trajectory: Trajectory) -> None:
        """
        :param behavior_spec: specifications for policy construction
        :return policy
        """
-        policy = NNPolicy(
+        policy = TFPolicy(
-            self.is_training,
-            self.artifact_path,
-            self.load,
+            model_path=self.artifact_path,
+            load=self.load,
            condition_sigma_on_obs=False,  # Faster training for PPO
            create_tf_graph=False,  # We will create the TF graph in the Optimizer
        )
    def add_policy(
-        self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
+        self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
    ) -> None:
        """
        Adds policy to trainer.
                    self.__class__.__name__
                )
            )
-        if not isinstance(policy, NNPolicy):
-            raise RuntimeError("Non-NNPolicy passed to PPOTrainer.add_policy()")
-        self.optimizer = PPOOptimizer(self.policy, self.trainer_settings)
+        self.optimizer = PPOOptimizer(
+            cast(TFPolicy, self.policy), self.trainer_settings
+        )
-    def get_policy(self, name_behavior_id: str) -> TFPolicy:
+    def get_policy(self, name_behavior_id: str) -> Policy:
        """
        Gets policy from trainer associated with name_behavior_id
        :param name_behavior_id: full identifier of policy
--- a/ml-agents/mlagents/trainers/sac/network.py
+++ b/ml-agents/mlagents/trainers/sac/network.py
 from typing import Dict, Optional
 from mlagents.tf_utils import tf
-from mlagents.trainers.models import ModelUtils, EncoderType
+from mlagents.trainers.tf.models import ModelUtils
+from mlagents.trainers.settings import EncoderType

 LOG_STD_MAX = 2
 LOG_STD_MIN = -20
            self._create_memory_ins(m_size)

        hidden_critic = self._create_observation_in(vis_encode_type)
-        self.policy.output = self.policy.output
        # Use the sequence length of the policy
        self.sequence_length_ph = self.policy.sequence_length_ph

--- a/ml-agents/mlagents/trainers/sac/optimizer.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer.py

 from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
-from mlagents.trainers.models import ModelUtils
+from mlagents.trainers.tf.models import ModelUtils
 from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.buffer import AgentBuffer

                # Non-exposed SAC parameters
                self.discrete_target_entropy_scale = (
-                    0.2
-                )  # Roughly equal to e-greedy 0.05
+                    0.2  # Roughly equal to e-greedy 0.05
+                )
                self.continuous_target_entropy_scale = 1.0

                stream_names = list(self.reward_signals.keys())
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
 from mlagents_envs.timers import timed
 from mlagents_envs.base_env import BehaviorSpec
 from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.policy.nn_policy import NNPolicy
+from mlagents.trainers.policy import Policy
 from mlagents.trainers.sac.optimizer import SACOptimizer
 from mlagents.trainers.trainer.rl_trainer import RLTrainer
 from mlagents.trainers.trajectory import Trajectory, SplitObservations

        self.load = load
        self.seed = seed
-        self.policy: NNPolicy = None  # type: ignore
+        self.policy: Policy = None  # type: ignore
        self.optimizer: SACOptimizer = None  # type: ignore
        self.hyperparameters: SACSettings = cast(
            SACSettings, trainer_settings.hyperparameters
    def create_policy(
        self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
    ) -> TFPolicy:
-        policy = NNPolicy(
+        policy = TFPolicy(
-            self.is_training,
            self.artifact_path,
            self.load,
            tanh_squash=True,
                self._stats_reporter.add_stat(stat, np.mean(stat_list))

    def add_policy(
-        self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
+        self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
    ) -> None:
        """
        Adds policy to trainer.
                    self.__class__.__name__
                )
            )
-        if not isinstance(policy, NNPolicy):
-            raise RuntimeError("Non-SACPolicy passed to SACTrainer.add_policy()")
-        self.optimizer = SACOptimizer(self.policy, self.trainer_settings)
+        self.optimizer = SACOptimizer(
+            cast(TFPolicy, self.policy), self.trainer_settings
+        )
        for _reward_signal in self.optimizer.reward_signals.keys():
            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
        # Needed to resume loads properly
            max(1, self.step / self.reward_signal_steps_per_update)
        )

-    def get_policy(self, name_behavior_id: str) -> TFPolicy:
+    def get_policy(self, name_behavior_id: str) -> Policy:
        """
        Gets policy from trainer associated with name_behavior_id
        :param name_behavior_id: full identifier of policy
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
 from mlagents.trainers.cli_utils import StoreConfigFile, DetectDefault, parser
 from mlagents.trainers.cli_utils import load_config
 from mlagents.trainers.exception import TrainerConfigError
-from mlagents.trainers.models import ScheduleType, EncoderType

 from mlagents_envs import logging_util
 from mlagents_envs.side_channel.environment_parameters_channel import (
 class ExportableSettings:
    def as_dict(self):
        return cattr.unstructure(self)
+
+
+class EncoderType(Enum):
+    SIMPLE = "simple"
+    NATURE_CNN = "nature_cnn"
+    RESNET = "resnet"
+
+
+class ScheduleType(Enum):
+    CONSTANT = "constant"
+    LINEAR = "linear"


@attr.s(auto_attribs=True)
--- a/ml-agents/mlagents/trainers/stats.py
+++ b/ml-agents/mlagents/trainers/stats.py
 from collections import defaultdict
 from enum import Enum
-from typing import List, Dict, NamedTuple, Any
+from typing import List, Dict, NamedTuple, Any, Optional
 import numpy as np
 import abc
 import csv
    ) -> None:
        if property_type == StatsPropertyType.HYPERPARAMETERS:
            assert isinstance(value, dict)
-            text = self._dict_to_tensorboard("Hyperparameters", value)
+            summary = self._dict_to_tensorboard("Hyperparameters", value)
-            self.summary_writers[category].add_summary(text, 0)
+            if summary is not None:
+                self.summary_writers[category].add_summary(summary, 0)

        elif property_type == StatsPropertyType.SALIENCY:
            self._maybe_create_summary_writer(category)
            #with tf.Session(config=generate_session_config()) as sess:
            #    hist_op = tf.summary.histogram(category, value)
            #    hist = sess.run(hist_op)
-    def _dict_to_tensorboard(self, name: str, input_dict: Dict[str, Any]) -> str:
+    def _dict_to_tensorboard(
+        self, name: str, input_dict: Dict[str, Any]
+    ) -> Optional[bytes]:
        """
        Convert a dict to a Tensorboard-encoded string.
        :param name: The name of the text.
                s = sess.run(s_op)
                return s
        except Exception:
-            logger.warning("Could not write text summary for Tensorboard.")
-            return ""
+            logger.warning(
+                f"Could not write {name} summary for Tensorboard: {input_dict}"
+            )
+            return None


 class CSVWriter(StatsWriter):
--- a/ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
+++ b/ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
 import tempfile
 import pytest

-import mlagents.trainers.tensorflow_to_barracuda as tf2bc
+import mlagents.trainers.tf.tensorflow_to_barracuda as tf2bc
 from mlagents.trainers.tests.test_nn_policy import create_policy_mock
 from mlagents.trainers.settings import TrainerSettings
 from mlagents.tf_utils import tf
--- a/ml-agents/mlagents/trainers/tests/test_bcmodule.py
+++ b/ml-agents/mlagents/trainers/tests/test_bcmodule.py
 import numpy as np
 import os

-from mlagents.trainers.policy.nn_policy import NNPolicy
+from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.components.bc.module import BCModule
 from mlagents.trainers.settings import (
    TrainerSettings,
    trainer_config.network_settings.memory = (
        NetworkSettings.MemorySettings() if use_rnn else None
    )
-    policy = NNPolicy(
+    policy = TFPolicy(
-        False,
        "test",
        False,
        tanhresample,
        assert isinstance(item, np.float32)
    old_learning_rate = bc_module.current_lr

-    stats = bc_module.update()
+    _ = bc_module.update()
    assert old_learning_rate == bc_module.current_lr


--- a/ml-agents/mlagents/trainers/tests/test_distributions.py
+++ b/ml-agents/mlagents/trainers/tests/test_distributions.py

 from mlagents.tf_utils import tf

-from mlagents.trainers.distributions import (
+from mlagents.trainers.tf.distributions import (
    GaussianDistribution,
    MultiCategoricalDistribution,
 )
--- a/ml-agents/mlagents/trainers/tests/test_models.py
+++ b/ml-agents/mlagents/trainers/tests/test_models.py
 import pytest

-from mlagents.trainers.models import ModelUtils
+from mlagents.trainers.tf.models import ModelUtils
 from mlagents.tf_utils import tf
 from mlagents_envs.base_env import BehaviorSpec, ActionType

--- a/ml-agents/mlagents/trainers/tests/test_nn_policy.py
+++ b/ml-agents/mlagents/trainers/tests/test_nn_policy.py
 from mlagents.tf_utils import tf


-from mlagents.trainers.policy.nn_policy import NNPolicy
-from mlagents.trainers.models import EncoderType, ModelUtils, Tensor3DShape
+from mlagents.trainers.policy.tf_policy import TFPolicy
+from mlagents.trainers.tf.models import ModelUtils, Tensor3DShape
-from mlagents.trainers.settings import TrainerSettings, NetworkSettings
+from mlagents.trainers.settings import TrainerSettings, NetworkSettings, EncoderType
 from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
 from mlagents.trainers import __version__

    model_path: str = "",
    load: bool = False,
    seed: int = 0,
-) -> NNPolicy:
+) -> TFPolicy:
    mock_spec = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
    trainer_settings.network_settings.memory = (
        NetworkSettings.MemorySettings() if use_rnn else None
    )
-    policy = NNPolicy(seed, mock_spec, trainer_settings, False, model_path, load)
+    policy = TFPolicy(
+        seed, mock_spec, trainer_settings, model_path=model_path, load=load
+    )
    return policy


            assert len(cm.output) == 1


-def _compare_two_policies(policy1: NNPolicy, policy2: NNPolicy) -> None:
+def _compare_two_policies(policy1: TFPolicy, policy2: TFPolicy) -> None:
    """
    Make sure two policies have the same output for the same input.
    """
    # Change half of the obs to 0
    for i in range(3):
        trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
-    policy = NNPolicy(
+    policy = TFPolicy(
-        False,
        "testdir",
        False,
    )
--- a/ml-agents/mlagents/trainers/tests/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/test_ppo.py

 from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
 from mlagents.trainers.ppo.optimizer import PPOOptimizer
-from mlagents.trainers.policy.nn_policy import NNPolicy
+from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.agent_processor import AgentManagerQueue
 from mlagents.trainers.tests import mock_brain as mb
 from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
        if use_rnn
        else None
    )
-    policy = NNPolicy(
-        0, mock_specs, trainer_settings, False, "test", False, create_tf_graph=False
+    policy = TFPolicy(
+        0, mock_specs, trainer_settings, "test", False, create_tf_graph=False
    )
    optimizer = PPOOptimizer(policy, trainer_settings)
    return optimizer
    ppo_optimizer.return_value = mock_optimizer

    trainer = PPOTrainer("test_brain", 0, trainer_params, True, False, 0, "0")
-    policy_mock = mock.Mock(spec=NNPolicy)
+    policy_mock = mock.Mock(spec=TFPolicy)
    policy_mock.get_current_step.return_value = 0
    step_count = (
        5  # 10 hacked because this function is no longer called through trainer
    ppo_optimizer.return_value = mock_optimizer

    trainer = PPOTrainer("test_policy", 0, dummy_config, True, False, 0, "0")
-    policy = mock.Mock(spec=NNPolicy)
+    policy = mock.Mock(spec=TFPolicy)
    policy.get_current_step.return_value = 2000

    behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
    # Make sure the summary steps were loaded properly
    assert trainer.get_step == 2000
-
-    # Test incorrect class of policy
-    policy = mock.Mock()
-    with pytest.raises(RuntimeError):
-        trainer.add_policy(behavior_id, policy)


 if __name__ == "__main__":
--- a/ml-agents/mlagents/trainers/tests/test_reward_signals.py
+++ b/ml-agents/mlagents/trainers/tests/test_reward_signals.py
 import copy
 import os
 import mlagents.trainers.tests.mock_brain as mb
-from mlagents.trainers.policy.nn_policy import NNPolicy
+from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.sac.optimizer import SACOptimizer
 from mlagents.trainers.ppo.optimizer import PPOOptimizer
 from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG, SAC_CONFIG
        if use_rnn
        else None
    )
-    policy = NNPolicy(
-        0, mock_specs, trainer_settings, False, "test", False, create_tf_graph=False
+    policy = TFPolicy(
+        0, mock_specs, trainer_settings, "test", False, create_tf_graph=False
    )
    if trainer_settings.trainer_type == TrainerType.SAC:
        optimizer = SACOptimizer(policy, trainer_settings)
--- a/ml-agents/mlagents/trainers/tests/test_sac.py
+++ b/ml-agents/mlagents/trainers/tests/test_sac.py

 from mlagents.trainers.sac.trainer import SACTrainer
 from mlagents.trainers.sac.optimizer import SACOptimizer
-from mlagents.trainers.policy.nn_policy import NNPolicy
+from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.agent_processor import AgentManagerQueue
 from mlagents.trainers.tests import mock_brain as mb
 from mlagents.trainers.tests.mock_brain import setup_test_behavior_specs
        if use_rnn
        else None
    )
-    policy = NNPolicy(
-        0, mock_brain, trainer_settings, False, "test", False, create_tf_graph=False
+    policy = TFPolicy(
+        0, mock_brain, trainer_settings, "test", False, create_tf_graph=False
    )
    optimizer = SACOptimizer(policy, trainer_settings)
    return optimizer
    sac_optimizer.return_value = mock_optimizer

    trainer = SACTrainer("test", 0, dummy_config, True, False, 0, "0")
-    policy = mock.Mock(spec=NNPolicy)
+    policy = mock.Mock(spec=TFPolicy)
    policy.get_current_step.return_value = 2000
    behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
    trainer.add_policy(behavior_id, policy)
    assert trainer.get_step == 2000
-
-    # Test incorrect class of policy
-    policy = mock.Mock()
-    with pytest.raises(RuntimeError):
-        trainer.add_policy(behavior_id, policy)


 def test_advance(dummy_config):
--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
    GAILSettings,
    TrainerType,
    RewardSignalType,
+    EncoderType,
+    ScheduleType,
-from mlagents.trainers.models import EncoderType, ScheduleType
 from mlagents_envs.side_channel.environment_parameters_channel import (
    EnvironmentParametersChannel,
 )
--- a/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
+++ b/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
        }
        step_info = EnvironmentStep(step_info_dict, 0, action_info_dict, env_stats)
        step_mock.return_value = [step_info]
-        env_manager.advance()
+        env_manager.process_steps(env_manager.get_steps())

        # Test add_experiences
        env_manager._step.assert_called_once()
--- a/ml-agents/mlagents/trainers/tests/test_trainer_controller.py
+++ b/ml-agents/mlagents/trainers/tests/test_trainer_controller.py
    tc.advance(env_mock)

    env_mock.reset.assert_not_called()
-    env_mock.advance.assert_called_once()
+    env_mock.get_steps.assert_called_once()
+    env_mock.process_steps.assert_called_once()
    # May have been called many times due to thread
    trainer_mock.advance.call_count > 0
--- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
 )
 from mlagents_envs.logging_util import get_logger
 from mlagents_envs.timers import timed
-from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
+from mlagents.trainers.optimizer import Optimizer
 from mlagents.trainers.buffer import AgentBuffer
 from mlagents.trainers.trainer import Trainer
 from mlagents.trainers.components.reward_signals import RewardSignalResult
            for agent_id in rewards:
                rewards[agent_id] = 0

-    def _update_end_episode_stats(self, agent_id: str, optimizer: TFOptimizer) -> None:
+    def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None:
        for name, rewards in self.collected_rewards.items():
            if name == "environment":
                self.stats_reporter.add_stat(
--- a/ml-agents/mlagents/trainers/trainer/trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/trainer.py

 from mlagents_envs.logging_util import get_logger
 from mlagents_envs.base_env import BehaviorSpec
-from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.stats import StatsReporter
 from mlagents.trainers.trajectory import Trajectory
 from mlagents.trainers.agent_processor import AgentManagerQueue
        self.step: int = 0
        self.artifact_path = artifact_path
        self.summary_freq = self.trainer_settings.summary_freq
-        self.policies: Dict[str, TFPolicy] = {}
+        self.policies: Dict[str, Policy] = {}

    @property
    def stats_reporter(self):
    @abc.abstractmethod
    def create_policy(
        self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
-    ) -> TFPolicy:
+    ) -> Policy:
        """
        Creates policy
        """
    def add_policy(
-        self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
+        self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
    ) -> None:
        """
        Adds policy to trainer.
    @abc.abstractmethod
-    def get_policy(self, name_behavior_id: str) -> TFPolicy:
+    def get_policy(self, name_behavior_id: str) -> Policy:
        """
        Gets policy from trainer.
        """
--- a/ml-agents/mlagents/trainers/trainer_controller.py
+++ b/ml-agents/mlagents/trainers/trainer_controller.py
 from mlagents.tf_utils import tf

 from mlagents_envs.logging_util import get_logger
-from mlagents.trainers.env_manager import EnvManager
+from mlagents.trainers.env_manager import EnvManager, EnvironmentStep
 from mlagents_envs.exception import (
    UnityEnvironmentException,
    UnityCommunicationException,
        self.train_model = train
        self.param_manager = param_manager
        self.ghost_controller = self.trainer_factory.ghost_controller
+        self.registered_behavior_ids: Set[str] = set()

        self.trainer_threads: List[threading.Thread] = []
        self.kill_trainers = False
            )

    @timed
-    def _reset_env(self, env: EnvManager) -> None:
+    def _reset_env(self, env_manager: EnvManager) -> None:
        """Resets the environment.

        Returns:
        new_config = self.param_manager.get_current_samplers()
-        env.reset(config=new_config)
+        env_manager.reset(config=new_config)
+        # Register any new behavior ids that were generated on the reset.
+        self._register_new_behaviors(env_manager, env_manager.first_step_infos)

    def _not_done_training(self) -> bool:
        return (
    def start_learning(self, env_manager: EnvManager) -> None:
        self._create_output_path(self.output_path)
        tf.reset_default_graph()
-        last_brain_behavior_ids: Set[str] = set()
-                external_brain_behavior_ids = set(env_manager.training_behaviors.keys())
-                new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids
-                self._create_trainers_and_managers(env_manager, new_behavior_ids)
-                last_brain_behavior_ids = external_brain_behavior_ids
                n_steps = self.advance(env_manager)
                for _ in range(n_steps):
                    self.reset_env_if_ready(env_manager)
            env.set_env_parameters(self.param_manager.get_current_samplers())

    @timed
-    def advance(self, env: EnvManager) -> int:
+    def advance(self, env_manager: EnvManager) -> int:
-            num_steps = env.advance()
+            new_step_infos = env_manager.get_steps()
+            self._register_new_behaviors(env_manager, new_step_infos)
+            num_steps = env_manager.process_steps(new_step_infos)

        # Report current lesson for each environment parameter
        for (
                    trainer.advance()

        return num_steps
+
+    def _register_new_behaviors(
+        self, env_manager: EnvManager, step_infos: List[EnvironmentStep]
+    ) -> None:
+        """
+        Handle registration (adding trainers and managers) of new behaviors ids.
+        :param env_manager:
+        :param step_infos:
+        :return:
+        """
+        step_behavior_ids: Set[str] = set()
+        for s in step_infos:
+            step_behavior_ids |= set(s.name_behavior_ids)
+        new_behavior_ids = step_behavior_ids - self.registered_behavior_ids
+        self._create_trainers_and_managers(env_manager, new_behavior_ids)
+        self.registered_behavior_ids |= step_behavior_ids

    def join_threads(self, timeout_seconds: float = 1.0) -> None:
        """
--- a/ml-agents/mlagents/trainers/tf/tensorflow_to_barracuda.py
+++ b/ml-agents/mlagents/trainers/tf/tensorflow_to_barracuda.py
    )
    # Te following code can be used as an example of API used from another module
    # convert() is the main entry point for converter
-    import tensorflow_to_barracuda as tf2bc
+    import tf.tensorflow_to_barracuda as tf2bc

    tf2bc.convert(args.source_file, args.target_file, args.trim_unused_by_output, args)

--- a/ml-agents/mlagents/trainers/tf/models.py
+++ b/ml-agents/mlagents/trainers/tf/models.py
-from enum import Enum
+from mlagents.trainers.settings import EncoderType, ScheduleType

 from mlagents.trainers.exception import UnityTrainerException

    height: int
    width: int
    num_channels: int
-
-
-class EncoderType(Enum):
-    SIMPLE = "simple"
-    NATURE_CNN = "nature_cnn"
-    RESNET = "resnet"
-
-
-class ScheduleType(Enum):
-    CONSTANT = "constant"
-    LINEAR = "linear"


 class NormalizerTensors(NamedTuple):
--- a/ml-agents/mlagents/trainers/tf/distributions.py
+++ b/ml-agents/mlagents/trainers/tf/distributions.py
 import numpy as np

 from mlagents.tf_utils import tf
-from mlagents.trainers.models import ModelUtils
+from mlagents.trainers.tf.models import ModelUtils

 EPSILON = 1e-6  # Small value to avoid divide by zero

--- a/com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyJointExtractor.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyJointExtractor.cs
+#if UNITY_2020_1_OR_NEWER
+
+using System.Collections.Generic;
+using UnityEngine;
+using Unity.MLAgents.Sensors;
+
+namespace Unity.MLAgents.Extensions.Sensors
+{
+    public class ArticulationBodyJointExtractor : IJointExtractor
+    {
+        ArticulationBody m_Body;
+
+        public ArticulationBodyJointExtractor(ArticulationBody body)
+        {
+            m_Body = body;
+        }
+
+        public int NumObservations(PhysicsSensorSettings settings)
+        {
+            return NumObservations(m_Body, settings);
+        }
+
+        public static int NumObservations(ArticulationBody body, PhysicsSensorSettings settings)
+        {
+            if (body == null || body.isRoot)
+            {
+                return 0;
+            }
+
+            var totalCount = 0;
+            if (settings.UseJointPositionsAndAngles)
+            {
+                switch (body.jointType)
+                {
+                    case ArticulationJointType.RevoluteJoint:
+                    case ArticulationJointType.SphericalJoint:
+                        // Both RevoluteJoint and SphericalJoint have all angular components.
+                        // We use sine and cosine of the angles for the observations.
+                        totalCount += 2 * body.dofCount;
+                        break;
+                    case ArticulationJointType.FixedJoint:
+                        // Since FixedJoint can't moved, there aren't any interesting observations for it.
+                        break;
+                    case ArticulationJointType.PrismaticJoint:
+                        // One linear component
+                        totalCount += body.dofCount;
+                        break;
+                }
+            }
+
+            if (settings.UseJointForces)
+            {
+                totalCount += body.dofCount;
+            }
+
+            return totalCount;
+        }
+
+        public int Write(PhysicsSensorSettings settings, ObservationWriter writer, int offset)
+        {
+            if (m_Body == null || m_Body.isRoot)
+            {
+                return 0;
+            }
+
+            var currentOffset = offset;
+
+            // Write joint positions
+            if (settings.UseJointPositionsAndAngles)
+            {
+                switch (m_Body.jointType)
+                {
+                    case ArticulationJointType.RevoluteJoint:
+                    case ArticulationJointType.SphericalJoint:
+                        // All joint positions are angular
+                        for (var dofIndex = 0; dofIndex < m_Body.dofCount; dofIndex++)
+                        {
+                            var jointRotationRads = m_Body.jointPosition[dofIndex];
+                            writer[currentOffset++] = Mathf.Sin(jointRotationRads);
+                            writer[currentOffset++] = Mathf.Cos(jointRotationRads);
+                        }
+                        break;
+                    case ArticulationJointType.FixedJoint:
+                        // No observations
+                        break;
+                    case ArticulationJointType.PrismaticJoint:
+                        writer[currentOffset++] = GetPrismaticValue();
+                        break;
+                }
+            }
+
+            if (settings.UseJointForces)
+            {
+                for (var dofIndex = 0; dofIndex < m_Body.dofCount; dofIndex++)
+                {
+                    // take tanh to keep in [-1, 1]
+                    writer[currentOffset++] = (float) System.Math.Tanh(m_Body.jointForce[dofIndex]);
+                }
+            }
+
+            return currentOffset - offset;
+        }
+
+        float GetPrismaticValue()
+        {
+            // Prismatic joints should have at most one free axis.
+            bool limited = false;
+            var drive = m_Body.xDrive;
+            if (m_Body.linearLockX == ArticulationDofLock.LimitedMotion)
+            {
+                drive = m_Body.xDrive;
+                limited = true;
+            }
+            else if (m_Body.linearLockY == ArticulationDofLock.LimitedMotion)
+            {
+                drive = m_Body.yDrive;
+                limited = true;
+            }
+            else if (m_Body.linearLockZ == ArticulationDofLock.LimitedMotion)
+            {
+                drive = m_Body.zDrive;
+                limited = true;
+            }
+
+            var jointPos = m_Body.jointPosition[0];
+            if (limited)
+            {
+                // If locked, interpolate between the limits.
+                var upperLimit = drive.upperLimit;
+                var lowerLimit = drive.lowerLimit;
+                if (upperLimit <= lowerLimit)
+                {
+                    // Invalid limits (probably equal), so don't try to lerp
+                    return 0;
+                }
+                var invLerped = Mathf.InverseLerp(lowerLimit, upperLimit, jointPos);
+
+                // Convert [0, 1] -> [-1, 1]
+                var normalized = 2.0f * invLerped - 1.0f;
+                return normalized;
+            }
+            // take tanh() to keep in [-1, 1]
+            return (float) System.Math.Tanh(jointPos);
+        }
+    }
+}
+#endif
--- a/com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyJointExtractor.cs.meta
+++ b/com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyJointExtractor.cs.meta
+fileFormatVersion: 2
+guid: 238d15f867b9c4ced9cef331b7420b27
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/com.unity.ml-agents.extensions/Runtime/Sensors/IJointExtractor.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Sensors/IJointExtractor.cs
+using Unity.MLAgents.Sensors;
+
+namespace Unity.MLAgents.Extensions.Sensors
+{
+    /// <summary>
+    /// Interface for generating observations from a physical joint or constraint.
+    /// </summary>
+    public interface IJointExtractor
+    {
+        /// <summary>
+        /// Determine the number of observations that would be generated for the particular joint
+        /// using the provided PhysicsSensorSettings.
+        /// </summary>
+        /// <param name="settings"></param>
+        /// <returns>Number of floats that will be written.</returns>
+        int NumObservations(PhysicsSensorSettings settings);
+
+        /// <summary>
+        /// Write the observations to the ObservationWriter, starting at the specified offset.
+        /// </summary>
+        /// <param name="settings"></param>
+        /// <param name="writer"></param>
+        /// <param name="offset"></param>
+        /// <returns>Number of floats that were written.</returns>
+        int Write(PhysicsSensorSettings settings, ObservationWriter writer, int offset);
+    }
+}
--- a/com.unity.ml-agents.extensions/Runtime/Sensors/IJointExtractor.cs.meta
+++ b/com.unity.ml-agents.extensions/Runtime/Sensors/IJointExtractor.cs.meta
+fileFormatVersion: 2
+guid: 2d2a01ea194334a4682d5c8cad4a956b
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyJointExtractor.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyJointExtractor.cs
+using System.Collections.Generic;
+using UnityEngine;
+using Unity.MLAgents.Sensors;
+
+namespace Unity.MLAgents.Extensions.Sensors
+{
+    public class RigidBodyJointExtractor : IJointExtractor
+    {
+        Rigidbody m_Body;
+        Joint m_Joint;
+
+        public RigidBodyJointExtractor(Rigidbody body)
+        {
+            m_Body = body;
+            m_Joint = m_Body?.GetComponent<Joint>();
+        }
+
+        public int NumObservations(PhysicsSensorSettings settings)
+        {
+            return NumObservations(m_Body, m_Joint, settings);
+        }
+
+        public static int NumObservations(Rigidbody body, Joint joint, PhysicsSensorSettings settings)
+        {
+            if(body == null || joint == null)
+            {
+                return 0;
+            }
+
+            var numObservations = 0;
+            if (settings.UseJointForces)
+            {
+                // 3 force and 3 torque values
+                numObservations += 6;
+            }
+
+            return numObservations;
+        }
+
+        public int Write(PhysicsSensorSettings settings, ObservationWriter writer, int offset)
+        {
+            if (m_Body == null || m_Joint == null)
+            {
+                return 0;
+            }
+
+            var currentOffset = offset;
+            if (settings.UseJointForces)
+            {
+                // Take tanh of the forces and torques to ensure they're in [-1, 1]
+                writer[currentOffset++] = (float)System.Math.Tanh(m_Joint.currentForce.x);
+                writer[currentOffset++] = (float)System.Math.Tanh(m_Joint.currentForce.y);
+                writer[currentOffset++] = (float)System.Math.Tanh(m_Joint.currentForce.z);
+
+                writer[currentOffset++] = (float)System.Math.Tanh(m_Joint.currentTorque.x);
+                writer[currentOffset++] = (float)System.Math.Tanh(m_Joint.currentTorque.y);
+                writer[currentOffset++] = (float)System.Math.Tanh(m_Joint.currentTorque.z);
+            }
+            return currentOffset - offset;
+        }
+    }
+}
--- a/com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyJointExtractor.cs.meta
+++ b/com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyJointExtractor.cs.meta
+fileFormatVersion: 2
+guid: 5014d7ab95c6a44469f447b8a7019746
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/ml-agents/mlagents/trainers/tf/init.py
+++ b/ml-agents/mlagents/trainers/tf/init.py
--- a/ml-agents/mlagents/trainers/policy/nn_policy.py
+++ b/ml-agents/mlagents/trainers/policy/nn_policy.py
-from typing import Any, Dict, Optional, List
-from mlagents.tf_utils import tf
-from mlagents_envs.timers import timed
-from mlagents_envs.base_env import DecisionSteps, BehaviorSpec
-from mlagents.trainers.models import EncoderType
-from mlagents.trainers.models import ModelUtils
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.settings import TrainerSettings
-from mlagents.trainers.distributions import (
-    GaussianDistribution,
-    MultiCategoricalDistribution,
-)
-
-EPSILON = 1e-6  # Small value to avoid divide by zero
-
-
-class NNPolicy(TFPolicy):
-    def __init__(
-        self,
-        seed: int,
-        behavior_spec: BehaviorSpec,
-        trainer_params: TrainerSettings,
-        is_training: bool,
-        model_path: str,
-        load: bool,
-        tanh_squash: bool = False,
-        reparameterize: bool = False,
-        condition_sigma_on_obs: bool = True,
-        create_tf_graph: bool = True,
-    ):
-        """
-        Policy that uses a multilayer perceptron to map the observations to actions. Could
-        also use a CNN to encode visual input prior to the MLP. Supports discrete and
-        continuous action spaces, as well as recurrent networks.
-        :param seed: Random seed.
-        :param brain: Assigned BrainParameters object.
-        :param trainer_params: Defined training parameters.
-        :param is_training: Whether the model should be trained.
-        :param load: Whether a pre-trained model will be loaded or a new one created.
-        :param model_path: Path where the model should be saved and loaded.
-        :param tanh_squash: Whether to use a tanh function on the continuous output, or a clipped output.
-        :param reparameterize: Whether we are using the resampling trick to update the policy in continuous output.
-        """
-        super().__init__(seed, behavior_spec, trainer_params, model_path, load)
-        self.grads = None
-        self.update_batch: Optional[tf.Operation] = None
-        num_layers = self.network_settings.num_layers
-        self.h_size = self.network_settings.hidden_units
-        if num_layers < 1:
-            num_layers = 1
-        self.num_layers = num_layers
-        self.vis_encode_type = self.network_settings.vis_encode_type
-        self.tanh_squash = tanh_squash
-        self.reparameterize = reparameterize
-        self.condition_sigma_on_obs = condition_sigma_on_obs
-        self.trainable_variables: List[tf.Variable] = []
-
-        # Non-exposed parameters; these aren't exposed because they don't have a
-        # good explanation and usually shouldn't be touched.
-        self.log_std_min = -20
-        self.log_std_max = 2
-        if create_tf_graph:
-            self.create_tf_graph()
-
-    def get_trainable_variables(self) -> List[tf.Variable]:
-        """
-        Returns a List of the trainable variables in this policy. if create_tf_graph hasn't been called,
-        returns empty list.
-        """
-        return self.trainable_variables
-
-    def create_tf_graph(self) -> None:
-        """
-        Builds the tensorflow graph needed for this policy.
-        """
-        with self.graph.as_default():
-            tf.set_random_seed(self.seed)
-            _vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
-            if len(_vars) > 0:
-                # We assume the first thing created in the graph is the Policy. If
-                # already populated, don't create more tensors.
-                return
-
-            self.create_input_placeholders()
-            encoded = self._create_encoder(
-                self.visual_in,
-                self.processed_vector_in,
-                self.h_size,
-                self.num_layers,
-                self.vis_encode_type,
-            )
-            if self.use_continuous_act:
-                self._create_cc_actor(
-                    encoded,
-                    self.tanh_squash,
-                    self.reparameterize,
-                    self.condition_sigma_on_obs,
-                )
-                self.saliency = tf.reduce_mean(
-                    tf.square(tf.gradients(self.output, self.vector_in)), axis=1
-                )
-
-            else:
-                self._create_dc_actor(encoded)
-                self.saliency = tf.reduce_mean(
-                    tf.square(tf.gradients(self.output_pre, self.vector_in)), axis=1
-                )
-
-            self.trainable_variables = tf.get_collection(
-                tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy"
-            )
-            self.trainable_variables += tf.get_collection(
-                tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm"
-            )  # LSTMs need to be root scope for Barracuda export
-
-        
-        self.inference_dict: Dict[str, tf.Tensor] = {
-            "action": self.output,
-            "log_probs": self.all_log_probs,
-            "entropy": self.entropy,
-        }
-        if self.use_continuous_act:
-            self.inference_dict["pre_action"] = self.output_pre
-        if self.use_recurrent:
-            self.inference_dict["memory_out"] = self.memory_out
-
-        # We do an initialize to make the Policy usable out of the box. If an optimizer is needed,
-        # it will re-load the full graph
-        self._initialize_graph()
-        
-
-    @timed
-    def evaluate(
-        self, decision_requests: DecisionSteps, global_agent_ids: List[str]
-    ) -> Dict[str, Any]:
-        """
-        Evaluates policy for the agent experiences provided.
-        :param decision_requests: DecisionSteps object containing inputs.
-        :param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result.
-        :return: Outputs from network as defined by self.inference_dict.
-        """
-        feed_dict = {
-            self.batch_size_ph: len(decision_requests),
-            self.sequence_length_ph: 1,
-        }
-        if self.use_recurrent:
-            if not self.use_continuous_act:
-                feed_dict[self.prev_action] = self.retrieve_previous_action(
-                    global_agent_ids
-                )
-            feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
-        feed_dict = self.fill_eval_dict(feed_dict, decision_requests)
-        run_out = self._execute_model(feed_dict, self.inference_dict)
-        return run_out
-
-    def _create_encoder(
-        self,
-        visual_in: List[tf.Tensor],
-        vector_in: tf.Tensor,
-        h_size: int,
-        num_layers: int,
-        vis_encode_type: EncoderType,
-    ) -> tf.Tensor:
-        """
-        Creates an encoder for visual and vector observations.
-        :param h_size: Size of hidden linear layers.
-        :param num_layers: Number of hidden linear layers.
-        :param vis_encode_type: Type of visual encoder to use if visual input.
-        :return: The hidden layer (tf.Tensor) after the encoder.
-        """
-        with tf.variable_scope("policy"):
-            encoded = ModelUtils.create_observation_streams(
-                self.visual_in,
-                self.processed_vector_in,
-                1,
-                h_size,
-                num_layers,
-                vis_encode_type,
-            )[0]
-        return encoded
-
-    def _create_cc_actor(
-        self,
-        encoded: tf.Tensor,
-        tanh_squash: bool = False,
-        reparameterize: bool = False,
-        condition_sigma_on_obs: bool = True,
-    ) -> None:
-        """
-        Creates Continuous control actor-critic model.
-        :param h_size: Size of hidden linear layers.
-        :param num_layers: Number of hidden linear layers.
-        :param vis_encode_type: Type of visual encoder to use if visual input.
-        :param tanh_squash: Whether to use a tanh function, or a clipped output.
-        :param reparameterize: Whether we are using the resampling trick to update the policy.
-        """
-        if self.use_recurrent:
-            self.memory_in = tf.placeholder(
-                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
-            )
-            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
-                encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy"
-            )
-
-            self.memory_out = tf.identity(memory_policy_out, name="recurrent_out")
-        else:
-            hidden_policy = encoded
-
-        with tf.variable_scope("policy"):
-            distribution = GaussianDistribution(
-                hidden_policy,
-                self.act_size,
-                reparameterize=reparameterize,
-                tanh_squash=tanh_squash,
-                condition_sigma=condition_sigma_on_obs,
-            )
-
-        if tanh_squash:
-            self.output_pre = distribution.sample
-            self.output = tf.identity(self.output_pre, name="action")
-        else:
-            self.output_pre = distribution.sample
-            # Clip and scale output to ensure actions are always within [-1, 1] range.
-            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
-            self.output = tf.identity(output_post, name="action")
-
-        self.selected_actions = tf.stop_gradient(self.output)
-
-        self.all_log_probs = tf.identity(distribution.log_probs, name="action_probs")
-        self.entropy = distribution.entropy
-
-        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
-        self.total_log_probs = distribution.total_log_probs
-
-    def _create_dc_actor(self, encoded: tf.Tensor) -> None:
-        """
-        Creates Discrete control actor-critic model.
-        :param h_size: Size of hidden linear layers.
-        :param num_layers: Number of hidden linear layers.
-        :param vis_encode_type: Type of visual encoder to use if visual input.
-        """
-        if self.use_recurrent:
-            self.prev_action = tf.placeholder(
-                shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action"
-            )
-            prev_action_oh = tf.concat(
-                [
-                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
-                    for i in range(len(self.act_size))
-                ],
-                axis=1,
-            )
-            hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)
-
-            self.memory_in = tf.placeholder(
-                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
-            )
-            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
-                hidden_policy,
-                self.memory_in,
-                self.sequence_length_ph,
-                name="lstm_policy",
-            )
-
-            self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
-        else:
-            hidden_policy = encoded
-
-        self.action_masks = tf.placeholder(
-            shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
-        )
-
-        with tf.variable_scope("policy"):
-            distribution = MultiCategoricalDistribution(
-                hidden_policy, self.act_size, self.action_masks
-            )
-        # It's important that we are able to feed_dict a value into this tensor to get the
-        # right one-hot encoding, so we can't do identity on it.
-        self.output = distribution.sample
-        self.all_log_probs = tf.identity(distribution.log_probs, name="action")
-        self.selected_actions = tf.stop_gradient(
-            distribution.sample_onehot
-        )  # In discrete, these are onehot
-        self.entropy = distribution.entropy
-        self.total_log_probs = distribution.total_log_probs
--- a//ml-agents/mlagents/trainers/tf/tensorflow_to_barracuda.py
+++ b//ml-agents/mlagents/trainers/tf/tensorflow_to_barracuda.py
--- a//ml-agents/mlagents/trainers/tf/models.py
+++ b//ml-agents/mlagents/trainers/tf/models.py
--- a//ml-agents/mlagents/trainers/tf/distributions.py
+++ b//ml-agents/mlagents/trainers/tf/distributions.py