Merge branch 'main' into develop-superpush-int

4 年前 · c8137dcd
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md

 ### Checklist
 - [ ] Added tests that prove my fix is effective or that my feature works
- [ ] Updated the [changelog](https://github.com/Unity-Technologies/ml-agents/blob/master/com.unity.ml-agents/CHANGELOG.md) (if applicable)
- [ ] Updated the [documentation](https://github.com/Unity-Technologies/ml-agents/tree/master/docs) (if applicable)
- [ ] Updated the [migration guide](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Migrating.md) (if applicable)
+- [ ] Updated the [changelog](https://github.com/Unity-Technologies/ml-agents/blob/main/com.unity.ml-agents/CHANGELOG.md) (if applicable)
+- [ ] Updated the [documentation](https://github.com/Unity-Technologies/ml-agents/tree/main/docs) (if applicable)
+- [ ] Updated the [migration guide](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Migrating.md) (if applicable)

 ### Other comments
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
 on:
  pull_request:
  push:
-    branches: [master]
+    branches: [main]

 jobs:
  pre-commit:
--- a/.github/workflows/publish_pypi.yaml
+++ b/.github/workflows/publish_pypi.yaml
            package-path: [ml-agents, ml-agents-envs, gym-unity]

    steps:
-    - uses: actions/checkout@master
+    - uses: actions/checkout@main
    - name: Set up Python 3.7
      uses: actions/setup-python@v1
      with:
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
    - 'test_requirements.txt'
    - '.github/workflows/pytest.yml'
  push:
-    branches: [master]
+    branches: [main]

 jobs:
  pytest:
--- a/.yamato/com.unity.ml-agents-performance.yml
+++ b/.yamato/com.unity.ml-agents-performance.yml
  triggers:
    cancel_old_ci: true
    recurring:
-      - branch: master
+      - branch: main
        frequency: daily
  artifacts:
    logs:
--- a/.yamato/com.unity.ml-agents-test.yml
+++ b/.yamato/com.unity.ml-agents-test.yml
  triggers:
    cancel_old_ci: true
    recurring:
-      - branch: master
+      - branch: main
        frequency: daily

 {% for package in packages %}
    cancel_old_ci: true
    {% if platform.name == "linux" %}
    expression: |
-      (pull_request.target eq "master" OR
+      (pull_request.target eq "main" OR
      pull_request.target match "release.+") AND
      NOT pull_request.draft AND
      (pull_request.changes.any match "com.unity.ml-agents/**" OR
--- a/.yamato/compressed-sensor-test.yml
+++ b/.yamato/compressed-sensor-test.yml
    cancel_old_ci: true
    {% if editor.extra_test == "sensor" %}
    expression: |
-      (pull_request.target eq "master" OR
+      (pull_request.target eq "main" OR
      pull_request.target match "release.+") AND
      NOT pull_request.draft AND
      (pull_request.changes.any match "com.unity.ml-agents/**" OR
--- a/.yamato/gym-interface-test.yml
+++ b/.yamato/gym-interface-test.yml
    cancel_old_ci: true
    {% if editor.extra_test == "gym" %}
    expression: |
-      (pull_request.target eq "master" OR
+      (pull_request.target eq "main" OR
      pull_request.target match "release.+") AND
      NOT pull_request.draft AND
      (pull_request.changes.any match "com.unity.ml-agents/**" OR
--- a/.yamato/protobuf-generation-test.yml
+++ b/.yamato/protobuf-generation-test.yml
  triggers:
    cancel_old_ci: true
    expression: |
-      (pull_request.target eq "master" OR
+      (pull_request.target eq "main" OR
      pull_request.target match "release.+") AND
      NOT pull_request.draft AND
      (pull_request.changes.any match "protobuf-definitions/**" OR
--- a/.yamato/pytest-gpu.yml
+++ b/.yamato/pytest-gpu.yml
  triggers:
    cancel_old_ci: true
    recurring:
-      - branch: master
+      - branch: main
        frequency: daily
  artifacts:
    logs:
--- a/.yamato/python-ll-api-test.yml
+++ b/.yamato/python-ll-api-test.yml
    cancel_old_ci: true
    {% if editor.extra_test == "llapi" %}
    expression: |
-      (pull_request.target eq "master" OR
+      (pull_request.target eq "main" OR
      pull_request.target match "release.+") AND
      NOT pull_request.draft AND
      (pull_request.changes.any match "com.unity.ml-agents/**" OR
--- a/.yamato/standalone-build-test.yml
+++ b/.yamato/standalone-build-test.yml
  triggers:
    cancel_old_ci: true
    expression: |
-      (pull_request.target eq "master" OR
+      (pull_request.target eq "main" OR
      pull_request.target match "release.+") AND
      NOT pull_request.draft AND
      (pull_request.changes.any match "com.unity.ml-agents/**" OR
--- a/.yamato/standalone-build-webgl-test.yml
+++ b/.yamato/standalone-build-webgl-test.yml
  triggers:
    cancel_old_ci: true
    recurring:
-      - branch: master
+      - branch: main
        frequency: weekly
  artifacts:
    logs:
--- a/.yamato/training-backcompat-tests.yml
+++ b/.yamato/training-backcompat-tests.yml
  triggers:
    cancel_old_ci: true
    recurring:
-      - branch: master
+      - branch: main
        frequency: daily
  artifacts:
    logs:
--- a/.yamato/training-int-tests.yml
+++ b/.yamato/training-int-tests.yml
  triggers:
    cancel_old_ci: true
    expression: |
-      (pull_request.target eq "master" OR
+      (pull_request.target eq "main" OR
      pull_request.target match "release.+") AND
      NOT pull_request.draft AND
      (pull_request.changes.any match "com.unity.ml-agents/**" OR
--- a/ML-Agents-Input-Example/Packages/packages-lock.json
+++ b/ML-Agents-Input-Example/Packages/packages-lock.json
      "url": "https://packages.unity.com"
    },
    "com.unity.barracuda": {
-      "version": "1.3.0-preview",
+      "version": "1.3.1-preview",
      "depth": 1,
      "source": "registry",
      "dependencies": {
      "depth": 0,
      "source": "local",
      "dependencies": {
-        "com.unity.barracuda": "1.3.0-preview",
+        "com.unity.barracuda": "1.3.1-preview",
        "com.unity.modules.imageconversion": "1.0.0",
        "com.unity.modules.jsonserialize": "1.0.0",
        "com.unity.modules.physics": "1.0.0",
--- a/Project/Assets/ML-Agents/Examples/FoodCollector/Scenes/GridFoodCollector.unity
+++ b/Project/Assets/ML-Agents/Examples/FoodCollector/Scenes/GridFoodCollector.unity
  m_ReflectionIntensity: 1
  m_CustomReflection: {fileID: 0}
  m_Sun: {fileID: 0}
-  m_IndirectSpecularColor: {r: 0.4497121, g: 0.49977785, b: 0.57563704, a: 1}
+  m_IndirectSpecularColor: {r: 0.44971168, g: 0.4997775, b: 0.57563686, a: 1}
  m_UseRadianceAmbientProbe: 0
 --- !u!157 &3
 LightmapSettings:
      propertyPath: m_Name
      value: GridFoodCollectorArea
      objectReference: {fileID: 0}
+    - target: {fileID: 4137908820211030, guid: b5339e4b990ade14f992aadf3bf8591b, type: 3}
+      propertyPath: m_LocalPosition.x
+      value: -17.2
+      objectReference: {fileID: 0}
+    - target: {fileID: 4259834826122778, guid: b5339e4b990ade14f992aadf3bf8591b, type: 3}
+      propertyPath: m_LocalPosition.x
+      value: -23.9
+      objectReference: {fileID: 0}
+    - target: {fileID: 4419274671784554, guid: b5339e4b990ade14f992aadf3bf8591b, type: 3}
+      propertyPath: m_LocalPosition.x
+      value: -8.9
+      objectReference: {fileID: 0}
    - target: {fileID: 4688212428263696, guid: b5339e4b990ade14f992aadf3bf8591b, type: 3}
      propertyPath: m_LocalPosition.x
      value: 0
      propertyPath: m_LocalEulerAnglesHint.z
      value: 0
      objectReference: {fileID: 0}
+    - target: {fileID: 4756368533889646, guid: b5339e4b990ade14f992aadf3bf8591b, type: 3}
+      propertyPath: m_LocalPosition.x
+      value: -30.4
+      objectReference: {fileID: 0}
+    - target: {fileID: 4756368533889646, guid: b5339e4b990ade14f992aadf3bf8591b, type: 3}
+      propertyPath: m_LocalPosition.z
+      value: -9.9
+      objectReference: {fileID: 0}
+    - target: {fileID: 3067525015186813280, guid: b5339e4b990ade14f992aadf3bf8591b,
+        type: 3}
+      propertyPath: NumCollidersPerCell
+      value: 1
+      objectReference: {fileID: 0}
+    - target: {fileID: 3067525015186813280, guid: b5339e4b990ade14f992aadf3bf8591b,
+        type: 3}
+      propertyPath: EstimatedMaxCollidersPerCell
+      value: 4
+      objectReference: {fileID: 0}
+    - target: {fileID: 5837508007780682603, guid: b5339e4b990ade14f992aadf3bf8591b,
+        type: 3}
+      propertyPath: ChannelOffsets.Array.size
+      value: 1
+      objectReference: {fileID: 0}
+    - target: {fileID: 5837508007780682603, guid: b5339e4b990ade14f992aadf3bf8591b,
+        type: 3}
+      propertyPath: ShowGizmos
+      value: 0
+      objectReference: {fileID: 0}
+    - target: {fileID: 5837508007780682603, guid: b5339e4b990ade14f992aadf3bf8591b,
+        type: 3}
+      propertyPath: ObservationPerCell
+      value: 6
+      objectReference: {fileID: 0}
+    - target: {fileID: 5837508007780682603, guid: b5339e4b990ade14f992aadf3bf8591b,
+        type: 3}
+      propertyPath: NumberOfObservations
+      value: 9600
+      objectReference: {fileID: 0}
+    - target: {fileID: 5837508007780682603, guid: b5339e4b990ade14f992aadf3bf8591b,
+        type: 3}
+      propertyPath: m_Enabled
+      value: 1
+      objectReference: {fileID: 0}
+    - target: {fileID: 5837508007780682603, guid: b5339e4b990ade14f992aadf3bf8591b,
+        type: 3}
+      propertyPath: rootReference
+      value: 
+      objectReference: {fileID: 190823801}
+--- !u!1 &190823801 stripped
+GameObject:
+  m_CorrespondingSourceObject: {fileID: 1706274796045088, guid: b5339e4b990ade14f992aadf3bf8591b,
+    type: 3}
+  m_PrefabInstance: {fileID: 190823800}
+  m_PrefabAsset: {fileID: 0}
 --- !u!1001 &392794583
 PrefabInstance:
  m_ObjectHideFlags: 0
--- a/Project/Assets/ML-Agents/Examples/Match3/Prefabs/Match3VisualObs.prefab
+++ b/Project/Assets/ML-Agents/Examples/Match3/Prefabs/Match3VisualObs.prefab
    VectorActionDescriptions: []
    VectorActionSpaceType: 0
    hasUpgradedBrainParametersWithActionSpec: 1
-  m_Model: {fileID: 11400000, guid: 48d14da88fea74d0693c691c6e3f2e34, type: 3}
+  m_Model: {fileID: 11400000, guid: 28ccdfd7cb3d941ce8af0ab89e06130a, type: 3}
  m_InferenceDevice: 2
  m_BehaviorType: 0
  m_BehaviorName: Match3VisualObs
--- a/Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VectorObs.onnx
+++ b/Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VectorObs.onnx
--- a/Project/Assets/ML-Agents/Examples/Sorter/Prefabs/Area.prefab
+++ b/Project/Assets/ML-Agents/Examples/Sorter/Prefabs/Area.prefab
  m_Script: {fileID: 11500000, guid: dd8012d5925524537b27131fef517017, type: 3}
  m_Name: 
  m_EditorClassIdentifier: 
-  ObservableSize: 23
-  MaxNumObservables: 20
+  m_SensorName: BufferSensor
+  m_ObservableSize: 23
+  m_MaxNumObservables: 20
 --- !u!1 &6000518840957865293
 GameObject:
  m_ObjectHideFlags: 0
--- a/README.md
+++ b/README.md

 # Unity ML-Agents Toolkit

-[![docs badge](https://img.shields.io/badge/docs-reference-blue.svg)](https://github.com/Unity-Technologies/ml-agents/tree/release_12_docs/docs/)
+[![docs badge](https://img.shields.io/badge/docs-reference-blue.svg)](https://github.com/Unity-Technologies/ml-agents/tree/release_13_docs/docs/)

 [![license badge](https://img.shields.io/badge/license-Apache--2.0-green.svg)](LICENSE)


 ## Features

- 15+ [example Unity environments](docs/Learning-Environment-Examples.md)
+- 18+ [example Unity environments](docs/Learning-Environment-Examples.md)
- Built-in support for Imitation Learning through Behavioral Cloning or
-  Generative Adversarial Imitation Learning
+- Built-in support for Imitation Learning through Behavioral Cloning (BC) or
+  Generative Adversarial Imitation Learning (GAIL)
 - Self-play mechanism for training agents in adversarial scenarios
 - Easily definable Curriculum Learning scenarios for complex tasks
 - Train robust agents using environment randomization

 ## Releases & Documentation

-**Our latest, stable release is `Release 12`. Click
-[here](https://github.com/Unity-Technologies/ml-agents/tree/release_12_docs/docs/Readme.md)
+
+**Our latest, stable release is `Release 13`. Click
+[here](https://github.com/Unity-Technologies/ml-agents/tree/release_13_docs/docs/Readme.md)
-The table below lists all our releases, including our `master` branch which is
+The table below lists all our releases, including our `main` branch which is
 under active development and may be unstable. A few helpful guidelines:
 - The [Versioning page](docs/Versioning.md) overviews how we manage our GitHub
  releases and the versioning process for each of the ML-Agents components.

 | **Version** | **Release Date** | **Source** | **Documentation** | **Download** | **Python Package** | **Unity Package** |
 |:-------:|:------:|:-------------:|:-------:|:------------:|:------------:|:------------:|
-| **master (unstable)** | -- | [source](https://github.com/Unity-Technologies/ml-agents/tree/master) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/master/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/master.zip) | -- | -- |
-| **Release 12** | **December 22, 2020** | **[source](https://github.com/Unity-Technologies/ml-agents/tree/release_12)** | **[docs](https://github.com/Unity-Technologies/ml-agents/tree/release_12_docs/docs/Readme.md)** | **[download](https://github.com/Unity-Technologies/ml-agents/archive/release_12.zip)** | **[0.23.0](https://pypi.org/project/mlagents/0.23.0/)** | **[1.7.2](https://docs.unity3d.com/Packages/com.unity.ml-agents@1.7/manual/index.html)** |
+| **main (unstable)** | -- | [source](https://github.com/Unity-Technologies/ml-agents/tree/main) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/main/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/main.zip) | -- | -- |
+| **Release 13** | **February 17, 2021** | **[source](https://github.com/Unity-Technologies/ml-agents/tree/release_13)** | **[docs](https://github.com/Unity-Technologies/ml-agents/tree/release_13_docs/docs/Readme.md)** | **[download](https://github.com/Unity-Technologies/ml-agents/archive/release_13.zip)** | **[0.24.0](https://pypi.org/project/mlagents/0.24.0/)** | **[1.8.0](https://docs.unity3d.com/Packages/com.unity.ml-agents@1.8/manual/index.html)** |
+| **Release 12** | December 22, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/release_12) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/release_12_docs/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/release_12.zip) | [0.23.0](https://pypi.org/project/mlagents/0.23.0/) | [1.7.2](https://docs.unity3d.com/Packages/com.unity.ml-agents@1.7/manual/index.html) |
 | **Release 11** | December 21, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/release_11) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/release_11_docs/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/release_11.zip) | [0.23.0](https://pypi.org/project/mlagents/0.23.0/) | [1.7.0](https://docs.unity3d.com/Packages/com.unity.ml-agents@1.7/manual/index.html) |
 | **Release 10** | November 18, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/release_10) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/release_10_docs/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/release_10.zip) | [0.22.0](https://pypi.org/project/mlagents/0.22.0/) | [1.6.0](https://docs.unity3d.com/Packages/com.unity.ml-agents@1.6/manual/index.html) |
 | **Verified Package 1.0.6** | **November 16, 2020** | **[source](https://github.com/Unity-Technologies/ml-agents/tree/com.unity.ml-agents_1.0.6)** | **[docs](https://github.com/Unity-Technologies/ml-agents/blob/release_2_verified_docs/docs/Readme.md)** | **[download](https://github.com/Unity-Technologies/ml-agents/archive/com.unity.ml-agents_1.0.6.zip)** | **[0.16.1](https://pypi.org/project/mlagents/0.16.1/)** | **[1.0.6](https://docs.unity3d.com/Packages/com.unity.ml-agents@1.0/manual/index.html)** |
-| **Release 7** | September 16, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/release_7) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/release_7_docs/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/release_7.zip) | [0.20.0](https://pypi.org/project/mlagents/0.20.0/) | [1.4.0](https://docs.unity3d.com/Packages/com.unity.ml-agents@1.4/manual/index.html) |

 If you are a researcher interested in a discussion of Unity as an AI platform,
 see a pre-print of our
  ([multi-armed bandit](https://blogs.unity3d.com/2017/06/26/unity-ai-themed-blog-entries/)
  and
  [Q-learning](https://blogs.unity3d.com/2017/08/22/unity-ai-reinforcement-learning-with-q-learning/))
+
+### More from Unity
+
+- [Unity Robotics](https://github.com/Unity-Technologies/Unity-Robotics-Hub)
+- [Unity Computer Vision](https://unity.com/computer-vision)
+- [Unity Game Simulation](https://unity.com/products/game-simulation)

 ## Community and Feedback

--- a/com.unity.ml-agents.extensions/Documentation~/Grid-Sensor.md
+++ b/com.unity.ml-agents.extensions/Documentation~/Grid-Sensor.md

 # Contribution

+An image can be thought of as a matrix of a predefined width (W) and a height (H) and each pixel can be thought of as simply an array of length 3 (in the case of RGB), `[Red, Green, Blue]` holding the different channel information of the color (channel) intensities at that pixel location. Thus an image is just a 3 dimensional matrix of size WxHx3. A Grid Observation can be thought of as a generalization of this setup where in place of a pixel there is a "cell" which is an array of length N representing different channel intensities at that cell position. From a Convolutional Neural Network point of view, the introduction of multiple channels in an "image" isn't a new concept. One such example is using an RGB-Depth image which is used in several robotics applications. The distinction of Grid Observations is what the data within the channels represents. Instead of limiting the channels to color intensities, the channels within a cell of a Grid Observation generalize to any data that can be represented by a single number (float or int).
+
+Before jumping into the details of the Grid Sensor, an important thing to note is the agent performance and qualitatively different behavior over raycasts. Unity MLAgent's comes with a suite of example environments. One in particular, the [Food Collector](https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/Learning-Environment-Examples.md#food-collector), has been the focus of the Grid Sensor development.
+
+The Food Collector environment can be described as:
+* Set-up: A multi-agent environment where agents compete to collect food.
+* Goal: The agents must learn to collect as many green food spheres as possible while avoiding red spheres.
+* Agents: The environment contains 5 agents with same Behavior Parameters.
+
+When applying the Grid Sensor to this environment, in place of the Raycast Vector Sensor or the Camera Sensor, a Mean Reward of 40-50 is observed. This performance is on par with what is seen by agents trained with RayCasts but the side-by-side comparison of trained agents, shows a qualitative difference in behavior. A deeper study and interpretation of the qualitative differences between agents trained with Raycasts and Vector Sensors verses Grid Sensors is left to future studies.
+
+<img src="images/gridobs-vs-vectorobs.gif" align="middle" width="3000"/>
+
 ## Overview

 There are three main phases to the observation process of the Grid Sensor:

 ### Channel Based

-The Channel Based Grid Observations represent obsevations in a normalized form with 0 to 1. To distinguish between categorical and continuous data, one would use the ChannelDepth array to signify the ranges that the values in the `channelValues` array could take. If one sets ChannelDepth[i] to be 1, it is assumed that the value of `channelValues[i]` is already normalized. Else ChannelDepth[i] represents the total number of possible values that `channelValues[i]` can take and will be used for normalization.
+The Channel Based Grid Observations is perhaps the simplest in terms of usability and similarity with other machine learning applications. Each grid is of size WxHxC where C is the number of channels. To distinguish between categorical and continuous data, one would use the ChannelDepth array to signify the ranges that the values in the `channelValues` array could take. If one sets ChannelDepth[i] to be 1, it is assumed that the value of `channelValues[i]` is already normalized. Else ChannelDepth[i] represents the total number of possible values that `channelValues[i]` can take.
+As the "enemy" is in the second position of the observed tags, its value can be normalized by:
 For ObjectType, "weapon", "enemy" will be represented respectively as:
 ```
 weapon = DetectableObjects.IndexOfTag("weapon")/ChannelDepth[0] = 1/2 = 0.5;
--- a/com.unity.ml-agents.extensions/Documentation~/Match3.md
+++ b/com.unity.ml-agents.extensions/Documentation~/Match3.md
 This implementation includes:

 * C# implementation catered toward a Match-3 setup including concepts around encoding for moves based on [Human Like Playtesting with Deep Learning](https://www.researchgate.net/publication/328307928_Human-Like_Playtesting_with_Deep_Learning)
-* An example Match-3 scene with ML-Agents implemented (located under /Project/Assets/ML-Agents/Examples/Match3). More information, on Match-3 example [here](https://github.com/Unity-Technologies/ml-agents/tree/release_12_docs/docs/docs/Learning-Environment-Examples.md#match-3).
+* An example Match-3 scene with ML-Agents implemented (located under /Project/Assets/ML-Agents/Examples/Match3). More information, on Match-3 example [here](https://github.com/Unity-Technologies/ml-agents/tree/release_13_docs/docs/docs/Learning-Environment-Examples.md#match-3).

 ### Feedback
 If you are a Match-3 developer and are trying to leverage ML-Agents for this scenario, [we want to hear from you](https://forms.gle/TBsB9jc8WshgzViU9). Additionally, we are also looking for interested Match-3 teams to speak with us for 45 minutes. If you are interested, please indicate that in the [form](https://forms.gle/TBsB9jc8WshgzViU9).  If selected, we will provide gift cards as a token of appreciation.
--- a/com.unity.ml-agents.extensions/Documentation~/com.unity.ml-agents.extensions.md
+++ b/com.unity.ml-agents.extensions/Documentation~/com.unity.ml-agents.extensions.md
 recommended ways to install the package:

 ### Local Installation
-[Clone the repository](https://github.com/Unity-Technologies/ml-agents/tree/release_12_docs/docs/Installation.md#clone-the-ml-agents-toolkit-repository-optional) and follow the
-[Local Installation for Development](https://github.com/Unity-Technologies/ml-agents/tree/release_12_docs/docs/Installation.md#advanced-local-installation-for-development-1)
+[Clone the repository](https://github.com/Unity-Technologies/ml-agents/tree/release_13_docs/docs/Installation.md#clone-the-ml-agents-toolkit-repository-optional) and follow the
+[Local Installation for Development](https://github.com/Unity-Technologies/ml-agents/tree/release_13_docs/docs/Installation.md#advanced-local-installation-for-development-1)
-![Package Manager git URL](https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/images/unity_package_manager_git_url.png)
-
+![Package Manager git URL](https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/images/unity_package_manager_git_url.png)
 In the dialog that appears, enter
 ```
 git+https://github.com/Unity-Technologies/ml-agents.git?path=com.unity.ml-agents.extensions
    - No way to customize the action space of the `InputActuatorComponent`

 ## Need Help?
-The main [README](https://github.com/Unity-Technologies/ml-agents/tree/release_12_docs/README.md) contains links for contacting the team or getting support.
+The main [README](https://github.com/Unity-Technologies/ml-agents/tree/release_13_docs/README.md) contains links for contacting the team or getting support.
--- a/com.unity.ml-agents.extensions/Runtime/Input/InputActionActuator.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Input/InputActionActuator.cs
    /// <see cref="Agent"/>'s <see cref="BehaviorParameters"/> indicate that the Agent is running in Heuristic Mode,
    /// this Actuator will write actions from the <see cref="InputSystem"/> to the <see cref="ActionBuffers"/> object.
    /// </summary>
-    public class InputActionActuator : IActuator, IHeuristicProvider
+    public class InputActionActuator : IActuator, IHeuristicProvider, IBuiltInActuator
    {
        readonly BehaviorParameters m_BehaviorParameters;
        readonly InputAction m_Action;
        /// <param name="adaptor">The <see cref="IRLActionInputAdaptor"/> that will convert data between ML-Agents
        ///     and the <see cref="InputSystem"/>.</param>
        public InputActionActuator(InputDevice inputDevice, BehaviorParameters behaviorParameters,
-            InputAction action,
-            IRLActionInputAdaptor adaptor)
+                                   InputAction action,
+                                   IRLActionInputAdaptor adaptor)
        {
            m_BehaviorParameters = behaviorParameters;
            Name = $"InputActionActuator-{action.name}";
            Profiler.BeginSample("InputActionActuator.Heuristic");
            m_InputAdaptor.WriteToHeuristic(m_Action, actionBuffersOut);
            Profiler.EndSample();
+        }
+
+        /// <inheritdoc/>
+        public BuiltInActuatorType GetBuiltInActuatorType()
+        {
+            return BuiltInActuatorType.InputActionActuator;
        }
    }
 }
--- a/com.unity.ml-agents.extensions/Runtime/Input/InputActuatorComponent.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Input/InputActuatorComponent.cs
    /// <see cref="InputActionActuator"/>s.
    /// </summary>
    [RequireComponent(typeof(PlayerInput), typeof(IInputActionAssetProvider))]
+    [AddComponentMenu("ML Agents/Input Actuator", (int)MenuGroup.Actuators)]
    public class InputActuatorComponent : ActuatorComponent
    {
        InputActionAsset m_InputAsset;
            }

            var inputControlScheme = new InputControlScheme(
-                 mlAgentsControlSchemeName,
+                mlAgentsControlSchemeName,
                deviceRequirements);

            return inputControlScheme;
                    var builder = new InputControlLayout.Builder()
                        .WithName(layoutName)
                        .WithFormat(mlAgentsLayoutFormat);
-                    for(var i = 0; i < defaultMap.actions.Count; i++)
+                    for (var i = 0; i < defaultMap.actions.Count; i++)
                    {
                        var action = defaultMap.actions[i];
                        builder.AddControl(action.name)
-
                }, layoutName);
            }
        }
--- a/com.unity.ml-agents.extensions/Runtime/Match3/Match3Actuator.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Match3/Match3Actuator.cs
    /// Actuator for a Match3 game. It translates valid moves (defined by AbstractBoard.IsMoveValid())
    /// in action masks, and applies the action to the board via AbstractBoard.MakeMove().
    /// </summary>
-    public class Match3Actuator : IActuator, IHeuristicProvider
+    public class Match3Actuator : IActuator, IHeuristicProvider, IBuiltInActuator
    {
        protected AbstractBoard m_Board;
        protected System.Random m_Random;
        /// <inheritdoc/>
        public void ResetData()
        {
+        }
+
+        /// <inheritdoc/>
+        public BuiltInActuatorType GetBuiltInActuatorType()
+        {
+            return BuiltInActuatorType.Match3Actuator;
        }

        IEnumerable<int> InvalidMoveIndices()
        {
            return 1;
        }
-
    }
 }
--- a/com.unity.ml-agents.extensions/Runtime/Match3/Match3ActuatorComponent.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Match3/Match3ActuatorComponent.cs
    /// <summary>
    /// Actuator component for a Match3 game. Generates a Match3Actuator at runtime.
    /// </summary>
+    [AddComponentMenu("ML Agents/Match 3 Actuator", (int)MenuGroup.Actuators)]
    public class Match3ActuatorComponent : ActuatorComponent
    {
        /// <summary>
--- a/com.unity.ml-agents.extensions/Runtime/Match3/Match3SensorComponent.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Match3/Match3SensorComponent.cs
 using Unity.MLAgents.Sensors;
+using UnityEngine;

 namespace Unity.MLAgents.Extensions.Match3
 {
+    [AddComponentMenu("ML Agents/Match 3 Sensor", (int)MenuGroup.Sensors)]
    public class Match3SensorComponent : SensorComponent
    {
        /// <summary>
--- a/com.unity.ml-agents.extensions/Runtime/Sensors/CountingGridSensor.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Sensors/CountingGridSensor.cs
            this.ChannelDepth = channelDepth;
            if (DetectableObjects.Length != ChannelDepth.Length)
                throw new UnityAgentsException("The channels of a CountingGridSensor is equal to the number of detectableObjects");
-            this.gridDepthType = GridDepthType.Channel;
+            this.gridDepthType = gridDepthType;
            this.CellScaleX = cellScaleX;
            this.CellScaleZ = cellScaleZ;
            this.GridNumSideX = gridWidth;
        /// <param name="foundColliders">The array of colliders</param>
        /// <param name="cellIndex">The cell index the collider is in</param>
        /// <param name="cellCenter">the center of the cell the collider is in</param>
-        protected override void ParseColliders(Collider[] foundColliders, int cellIndex, Vector3 cellCenter)
+        protected override void ParseColliders(Collider[] foundColliders, int numFound, int cellIndex, Vector3 cellCenter)
-            for (int i = 0; i < foundColliders.Length; i++)
+            for (int i = 0; i < numFound; i++)
            {
                currentColliderGo = foundColliders[i].gameObject;

                closestColliderPoint = foundColliders[i].ClosestPointOnBounds(cellCenter);

                LoadObjectData(currentColliderGo, cellIndex,
-                    Vector3.Distance(closestColliderPoint, transform.position) / SphereRadius);
+                    Vector3.Distance(closestColliderPoint, transform.position) * InverseSphereRadius);
            }
        }

--- a/com.unity.ml-agents.extensions/Runtime/Sensors/GridSensor.cs
+++ b/com.unity.ml-agents.extensions/Runtime/Sensors/GridSensor.cs
 using UnityEngine;
 using UnityEngine.Assertions;
 using Unity.MLAgents.Sensors;
+using UnityEngine.Profiling;

 namespace Unity.MLAgents.Extensions.Sensors
 {
+    [AddComponentMenu("ML Agents/Grid Sensor", (int)MenuGroup.Sensors)]
    public class GridSensor : SensorComponent, ISensor, IBuiltInSensor
    {
        /// <summary>
        [Tooltip("The reference of the root of the agent. This is used to disambiguate objects with the same tag as the agent. Defaults to current GameObject")]
        public GameObject rootReference;

+        [Header("Collider Buffer Properties")]
+        [Tooltip("The absolute max size of the Collider buffer used in the non-allocating Physics calls.  In other words" +
+            " the Collider buffer will never grow beyond this number even if there are more Colliders in the Grid Cell.")]
+        public int MaxColliderBufferSize = 500;
+        [Tooltip(
+            "The Estimated Max Number of Colliders to expect per cell.  This number is used to " +
+            "pre-allocate an array of Colliders in order to take advantage of the OverlapBoxNonAlloc " +
+            "Physics API.  If the number of colliders found is >= InitialColliderBufferSize the array " +
+            "will be resized to double its current size.  The hard coded absolute size is 500.")]
+        public int InitialColliderBufferSize = 4;
+        Collider[] m_ColliderBuffer;
+
+        float[] m_ChannelBuffer;
+
        //
        // Hidden Parameters
        //
        /// <summary>
        /// Radius of grid, used for normalizing the distance.
        /// </summary>
-        protected float SphereRadius;
+        protected float InverseSphereRadius;

        /// <summary>
        /// Total Number of cells (width*height)
            NumCells = GridNumSideX * GridNumSideZ;
            float sphereRadiusX = (CellScaleX * GridNumSideX) / Mathf.Sqrt(2);
            float sphereRadiusZ = (CellScaleZ * GridNumSideZ) / Mathf.Sqrt(2);
-            SphereRadius = Mathf.Max(sphereRadiusX, sphereRadiusZ);
+            InverseSphereRadius = 1.0f / Mathf.Max(sphereRadiusX, sphereRadiusZ);
            ChannelOffsets = new int[ChannelDepth.Length];
            DiffNumSideZX = (GridNumSideZ - GridNumSideX);
            OffsetGridNumSide = (GridNumSideZ - 1f) / 2f;
            InitDepthType();
            InitCellPoints();
            InitPerceptionBuffer();
+            m_ColliderBuffer = new Collider[Math.Min(MaxColliderBufferSize, InitialColliderBufferSize)];
            // Default root reference to current game object
            if (rootReference == null)
                rootReference = gameObject;
            m_perceptionTexture2D = new Texture2D(GridNumSideX, GridNumSideZ, TextureFormat.RGB24, false);
        }

+        /// <inheritdoc cref="ISensor.Reset"/>
+        void ISensor.Reset() { }
+
-        public void Reset()
+        public void ClearPerceptionBuffer()
        {
            if (m_PerceptionBuffer != null)
            {
            else
            {
                m_PerceptionBuffer = new float[NumberOfObservations];
+                m_ColliderBuffer = new Collider[Math.Min(MaxColliderBufferSize, InitialColliderBufferSize)];
            }

            if (ShowGizmos)
        {
            return BuiltInSensorType.GridSensor;
        }
-

        /// <summary>
        /// GetCompressedObservation - Calls Perceive then puts the data stored on the perception buffer
        /// <returns>A float[] containing all of the information collected from the gridsensor</returns>
        public float[] Perceive()
        {
-            Reset();
+            if (m_ColliderBuffer == null)
+            {
+                return Array.Empty<float>();
+            }
+            ClearPerceptionBuffer();
-                // TODO: make these part of the class
-                Collider[] foundColliders = null;
-                Vector3 cellCenter = Vector3.zero;
+                var halfCellScale = new Vector3(CellScaleX / 2f, CellScaleY, CellScaleZ / 2f);
-                Vector3 halfCellScale = new Vector3(CellScaleX / 2f, CellScaleY, CellScaleZ / 2f);
-
-                for (int cellIndex = 0; cellIndex < NumCells; cellIndex++)
+                for (var cellIndex = 0; cellIndex < NumCells; cellIndex++)
+                    int numFound;
+                    Vector3 cellCenter;
-                        cellCenter = transform.TransformPoint(CellPoints[cellIndex]);
-                        foundColliders = Physics.OverlapBox(cellCenter, halfCellScale, transform.rotation, ObserveMask);
+                        Transform transform1;
+                        cellCenter = (transform1 = transform).TransformPoint(CellPoints[cellIndex]);
+                        numFound = BufferResizingOverlapBoxNonAlloc(cellCenter, halfCellScale, transform1.rotation);
-                        foundColliders = Physics.OverlapBox(cellCenter, halfCellScale, Quaternion.identity, ObserveMask);
+                        numFound = BufferResizingOverlapBoxNonAlloc(cellCenter, halfCellScale, Quaternion.identity);
-                    if (foundColliders != null && foundColliders.Length > 0)
+                    if (numFound > 0)
-                        ParseColliders(foundColliders, cellIndex, cellCenter);
+                        ParseColliders(m_ColliderBuffer, numFound, cellIndex, cellCenter);
+            return m_PerceptionBuffer;
+        }
-            return m_PerceptionBuffer;
+        /// <summary>
+        /// This method attempts to perform the Physics.OverlapBoxNonAlloc and will double the size of the Collider buffer
+        /// if the number of Colliders in the buffer after the call is equal to the length of the buffer.
+        /// </summary>
+        /// <param name="cellCenter"></param>
+        /// <param name="halfCellScale"></param>
+        /// <param name="rotation"></param>
+        /// <returns></returns>
+        int BufferResizingOverlapBoxNonAlloc(Vector3 cellCenter, Vector3 halfCellScale, Quaternion rotation)
+        {
+            int numFound;
+            // Since we can only get a fixed number of results, requery
+            // until we're sure we can hold them all (or until we hit the max size).
+            while (true)
+            {
+                numFound = Physics.OverlapBoxNonAlloc(cellCenter, halfCellScale, m_ColliderBuffer, rotation, ObserveMask);
+                if (numFound == m_ColliderBuffer.Length && m_ColliderBuffer.Length < MaxColliderBufferSize)
+                {
+                    m_ColliderBuffer = new Collider[Math.Min(MaxColliderBufferSize, m_ColliderBuffer.Length * 2)];
+                    InitialColliderBufferSize = m_ColliderBuffer.Length;
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            return numFound;
        }

        /// <summary>
+        /// <param name="numFound">Number of colliders found.</param>
-        protected virtual void ParseColliders(Collider[] foundColliders, int cellIndex, Vector3 cellCenter)
+        protected virtual void ParseColliders(Collider[] foundColliders, int numFound, int cellIndex, Vector3 cellCenter)
-            GameObject currentColliderGo = null;
+            Profiler.BeginSample("GridSensor.ParseColliders");
-            Vector3 closestColliderPoint = Vector3.zero;
-            float distance = float.MaxValue;
-            float currentDistance = 0f;
+            var minDistanceSquared = float.MaxValue;
-            for (int i = 0; i < foundColliders.Length; i++)
+            for (var i = 0; i < numFound; i++)
-                currentColliderGo = foundColliders[i].gameObject;
+                var currentColliderGo = foundColliders[i].gameObject;
-                if (currentColliderGo == rootReference)
+                if (ReferenceEquals(currentColliderGo, rootReference))
-                closestColliderPoint = foundColliders[i].ClosestPointOnBounds(cellCenter);
-                currentDistance = Vector3.Distance(closestColliderPoint, rootReference.transform.position);
+                var closestColliderPoint = foundColliders[i].ClosestPointOnBounds(cellCenter);
+                var currentDistanceSquared = (closestColliderPoint - rootReference.transform.position).sqrMagnitude;
-                if ((Array.IndexOf(DetectableObjects, currentColliderGo.tag) > -1) && (currentDistance < distance))
+                var index = -1;
+                for (var ii = 0; ii < DetectableObjects.Length; ii++)
+                {
+                    if (currentColliderGo.CompareTag(DetectableObjects[ii]))
+                    {
+                        index = ii;
+                        break;
+                    }
+                }
+                if (index > -1 && currentDistanceSquared < minDistanceSquared)
-                    distance = currentDistance;
+                    minDistanceSquared = currentDistanceSquared;
-            if (closestColliderGo != null)
-                LoadObjectData(closestColliderGo, cellIndex, distance / SphereRadius);
+            if (!ReferenceEquals(closestColliderGo, null))
+                LoadObjectData(closestColliderGo, cellIndex, (float)Math.Sqrt(minDistanceSquared) * InverseSphereRadius);
+            Profiler.EndSample();
        }

        /// <summary>
        /// </example>
        protected virtual float[] GetObjectData(GameObject currentColliderGo, float typeIndex, float normalizedDistance)
        {
-            float[] channelValues = new float[ChannelDepth.Length];
-            channelValues[0] = typeIndex;
-            return channelValues;
+            if (m_ChannelBuffer == null)
+            {
+                m_ChannelBuffer = new float[ChannelDepth.Length];
+            }
+            Array.Clear(m_ChannelBuffer, 0, m_ChannelBuffer.Length);
+            m_ChannelBuffer[0] = typeIndex;
+            return m_ChannelBuffer;
        }

        /// <summary>
        /// </summary>
        /// <param name="currentColliderGo">The game object that was found colliding with a certain cell</param>
        /// <param name="cellIndex">The index of the current cell</param>
-        /// <param name="normalized_distance">A float between 0 and 1 describing the ratio of
+        /// <param name="normalizedDistance">A float between 0 and 1 describing the ratio of
-        protected virtual void LoadObjectData(GameObject currentColliderGo, int cellIndex, float normalized_distance)
+        protected virtual void LoadObjectData(GameObject currentColliderGo, int cellIndex, float normalizedDistance)
-            for (int i = 0; i < DetectableObjects.Length; i++)
+            Profiler.BeginSample("GridSensor.LoadObjectData");
+            var channelHotVals = new ArraySegment<float>(m_PerceptionBuffer, cellIndex * ObservationPerCell, ObservationPerCell);
+            for (var i = 0; i < DetectableObjects.Length; i++)
-                if (currentColliderGo != null && currentColliderGo.CompareTag(DetectableObjects[i]))
+                for (var ii = 0; ii < channelHotVals.Count; ii++)
+                {
+                    m_PerceptionBuffer[channelHotVals.Offset + ii] = 0f;
+                }
+
+                if (!ReferenceEquals(currentColliderGo, null) && currentColliderGo.CompareTag(DetectableObjects[i]))
-                    float[] channelValues = GetObjectData(currentColliderGo, (float)i + 1, normalized_distance);
+                    float[] channelValues = GetObjectData(currentColliderGo, (float)i + 1, normalizedDistance);
-
                    if (ShowGizmos)
                    {
                        Color debugRayColor = Color.white;
                        }
+
                        CellActivity[cellIndex] = new Color(debugRayColor.r, debugRayColor.g, debugRayColor.b, .5f);
                    }

-                            /// <remarks>
-                            /// The observations are "channel based" so each grid is WxHxC where C is the number of channels
-                            /// This typically means that each channel value is normalized between 0 and 1
-                            /// If channelDepth is 1, the value is assumed normalized, else the value is normalized by the channelDepth
-                            /// The channels are then stored consecutively in PerceptionBuffer.
-                            /// NOTE: This is the only grid type that uses floating point values
-                            /// For example, if a cell contains the 3rd type of 5 possible on the 2nd team of 3 possible teams:
-                            /// channelValues = {2, 1}
-                            /// ObservationPerCell = channelValues.Length
-                            /// channelValues = {2f/5f, 1f/3f} = {.4, .33..}
-                            /// Array.Copy(channelValues, 0, PerceptionBuffer, cell_id*ObservationPerCell, ObservationPerCell);
-                            /// </remarks>
-                            for (int j = 0; j < channelValues.Length; j++)
-                                channelValues[j] /= ChannelDepth[j];
+                                // The observations are "channel based" so each grid is WxHxC where C is the number of channels
+                                // This typically means that each channel value is normalized between 0 and 1
+                                // If channelDepth is 1, the value is assumed normalized, else the value is normalized by the channelDepth
+                                // The channels are then stored consecutively in PerceptionBuffer.
+                                // NOTE: This is the only grid type that uses floating point values
+                                // For example, if a cell contains the 3rd type of 5 possible on the 2nd team of 3 possible teams:
+                                // channelValues = {2, 1}
+                                // ObservationPerCell = channelValues.Length
+                                // channelValues = {2f/5f, 1f/3f} = {.4, .33..}
+                                // Array.Copy(channelValues, 0, PerceptionBuffer, cell_id*ObservationPerCell, ObservationPerCell);
+                                for (int j = 0; j < channelValues.Length; j++)
+                                {
+                                    channelValues[j] /= ChannelDepth[j];
+                                }
+
+                                Array.Copy(channelValues, 0, m_PerceptionBuffer, cellIndex * ObservationPerCell, ObservationPerCell);
+                                break;
-                            Array.Copy(channelValues, 0, m_PerceptionBuffer, cellIndex * ObservationPerCell, ObservationPerCell);
-                            break;
-                            /// <remarks>
-                            /// The observations are "channel hot" so each grid is WxHxD where D is the sum of all of the channel depths
-                            /// The opposite of the "channel based" case, the channel values are represented as one hot vector per channel and then concatenated together
-                            /// Thus channelDepth is assumed to be greater than 1.
-                            /// For example, if a cell contains the 3rd type of 5 possible on the 2nd team of 3 possible teams,
-                            /// channelValues = {2, 1}
-                            /// channelOffsets = {5, 3}
-                            /// ObservationPerCell = 5 + 3 = 8
-                            /// channelHotVals = {0, 0, 1, 0, 0, 0, 1, 0}
-                            /// Array.Copy(channelHotVals, 0, PerceptionBuffer, cell_id*ObservationPerCell, ObservationPerCell);
-                            /// </remarks>
-                            float[] channelHotVals = new float[ObservationPerCell];
-                            for (int j = 0; j < channelValues.Length; j++)
-                                if (ChannelDepth[j] > 1)
-                                {
-                                    channelHotVals[(int)channelValues[j] + ChannelOffsets[j]] = 1f;
-                                }
-                                else
+                                // The observations are "channel hot" so each grid is WxHxD where D is the sum of all of the channel depths
+                                // The opposite of the "channel based" case, the channel values are represented as one hot vector per channel and then concatenated together
+                                // Thus channelDepth is assumed to be greater than 1.
+                                // For example, if a cell contains the 3rd type of 5 possible on the 2nd team of 3 possible teams,
+                                // channelValues = {2, 1}
+                                // channelOffsets = {5, 3}
+                                // ObservationPerCell = 5 + 3 = 8
+                                // channelHotVals = {0, 0, 1, 0, 0, 0, 1, 0}
+                                // Array.Copy(channelHotVals, 0, PerceptionBuffer, cell_id*ObservationPerCell, ObservationPerCell);
+                                for (int j = 0; j < channelValues.Length; j++)
-                                    channelHotVals[ChannelOffsets[j]] = channelValues[j];
+                                    if (ChannelDepth[j] > 1)
+                                    {
+                                        m_PerceptionBuffer[channelHotVals.Offset + (int)channelValues[j] + ChannelOffsets[j]] = 1f;
+                                    }
+                                    else
+                                    {
+                                        m_PerceptionBuffer[channelHotVals.Offset + ChannelOffsets[j]] = channelValues[j];
+                                    }
+                                break;
+                    }
-                            Array.Copy(channelHotVals, 0, m_PerceptionBuffer, cellIndex * ObservationPerCell, ObservationPerCell);
-                            break;
-                    }
+            Profiler.EndSample();
        }

        /// <summary>Converts the index of the cell to the 3D point (y is zero)</summary>
                CellActivity[toCellID] = CellActivity[fromCellID];
        }

-        /// <summary>Creates a copy of a float array</summary>
-        /// <returns>float[] of the original data</returns>
-        /// <param name="array">The array to copy from</parma>
-        private static float[] CreateCopy(float[] array)
-        {
-            float[] b = new float[array.Length];
-            System.Buffer.BlockCopy(array, 0, b, 0, array.Length * sizeof(float));
-            return b;
-        }
-
-        /// <summary>Utility method to find the index of a tag</summary>
-        /// <returns>Index of the tag in DetectableObjects, if it is in there</returns>
-        /// <param name="tag">The tag to search for</param>
-        public int IndexOfTag(string tag)
-        {
-            return Array.IndexOf(DetectableObjects, tag);
-        }
-
        void OnDrawGizmos()
        {
            if (ShowGizmos)

                Perceive();

-                Vector3 scale = new Vector3(CellScaleX, 1, CellScaleZ);
-                Vector3 offset = new Vector3(0, GizmoYOffset, 0);
-                Matrix4x4 oldGizmoMatrix = Gizmos.matrix;
-                Matrix4x4 cubeTransform = Gizmos.matrix;
-                for (int i = 0; i < NumCells; i++)
+                var scale = new Vector3(CellScaleX, 1, CellScaleZ);
+                var offset = new Vector3(0, GizmoYOffset, 0);
+                var oldGizmoMatrix = Gizmos.matrix;
+                for (var i = 0; i < NumCells; i++)
+                    Matrix4x4 cubeTransform;
                    if (RotateToAgent)
                    {
                        cubeTransform = Matrix4x4.TRS(CellToPoint(i) + offset, transform.rotation, scale);
        }

        /// <inheritdoc/>
-        void ISensor.Update() { }
+        void ISensor.Update()
+        {
+            using (TimerStack.Instance.Scoped("GridSensor.Update"))
+            {
+                Perceive();
+            }
+        }

        /// <summary>Gets the observation shape</summary>
        /// <returns>int[] of the observation shape</returns>
        {
            using (TimerStack.Instance.Scoped("GridSensor.WriteToTensor"))
            {
-                Perceive();
-
                int index = 0;
                for (var h = GridNumSideZ - 1; h >= 0; h--) // height
                {
--- a/com.unity.ml-agents.extensions/Runtime/Unity.ML-Agents.Extensions.asmdef
+++ b/com.unity.ml-agents.extensions/Runtime/Unity.ML-Agents.Extensions.asmdef
        "Unity.Barracuda",
        "Unity.ML-Agents",
        "Unity.ML-Agents.Extensions.Input"
-    ],
-    "includePlatforms": [],
-    "excludePlatforms": []
+    ]
 }
--- a/com.unity.ml-agents.extensions/Tests/Runtime/Input/Unity.ML-Agents.Extensions.Input.Tests.Runtime.asmdef
+++ b/com.unity.ml-agents.extensions/Tests/Runtime/Input/Unity.ML-Agents.Extensions.Input.Tests.Runtime.asmdef
    "versionDefines": [
        {
            "name": "com.unity.inputsystem",
-            "expression": "1.1.0-preview",
+            "expression": "1.1.0",
            "define": "MLA_INPUT_TESTS"
        }
    ],
--- a/com.unity.ml-agents.extensions/package.json
+++ b/com.unity.ml-agents.extensions/package.json
 {
  "name": "com.unity.ml-agents.extensions",
  "displayName": "ML Agents Extensions",
-  "version": "0.0.1-preview",
+  "version": "0.1.0-preview",
  "unity": "2018.4",
  "description": "A source-only package for new features based on ML-Agents",
  "dependencies": {
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
 ## [Unreleased]
 ### Major Changes
 #### com.unity.ml-agents (C#)
+- The `BufferSensor` and `BufferSensorComponent` have been added. They allow the Agent to observe variable number of entities. (#4909)
 #### ml-agents / ml-agents-envs / gym-unity (Python)

 ### Minor Changes
+- The `cattrs` version dependency was updated to allow `>=1.1.0` on Python 3.8 or higher. (#4821)
+
 ### Bug Fixes
 #### com.unity.ml-agents (C#)
 #### ml-agents / ml-agents-envs / gym-unity (Python)
 - Added a `--torch-device` commandline option to `mlagents-learn`, which sets the default
  [`torch.device`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.torch.device) used for training. (#4888)
 - The `--cpu` commandline option had no effect and was removed. Use `--torch-device=cpu` to force CPU training. (#4888)
- The `mlagents_env` API has changed, `BehaviorSpec` now has a `observation_specs` property containing a list of `ObservationSpec`. For more information on `ObservationSpec` see [here](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Python-API.md#behaviorspec). (#4763, #4825)
+- The `mlagents_env` API has changed, `BehaviorSpec` now has a `observation_specs` property containing a list of `ObservationSpec`. For more information on `ObservationSpec` see [here](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Python-API.md#behaviorspec). (#4763, #4825)

 ### Bug Fixes
 #### com.unity.ml-agents (C#)
 #### com.unity.ml-agents (C#)
 #### ml-agents / ml-agents-envs / gym-unity (Python)
 - PyTorch trainers are now the default. See the
- [installation docs](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) for
+ [installation docs](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Installation.md) for
 more information on installing PyTorch. For the time being, TensorFlow is still available;
 you can use the TensorFlow backend by adding `--tensorflow` to the CLI, or
 adding `framework: tensorflow` in the configuration YAML. (#4517)
 - The Barracuda dependency was upgraded to 1.1.2 (#4571)
 - Utilities were added to `com.unity.ml-agents.extensions` to make it easier to
-integrate with match-3 games. See the [readme](https://github.com/Unity-Technologies/ml-agents/blob/master/com.unity.ml-agents.extensions/Documentation~/Match3.md)
+integrate with match-3 games. See the [readme](https://github.com/Unity-Technologies/ml-agents/blob/main/com.unity.ml-agents.extensions/Documentation~/Match3.md)
 for more details. (#4515)
 #### ml-agents / ml-agents-envs / gym-unity (Python)
 - The `action_probs` node is no longer listed as an output in TensorFlow models (#4613).
 #### ml-agents / ml-agents-envs / gym-unity (Python)
 - Added the Random Network Distillation (RND) intrinsic reward signal to the Pytorch
 trainers. To use RND, add a `rnd` section to the `reward_signals` section of your
- yaml configuration file. [More information here](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-Configuration-File.md#rnd-intrinsic-reward) (#4473)
+ yaml configuration file. [More information here](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Training-Configuration-File.md#rnd-intrinsic-reward) (#4473)
 ### Minor Changes
 #### com.unity.ml-agents (C#)
 - Stacking for compressed observations is now supported. An additional setting
 ### Major Changes
 #### ml-agents / ml-agents-envs / gym-unity (Python)
 - The Parameter Randomization feature has been refactored to enable sampling of new parameters per episode to improve robustness. The
-  `resampling-interval` parameter has been removed and the config structure updated. More information [here](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-ML-Agents.md). (#4065)
+  `resampling-interval` parameter has been removed and the config structure updated. More information [here](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Training-ML-Agents.md). (#4065)
-[here](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-ML-Agents.md).(#4160)
+[here](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Training-ML-Agents.md).(#4160)

 ### Minor Changes
 #### com.unity.ml-agents (C#)
--- a/com.unity.ml-agents/CONTRIBUTING.md
+++ b/com.unity.ml-agents/CONTRIBUTING.md
 ## Communication

 First, please read through our
-[code of conduct](https://github.com/Unity-Technologies/ml-agents/blob/master/CODE_OF_CONDUCT.md),
+[code of conduct](https://github.com/Unity-Technologies/ml-agents/blob/main/CODE_OF_CONDUCT.md),
 as we expect all our contributors to follow it.

 Second, before starting on a project that you intend to contribute to the

 ## Git Branches

-The master branch corresponds to the most recent version of the project. Note
+The main branch corresponds to the most recent version of the project. Note
 that this may be newer that the
 [latest release](https://github.com/Unity-Technologies/ml-agents/releases/tag/latest_release).

--- a/com.unity.ml-agents/Documentation~/com.unity.ml-agents.md
+++ b/com.unity.ml-agents/Documentation~/com.unity.ml-agents.md
 [unity ML-Agents Toolkit]: https://github.com/Unity-Technologies/ml-agents
 [unity inference engine]: https://docs.unity3d.com/Packages/com.unity.barracuda@latest/index.html
 [package manager documentation]: https://docs.unity3d.com/Manual/upm-ui-install.html
-[installation instructions]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/Installation.md
+[installation instructions]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/Installation.md
-[ML-Agents GitHub repo]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/com.unity.ml-agents.extensions
+[ML-Agents GitHub repo]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/com.unity.ml-agents.extensions
--- a/com.unity.ml-agents/Runtime/Academy.cs
+++ b/com.unity.ml-agents/Runtime/Academy.cs
 * API. For more information on each of these entities, in addition to how to
 * set-up a learning environment and train the behavior of characters in a
 * Unity scene, please browse our documentation pages on GitHub:
- * https://github.com/Unity-Technologies/ml-agents/tree/release_12_docs/docs/
+ * https://github.com/Unity-Technologies/ml-agents/tree/release_13_docs/docs/
 */

 namespace Unity.MLAgents
    /// fall back to inference or heuristic decisions. (You can also set agents to always use
    /// inference or heuristics.)
    /// </remarks>
-    [HelpURL("https://github.com/Unity-Technologies/ml-agents/tree/release_12_docs/" +
+    [HelpURL("https://github.com/Unity-Technologies/ml-agents/tree/release_13_docs/" +
        "docs/Learning-Environment-Design.md")]
    public class Academy : IDisposable
    {
--- a/com.unity.ml-agents/Runtime/Actuators/ActionSpec.cs
+++ b/com.unity.ml-agents/Runtime/Actuators/ActionSpec.cs
        /// <param name="numContinuousActions">The number of continuous actions available.</param>
        /// <param name="discreteBranchSizes">The array of branch sizes for the discrete actions.  Each index
        /// contains the number of actions available for that branch.</param>
-        /// <returns>An ActionSpec initialized with the specified action sizes.</returns>
        public ActionSpec(int numContinuousActions = 0, int[] discreteBranchSizes = null)
        {
            m_NumContinuousActions = numContinuousActions;
--- a/com.unity.ml-agents/Runtime/Actuators/IActionReceiver.cs
+++ b/com.unity.ml-agents/Runtime/Actuators/IActionReceiver.cs
        ///
        /// See [Agents - Actions] for more information on masking actions.
        ///
-        /// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/Learning-Environment-Design-Agents.md#actions
+        /// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/Learning-Environment-Design-Agents.md#actions
        /// </remarks>
        /// <seealso cref="IActionReceiver.OnActionReceived"/>
        void WriteDiscreteActionMask(IDiscreteActionMask actionMask);
--- a/com.unity.ml-agents/Runtime/Actuators/IDiscreteActionMask.cs
+++ b/com.unity.ml-agents/Runtime/Actuators/IDiscreteActionMask.cs
        ///
        /// See [Agents - Actions] for more information on masking actions.
        ///
-        /// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/Learning-Environment-Design-Agents.md#actions
+        /// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/Learning-Environment-Design-Agents.md#actions
        /// </remarks>
        /// <param name="branch">The branch for which the actions will be masked.</param>
        /// <param name="actionIndices">The indices of the masked actions.</param>
--- a/com.unity.ml-agents/Runtime/Actuators/VectorActuator.cs
+++ b/com.unity.ml-agents/Runtime/Actuators/VectorActuator.cs
    /// <summary>
    /// IActuator implementation that forwards calls to an <see cref="IActionReceiver"/> and an <see cref="IHeuristicProvider"/>.
    /// </summary>
-    internal class VectorActuator : IActuator, IHeuristicProvider
+    internal class VectorActuator : IActuator, IHeuristicProvider, IBuiltInActuator
    {
        IActionReceiver m_ActionReceiver;
        IHeuristicProvider m_HeuristicProvider;

        /// <inheritdoc />
        public string Name { get; }
+
+        /// <inheritdoc />
+        public virtual BuiltInActuatorType GetBuiltInActuatorType()
+        {
+            return BuiltInActuatorType.VectorActuator;
+        }
    }
 }
--- a/com.unity.ml-agents/Runtime/Agent.cs
+++ b/com.unity.ml-agents/Runtime/Agent.cs
    }

    /// <summary>
+    /// Simple wrapper around VectorActuator that overrides GetBuiltInActuatorType
+    /// so that it can be distinguished from a standard VectorActuator.
+    /// </summary>
+    internal class AgentVectorActuator : VectorActuator
+    {
+        public AgentVectorActuator(IActionReceiver actionReceiver,
+            IHeuristicProvider heuristicProvider,
+            ActionSpec actionSpec,
+            string name = "VectorActuator"
+        ) : base(actionReceiver, heuristicProvider, actionSpec, name)
+        { }
+
+        public override BuiltInActuatorType GetBuiltInActuatorType()
+        {
+            return BuiltInActuatorType.AgentVectorActuator;
+        }
+    }
+
+    /// <summary>
    /// An agent is an actor that can observe its environment, decide on the
    /// best course of action using those observations, and execute those actions
    /// within the environment.
    /// [OnDisable()]: https://docs.unity3d.com/ScriptReference/MonoBehaviour.OnDisable.html]
    /// [OnBeforeSerialize()]: https://docs.unity3d.com/ScriptReference/MonoBehaviour.OnBeforeSerialize.html
    /// [OnAfterSerialize()]: https://docs.unity3d.com/ScriptReference/MonoBehaviour.OnAfterSerialize.html
-    /// [Agents]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/Learning-Environment-Design-Agents.md
-    /// [Reinforcement Learning in Unity]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/Learning-Environment-Design.md
+    /// [Agents]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/Learning-Environment-Design-Agents.md
+    /// [Reinforcement Learning in Unity]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/Learning-Environment-Design.md
-    /// [Unity ML-Agents Toolkit manual]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/Readme.md
+    /// [Unity ML-Agents Toolkit manual]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/Readme.md
-    [HelpURL("https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/" +
+    [HelpURL("https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/" +
        "docs/Learning-Environment-Design-Agents.md")]
    [Serializable]
    [RequireComponent(typeof(BehaviorParameters))]
        /// for information about mixing reward signals from curiosity and Generative Adversarial
        /// Imitation Learning (GAIL) with rewards supplied through this method.
        ///
-        /// [Agents - Rewards]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/Learning-Environment-Design-Agents.md#rewards
-        /// [Reward Signals]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/ML-Agents-Overview.md#a-quick-note-on-reward-signals
+        /// [Agents - Rewards]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/Learning-Environment-Design-Agents.md#rewards
+        /// [Reward Signals]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/ML-Agents-Overview.md#a-quick-note-on-reward-signals
        /// </remarks>
        /// <param name="reward">The new value of the reward.</param>
        public void SetReward(float reward)
        /// for information about mixing reward signals from curiosity and Generative Adversarial
        /// Imitation Learning (GAIL) with rewards supplied through this method.
        ///
-        /// [Agents - Rewards]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/Learning-Environment-Design-Agents.md#rewards
-        /// [Reward Signals]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/ML-Agents-Overview.md#a-quick-note-on-reward-signals
+        /// [Agents - Rewards]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/Learning-Environment-Design-Agents.md#rewards
+        /// [Reward Signals]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/ML-Agents-Overview.md#a-quick-note-on-reward-signals
        ///</remarks>
        /// <param name="increment">Incremental reward value.</param>
        public void AddReward(float increment)
        /// implementing a simple heuristic function can aid in debugging agent actions and interactions
        /// with its environment.
        ///
-        /// [Demonstration Recorder]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/Learning-Environment-Design-Agents.md#recording-demonstrations
-        /// [Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/Learning-Environment-Design-Agents.md#actions
+        /// [Demonstration Recorder]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/Learning-Environment-Design-Agents.md#recording-demonstrations
+        /// [Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/Learning-Environment-Design-Agents.md#actions
        /// [GameObject]: https://docs.unity3d.com/Manual/GameObjects.html
        /// </remarks>
        /// <example>
            // Support legacy OnActionReceived
            // TODO don't set this up if the sizes are 0?
            var param = m_PolicyFactory.BrainParameters;
-            m_VectorActuator = new VectorActuator(this, this, param.ActionSpec);
+            m_VectorActuator = new AgentVectorActuator(this, this, param.ActionSpec);
            m_ActuatorManager = new ActuatorManager(attachedActuators.Length + 1);
            m_LegacyActionCache = new float[m_VectorActuator.TotalNumberOfActions()];
            m_LegacyHeuristicCache = new float[m_VectorActuator.TotalNumberOfActions()];
        /// For more information about observations, see [Observations and Sensors].
        ///
        /// [GameObject]: https://docs.unity3d.com/Manual/GameObjects.html
-        /// [Observations and Sensors]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/Learning-Environment-Design-Agents.md#observations-and-sensors
+        /// [Observations and Sensors]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/Learning-Environment-Design-Agents.md#observations-and-sensors
        /// </remarks>
        public virtual void CollectObservations(VectorSensor sensor)
        {
        ///
        /// See [Agents - Actions] for more information on masking actions.
        ///
-        /// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/Learning-Environment-Design-Agents.md#actions
+        /// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/Learning-Environment-Design-Agents.md#actions
        /// </remarks>
        /// <seealso cref="IActionReceiver.OnActionReceived"/>
        public virtual void WriteDiscreteActionMask(IDiscreteActionMask actionMask)
        ///
        /// For more information about implementing agent actions see [Agents - Actions].
        ///
-        /// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/Learning-Environment-Design-Agents.md#actions
+        /// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/Learning-Environment-Design-Agents.md#actions
        /// </remarks>
        /// <param name="actions">
        /// Struct containing the buffers of actions to be executed at this step.
--- a/com.unity.ml-agents/Runtime/Analytics/Events.cs
+++ b/com.unity.ml-agents/Runtime/Analytics/Events.cs
        public int InferenceDevice;
        public List<EventObservationSpec> ObservationSpecs;
        public EventActionSpec ActionSpec;
+        public List<EventActuatorInfo> ActuatorInfos;
        public int MemorySize;
        public long TotalWeightSizeBytes;
        public string ModelHash;
                NumContinuousActions = actionSpec.NumContinuousActions,
                NumDiscreteActions = actionSpec.NumDiscreteActions,
                BranchSizes = branchSizes,
+            };
+        }
+    }
+
+    /// <summary>
+    /// Information about an actuator.
+    /// </summary>
+    [Serializable]
+    internal struct EventActuatorInfo
+    {
+        public int BuiltInActuatorType;
+        public int NumContinuousActions;
+        public int NumDiscreteActions;
+
+        public static EventActuatorInfo FromActuator(IActuator actuator)
+        {
+            BuiltInActuatorType builtInActuatorType = Actuators.BuiltInActuatorType.Unknown;
+            if (actuator is IBuiltInActuator builtInActuator)
+            {
+                builtInActuatorType = builtInActuator.GetBuiltInActuatorType();
+            }
+
+            var actionSpec = actuator.ActionSpec;
+
+            return new EventActuatorInfo
+            {
+                BuiltInActuatorType = (int)builtInActuatorType,
+                NumContinuousActions = actionSpec.NumContinuousActions,
+                NumDiscreteActions = actionSpec.NumDiscreteActions
            };
        }
    }
        public string BehaviorName;
        public List<EventObservationSpec> ObservationSpecs;
        public EventActionSpec ActionSpec;
+        public List<EventActuatorInfo> ActuatorInfos;

        /// <summary>
        /// This will be the same as TrainingEnvironmentInitializedEvent if available, but
--- a/com.unity.ml-agents/Runtime/Analytics/InferenceAnalytics.cs
+++ b/com.unity.ml-agents/Runtime/Analytics/InferenceAnalytics.cs
        /// <param name="inferenceDevice">Whether inference is being performed on the CPU or GPU</param>
        /// <param name="sensors">List of ISensors for the Agent. Used to generate information about the observation space.</param>
        /// <param name="actionSpec">ActionSpec for the Agent. Used to generate information about the action space.</param>
+        /// <param name="actuators">List of IActuators for the Agent. Used to generate information about the action space.</param>
        /// <returns></returns>
        public static void InferenceModelSet(
            NNModel nnModel,
-            ActionSpec actionSpec
+            ActionSpec actionSpec,
+            IList<IActuator> actuators
        )
        {
            // The event shouldn't be able to report if this is disabled but if we know we're not going to report
                return;
            }

-            var data = GetEventForModel(nnModel, behaviorName, inferenceDevice, sensors, actionSpec);
+            var data = GetEventForModel(nnModel, behaviorName, inferenceDevice, sensors, actionSpec, actuators);
-            //Debug.Log(JsonUtility.ToJson(data, true));
+            // Debug.Log(JsonUtility.ToJson(data, true));
 #if UNITY_EDITOR
            if (AnalyticsUtils.s_SendEditorAnalytics)
            {
        /// <param name="inferenceDevice"></param>
        /// <param name="sensors"></param>
        /// <param name="actionSpec"></param>
+        /// <param name="actuators"></param>
        /// <returns></returns>
        internal static InferenceEvent GetEventForModel(
            NNModel nnModel,
-            ActionSpec actionSpec
+            ActionSpec actionSpec,
+            IList<IActuator> actuators
        )
        {
            var barracudaModel = ModelLoader.Load(nnModel);
            foreach (var sensor in sensors)
            {
                inferenceEvent.ObservationSpecs.Add(EventObservationSpec.FromSensor(sensor));
+            }
+
+            inferenceEvent.ActuatorInfos = new List<EventActuatorInfo>(actuators.Count);
+            foreach (var actuator in actuators)
+            {
+                inferenceEvent.ActuatorInfos.Add(EventActuatorInfo.FromActuator(actuator));
            }

            inferenceEvent.TotalWeightSizeBytes = GetModelWeightSize(barracudaModel);
--- a/com.unity.ml-agents/Runtime/Analytics/TrainingAnalytics.cs
+++ b/com.unity.ml-agents/Runtime/Analytics/TrainingAnalytics.cs
        public static void RemotePolicyInitialized(
            string fullyQualifiedBehaviorName,
            IList<ISensor> sensors,
-            ActionSpec actionSpec
+            ActionSpec actionSpec,
+            IList<IActuator> actuators
        )
        {
            if (!IsAnalyticsEnabled())
                return;
            }

-            var data = GetEventForRemotePolicy(behaviorName, sensors, actionSpec);
+            var data = GetEventForRemotePolicy(behaviorName, sensors, actionSpec, actuators);
            // Note - to debug, use JsonUtility.ToJson on the event.
            // Debug.Log(
            //     $"Would send event {k_RemotePolicyInitializedEventName} with body {JsonUtility.ToJson(data, true)}"
 #endif
        }

-        static RemotePolicyInitializedEvent GetEventForRemotePolicy(
+        internal static RemotePolicyInitializedEvent GetEventForRemotePolicy(
-            ActionSpec actionSpec)
+            ActionSpec actionSpec,
+            IList<IActuator> actuators
+        )
        {
            var remotePolicyEvent = new RemotePolicyInitializedEvent();

            foreach (var sensor in sensors)
            {
                remotePolicyEvent.ObservationSpecs.Add(EventObservationSpec.FromSensor(sensor));
+            }
+
+            remotePolicyEvent.ActuatorInfos = new List<EventActuatorInfo>(actuators.Count);
+            foreach (var actuator in actuators)
+            {
+                remotePolicyEvent.ActuatorInfos.Add(EventActuatorInfo.FromActuator(actuator));
            }

            remotePolicyEvent.MLAgentsEnvsVersion = s_TrainerPackageVersion;
--- a/com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs
+++ b/com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs
 #if UNITY_EDITOR || UNITY_STANDALONE_WIN || UNITY_STANDALONE_OSX || UNITY_STANDALONE_LINUX
+#define MLA_SUPPORTED_TRAINING_PLATFORM
+#endif
+
+# if MLA_SUPPORTED_TRAINING_PLATFORM
 using Grpc.Core;
 #if UNITY_EDITOR
 using UnityEditor;
        /// <param name="initParametersOut">The External Initialization Parameters received.</param>
        public bool Initialize(CommunicatorInitParameters initParameters, out UnityRLInitParameters initParametersOut)
        {
+#if MLA_SUPPORTED_TRAINING_PLATFORM
            var academyParameters = new UnityRLInitializationOutputProto
            {
                Name = initParameters.name,
            UpdateEnvironmentWithInput(input.RlInput);
            initParametersOut = initializationInput.RlInitializationInput.ToUnityRLInitParameters();
            return true;
+#else
+            initParametersOut = new UnityRLInitParameters();
+            return false;
+#endif
        }

        /// <summary>
--- a/com.unity.ml-agents/Runtime/Constants.cs
+++ b/com.unity.ml-agents/Runtime/Constants.cs
    internal enum MenuGroup
    {
        Default = 0,
-        Sensors = 50
+        Sensors = 50,
+        Actuators = 100
    }
 }
--- a/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationRecorder.cs
+++ b/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationRecorder.cs
    /// See [Imitation Learning - Recording Demonstrations] for more information.
    ///
    /// [GameObject]: https://docs.unity3d.com/Manual/GameObjects.html
-    /// [Imitation Learning - Recording Demonstrations]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs//Learning-Environment-Design-Agents.md#recording-demonstrations
+    /// [Imitation Learning - Recording Demonstrations]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs//Learning-Environment-Design-Agents.md#recording-demonstrations
    /// </remarks>
    [RequireComponent(typeof(Agent))]
    [AddComponentMenu("ML Agents/Demonstration Recorder", (int)MenuGroup.Default)]
--- a/com.unity.ml-agents/Runtime/DiscreteActionMasker.cs
+++ b/com.unity.ml-agents/Runtime/DiscreteActionMasker.cs
        ///
        /// See [Agents - Actions] for more information on masking actions.
        ///
-        /// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/docs/Learning-Environment-Design-Agents.md#actions
+        /// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/docs/Learning-Environment-Design-Agents.md#actions
        /// </remarks>
        /// <param name="branch">The branch for which the actions will be masked.</param>
        /// <param name="actionIndices">The indices of the masked actions.</param>
--- a/com.unity.ml-agents/Runtime/Inference/BarracudaModelParamLoader.cs
+++ b/com.unity.ml-agents/Runtime/Inference/BarracudaModelParamLoader.cs
                if (sensor.GetObservationShape().Length == 3)
                {
                    if (!tensorsNames.Contains(
-                        TensorNames.VisualObservationPlaceholderPrefix + visObsIndex))
+                        TensorNames.GetVisualObservationName(visObsIndex)))
                    {
                        failedModelChecks.Add(
                            "The model does not contain a Visual Observation Placeholder Input " +
                if (sensor.GetObservationShape().Length == 2)
                {
                    if (!tensorsNames.Contains(
-                        TensorNames.ObservationPlaceholderPrefix + sensorIndex))
+                        TensorNames.GetObservationName(sensorIndex)))
                    {
                        failedModelChecks.Add(
                            "The model does not contain an Observation Placeholder Input " +
                if (sens.GetObservationShape().Length == 3)
                {

-                    tensorTester[TensorNames.VisualObservationPlaceholderPrefix + visObsIndex] =
+                    tensorTester[TensorNames.GetVisualObservationName(visObsIndex)] =
-                    tensorTester[TensorNames.ObservationPlaceholderPrefix + sensorIndex] =
+                    tensorTester[TensorNames.GetObservationName(sensorIndex)] =
                        (bp, tensor, scs, i) => CheckRankTwoObsShape(tensor, sens);
                }
            }
--- a/com.unity.ml-agents/Runtime/Inference/TensorGenerator.cs
+++ b/com.unity.ml-agents/Runtime/Inference/TensorGenerator.cs
                        // If the tensor is of rank 2, we use the index of the sensor
                        // to create the name
                        obsGen = new ObservationGenerator(allocator);
-                        obsGenName = TensorNames.ObservationPlaceholderPrefix + sensorIndex;
+                        obsGenName = TensorNames.GetObservationName(sensorIndex);
-                        obsGenName = TensorNames.VisualObservationPlaceholderPrefix + visIndex;
+                        obsGenName = TensorNames.GetVisualObservationName(visIndex);
                        visIndex++;
                        break;
                    default:
--- a/com.unity.ml-agents/Runtime/Inference/TensorNames.cs
+++ b/com.unity.ml-agents/Runtime/Inference/TensorNames.cs
        public const string SequenceLengthPlaceholder = "sequence_length";
        public const string VectorObservationPlaceholder = "vector_observation";
        public const string RecurrentInPlaceholder = "recurrent_in";
-        public const string recurrentInPlaceholderH = "recurrent_in_h";
-        public const string recurrentInPlaceholderC = "recurrent_in_c";
        public const string VisualObservationPlaceholderPrefix = "visual_observation_";
        public const string ObservationPlaceholderPrefix = "obs_";
        public const string PreviousActionPlaceholder = "prev_action";
        public const string ValueEstimateOutput = "value_estimate";
        public const string RecurrentOutput = "recurrent_out";
-        public const string recurrentOutputH = "recurrent_out_h";
-        public const string recurrentOutputC = "recurrent_out_c";
        public const string MemorySize = "memory_size";
        public const string VersionNumber = "version_number";
        public const string ContinuousActionOutputShape = "continuous_action_output_shape";
        public const string IsContinuousControlDeprecated = "is_continuous_control";
        public const string ActionOutputDeprecated = "action";
        public const string ActionOutputShapeDeprecated = "action_output_shape";
+
+        /// <summary>
+        /// Returns the name of the visual observation with a given index
+        /// </summary>
+        public static string GetVisualObservationName(int index)
+        {
+            return VisualObservationPlaceholderPrefix + index;
+        }
+
+        /// <summary>
+        /// Returns the name of the observation with a given index
+        /// </summary>
+        public static string GetObservationName(int index)
+        {
+            return ObservationPlaceholderPrefix + index;
+        }
    }
 }
--- a/com.unity.ml-agents/Runtime/Policies/BarracudaPolicy.cs
+++ b/com.unity.ml-agents/Runtime/Policies/BarracudaPolicy.cs
        private string m_BehaviorName;

        /// <summary>
+        /// List of actuators, only used for analytics
+        /// </summary>
+        private IList<IActuator> m_Actuators;
+
+        /// <summary>
        /// Whether or not we've tried to send analytics for this model. We only ever try to send once per policy,
        /// and do additional deduplication in the analytics code.
        /// </summary>
        public BarracudaPolicy(
            ActionSpec actionSpec,
+            IList<IActuator> actuators,
            NNModel model,
            InferenceDevice inferenceDevice,
            string behaviorName
            m_ModelRunner = modelRunner;
            m_BehaviorName = behaviorName;
            m_ActionSpec = actionSpec;
+            m_Actuators = actuators;
        }

        /// <inheritdoc />
                    m_BehaviorName,
                    m_ModelRunner.InferenceDevice,
                    sensors,
-                    m_ActionSpec
+                    m_ActionSpec,
+                    m_Actuators
                );
            }
            m_AgentId = info.episodeId;
--- a/com.unity.ml-agents/Runtime/Policies/BehaviorParameters.cs
+++ b/com.unity.ml-agents/Runtime/Policies/BehaviorParameters.cs
                                "Either assign a model, or change to a different Behavior Type."
                            );
                        }
-                        return new BarracudaPolicy(actionSpec, m_Model, m_InferenceDevice, m_BehaviorName);
+                        return new BarracudaPolicy(actionSpec, actuatorManager, m_Model, m_InferenceDevice, m_BehaviorName);
-                        return new RemotePolicy(actionSpec, FullyQualifiedBehaviorName);
+                        return new RemotePolicy(actionSpec, actuatorManager, FullyQualifiedBehaviorName);
-                        return new BarracudaPolicy(actionSpec, m_Model, m_InferenceDevice, m_BehaviorName);
+                        return new BarracudaPolicy(actionSpec, actuatorManager, m_Model, m_InferenceDevice, m_BehaviorName);
                    }
                    else
                    {
--- a/com.unity.ml-agents/Runtime/Policies/RemotePolicy.cs
+++ b/com.unity.ml-agents/Runtime/Policies/RemotePolicy.cs

        internal ICommunicator m_Communicator;

+        /// <summary>
+        /// List of actuators, only used for analytics
+        /// </summary>
+        private IList<IActuator> m_Actuators;
+
+            IList<IActuator> actuators,
            string fullyQualifiedBehaviorName)
        {
            m_FullyQualifiedBehaviorName = fullyQualifiedBehaviorName;
+            m_Actuators = actuators;
        }

        /// <inheritdoc />
                TrainingAnalytics.RemotePolicyInitialized(
                    m_FullyQualifiedBehaviorName,
                    sensors,
-                    m_ActionSpec
+                    m_ActionSpec,
+                    m_Actuators
                );
            }
            m_AgentId = info.episodeId;
--- a/com.unity.ml-agents/Runtime/Sensors/BufferSensor.cs
+++ b/com.unity.ml-agents/Runtime/Sensors/BufferSensor.cs
    /// </summary>
    public class BufferSensor : ISensor, IDimensionPropertiesSensor, IBuiltInSensor
    {
+        private string m_Name;
        private int m_MaxNumObs;
        private int m_ObsSize;
        float[] m_ObservationBuffer;
                DimensionProperty.None
            };
-        public BufferSensor(int maxNumberObs, int obsSize)
+        public BufferSensor(int maxNumberObs, int obsSize, string name)
+            m_Name = name;
            m_MaxNumObs = maxNumberObs;
            m_ObsSize = obsSize;
            m_ObservationBuffer = new float[m_ObsSize * m_MaxNumObs];
            Array.Clear(m_ObservationBuffer, 0, m_ObservationBuffer.Length);
        }

+        /// <inheritdoc/>
+        /// <inheritdoc/>
-            return "BufferSensor";
+            return m_Name;
        }

        /// <inheritdoc/>
--- a/com.unity.ml-agents/Runtime/Sensors/BufferSensorComponent.cs
+++ b/com.unity.ml-agents/Runtime/Sensors/BufferSensorComponent.cs
    [AddComponentMenu("ML Agents/Buffer Sensor", (int)MenuGroup.Sensors)]
    public class BufferSensorComponent : SensorComponent
    {
+
+        /// <summary>
+        /// Name of the generated <see cref="bufferSensor"/> object.
+        /// Note that changing this at runtime does not affect how the Agent sorts the sensors.
+        /// </summary>
+        public string SensorName
+        {
+            get { return m_SensorName; }
+            set { m_SensorName = value; }
+        }
+        [HideInInspector, SerializeField]
+        private string m_SensorName = "BufferSensor";
+
-        public int ObservableSize;
+        public int ObservableSize
+        {
+            get { return m_ObservableSize; }
+            set { m_ObservableSize = value; }
+        }
+        [HideInInspector, SerializeField]
+        private int m_ObservableSize;
-        public int MaxNumObservables;
+        public int MaxNumObservables
+        {
+            get { return m_MaxNumObservables; }
+            set { m_MaxNumObservables = value; }
+        }
+        [HideInInspector, SerializeField]
+        private int m_MaxNumObservables;

        private BufferSensor m_Sensor;

-            m_Sensor = new BufferSensor(MaxNumObservables, ObservableSize);
+            m_Sensor = new BufferSensor(MaxNumObservables, ObservableSize, m_SensorName);
            return m_Sensor;
        }

--- a/com.unity.ml-agents/Runtime/Sensors/IBuiltInSensor.cs
+++ b/com.unity.ml-agents/Runtime/Sensors/IBuiltInSensor.cs
    /// </summary>
    public enum BuiltInSensorType
    {
+        /// <summary>
+        /// Default Sensor type if it cannot be determined.
+        /// </summary>
+        /// <summary>
+        /// The Vector sensor used by the agent.
+        /// </summary>
-        // Note that StackingSensor actually returns the wrapped sensor's type
+        /// <summary>
+        /// The Stacking Sensor type. NOTE: StackingSensor actually returns the wrapped sensor's type.
+        /// </summary>
+        /// <summary>
+        /// The RayPerception Sensor types, both 3D and 2D.
+        /// </summary>
+        /// <summary>
+        /// The observable attribute sensor type.
+        /// </summary>
+        /// <summary>
+        /// Sensors that use the Camera for observations.
+        /// </summary>
+        /// <summary>
+        /// Sensors that use RenderTextures for observations.
+        /// </summary>
+        /// <summary>
+        /// Sensors that use buffers or tensors for observations.
+        /// </summary>
+        /// <summary>
+        /// The sensors that observe properties of rigid bodies.
+        /// </summary>
+        /// <summary>
+        /// The sensors that observe Match 3 boards.
+        /// </summary>
+        /// <summary>
+        /// Sensors that break down the world into a grid of colliders to observe an area at a pre-defined granularity.
+        /// </summary>
        GridSensor = 10
    }

    /// </summary>
-    public interface IBuiltInSensor
+    internal interface IBuiltInSensor
    {
        /// <summary>
        /// Return the corresponding BuiltInSensorType for the sensor.
    }
-
-
 }
--- a/com.unity.ml-agents/Runtime/Sensors/ObservationWriter.cs
+++ b/com.unity.ml-agents/Runtime/Sensors/ObservationWriter.cs
            }
        }

+        /// <summary>
+        /// Write the list of floats.
+        /// </summary>
+        /// <param name="data">The actual list of floats to write.</param>
+        /// <param name="writeOffset">Optional write offset to start writing from.</param>
        public void AddList(IList<float> data, int writeOffset = 0)
        {
            if (m_Data != null)
                    var val = data[index];
                    m_Data[index + m_Offset + writeOffset] = val;
-
                }
            }
            else
--- a/com.unity.ml-agents/Runtime/SideChannels/TrainingAnalyticsSideChannel.cs
+++ b/com.unity.ml-agents/Runtime/SideChannels/TrainingAnalyticsSideChannel.cs

 namespace Unity.MLAgents.SideChannels
 {
-    public class TrainingAnalyticsSideChannel : SideChannel
+    /// <summary>
+    /// Side Channel implementation for recording which training features are being used.
+    /// </summary>
+    internal class TrainingAnalyticsSideChannel : SideChannel
    {
        const string k_TrainingAnalyticsConfigId = "b664a4a9-d86f-5a5f-95cb-e8353a7e8356";

--- a/com.unity.ml-agents/Tests/Editor/Analytics/InferenceAnalyticsTests.cs
+++ b/com.unity.ml-agents/Tests/Editor/Analytics/InferenceAnalyticsTests.cs
+using System;
 using System.Collections.Generic;
 using NUnit.Framework;
 using Unity.MLAgents.Sensors;
        {
            var sensors = new List<ISensor> { sensor_21_20_3.Sensor, sensor_20_22_3.Sensor };
            var behaviorName = "continuousModel";
+            var actionSpec = GetContinuous2vis8vec2actionActionSpec();
+
+            var vectorActuator = new VectorActuator(null, actionSpec, "test'");
+            var actuators = new IActuator[] { vectorActuator };
-                InferenceDevice.CPU, sensors, GetContinuous2vis8vec2actionActionSpec()
+                InferenceDevice.CPU, sensors, actionSpec,
+                actuators
            );

            // The behavior name should be hashed, not pass-through.
            Assert.AreEqual((int)DimensionProperty.None, continuousEvent.ObservationSpecs[0].DimensionInfos[2].Flags);
            Assert.AreEqual("None", continuousEvent.ObservationSpecs[0].CompressionType);
            Assert.AreEqual(Test3DSensor.k_BuiltInSensorType, continuousEvent.ObservationSpecs[0].BuiltInSensorType);
+            Assert.AreEqual((int)BuiltInActuatorType.VectorActuator, continuousEvent.ActuatorInfos[0].BuiltInActuatorType);
            Assert.AreNotEqual(null, continuousEvent.ModelHash);

            // Make sure nested fields get serialized
            Assert.IsTrue(jsonString.Contains("NumDiscreteActions"));
            Assert.IsTrue(jsonString.Contains("SensorName"));
            Assert.IsTrue(jsonString.Contains("Flags"));
+            Assert.IsTrue(jsonString.Contains("ActuatorInfos"));
        }

        [Test]
            using (new AnalyticsUtils.DisableAnalyticsSending())
            {
                var sensors = new List<ISensor> { sensor_21_20_3.Sensor, sensor_20_22_3.Sensor };
-                var policy = new BarracudaPolicy(GetContinuous2vis8vec2actionActionSpec(), continuousONNXModel, InferenceDevice.CPU, "testBehavior");
+                var policy = new BarracudaPolicy(
+                    GetContinuous2vis8vec2actionActionSpec(),
+                    Array.Empty<IActuator>(),
+                    continuousONNXModel,
+                    InferenceDevice.CPU,
+                    "testBehavior"
+                );
                policy.RequestDecision(new AgentInfo(), sensors);
            }
            Academy.Instance.Dispose();
--- a/com.unity.ml-agents/Tests/Editor/Analytics/TrainingAnalyticsTest.cs
+++ b/com.unity.ml-agents/Tests/Editor/Analytics/TrainingAnalyticsTest.cs
+using System;
-using UnityEngine;
-using Unity.Barracuda;
-using UnityEditor;

 namespace Unity.MLAgents.Tests.Analytics
 {
        }

        [Test]
+        public void TestRemotePolicyEvent()
+        {
+            var behaviorName = "testBehavior";
+            var sensor1 = new Test3DSensor("SensorA", 21, 20, 3);
+            var sensor2 = new Test3DSensor("SensorB", 20, 22, 3);
+            var sensors = new List<ISensor> { sensor1, sensor2 };
+
+            var actionSpec = ActionSpec.MakeContinuous(2);
+
+            var vectorActuator = new VectorActuator(null, actionSpec, "test'");
+            var actuators = new IActuator[] { vectorActuator };
+
+            var remotePolicyEvent = TrainingAnalytics.GetEventForRemotePolicy(behaviorName, sensors, actionSpec, actuators);
+
+            // The behavior name should be hashed, not pass-through.
+            Assert.AreNotEqual(behaviorName, remotePolicyEvent.BehaviorName);
+
+            Assert.AreEqual(2, remotePolicyEvent.ObservationSpecs.Count);
+            Assert.AreEqual(3, remotePolicyEvent.ObservationSpecs[0].DimensionInfos.Length);
+            Assert.AreEqual(20, remotePolicyEvent.ObservationSpecs[0].DimensionInfos[0].Size);
+            Assert.AreEqual("None", remotePolicyEvent.ObservationSpecs[0].CompressionType);
+            Assert.AreEqual(Test3DSensor.k_BuiltInSensorType, remotePolicyEvent.ObservationSpecs[0].BuiltInSensorType);
+
+            Assert.AreEqual(2, remotePolicyEvent.ActionSpec.NumContinuousActions);
+            Assert.AreEqual(0, remotePolicyEvent.ActionSpec.NumDiscreteActions);
+
+            Assert.AreEqual(2, remotePolicyEvent.ActuatorInfos[0].NumContinuousActions);
+            Assert.AreEqual(0, remotePolicyEvent.ActuatorInfos[0].NumDiscreteActions);
+
+        }
+
+        [Test]
        public void TestRemotePolicy()
        {
            if (Academy.IsInitialized)
            using (new AnalyticsUtils.DisableAnalyticsSending())
            {
                var actionSpec = ActionSpec.MakeContinuous(3);
-                var policy = new RemotePolicy(actionSpec, "TestBehavior?team=42");
+                var policy = new RemotePolicy(actionSpec, Array.Empty<IActuator>(), "TestBehavior?team=42");
                policy.RequestDecision(new AgentInfo(), new List<ISensor>());
            }

--- a/com.unity.ml-agents/Tests/Editor/Sensor/BufferSensorTest.cs
+++ b/com.unity.ml-agents/Tests/Editor/Sensor/BufferSensorTest.cs
        public void TestBufferSensor()
        {

-            var bufferSensor = new BufferSensor(20, 4);
+            var bufferSensor = new BufferSensor(20, 4, "testName");
            var shape = bufferSensor.GetObservationShape();
            var dimProp = bufferSensor.GetDimensionProperties();
            Assert.AreEqual(shape[0], 20);
            var bufferComponent = agentGameObj.AddComponent<BufferSensorComponent>();
            bufferComponent.MaxNumObservables = 20;
            bufferComponent.ObservableSize = 4;
+            bufferComponent.SensorName = "TestName";

            var sensor = bufferComponent.CreateSensor();
            var shape = bufferComponent.GetObservationShape();

            Assert.AreEqual(shape, obs.Shape);
            Assert.AreEqual(obs.DimensionProperties.Count, 2);
+
+            Assert.AreEqual(sensor.GetName(), "TestName");

            for (int i = 0; i < 8; i++)
            {
--- a/config/ppo/Match3.yaml
+++ b/config/ppo/Match3.yaml
+default_settings:
+  trainer_type: ppo
+  hyperparameters:
+    batch_size: 16
+    buffer_size: 120
+    learning_rate: 0.0003
+    beta: 0.005
+    epsilon: 0.2
+    lambd: 0.99
+    num_epoch: 3
+    learning_rate_schedule: constant
+  network_settings:
+    normalize: true
+    hidden_units: 256
+    num_layers: 4
+    vis_encode_type: match3
+  reward_signals:
+    extrinsic:
+      gamma: 0.99
+      strength: 1.0
+  keep_checkpoints: 5
+  max_steps: 5000000
+  time_horizon: 128
+  summary_freq: 10000
+  threaded: true
+
-  Match3VectorObs:
-    trainer_type: ppo
-    hyperparameters:
-      batch_size: 64
-      buffer_size: 12000
-      learning_rate: 0.0003
-      beta: 0.001
-      epsilon: 0.2
-      lambd: 0.99
-      num_epoch: 3
-      learning_rate_schedule: constant
-    network_settings:
-      normalize: true
-      hidden_units: 128
-      num_layers: 2
-      vis_encode_type: match3
-    reward_signals:
-      extrinsic:
-        gamma: 0.99
-        strength: 1.0
-    keep_checkpoints: 5
-    max_steps: 5000000
-    time_horizon: 1000
-    summary_freq: 10000
-    threaded: true
-  Match3VisualObs:
-    trainer_type: ppo
-    hyperparameters:
-      batch_size: 64
-      buffer_size: 12000
-      learning_rate: 0.0003
-      beta: 0.001
-      epsilon: 0.2
-      lambd: 0.99
-      num_epoch: 3
-      learning_rate_schedule: constant
-    network_settings:
-      normalize: true
-      hidden_units: 128
-      num_layers: 2
-      vis_encode_type: match3
-    reward_signals:
-      extrinsic:
-        gamma: 0.99
-        strength: 1.0
-    keep_checkpoints: 5
-    max_steps: 5000000
-    time_horizon: 1000
-    summary_freq: 10000
-    threaded: true
-      batch_size: 64
-      buffer_size: 128
+      batch_size: 16
+      buffer_size: 120
    network_settings:
      hidden_units: 4
      num_layers: 1
-  Match3GreedyHeuristic:
+  Match3SmartHeuristic:
-      batch_size: 64
-      buffer_size: 128
+      batch_size: 16
+      buffer_size: 120
    network_settings:
      hidden_units: 4
      num_layers: 1
--- a/config/ppo/PyramidsRND.yaml
+++ b/config/ppo/PyramidsRND.yaml
        strength: 0.01
        network_settings:
          hidden_units: 64
+          num_layers: 3
        learning_rate: 0.0001
    keep_checkpoints: 5
    max_steps: 3000000
--- a/docs/Installation-Anaconda-Windows.md
+++ b/docs/Installation-Anaconda-Windows.md
 the ml-agents Conda environment by typing `activate ml-agents`)_:

 ```sh
-git clone --branch release_12 https://github.com/Unity-Technologies/ml-agents.git
+git clone --branch release_13 https://github.com/Unity-Technologies/ml-agents.git
-The `--branch release_12` option will switch to the tag of the latest stable
-release. Omitting that will get the `master` branch which is potentially
+The `--branch release_13` option will switch to the tag of the latest stable
+release. Omitting that will get the `main` branch which is potentially
 unstable.

 If you don't want to use Git, you can find download links on the
--- a/docs/Installation.md
+++ b/docs/Installation.md
 of our tutorials / guides assume you have access to our example environments).

 ```sh
-git clone --branch release_12 https://github.com/Unity-Technologies/ml-agents.git
+git clone --branch release_13 https://github.com/Unity-Technologies/ml-agents.git
-The `--branch release_12` option will switch to the tag of the latest stable
-release. Omitting that will get the `master` branch which is potentially
+The `--branch release_13` option will switch to the tag of the latest stable
+release. Omitting that will get the `main` branch which is potentially
 unstable.

 #### Advanced: Local Installation for Development
-back, make sure to clone the `master` branch (by omitting `--branch release_12`
+back, make sure to clone the `main` branch (by omitting `--branch release_13`
 from the command above). See our
 [Contributions Guidelines](../com.unity.ml-agents/CONTRIBUTING.md) for more
 information on contributing to the ML-Agents Toolkit.
--- a/docs/Learning-Environment-Examples.md
+++ b/docs/Learning-Environment-Examples.md
  - Observations and actions are defined with a sensor and actuator respectively.
 - Float Properties: None
 - Benchmark Mean Reward:
-  - 37.2 for visual observations
-  - 37.6 for vector observations
+  - 39.5 for visual observations
+  - 38.5 for vector observations
  - 34.2 for simple heuristic (pick a random valid move)
  - 37.0 for greedy heuristic (pick the highest-scoring valid move)

--- a/docs/ML-Agents-Overview.md
+++ b/docs/ML-Agents-Overview.md
 - [Model Types](#model-types)
  - [Learning from Vector Observations](#learning-from-vector-observations)
  - [Learning from Cameras using Convolutional Neural Networks](#learning-from-cameras-using-convolutional-neural-networks)
+  - [Learning from Variable Length Observations using Attention](#learning-from-ariable-length-observations-using-attention)
  - [Memory-enhanced Agents using Recurrent Neural Networks](#memory-enhanced-agents-using-recurrent-neural-networks)
 - [Additional Features](#additional-features)
 - [Summary and Next Steps](#summary-and-next-steps)

 Regardless of the training method deployed, there are a few model types that
 users can train using the ML-Agents Toolkit. This is due to the flexibility in
-defining agent observations, which can include vector, ray cast and visual
+defining agent observations, which include vector, ray cast and visual
 observations. You can learn more about how to instrument an agent's observation
 in the [Designing Agents](Learning-Environment-Design-Agents.md) guide.


 The choice of the architecture depends on the visual complexity of the scene and
 the available computational resources.
+
+### Learning from Variable Length Observations using Attention
+
+Using the ML-Agents Toolkit, it is possible to have agents learn from a
+varying number of inputs. To do so, each agent can keep track of a buffer
+of vector observations. At each step, the agent will go through all the
+elements in the buffer and extract information but the elements
+in the buffer can change at every step.
+This can be useful in scenarios in which the agents must keep track of
+a varying number of elements throughout the episode. For example in a game
+where an agent must learn to avoid projectiles, but the projectiles can vary in
+numbers.
+
+![Variable Length Observations Illustrated](images/variable-length-observation-illustrated.png)
+
+You can learn more about variable length observations
+[here](Learning-Environment-Design-Agents.md#variable-length-observations).
+When variable length observations are utilized, the ML-Agents Toolkit
+leverages attention networks to learn from a varying number of entities.
+Agents using attention will ignore entities that are deemed not relevant
+and pay special attention to entities relevant to the current situation
+based on context.

 ### Memory-enhanced Agents using Recurrent Neural Networks

--- a/docs/Migrating.md
+++ b/docs/Migrating.md
 - The Parameter Randomization feature has been merged with the Curriculum feature. It is now possible to specify a sampler
 in the lesson of a Curriculum. Curriculum has been refactored and is now specified at the level of the parameter, not the
 behavior. More information
-[here](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-ML-Agents.md).(#4160)
+[here](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Training-ML-Agents.md).(#4160)

 ### Steps to Migrate
 - The configuration format for curriculum and parameter randomization has changed. To upgrade your configuration files,
--- a/docs/Training-on-Amazon-Web-Service.md
+++ b/docs/Training-on-Amazon-Web-Service.md
 2. Clone the ML-Agents repo and install the required Python packages

   ```sh
-   git clone --branch release_12 https://github.com/Unity-Technologies/ml-agents.git
+   git clone --branch release_13 https://github.com/Unity-Technologies/ml-agents.git
   cd ml-agents/ml-agents/
   pip3 install -e .
   ```
--- a/docs/Unity-Inference-Engine.md
+++ b/docs/Unity-Inference-Engine.md
 loading expects certain conventions for constants and tensor names. While it is
 possible to construct a model that follows these conventions, we don't provide
 any additional help for this. More details can be found in
-[TensorNames.cs](https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/com.unity.ml-agents/Runtime/Inference/TensorNames.cs)
+[TensorNames.cs](https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/com.unity.ml-agents/Runtime/Inference/TensorNames.cs)
-[BarracudaModelParamLoader.cs](https://github.com/Unity-Technologies/ml-agents/blob/release_12_docs/com.unity.ml-agents/Runtime/Inference/BarracudaModelParamLoader.cs).
+[BarracudaModelParamLoader.cs](https://github.com/Unity-Technologies/ml-agents/blob/release_13_docs/com.unity.ml-agents/Runtime/Inference/BarracudaModelParamLoader.cs).

 If you wish to run inference on an externally trained model, you should use
 Barracuda directly, instead of trying to run it through ML-Agents.
--- a/docs/localized/KR/docs/Installation-Anaconda-Windows.md
+++ b/docs/localized/KR/docs/Installation-Anaconda-Windows.md
 git clone https://github.com/Unity-Technologies/ml-agents.git
 ```

-���� Git�� �����ϰ� ���� �ʴٸ� ������ [��ũ](https://github.com/Unity-Technologies/ml-agents/archive/master.zip)���� ���� ������ �ٿ��ε� �� �� �ֽ��ϴ�.
+���� Git�� �����ϰ� ���� �ʴٸ� ������ [��ũ](https://github.com/Unity-Technologies/ml-agents/archive/main.zip)���� ���� ������ �ٿ��ε� �� �� �ֽ��ϴ�.

 `UnitySDK` ���� �����丮���� ������Ʈ�� �߰��� ����Ƽ �ּ��� ���ԵǾ� �ֽ��ϴ�. ���� �����ϴµ� ������ �Ǵ� ���� [���� ȯ��](Learning-Environment-Examples.md)���� �ֽ��ϴ�.

--- a/docs/localized/RU/README.md
+++ b/docs/localized/RU/README.md
-<img src="https://github.com/Unity-Technologies/ml-agents/blob/master/docs/images/image-banner.png" align="middle" width="3000"/>
+<img src="https://github.com/Unity-Technologies/ml-agents/blob/main/docs/images/image-banner.png" align="middle" width="3000"/>

 # Unity ML-Agents Toolkit Version Release 7

 См. [здесь](https://github.com/Unity-Technologies/ml-agents/tree/release_7_docs/docs/Readme.md),
 чтобы начать работать с самой последней версий ML-Agents.**

-Таблица внизу - список всех наших релизов, включая master ветку, над которой мы ведем активную работу
+Таблица внизу - список всех наших релизов, включая main ветку, над которой мы ведем активную работу
 и которая может быть нестабильной. Полезная информация:

 [Управление версиями](docs/Versioning.md) - описание того, как мы работам с GitHub.

 | **Version** | **Дата релиза** | **Source** | **Документация** | **Загрузка** |
 |:-------:|:------:|:-------------:|:-------:|:------------:|
-| **master (unstable)** | -- | [source](https://github.com/Unity-Technologies/ml-agents/tree/master) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/master/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/master.zip) |
+| **main (unstable)** | -- | [source](https://github.com/Unity-Technologies/ml-agents/tree/main) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/main/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/main.zip) |
 | **Release 7** | **16 Сентября, 2020** | **[source](https://github.com/Unity-Technologies/ml-agents/tree/release_7)** | **[docs](https://github.com/Unity-Technologies/ml-agents/tree/release_7_docs/docs/Readme.md)** | **[download](https://github.com/Unity-Technologies/ml-agents/archive/release_7.zip)** |
 | **Release 6** | 12 Августа, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/release_6) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/release_6_docs/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/release_6.zip) |
 | **Release 5** | 31 Июля, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/release_5) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/release_5_docs/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/release_5.zip) |
--- a/docs/localized/RU/docs/Начало
+++ b/docs/localized/RU/docs/Начало
 # Начало работы

-В данной статье мы разберем шаг за шагом один из [наших примеров](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Examples.md),
+В данной статье мы разберем шаг за шагом один из [наших примеров](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Learning-Environment-Examples.md),
-[Background: Unity](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Background-Unity.md).
+[Background: Unity](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Background-Unity.md).
-[Background: Machine Learning](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Background-Machine-Learning.md).
+[Background: Machine Learning](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Background-Machine-Learning.md).
-![Головы с мячами](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/images/balance.png)
+![Головы с мячами](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/images/balance.png)

 Нашей сценой будет **3D Balance Ball**. Агенты в ней - синие кубы-платформы, у каждого
 на голове по мячу. Все они - копии друг друга. Каждый куб-агент пытается

 ## Установка
 Если вы еще не установили ML-Agents, следуйте этой
-инструкции по установке(https://github.com/Unity-Technologies/ml-agents/tree/master/docs/localized/RU/docs/Установка.md).
+инструкции по установке(https://github.com/Unity-Technologies/ml-agents/tree/main/docs/localized/RU/docs/Установка.md).

 1. Далее, откройте в Unity Project, в котором находятся примеры:
 1. Запустите Unity Hub
 со _средой_, собирая о ней данные. В Unity, среда это сцена (scene), в которой есть один
 или более объектов - Агентов, а также, конечно, и другие объекты, с которыми взаимодействует агент.

-![Unity-Editor](https://raw.githubusercontent.com/Unity-Technologies/ml-agents/master/docs/images/mlagents-3DBallHierarchy.png)
+![Unity-Editor](https://raw.githubusercontent.com/Unity-Technologies/ml-agents/main/docs/images/mlagents-3DBallHierarchy.png)

 Обратите внимание: В Unity каждый объект сцены - это объект класса GameObject. GameObject это
 буквально контейнер для всего, что касается объекта: его физики, графики, поведения и пр.,
 ## Запуск заранее обученной (предтренированной) модели

 Мы включили в свои примеры заранее обученные модели (файлы с расширением `.nn`)
-и использовали [Unity Inference Engine](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Unity-Inference-Engine.md),
+и использовали [Unity Inference Engine](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Unity-Inference-Engine.md),
 чтобы запустить их в Unity. В этом разделе мы воспользуемся одной
 из таких моделей для 3D Ball.

   Поэтому при необходимости внести изменения в каждую из платформ,
   вы можете просто изменить сам префаб вместо этого.

-   ![Platform Prefab](https://raw.githubusercontent.com/Unity-Technologies/ml-agents/master/docs/images/platform_prefab.png)
+   ![Platform Prefab](https://raw.githubusercontent.com/Unity-Technologies/ml-agents/main/docs/images/platform_prefab.png)
-   ![3dball learning brain](https://raw.githubusercontent.com/Unity-Technologies/ml-agents/master/docs/images/3dball_learning_brain.png)
+   ![3dball learning brain](https://raw.githubusercontent.com/Unity-Technologies/ml-agents/main/docs/images/3dball_learning_brain.png)

 1. Теперь каждый `Агент` на каждой платформе `3DBall` в окне **Hierarchy**
   должен содержать модель поведения **3DBall** в `Behavior Parameters`.
 фазы обучения к другой. Это сигнал, что процесс обучения проходит успешно.

 **Примечание:** Вы можете обучать агента, используя вместо Editor’a исполняемые файлы.
-См. инструкцию [Using an Executable](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Executable.md).
+См. инструкцию [Using an Executable](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Learning-Environment-Executable.md).

 ### Наблюдение за тренировочным процессом

 (среда/суммарная награда за эпизод), которая должно увеличиваться в процессе обучения,
 приближаясь к 100 - максимально возможное значение, которого может достигнуть агент.

-![Example TensorBoard Run](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/images/mlagents-TensorBoard.png)
+![Example TensorBoard Run](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/images/mlagents-TensorBoard.png)

 ## Внедрение модели в среду Unity

 ### Следующие шаги

 - Для дополнительной информации о ML-Agents Toolkit,
-  см. [Обзор ML-Agents Toolkit](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/ML-Agents-Overview.md).
- [Создание своих сцен](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Create-New.md)
+  см. [Обзор ML-Agents Toolkit](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/ML-Agents-Overview.md).
+- [Создание своих сцен](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Learning-Environment-Create-New.md)
-  примера в [ML-Agents - Example Environments](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Examples.md)
- Информация про различные опции обучения - [Training ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-ML-Agents.md)
+  примера в [ML-Agents - Example Environments](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Learning-Environment-Examples.md)
+- Информация про различные опции обучения - [Training ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Training-ML-Agents.md)
--- a/docs/localized/RU/docs/Установка.md
+++ b/docs/localized/RU/docs/Установка.md

 ML-Agents Toolkit состоит из нескольких компонентов:

- Специальный пакет Unity ([`com.unity.ml-agents`](https://github.com/Unity-Technologies/ml-agents/tree/master/com.unity.ml-agents))
+- Специальный пакет Unity ([`com.unity.ml-agents`](https://github.com/Unity-Technologies/ml-agents/tree/main/com.unity.ml-agents))
-  - ([`mlagents`](https://github.com/Unity-Technologies/ml-agents/tree/master/ml-agents)) - пакет с алгоритмами машинного обучения,
+  - ([`mlagents`](https://github.com/Unity-Technologies/ml-agents/tree/main/ml-agents)) - пакет с алгоритмами машинного обучения,
-  - ([`mlagents_envs`](https://github.com/Unity-Technologies/ml-agents/tree/master/ml-agents-envs) содержит Python
+  - ([`mlagents_envs`](https://github.com/Unity-Technologies/ml-agents/tree/main/ml-agents-envs) содержит Python
-  - ([`gym_unity`](https://github.com/Unity-Technologies/ml-agents/tree/master/gym-unity)) - позволяет обернуть вашу сцену
+  - ([`gym_unity`](https://github.com/Unity-Technologies/ml-agents/tree/main/gym-unity)) - позволяет обернуть вашу сцену
- Unity [Project](https://github.com/Unity-Technologies/ml-agents/tree/master/Project),
-содержащий [примеры сцены](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Examples.md),
+- Unity [Project](https://github.com/Unity-Technologies/ml-agents/tree/main/Project),
+содержащий [примеры сцены](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Learning-Environment-Examples.md),
 где реализованы различные возможности ML-Agents для наглядности.

 Итого, чтобы установить и использовать ML-Agents, вам нужно:

 Вам нужно будет склонировать репозиторий, если вы планируете вносить изменения
 или что-то добавлять в ML-Agents для своих целей. Или вы планируете сделать
-эти изменения общедоступными, пожалуйста, склонируйте master branch
+эти изменения общедоступными, пожалуйста, склонируйте main branch
-Далее см. [гайд про публикацию правок](https://github.com/Unity-Technologies/ml-agents/blob/master/com.unity.ml-agents/CONTRIBUTING.md).
+Далее см. [гайд про публикацию правок](https://github.com/Unity-Technologies/ml-agents/blob/main/com.unity.ml-agents/CONTRIBUTING.md).

 ### Установка пакета `com.unity.ml-agents` для Unity
 Unity ML-Agents C# SDK это пакет Unity. Вы можете установить его прямо из пакетного менеджера,
 А в Unity 2019.3 она - сверху слева.

 <p align="center">
-  <img src="https://raw.githubusercontent.com/Unity-Technologies/ml-agents/master/docs/images/unity_package_manager_window.png"
+  <img src="https://raw.githubusercontent.com/Unity-Technologies/ml-agents/main/docs/images/unity_package_manager_window.png"
-  <img src="https://raw.githubusercontent.com/Unity-Technologies/ml-agents/master/docs/images/unity_package_json.png"
+  <img src="https://raw.githubusercontent.com/Unity-Technologies/ml-agents/main/docs/images/unity_package_json.png"
     alt="package.json"
     height="300"
     border="10" />
 Virtual Environments позволяет управлять ранее установленными версиями так,
 что для одного проекта будут одни зависимости, а для другого - другие на каждой
 из операционных систем - Mac / Windows / Linux.
-Гайд по [Virtual Environments](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Using-Virtual-Environment.md).
+Гайд по [Virtual Environments](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Using-Virtual-Environment.md).

 Чтобы установить `mlagents` Python, активируйте вашу виртуальную среду
 и выполните следующее в командной строке:
 команду `mlagents-learn --help`, после
 исполнения которой вы увидите набор доступных команд `mlagents-learn`.
 Устанавливая mlagents, вы также устанавливаете и все то, на базе чего он сделан -
-см. [setup.py file](https://github.com/Unity-Technologies/ml-agents/blob/master/ml-agents/setup.py).
-В том числе [TensorFlow](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Background-TensorFlow.md)
+см. [setup.py file](https://github.com/Unity-Technologies/ml-agents/blob/main/ml-agents/setup.py).
+В том числе [TensorFlow](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Background-TensorFlow.md)
 (требует поддержки CPU w/ AVX).

 #### Продвинутая локальная установка для разработки

 ### Следующие шаги

-Гайд ["Начало работы"](https://github.com/Unity-Technologies/ml-agents/tree/master/docs/localized/RU/docs/Начало-работы.md)
+Гайд ["Начало работы"](https://github.com/Unity-Technologies/ml-agents/tree/main/docs/localized/RU/docs/Начало-работы.md)
 содержит в себе серию коротких обучающих уроков по настройке ML-Agents внутри Unity
 и запуск предобученной модели, а также уроки по созданию и дополнению сцен,
 где будет обучаться ваш агент.
-Если у вас возникли трудности с ML-Agents, пожалуйста, обратитесь к [FAQ](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/FAQ.md)
-и к странице [Limitations](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Limitations.md).
+Если у вас возникли трудности с ML-Agents, пожалуйста, обратитесь к [FAQ](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/FAQ.md)
+и к странице [Limitations](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Limitations.md).
 Если это не решило вашу проблему, создайте запрос [здесь](https://github.com/Unity-Technologies/ml-agents/issues),
 предоставив информацию о операционной системе, версии Python и сообщение об ошибки (если оно есть).

--- a/ml-agents-envs/mlagents_envs/base_env.py
+++ b/ml-agents-envs/mlagents_envs/base_env.py
 from mlagents_envs.exception import UnityActionException

 AgentId = int
+GroupId = int
 BehaviorName = str



    obs: List[np.ndarray]
    reward: float
-    group_reward: float
    agent_id: AgentId
    action_mask: Optional[List[np.ndarray]]
    group_id: int
    def __init__(self, obs, reward, agent_id, action_mask, group_id, group_reward):
        self.obs: List[np.ndarray] = obs
        self.reward: np.ndarray = reward
-        self.group_reward: np.ndarray = group_reward
        self.agent_id: np.ndarray = agent_id
        self.action_mask: Optional[List[np.ndarray]] = action_mask
        self.group_id: np.ndarray = group_id
        return DecisionStep(
            obs=agent_obs,
            reward=self.reward[agent_index],
-            group_reward=self.group_reward[agent_index],
            agent_id=agent_id,
            action_mask=agent_mask,
            group_id=group_id,
        return DecisionSteps(
            obs=obs,
            reward=np.zeros(0, dtype=np.float32),
-            group_reward=np.zeros(0, dtype=np.float32),
            agent_id=np.zeros(0, dtype=np.int32),
            action_mask=None,
            group_id=np.zeros(0, dtype=np.int32),

    obs: List[np.ndarray]
    reward: float
-    group_reward: float
-    group_id: int
+    group_id: GroupId
    group_reward: float


    def __init__(self, obs, reward, interrupted, agent_id, group_id, group_reward):
        self.obs: List[np.ndarray] = obs
        self.reward: np.ndarray = reward
-        self.group_reward: np.ndarray = group_reward
        self.interrupted: np.ndarray = interrupted
        self.agent_id: np.ndarray = agent_id
        self.group_id: np.ndarray = group_id
        return TerminalStep(
            obs=agent_obs,
            reward=self.reward[agent_index],
-            group_reward=self.group_reward[agent_index],
            interrupted=self.interrupted[agent_index],
            agent_id=agent_id,
            group_id=group_id,
        return TerminalSteps(
            obs=obs,
            reward=np.zeros(0, dtype=np.float32),
-            group_reward=np.zeros(0, dtype=np.float32),
            interrupted=np.zeros(0, dtype=np.bool),
            agent_id=np.zeros(0, dtype=np.int32),
            group_id=np.zeros(0, dtype=np.int32),
--- a/ml-agents-envs/mlagents_envs/rpc_utils.py
+++ b/ml-agents-envs/mlagents_envs/rpc_utils.py
    decision_rewards = np.array(
        [agent_info.reward for agent_info in decision_agent_info_list], dtype=np.float32
    )
-
    terminal_rewards = np.array(
        [agent_info.reward for agent_info in terminal_agent_info_list], dtype=np.float32
    )
--- a/ml-agents-envs/mlagents_envs/tests/test_envs.py
+++ b/ml-agents-envs/mlagents_envs/tests/test_envs.py
    )
    env = UnityEnvironment(file_name=file_name, worker_id=0, base_port=base_port)
    assert expected == env._port
+    env.close()


@mock.patch("mlagents_envs.env_utils.launch_executable")
    args = env._executable_args()
    log_file_index = args.index("-logFile")
    assert args[log_file_index + 1] == "./some-log-folder-path/Player-0.log"
+    env.close()


@mock.patch("mlagents_envs.env_utils.launch_executable")
--- a/ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py
+++ b/ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py
 import io
 import numpy as np
 import pytest
-from typing import List, Tuple
+from typing import List, Tuple, Any

 from mlagents_envs.communicator_objects.agent_info_pb2 import AgentInfoProto
 from mlagents_envs.communicator_objects.observation_pb2 import (
        reward = decision_steps.reward[agent_id_index]
        done = False
        max_step_reached = False
-        agent_mask = None
+        agent_mask: Any = None
-            agent_mask = []  # type: ignore
+            agent_mask = []
+            agent_mask = agent_mask.astype(np.bool).tolist()
        observations: List[ObservationProto] = []
        for all_observations_of_type in decision_steps.obs:
            observation = all_observations_of_type[agent_id_index]
            reward=reward,
            done=done,
            id=agent_id,
-            max_step_reached=max_step_reached,
+            max_step_reached=bool(max_step_reached),
            action_mask=agent_mask,
            observations=observations,
        )
            reward=reward,
            done=done,
            id=agent_id,
-            max_step_reached=max_step_reached,
+            max_step_reached=bool(max_step_reached),
            action_mask=None,
            observations=final_observations,
        )
--- a/ml-agents/mlagents/trainers/action_info.py
+++ b/ml-agents/mlagents/trainers/action_info.py


 class ActionInfo(NamedTuple):
+    """
+    A NamedTuple containing actions and related quantities to the policy forward
+    pass. Additionally contains the agent ids in the corresponding DecisionStep
+    :param action: The action output of the policy
+    :param env_action: The possibly clipped action to be executed in the environment
+    :param outputs: Dict of all quantities associated with the policy forward pass
+    :param agent_ids: List of int agent ids in DecisionStep
+    """
+
-    value: Any
-        return ActionInfo([], [], [], {}, [])
+        return ActionInfo([], [], {}, [])
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
    StatsAggregationMethod,
    EnvironmentStats,
 )
-from mlagents.trainers.trajectory import Trajectory, AgentExperience
+from mlagents.trainers.trajectory import AgentStatus, Trajectory, AgentExperience
-from mlagents.trainers.behavior_id_utils import get_global_agent_id, get_global_group_id
+from mlagents.trainers.behavior_id_utils import (
+    get_global_agent_id,
+    get_global_group_id,
+    GlobalAgentId,
+    GlobalGroupId,
+)

 T = TypeVar("T")

        :param max_trajectory_length: Maximum length of a trajectory before it is added to the trainer.
        :param stats_category: The category under which to write the stats. Usually, this comes from the Trainer.
        """
-        self.experience_buffers: Dict[str, List[AgentExperience]] = defaultdict(list)
-        self.last_step_result: Dict[str, Tuple[DecisionStep, int]] = {}
-        # current_group_obs is used to collect the last seen obs of all the agents in the same group,
-        # and assemble the next_collab_obs.
-        self.current_group_obs: Dict[str, Dict[str, List[np.ndarray]]] = defaultdict(
-            lambda: defaultdict(list)
-        )
-        # last_group_obs is used to collect the last seen obs of all the agents in the same group,
-        # and assemble the collab_obs.
-        self.last_group_obs: Dict[str, Dict[str, List[np.ndarray]]] = defaultdict(
-            lambda: defaultdict(list)
-        )
+        self._experience_buffers: Dict[
+            GlobalAgentId, List[AgentExperience]
+        ] = defaultdict(list)
+        self._last_step_result: Dict[GlobalAgentId, Tuple[DecisionStep, int]] = {}
+        # current_group_obs is used to collect the current (i.e. the most recently seen)
+        # obs of all the agents in the same group, and assemble the group obs.
+        # It is a dictionary of GlobalGroupId to dictionaries of GlobalAgentId to observation.
+        self._current_group_obs: Dict[
+            GlobalGroupId, Dict[GlobalAgentId, List[np.ndarray]]
+        ] = defaultdict(lambda: defaultdict(list))
+        # group_status is used to collect the current, most recently seen
+        # group status of all the agents in the same group, and assemble the group's status.
+        # It is a dictionary of GlobalGroupId to dictionaries of GlobalAgentId to AgentStatus.
+        self._group_status: Dict[
+            GlobalGroupId, Dict[GlobalAgentId, AgentStatus]
+        ] = defaultdict(lambda: defaultdict(None))
-        self.last_take_action_outputs: Dict[str, ActionInfoOutputs] = {}
+        self._last_take_action_outputs: Dict[GlobalAgentId, ActionInfoOutputs] = {}
+
+        self._episode_steps: Counter = Counter()
+        self._episode_rewards: Dict[GlobalAgentId, float] = defaultdict(float)
+        self._stats_reporter = stats_reporter
+        self._max_trajectory_length = max_trajectory_length
+        self._trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
+        self._behavior_id = behavior_id
+
-        self.episode_steps: Counter = Counter()
-        self.episode_rewards: Dict[str, float] = defaultdict(float)
-        self.stats_reporter = stats_reporter
-        self.max_trajectory_length = max_trajectory_length
-        self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
-        self.behavior_id = behavior_id

    def add_experiences(
        self,
        take_action_outputs = previous_action.outputs
        if take_action_outputs:
            for _entropy in take_action_outputs["entropy"]:
-                self.stats_reporter.add_stat("Policy/Entropy", _entropy)
+                self._stats_reporter.add_stat("Policy/Entropy", _entropy)

        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
-            if global_id in self.last_step_result:  # Don't store if agent just reset
-                self.last_take_action_outputs[global_id] = take_action_outputs
+            if global_id in self._last_step_result:  # Don't store if agent just reset
+                self._last_take_action_outputs[global_id] = take_action_outputs
-        # Iterate over all the terminal steps, first gather all the teammate obs
-        # and then create the AgentExperiences/Trajectories
+        # Iterate over all the terminal steps, first gather all the group obs
+        # and then create the AgentExperiences/Trajectories. _add_to_group_status
+        # stores Group statuses in a common data structure self.group_status
-            local_id = terminal_step.agent_id
-            global_id = get_global_agent_id(worker_id, local_id)
-            self._gather_teammate_obs(terminal_step, global_id, worker_id)
+            self._add_group_status_and_obs(terminal_step, worker_id)
-                terminal_step,
-                global_id,
-                worker_id,
-                terminal_steps.agent_id_to_index[local_id],
+                terminal_step, worker_id, terminal_steps.agent_id_to_index[local_id]
-            self._clear_teammate_obs(global_id)
+            self._clear_group_status_and_obs(global_id)
-        # Clean the last experience dictionary for terminal steps
-        for terminal_step in terminal_steps.values():
-            local_id = terminal_step.agent_id
-            global_id = get_global_agent_id(worker_id, local_id)
-
-        # Iterate over all the decision steps, first gather all the teammate obs
-        # and then create the trajectories
+        # Iterate over all the decision steps, first gather all the group obs
+        # and then create the trajectories. _add_to_group_status
+        # stores Group statuses in a common data structure self.group_status
-            local_id = ongoing_step.agent_id
-            global_id = get_global_agent_id(worker_id, local_id)
-            self._gather_teammate_obs(ongoing_step, global_id, worker_id)
+            self._add_group_status_and_obs(ongoing_step, worker_id)
-            global_id = get_global_agent_id(worker_id, local_id)
-                ongoing_step,
-                global_id,
-                worker_id,
-                decision_steps.agent_id_to_index[local_id],
+                ongoing_step, worker_id, decision_steps.agent_id_to_index[local_id]
-            if _gid in self.last_step_result:
+            if _gid in self._last_step_result:
-    def _gather_teammate_obs(
-        self, step: Union[TerminalStep, DecisionStep], global_id: str, worker_id: int
+    def _add_group_status_and_obs(
+        self, step: Union[TerminalStep, DecisionStep], worker_id: int
-        stored_decision_step, idx = self.last_step_result.get(global_id, (None, None))
-        if stored_decision_step is not None:
+        """
+        Takes a TerminalStep or DecisionStep and adds the information in it
+        to self.group_status. This information can then be retrieved
+        when constructing trajectories to get the status of group mates. Also stores the current
+        observation into current_group_obs, to be used to get the next group observations
+        for bootstrapping.
+        :param step: TerminalStep or DecisionStep
+        :param worker_id: Worker ID of this particular environment. Used to generate a
+            global group id.
+        """
+        global_agent_id = get_global_agent_id(worker_id, step.agent_id)
+        stored_decision_step, idx = self._last_step_result.get(
+            global_agent_id, (None, None)
+        )
+        stored_take_action_outputs = self._last_take_action_outputs.get(
+            global_agent_id, None
+        )
+        if stored_decision_step is not None and stored_take_action_outputs is not None:
+            # 0, the default group_id, means that the agent doesn't belong to an agent group.
+            # If 0, don't add any groupmate information.
-                self.last_group_obs[global_group_id][
-                    global_id
-                ] = stored_decision_step.obs
-                self.current_group_obs[global_group_id][global_id] = step.obs
+                stored_actions = stored_take_action_outputs["action"]
+                action_tuple = ActionTuple(
+                    continuous=stored_actions.continuous[idx],
+                    discrete=stored_actions.discrete[idx],
+                )
+                group_status = AgentStatus(
+                    obs=stored_decision_step.obs,
+                    reward=step.reward,
+                    action=action_tuple,
+                    done=isinstance(step, TerminalStep),
+                )
+                self._group_status[global_group_id][global_agent_id] = group_status
+                self._current_group_obs[global_group_id][global_agent_id] = step.obs
+
+    def _clear_group_status_and_obs(self, global_id: GlobalAgentId) -> None:
+        """
+        Clears an agent from self._group_status and self._current_group_obs.
+        """
+        self._delete_in_nested_dict(self._current_group_obs, global_id)
+        self._delete_in_nested_dict(self._group_status, global_id)
-    def _clear_teammate_obs(self, global_id: str) -> None:
-        for _group_id in list(self.current_group_obs.keys()):
-            _team_group = self.current_group_obs[_group_id]
-            self._safe_delete(_team_group, global_id)
-            if not _team_group:  # if dict is empty
-                self._safe_delete(self.current_group_obs, _group_id)
-        for _group_id in list(self.last_group_obs.keys()):
-            _team_group = self.last_group_obs[_group_id]
-            self._safe_delete(_team_group, global_id)
+    def _delete_in_nested_dict(self, nested_dict: Dict[str, Any], key: str) -> None:
+        for _manager_id in list(nested_dict.keys()):
+            _team_group = nested_dict[_manager_id]
+            self._safe_delete(_team_group, key)
-                self._safe_delete(self.last_group_obs, _group_id)
+                self._safe_delete(nested_dict, _manager_id)
-        self,
-        step: Union[TerminalStep, DecisionStep],
-        global_id: str,
-        worker_id: int,
-        index: int,
+        self, step: Union[TerminalStep, DecisionStep], worker_id: int, index: int
-        stored_decision_step, idx = self.last_step_result.get(global_id, (None, None))
-        stored_take_action_outputs = self.last_take_action_outputs.get(global_id, None)
+        global_agent_id = get_global_agent_id(worker_id, step.agent_id)
+        global_group_id = get_global_group_id(worker_id, step.group_id)
+        stored_decision_step, idx = self._last_step_result.get(
+            global_agent_id, (None, None)
+        )
+        stored_take_action_outputs = self._last_take_action_outputs.get(
+            global_agent_id, None
+        )
-            self.last_step_result[global_id] = (step, index)
+            self._last_step_result[global_agent_id] = (step, index)
-                memory = self.policy.retrieve_memories([global_id])[0, :]
+                memory = self.policy.retrieve_previous_memories([global_agent_id])[0, :]
            else:
                memory = None
            done = terminated  # Since this is an ongoing step
                discrete=stored_action_probs.discrete[idx],
            )
            action_mask = stored_decision_step.action_mask
-            prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
+            prev_action = self.policy.retrieve_previous_action([global_agent_id])[0, :]
-            collab_obs = []
-            global_group_id = get_global_group_id(worker_id, step.group_id)
-            for _id, _obs in self.last_group_obs[global_group_id].items():
-                if _id != global_id:
-                    collab_obs.append(_obs)
+            group_statuses = []
+            for _id, _mate_status in self._group_status[global_group_id].items():
+                if _id != global_agent_id:
+                    group_statuses.append(_mate_status)
-                collab_obs=collab_obs,
                reward=step.reward,
                done=done,
                action=action_tuple,
                interrupted=interrupted,
                memory=memory,
+                group_status=group_statuses,
+                group_reward=step.group_reward,
-            self.experience_buffers[global_id].append(experience)
-            self.episode_rewards[global_id] += step.reward
+            self._experience_buffers[global_agent_id].append(experience)
+            self._episode_rewards[global_agent_id] += step.reward
-                self.episode_steps[global_id] += 1
+                self._episode_steps[global_agent_id] += 1
-                len(self.experience_buffers[global_id]) >= self.max_trajectory_length
+                len(self._experience_buffers[global_agent_id])
+                >= self._max_trajectory_length
-                next_collab_obs = []
-                global_group_id = get_global_group_id(worker_id, step.group_id)
-                for _id, _exp in self.current_group_obs[global_group_id].items():
-                    if _id != global_id:
-                        next_collab_obs.append(_exp)
+                next_group_obs = []
+                for _id, _obs in self._current_group_obs[global_group_id].items():
+                    if _id != global_agent_id:
+                        next_group_obs.append(_obs)
-                    steps=self.experience_buffers[global_id],
-                    agent_id=global_id,
+                    steps=self._experience_buffers[global_agent_id],
+                    agent_id=global_agent_id,
-                    next_collab_obs=next_collab_obs,
-                    behavior_id=self.behavior_id,
+                    next_group_obs=next_group_obs,
+                    behavior_id=self._behavior_id,
-                for traj_queue in self.trajectory_queues:
+                for traj_queue in self._trajectory_queues:
-                self.experience_buffers[global_id] = []
+                self._experience_buffers[global_agent_id] = []
-                self.stats_reporter.add_stat(
-                    "Environment/Episode Length", self.episode_steps.get(global_id, 0)
+                self._stats_reporter.add_stat(
+                    "Environment/Episode Length",
+                    self._episode_steps.get(global_agent_id, 0),
-                self._clean_agent_data(global_id)
+                self._clean_agent_data(global_agent_id)
-    def _clean_agent_data(self, global_id: str) -> None:
+    def _clean_agent_data(self, global_id: GlobalAgentId) -> None:
-        self._safe_delete(self.experience_buffers, global_id)
-        self._safe_delete(self.last_take_action_outputs, global_id)
-        self._safe_delete(self.last_step_result, global_id)
-        self._safe_delete(self.episode_steps, global_id)
-        self._safe_delete(self.episode_rewards, global_id)
+        self._safe_delete(self._experience_buffers, global_id)
+        self._safe_delete(self._last_take_action_outputs, global_id)
+        self._safe_delete(self._last_step_result, global_id)
+        self._safe_delete(self._episode_steps, global_id)
+        self._safe_delete(self._episode_rewards, global_id)
        self.policy.remove_previous_action([global_id])
        self.policy.remove_memories([global_id])

        assembles a Trajectory
        :param trajectory_queue: Trajectory queue to publish to.
        """
-        self.trajectory_queues.append(trajectory_queue)
+        self._trajectory_queues.append(trajectory_queue)

    def end_episode(self) -> None:
        """
-        all_gids = list(self.experience_buffers.keys())  # Need to make copy
+        all_gids = list(self._experience_buffers.keys())  # Need to make copy
        for _gid in all_gids:
            self._clean_agent_data(_gid)

        super().__init__(policy, behavior_id, stats_reporter, max_trajectory_length)
        trajectory_queue_len = 20 if threaded else 0
        self.trajectory_queue: AgentManagerQueue[Trajectory] = AgentManagerQueue(
-            self.behavior_id, maxlen=trajectory_queue_len
+            self._behavior_id, maxlen=trajectory_queue_len
-            self.behavior_id, maxlen=0
+            self._behavior_id, maxlen=0
        )
        self.publish_trajectory_queue(self.trajectory_queue)

        for stat_name, value_list in env_stats.items():
            for val, agg_type in value_list:
                if agg_type == StatsAggregationMethod.AVERAGE:
-                    self.stats_reporter.add_stat(stat_name, val, agg_type)
+                    self._stats_reporter.add_stat(stat_name, val, agg_type)
-                    self.stats_reporter.add_stat(stat_name, val, agg_type)
+                    self._stats_reporter.add_stat(stat_name, val, agg_type)
-                        self.stats_reporter.set_stat(stat_name, val)
+                        self._stats_reporter.set_stat(stat_name, val)
--- a/ml-agents/mlagents/trainers/behavior_id_utils.py
+++ b/ml-agents/mlagents/trainers/behavior_id_utils.py
-from typing import NamedTuple, Optional
+from typing import NamedTuple
+from mlagents_envs.base_env import AgentId, GroupId
+
+GlobalGroupId = str
+GlobalAgentId = str


 class BehaviorIdentifiers(NamedTuple):
    behavior_id: str
    brain_name: str
    team_id: int
-    group_id: int
-        Parses a name_behavior_id of the form name?team=0&group=0
+        Parses a name_behavior_id of the form name?team=0
+
-        group_id: int = 0
-        if "group" in ids:
-            group_id = int(ids["group"][0])
-            behavior_id=name_behavior_id,
-            brain_name=name,
-            team_id=team_id,
-            group_id=group_id,
+            behavior_id=name_behavior_id, brain_name=name, team_id=team_id
-def create_name_behavior_id(
-    name: str, team_id: Optional[int] = None, group_id: Optional[int] = None
-) -> str:
+def create_name_behavior_id(name: str, team_id: int) -> str:
-    Reconstructs fully qualified behavior name from name and team_id
-    :param name: brain name
-    :param team_id: team ID
-    :return: name_behavior_id
-    """
-    final_name = name
-    if team_id is not None:
-        final_name += f"?team={team_id}"
-    if group_id is not None:
-        final_name += f"&group={group_id}"
-    return final_name
+   Reconstructs fully qualified behavior name from name and team_id
+   :param name: brain name
+   :param team_id: team ID
+   :return: name_behavior_id
+   """
+    return name + "?team=" + str(team_id)
-def get_global_agent_id(worker_id: int, agent_id: int) -> str:
+def get_global_agent_id(worker_id: int, agent_id: AgentId) -> GlobalAgentId:
-    return f"${worker_id}-{agent_id}"
+    return f"agent_{worker_id}-{agent_id}"
-def get_global_group_id(worker_id: int, group_id: int) -> str:
+def get_global_group_id(worker_id: int, group_id: GroupId) -> GlobalGroupId:
-    Create an agent id that is unique across environment workers using the worker_id.
+    Create a group id that is unique across environment workers when using the worker_id.
-    return f"#{worker_id}-{group_id}"
+    return f"group_{worker_id}-{group_id}"
--- a/ml-agents/mlagents/trainers/buffer.py
+++ b/ml-agents/mlagents/trainers/buffer.py

 from mlagents_envs.exception import UnityException

+# Elements in the buffer can be np.ndarray, or in the case of teammate obs, actions, rewards,
+# a List of np.ndarray. This is done so that we don't have duplicated np.ndarrays, only references.
+BufferEntry = Union[np.ndarray, List[np.ndarray]]
+

 class BufferException(UnityException):
    """
 class BufferKey(enum.Enum):
    ACTION_MASK = "action_mask"
    CONTINUOUS_ACTION = "continuous_action"
+    NEXT_CONT_ACTION = "next_continuous_action"
+    NEXT_DISC_ACTION = "next_discrete_action"
+    CRITIC_MEMORY = "critic_memory"
+    GROUP_DONES = "group_dones"
+    GROUPMATE_REWARDS = "groupmate_reward"
+    GROUP_REWARD = "group_reward"
+    GROUP_CONTINUOUS_ACTION = "group_continuous_action"
+    GROUP_DISCRETE_ACTION = "group_discrete_aaction"
+    GROUP_NEXT_CONT_ACTION = "group_next_cont_action"
+    GROUP_NEXT_DISC_ACTION = "group_next_disc_action"
+
+
+    GROUP_OBSERVATION = "group_obs"
+    NEXT_GROUP_OBSERVATION = "next_group_obs"


 class RewardSignalKeyPrefix(enum.Enum):

 class AgentBufferField(list):
    """
-    AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to its
-    AgentBufferField with the append method.
+    AgentBufferField is a list of numpy arrays, or List[np.ndarray] for group entries.
+    When an agent collects a field, you can add it to its AgentBufferField with the append method.
-    def __init__(self):
+    def __init__(self, *args, **kwargs):
-        super().__init__()
+        super().__init__(*args, **kwargs)
-    def __str__(self):
-        return str(np.array(self).shape)
+    def __str__(self) -> str:
+        return f"AgentBufferField: {super().__str__()}"
+
+    def __getitem__(self, index):
+        return_data = super().__getitem__(index)
+        if isinstance(return_data, list):
+            return AgentBufferField(return_data)
+        else:
+            return return_data

    def append(self, element: np.ndarray, padding_value: float = 0.0) -> None:
        """
        super().append(element)
        self.padding_value = padding_value

-    def extend(self, data: np.ndarray) -> None:
+    def set(self, data: List[BufferEntry]) -> None:
-        Adds a list of np.arrays to the end of the list of np.arrays.
-        :param data: The np.array list to append.
+        Sets the list of BufferEntry to the input data
+        :param data: The BufferEntry list to be set.
-        self += list(np.array(data, dtype=np.float32))
-
-    def set(self, data):
-        """
-        Sets the list of np.array to the input data
-        :param data: The np.array list to be set.
-        """
-        # Make sure we convert incoming data to float32 if it's a float
-        dtype = None
-        if data is not None and len(data) and isinstance(data[0], float):
-            dtype = np.float32
-        self[:] = list(np.array(data, dtype=dtype))
+        self[:] = data

    def get_batch(
        self,
-    ) -> np.ndarray:
+    ) -> List[BufferEntry]:
        """
        Retrieve the last batch_size elements of length training_length
        from the list of np.array
                )
            if batch_size * training_length > len(self):
                padding = np.array(self[-1], dtype=np.float32) * self.padding_value
-                return np.array(
-                    [padding] * (training_length - leftover) + self[:], dtype=np.float32
-                )
+                return [padding] * (training_length - leftover) + self[:]
+
-                return np.array(
-                    self[len(self) - batch_size * training_length :], dtype=np.float32
-                )
+                return self[len(self) - batch_size * training_length :]
        else:
            # The sequences will have overlapping elements
            if batch_size is None:
            tmp_list: List[np.ndarray] = []
            for end in range(len(self) - batch_size + 1, len(self) + 1):
                tmp_list += self[end - training_length : end]
-            return np.array(tmp_list, dtype=np.float32)
+            return tmp_list

    def reset_field(self) -> None:
        """
+
+    def padded_to_batch(
+        self, pad_value: np.float = 0, dtype: np.dtype = np.float32
+    ) -> Union[np.ndarray, List[np.ndarray]]:
+        """
+        Converts this AgentBufferField (which is a List[BufferEntry]) into a numpy array
+        with first dimension equal to the length of this AgentBufferField. If this AgentBufferField
+        contains a List[List[BufferEntry]] (i.e., in the case of group observations), return a List
+        containing numpy arrays or tensors, of length equal to the maximum length of an entry. Missing
+        For entries with less than that length, the array will be padded with pad_value.
+        :param pad_value: Value to pad List AgentBufferFields, when there are less than the maximum
+            number of agents present.
+        :param dtype: Dtype of output numpy array.
+        :return: Numpy array or List of numpy arrays representing this AgentBufferField, where the first
+            dimension is equal to the length of the AgentBufferField.
+        """
+        if len(self) > 0 and not isinstance(self[0], list):
+            return np.asanyarray(self, dytpe=dtype)
+
+        shape = None
+        for _entry in self:
+            # _entry could be an empty list if there are no group agents in this
+            # step. Find the first non-empty list and use that shape.
+            if _entry:
+                shape = _entry[0].shape
+                break
+        # If there were no groupmate agents in the entire batch, return an empty List.
+        if shape is None:
+            return []
+
+        # Convert to numpy array while padding with 0's
+        new_list = list(
+            map(
+                lambda x: np.asanyarray(x, dtype=dtype),
+                itertools.zip_longest(*self, fillvalue=np.full(shape, pad_value)),
+            )
+        )
+        return new_list


 class AgentBuffer(MutableMapping):
            return len(next(iter(self.values())))
        else:
            return 0
-
-    @staticmethod
-    def obs_list_to_obs_batch(obs_list: List[List[np.ndarray]]) -> List[np.ndarray]:
-        """
-        Converts a List of obs (an obs itself consinsting of a List of np.ndarray) to
-        a List of np.ndarray, with the observations batchwise.
-        """
-        # Transpose and convert List of Lists
-        new_list = list(map(lambda x: np.asanyarray(list(x)), zip(*obs_list)))
-        return new_list
-
-    @staticmethod
-    def obs_list_list_to_obs_batch(
-        obs_list_list: List[List[List[np.ndarray]]]
-    ) -> List[List[np.ndarray]]:
-        """
-        Convert a List of List of obs, where one of the dimension is time and the other is number (e.g. in the
-        case of a variable number of critic observations) to a List of obs, where time is in the batch dimension
-        of the obs, and the List is the variable number of agents.
-        """
-        new_list = list(
-            map(
-                lambda x: AgentBuffer.obs_list_to_obs_batch(list(x)),
-                zip(*obs_list_list),
-            )
-        )
-        return new_list
--- a/ml-agents/mlagents/trainers/ghost/trainer.py
+++ b/ml-agents/mlagents/trainers/ghost/trainer.py
        :param parsed_behavior_id: Behavior ID that the policy should belong to.
        :param policy: Policy to associate with name_behavior_id.
        """
-        name_behavior_id = create_name_behavior_id(
-            parsed_behavior_id.brain_name, team_id=parsed_behavior_id.team_id
-        )
-        # Add policy only based on the team id, not the group id
-        self._name_to_parsed_behavior_id[
-            parsed_behavior_id.behavior_id
-        ] = parsed_behavior_id
+        name_behavior_id = parsed_behavior_id.behavior_id
+        self._name_to_parsed_behavior_id[name_behavior_id] = parsed_behavior_id
        self.policies[name_behavior_id] = policy

    def get_policy(self, name_behavior_id: str) -> Policy:
        :return: Policy associated with name_behavior_id
        """
-        # Get policy based on team id, but not group id
-        parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(name_behavior_id)
-        name_behavior_id = create_name_behavior_id(
-            parsed_behavior_id.brain_name, team_id=parsed_behavior_id.team_id
-        )
        return self.policies[name_behavior_id]

    def _save_snapshot(self) -> None:
--- a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
 from typing import Dict, Optional, Tuple, List
 from mlagents.torch_utils import torch
 import numpy as np
+import math
-from mlagents.trainers.buffer import AgentBuffer
-from mlagents.trainers.trajectory import ObsUtil, TeamObsUtil
+from mlagents.trainers.buffer import AgentBuffer, AgentBufferField
+from mlagents.trainers.trajectory import ObsUtil
 from mlagents.trainers.torch.components.bc.module import BCModule
 from mlagents.trainers.torch.components.reward_providers import create_reward_provider

        self.global_step = torch.tensor(0)
        self.bc_module: Optional[BCModule] = None
        self.create_reward_signals(trainer_settings.reward_signals)
+        self.critic_memory_dict: Dict[str, torch.Tensor] = {}
        if trainer_settings.behavioral_cloning is not None:
            self.bc_module = BCModule(
                self.policy,
                default_num_epoch=3,
            )
+
+    @property
+    def critic(self):
+        raise NotImplementedError

    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
        pass
                reward_signal, self.policy.behavior_spec, settings
            )

+    def _evaluate_by_sequence(
+        self, tensor_obs: List[torch.Tensor], initial_memory: np.ndarray
+    ) -> Tuple[Dict[str, torch.Tensor], AgentBufferField, torch.Tensor]:
+        """
+        Evaluate a trajectory sequence-by-sequence, assembling the result. This enables us to get the
+        intermediate memories for the critic.
+        :param tensor_obs: A List of tensors of shape (trajectory_len, <obs_dim>) that are the agent's
+            observations for this trajectory.
+        :param initial_memory: The memory that preceeds this trajectory. Of shape (1,1,<mem_size>), i.e.
+            what is returned as the output of a MemoryModules.
+        :return: A Tuple of the value estimates as a Dict of [name, tensor], an AgentBufferField of the initial
+            memories to be used during value function update, and the final memory at the end of the trajectory.
+        """
+        num_experiences = tensor_obs[0].shape[0]
+        all_next_memories = AgentBufferField()
+        # In the buffer, the 1st sequence are the ones that are padded. So if seq_len = 3 and
+        # trajectory is of length 10, the 1st sequence is [pad,pad,obs].
+        # Compute the number of elements in this padded seq.
+        leftover = num_experiences % self.policy.sequence_length
+
+        # Compute values for the potentially truncated initial sequence
+        seq_obs = []
+
+        first_seq_len = self.policy.sequence_length
+        for _obs in tensor_obs:
+            if leftover > 0:
+                first_seq_len = leftover
+            first_seq_obs = _obs[0:first_seq_len]
+            seq_obs.append(first_seq_obs)
+
+        # For the first sequence, the initial memory should be the one at the
+        # beginning of this trajectory.
+        for _ in range(first_seq_len):
+            all_next_memories.append(ModelUtils.to_numpy(initial_memory.squeeze()))
+
+        init_values, _mem = self.critic.critic_pass(
+            seq_obs, initial_memory, sequence_length=first_seq_len
+        )
+        all_values = {
+            signal_name: [init_values[signal_name]]
+            for signal_name in init_values.keys()
+        }
+
+        # Evaluate other trajectories, carrying over _mem after each
+        # trajectory
+        for seq_num in range(
+            1, math.ceil((num_experiences) / (self.policy.sequence_length))
+        ):
+            seq_obs = []
+            for _ in range(self.policy.sequence_length):
+                all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze()))
+            for _obs in tensor_obs:
+                start = seq_num * self.policy.sequence_length - (
+                    self.policy.sequence_length - leftover
+                )
+                end = (seq_num + 1) * self.policy.sequence_length - (
+                    self.policy.sequence_length - leftover
+                )
+                seq_obs.append(_obs[start:end])
+            values, _mem = self.critic.critic_pass(
+                seq_obs, _mem, sequence_length=self.policy.sequence_length
+            )
+            for signal_name, _val in values.items():
+                all_values[signal_name].append(_val)
+        # Create one tensor per reward signal
+        all_value_tensors = {
+            signal_name: torch.cat(value_list, dim=0)
+            for signal_name, value_list in all_values.items()
+        }
+        next_mem = _mem
+        return all_value_tensors, all_next_memories, next_mem
+
-        next_critic_obs: List[List[np.ndarray]],
-    ) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
+        agent_id: str = "",
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField]]:
+        """
+        Get value estimates and memories for a trajectory, in batch form.
+        :param batch: An AgentBuffer that consists of a trajectory.
+        :param next_obs: the next observation (after the trajectory). Used for boostrapping
+            if this is not a termiinal trajectory.
+        :param done: Set true if this is a terminal trajectory.
+        :param agent_id: Agent ID of the agent that this trajectory belongs to.
+        :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)],
+            the final value estimate as a Dict of [name, float], and optionally (if using memories)
+            an AgentBufferField of initial critic memories to be used during update.
+        """
-        current_obs = ObsUtil.from_buffer(batch, n_obs)
+
+        if agent_id in self.critic_memory_dict:
+            memory = self.critic_memory_dict[agent_id]
+        else:
+            memory = (
+                torch.zeros((1, 1, self.critic.memory_size))
+                if self.policy.use_recurrent
+                else None
+            )
-        current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
+        current_obs = [
+            ModelUtils.list_to_tensor(obs) for obs in ObsUtil.from_buffer(batch, n_obs)
+        ]
-
-        memory = torch.zeros([1, 1, self.policy.m_size])
-        critic_obs = TeamObsUtil.from_buffer(batch, n_obs)
-        critic_obs = [
-            [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
-            for _teammate_obs in critic_obs
-        ]
-        next_critic_obs = [
-            ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_critic_obs
-        ]
-        # Expand dimensions of next critic obs
-        next_critic_obs = [
-            [_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_critic_obs
-        ]
+        # If we're using LSTM, we want to get all the intermediate memories.
+        all_next_memories: Optional[AgentBufferField] = None
+        if self.policy.use_recurrent:
+            (
+                value_estimates,
+                all_next_memories,
+                next_memory,
+            ) = self._evaluate_by_sequence(current_obs, memory)
+        else:
+            value_estimates, next_memory = self.critic.critic_pass(
+                current_obs, memory, sequence_length=batch.num_experiences
+            )
-        memory = torch.zeros([1, 1, self.policy.m_size])
+        # Store the memory for the next trajectory
+        self.critic_memory_dict[agent_id] = next_memory
-        value_estimates, next_memory = self.policy.actor_critic.critic_pass(
-            current_obs,
-            memory,
-            sequence_length=batch.num_experiences,
-            critic_obs=critic_obs,
-        )
-
-        next_value_estimate, _ = self.policy.actor_critic.critic_pass(
-            next_obs, next_memory, sequence_length=1, critic_obs=next_critic_obs
+        next_value_estimate, _ = self.critic.critic_pass(
+            next_obs, next_memory, sequence_length=1
        )

        for name, estimate in value_estimates.items():
            for k in next_value_estimate:
                if not self.reward_signals[k].ignore_done:
                    next_value_estimate[k] = 0.0
-
-        return value_estimates, next_value_estimate
+            if agent_id in self.critic_memory_dict:
+                self.critic_memory_dict.pop(agent_id)
+        return value_estimates, next_value_estimate, all_next_memories
--- a/ml-agents/mlagents/trainers/policy/policy.py
+++ b/ml-agents/mlagents/trainers/policy/policy.py
 from mlagents.trainers.action_info import ActionInfo
 from mlagents.trainers.settings import TrainerSettings, NetworkSettings
 from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.behavior_id_utils import GlobalAgentId


 class UnityPolicyException(UnityException):
        self.network_settings: NetworkSettings = trainer_settings.network_settings
        self.seed = seed
        self.previous_action_dict: Dict[str, np.ndarray] = {}
+        self.previous_memory_dict: Dict[str, np.ndarray] = {}
        self.memory_dict: Dict[str, np.ndarray] = {}
        self.normalize = trainer_settings.network_settings.normalize
        self.use_recurrent = self.network_settings.memory is not None
        return np.zeros((num_agents, self.m_size), dtype=np.float32)

    def save_memories(
-        self, agent_ids: List[str], memory_matrix: Optional[np.ndarray]
+        self, agent_ids: List[GlobalAgentId], memory_matrix: Optional[np.ndarray]
+        # Pass old memories into previous_memory_dict
+        for agent_id in agent_ids:
+            if agent_id in self.memory_dict:
+                self.previous_memory_dict[agent_id] = self.memory_dict[agent_id]
+
-    def retrieve_memories(self, agent_ids: List[str]) -> np.ndarray:
+    def retrieve_memories(self, agent_ids: List[GlobalAgentId]) -> np.ndarray:
        memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
        for index, agent_id in enumerate(agent_ids):
            if agent_id in self.memory_dict:
-    def remove_memories(self, agent_ids):
+    def retrieve_previous_memories(self, agent_ids: List[GlobalAgentId]) -> np.ndarray:
+        memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
+        for index, agent_id in enumerate(agent_ids):
+            if agent_id in self.previous_memory_dict:
+                memory_matrix[index, :] = self.previous_memory_dict[agent_id]
+        return memory_matrix
+
+    def remove_memories(self, agent_ids: List[GlobalAgentId]) -> None:
+            if agent_id in self.previous_memory_dict:
+                self.previous_memory_dict.pop(agent_id)

    def make_empty_previous_action(self, num_agents: int) -> np.ndarray:
        """
        )

    def save_previous_action(
-        self, agent_ids: List[str], action_tuple: ActionTuple
+        self, agent_ids: List[GlobalAgentId], action_tuple: ActionTuple
-    def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
+    def retrieve_previous_action(self, agent_ids: List[GlobalAgentId]) -> np.ndarray:
        action_matrix = self.make_empty_previous_action(len(agent_ids))
        for index, agent_id in enumerate(agent_ids):
            if agent_id in self.previous_action_dict:
-    def remove_previous_action(self, agent_ids):
+    def remove_previous_action(self, agent_ids: List[GlobalAgentId]) -> None:
        for agent_id in agent_ids:
            if agent_id in self.previous_action_dict:
                self.previous_action_dict.pop(agent_id)
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py
 from mlagents_envs.timers import timed

 from mlagents.trainers.settings import TrainerSettings
-from mlagents.trainers.torch.networks import (
-    SharedActorCritic,
-    SeparateActorCritic,
-    GlobalSteps,
-)
+from mlagents.trainers.torch.networks import SimpleActor, SharedActorCritic, GlobalSteps

 from mlagents.trainers.torch.utils import ModelUtils
 from mlagents.trainers.buffer import AgentBuffer
        )  # could be much simpler if TorchPolicy is nn.Module
        self.grads = None

-        reward_signal_configs = trainer_settings.reward_signals
-        reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
-
-            ac_class = SeparateActorCritic
+            self.actor = SimpleActor(
+                observation_specs=self.behavior_spec.observation_specs,
+                network_settings=trainer_settings.network_settings,
+                action_spec=behavior_spec.action_spec,
+                conditional_sigma=self.condition_sigma_on_obs,
+                tanh_squash=tanh_squash,
+            )
+            self.shared_critic = False
-            ac_class = SharedActorCritic
-        self.actor_critic = ac_class(
-            observation_specs=self.behavior_spec.observation_specs,
-            network_settings=trainer_settings.network_settings,
-            action_spec=behavior_spec.action_spec,
-            stream_names=reward_signal_names,
-            conditional_sigma=self.condition_sigma_on_obs,
-            tanh_squash=tanh_squash,
-        )
+            reward_signal_configs = trainer_settings.reward_signals
+            reward_signal_names = [
+                key.value for key, _ in reward_signal_configs.items()
+            ]
+            self.actor = SharedActorCritic(
+                observation_specs=self.behavior_spec.observation_specs,
+                network_settings=trainer_settings.network_settings,
+                action_spec=behavior_spec.action_spec,
+                stream_names=reward_signal_names,
+                conditional_sigma=self.condition_sigma_on_obs,
+                tanh_squash=tanh_squash,
+            )
+            self.shared_critic = True
+
-        self.m_size = self.actor_critic.memory_size
+        self.m_size = self.actor.memory_size
-        self.actor_critic.to(default_device())
+        self.actor.to(default_device())
        self._clip_action = not tanh_squash

    @property
        """

        if self.normalize:
-            self.actor_critic.update_normalization(buffer)
+            self.actor.update_normalization(buffer)

    @timed
    def sample_actions(
        memories: Optional[torch.Tensor] = None,
        seq_len: int = 1,
-        critic_obs: Optional[List[List[torch.Tensor]]] = None,
    ) -> Tuple[AgentAction, ActionLogProbs, torch.Tensor, torch.Tensor]:
        """
        :param obs: List of observations.
        :return: Tuple of AgentAction, ActionLogProbs, entropies, and output memories.
        """
-        actions, log_probs, entropies, memories = self.actor_critic.get_action_stats(
+        actions, log_probs, entropies, memories = self.actor.get_action_and_stats(
            obs, masks, memories, seq_len
        )
        return (actions, log_probs, entropies, memories)
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        seq_len: int = 1,
-        critic_obs: Optional[List[List[torch.Tensor]]] = None,
-    ) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
-        log_probs, entropies, value_heads = self.actor_critic.get_stats_and_value(
-            obs, actions, masks, memories, seq_len, critic_obs
+    ) -> Tuple[ActionLogProbs, torch.Tensor]:
+        log_probs, entropies = self.actor.get_stats(
+            obs, actions, masks, memories, seq_len
-        return log_probs, entropies, value_heads
+        return log_probs, entropies

    @timed
    def evaluate(
        memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze(
            0
        )
+
        run_out = {}
        with torch.no_grad():
            action, log_probs, entropy, memories = self.sample_actions(
        return ActionInfo(
            action=run_out.get("action"),
            env_action=run_out.get("env_action"),
-            value=run_out.get("value"),
            outputs=run_out,
            agent_ids=list(decision_requests.agent_id),
        )
        return self.get_current_step()

    def load_weights(self, values: List[np.ndarray]) -> None:
-        self.actor_critic.load_state_dict(values)
+        self.actor.load_state_dict(values)
-        return copy.deepcopy(self.actor_critic.state_dict())
+        return copy.deepcopy(self.actor.state_dict())
-        return {"Policy": self.actor_critic, "global_step": self.global_step}
+        return {"Policy": self.actor, "global_step": self.global_step}
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
 from typing import Dict, cast
-from mlagents.torch_utils import torch
+from mlagents.torch_utils import torch, default_device

 from mlagents.trainers.buffer import AgentBuffer, BufferKey, RewardSignalUtil

 from mlagents.trainers.settings import TrainerSettings, PPOSettings
+from mlagents.trainers.torch.networks import ValueNetwork
-from mlagents.trainers.trajectory import ObsUtil, TeamObsUtil
+from mlagents.trainers.trajectory import ObsUtil


 class TorchPPOOptimizer(TorchOptimizer):
        # Create the graph here to give more granular control of the TF graph to the Optimizer.

        super().__init__(policy, trainer_settings)
-        params = list(self.policy.actor_critic.parameters())
+        reward_signal_configs = trainer_settings.reward_signals
+        reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
+
+        if policy.shared_critic:
+            self._critic = policy.actor
+        else:
+            self._critic = ValueNetwork(
+                reward_signal_names,
+                policy.behavior_spec.observation_specs,
+                network_settings=trainer_settings.network_settings,
+            )
+            self._critic.to(default_device())
+
+        params = list(self.policy.actor.parameters()) + list(self._critic.parameters())
        self.hyperparameters: PPOSettings = cast(
            PPOSettings, trainer_settings.hyperparameters
        )

        self.stream_names = list(self.reward_signals.keys())

+    @property
+    def critic(self):
+        return self._critic
+
    def ppo_value_loss(
        self,
        values: Dict[str, torch.Tensor],
        if len(memories) > 0:
            memories = torch.stack(memories).unsqueeze(0)

-        log_probs, entropy, values = self.policy.evaluate_actions(
+        # Get value memories
+        value_memories = [
+            ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i])
+            for i in range(
+                0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length
+            )
+        ]
+        if len(value_memories) > 0:
+            value_memories = torch.stack(value_memories).unsqueeze(0)
+
+        log_probs, entropy = self.policy.evaluate_actions(
-            critic_obs=critic_obs,
+        )
+        values, _ = self.critic.critic_pass(
+            current_obs,
+            memories=value_memories,
+            sequence_length=self.policy.sequence_length,
        )
        old_log_probs = ActionLogProbs.from_buffer(batch).flatten()
        log_probs = log_probs.flatten()
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
            self.policy.update_normalization(agent_buffer_trajectory)

        # Get all value estimates
-        value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
+        value_estimates, value_next, value_memories = self.optimizer.get_trajectory_value_estimates(
-            trajectory.next_collab_obs,
+        if value_memories is not None:
+            agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set(value_memories)

        for name, v in value_estimates.items():
            agent_buffer_trajectory[RewardSignalUtil.value_estimates_key(name)].extend(
            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
        )

-        advantages = self.update_buffer[BufferKey.ADVANTAGES].get_batch()
+        advantages = np.array(self.update_buffer[BufferKey.ADVANTAGES].get_batch())
        self.update_buffer[BufferKey.ADVANTAGES].set(
            (advantages - advantages.mean()) / (advantages.std() + 1e-10)
        )
--- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py

    def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings):
        super().__init__(policy, trainer_params)
+        reward_signal_configs = trainer_params.reward_signals
+        reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
+        if policy.shared_critic:
+            raise UnityTrainerException("SAC does not support SharedActorCritic")
+        self._critic = ValueNetwork(
+            reward_signal_names,
+            policy.behavior_spec.observation_specs,
+            policy.network_settings,
+        )
+
        hyperparameters: SACSettings = cast(SACSettings, trainer_params.hyperparameters)
        self.tau = hyperparameters.tau
        self.init_entcoef = hyperparameters.init_entcoef
        }
        self._action_spec = self.policy.behavior_spec.action_spec

-        self.value_network = TorchSACOptimizer.PolicyValueNetwork(
+        self.q_network = TorchSACOptimizer.PolicyValueNetwork(
            self.stream_names,
            self.policy.behavior_spec.observation_specs,
            policy_network_settings,
            self.policy.behavior_spec.observation_specs,
            policy_network_settings,
        )
-        ModelUtils.soft_update(
-            self.policy.actor_critic.critic, self.target_network, 1.0
-        )
+        ModelUtils.soft_update(self._critic, self.target_network, 1.0)

        # We create one entropy coefficient per action, whether discrete or continuous.
        _disc_log_ent_coef = torch.nn.Parameter(
        self.target_entropy = TorchSACOptimizer.TargetEntropy(
            continuous=_cont_target, discrete=_disc_target
        )
-        policy_params = list(self.policy.actor_critic.network_body.parameters()) + list(
-            self.policy.actor_critic.action_model.parameters()
-        )
-        value_params = list(self.value_network.parameters()) + list(
-            self.policy.actor_critic.critic.parameters()
+        policy_params = list(self.policy.actor.parameters())
+        value_params = list(self.q_network.parameters()) + list(
+            self._critic.parameters()
        )

        logger.debug("value_vars")
        )
        self._move_to_device(default_device())

+    @property
+    def critic(self):
+        return self._critic
+
-        self.value_network.to(device)
+        self._critic.to(device)
+        self.q_network.to(device)

    def sac_q_loss(
        self,
            for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length)
        ]
        # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true.
+        value_memories_list = [
+            ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i])
+            for i in range(
+                0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length
+            )
+        ]
-        next_memories_list = [
+        next_value_memories_list = [
-                batch[BufferKey.MEMORY][i][self.policy.m_size // 2 :]
+                batch[BufferKey.CRITIC_MEMORY][i]
-                offset, len(batch[BufferKey.MEMORY]), self.policy.sequence_length
+                offset, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length
-            next_memories = torch.stack(next_memories_list).unsqueeze(0)
+            value_memories = torch.stack(value_memories_list).unsqueeze(0)
+            next_value_memories = torch.stack(next_value_memories_list).unsqueeze(0)
-            next_memories = None
-        # Q network memories are 0'ed out, since we don't have them during inference.
+            value_memories = None
+            next_value_memories = None
+
+        # Q and V network memories are 0'ed out, since we don't have them during inference.
-            torch.zeros_like(next_memories) if next_memories is not None else None
+            torch.zeros_like(next_value_memories)
+            if next_value_memories is not None
+            else None
-        self.value_network.q1_network.network_body.copy_normalization(
-            self.policy.actor_critic.network_body
+        self.q_network.q1_network.network_body.copy_normalization(
+            self.policy.actor.network_body
-        self.value_network.q2_network.network_body.copy_normalization(
-            self.policy.actor_critic.network_body
+        self.q_network.q2_network.network_body.copy_normalization(
+            self.policy.actor.network_body
-            self.policy.actor_critic.network_body
+            self.policy.actor.network_body
-        (
-            sampled_actions,
-            log_probs,
-            _,
-            value_estimates,
-            _,
-        ) = self.policy.actor_critic.get_action_stats_and_value(
+        self._critic.network_body.copy_normalization(self.policy.actor.network_body)
+        sampled_actions, log_probs, _, _, = self.policy.actor.get_action_and_stats(
+        value_estimates, _ = self._critic.critic_pass(
+            current_obs, value_memories, sequence_length=self.policy.sequence_length
+        )
-        q1p_out, q2p_out = self.value_network(
+        q1p_out, q2p_out = self.q_network(
            current_obs,
            cont_sampled_actions,
            memories=q_memories,
-        q1_out, q2_out = self.value_network(
+        q1_out, q2_out = self.q_network(
            current_obs,
            cont_actions,
            memories=q_memories,
        with torch.no_grad():
            target_values, _ = self.target_network(
                next_obs,
-                memories=next_memories,
+                memories=next_value_memories,
                sequence_length=self.policy.sequence_length,
            )
        masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool)
        policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks)
        entropy_loss = self.sac_entropy_loss(log_probs, masks)

-        total_value_loss = q1_loss + q2_loss + value_loss
+        total_value_loss = q1_loss + q2_loss
+        if self.policy.shared_critic:
+            policy_loss += value_loss
+        else:
+            total_value_loss += value_loss

        decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step())
        ModelUtils.update_learning_rate(self.policy_optimizer, decay_lr)
        self.entropy_optimizer.step()

        # Update target network
-        ModelUtils.soft_update(
-            self.policy.actor_critic.critic, self.target_network, self.tau
-        )
+        ModelUtils.soft_update(self._critic, self.target_network, self.tau)
        update_stats = {
            "Losses/Policy Loss": policy_loss.item(),
            "Losses/Value Loss": value_loss.item(),

    def get_modules(self):
        modules = {
-            "Optimizer:value_network": self.value_network,
+            "Optimizer:value_network": self.q_network,
            "Optimizer:target_network": self.target_network,
            "Optimizer:policy_optimizer": self.policy_optimizer,
            "Optimizer:value_optimizer": self.value_optimizer,
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

        # Get all value estimates for reporting purposes
-        value_estimates, _ = self.optimizer.get_trajectory_value_estimates(
+        (
+            value_estimates,
+            _,
+            value_memories,
+        ) = self.optimizer.get_trajectory_value_estimates(
+        if value_memories is not None:
+            agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set(value_memories)
+
        for name, v in value_estimates.items():
            self._stats_reporter.add_stat(
                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value",
--- a/ml-agents/mlagents/trainers/tests/mock_brain.py
+++ b/ml-agents/mlagents/trainers/tests/mock_brain.py

 from mlagents.trainers.buffer import AgentBuffer, AgentBufferKey
 from mlagents.trainers.torch.action_log_probs import LogProbsTuple
-from mlagents.trainers.trajectory import Trajectory, AgentExperience
+from mlagents.trainers.trajectory import AgentStatus, Trajectory, AgentExperience
 from mlagents_envs.base_env import (
    DecisionSteps,
    TerminalSteps,
    observation_specs: List[ObservationSpec],
    action_spec: ActionSpec,
    done: bool = False,
+    grouped: bool = False,
 ) -> Tuple[DecisionSteps, TerminalSteps]:
    """
    Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations.
    reward = np.array(num_agents * [1.0], dtype=np.float32)
    interrupted = np.array(num_agents * [False], dtype=np.bool)
    agent_id = np.arange(num_agents, dtype=np.int32)
-    group_id = np.array(num_agents * [0], dtype=np.int32)
+    _gid = 1 if grouped else 0
+    group_id = np.array(num_agents * [_gid], dtype=np.int32)
    group_reward = np.array(num_agents * [0.0], dtype=np.float32)
    behavior_spec = BehaviorSpec(observation_specs, action_spec)
    if done:
    action_spec: ActionSpec,
    max_step_complete: bool = False,
    memory_size: int = 10,
+    num_other_agents_in_group: int = 0,
 ) -> Trajectory:
    """
    Makes a fake trajectory of length length. If max_step_complete,
        memory = np.ones(memory_size, dtype=np.float32)
        agent_id = "test_agent"
        behavior_id = "test_brain"
+        group_status = []
+        for _ in range(num_other_agents_in_group):
+            group_status.append(AgentStatus(obs, reward, action, done))
        experience = AgentExperience(
            obs=obs,
            reward=reward,
            prev_action=prev_action,
            interrupted=max_step,
            memory=memory,
+            group_status=group_status,
+            group_reward=0,
        )
        steps_list.append(experience)
    obs = []
        prev_action=prev_action,
        interrupted=max_step_complete,
        memory=memory,
+        group_status=group_status,
+        group_reward=0,
-        steps=steps_list, agent_id=agent_id, behavior_id=behavior_id, next_obs=obs
+        steps=steps_list,
+        agent_id=agent_id,
+        behavior_id=behavior_id,
+        next_obs=obs,
+        next_group_obs=[obs] * num_other_agents_in_group,
    )


--- a/ml-agents/mlagents/trainers/tests/test_agent_processor.py
+++ b/ml-agents/mlagents/trainers/tests/test_agent_processor.py
 from unittest import mock
 import pytest
+from typing import List
 import mlagents.trainers.tests.mock_brain as mb
 import numpy as np
 from mlagents.trainers.agent_processor import (
 def create_mock_policy():
    mock_policy = mock.Mock()
    mock_policy.reward_signals = {}
-    mock_policy.retrieve_memories.return_value = np.zeros((1, 1), dtype=np.float32)
+    mock_policy.retrieve_previous_memories.return_value = np.zeros(
+        (1, 1), dtype=np.float32
+    )
+def _create_action_info(num_agents: int, agent_ids: List[str]) -> ActionInfo:
+    fake_action_outputs = {
+        "action": ActionTuple(
+            continuous=np.array([[0.1]] * num_agents, dtype=np.float32)
+        ),
+        "entropy": np.array([1.0], dtype=np.float32),
+        "learning_rate": 1.0,
+        "log_probs": LogProbsTuple(
+            continuous=np.array([[0.1]] * num_agents, dtype=np.float32)
+        ),
+    }
+    fake_action_info = ActionInfo(
+        action=ActionTuple(continuous=np.array([[0.1]] * num_agents, dtype=np.float32)),
+        env_action=ActionTuple(
+            continuous=np.array([[0.1]] * num_agents, dtype=np.float32)
+        ),
+        outputs=fake_action_outputs,
+        agent_ids=agent_ids,
+    )
+    return fake_action_info
+
+
@pytest.mark.parametrize("num_vis_obs", [0, 1, 2], ids=["vec", "1 viz", "2 viz"])
 def test_agentprocessor(num_vis_obs):
    policy = create_mock_policy()
        stats_reporter=StatsReporter("testcat"),
    )

-    fake_action_outputs = {
-        "action": ActionTuple(continuous=np.array([[0.1], [0.1]])),
-        "entropy": np.array([1.0], dtype=np.float32),
-        "learning_rate": 1.0,
-        "log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])),
-    }
    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=2,
        observation_specs=create_observation_specs_with_shapes(
    )
-    fake_action_info = ActionInfo(
-        action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
-        env_action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
-        value=[0.1, 0.1],
-        outputs=fake_action_outputs,
-        agent_ids=mock_decision_steps.agent_id,
-    )
+    fake_action_info = _create_action_info(2, mock_decision_steps.agent_id)
    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
    # Assert that the trajectory is of length 5
    trajectory = tqueue.put.call_args_list[0][0][0]
    assert len(trajectory.steps) == 5
+    # Make sure ungrouped agents don't have team obs
+    for step in trajectory.steps:
+        assert len(step.group_status) == 0
-    assert len(processor.experience_buffers[0]) == 0
+    assert len(processor._experience_buffers[0]) == 0

    # Test empty steps
    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    # Assert that the AgentProcessor is still empty
-    assert len(processor.experience_buffers[0]) == 0
+    assert len(processor._experience_buffers[0]) == 0
+
+
+def test_group_statuses():
+    policy = create_mock_policy()
+    tqueue = mock.Mock()
+    name_behavior_id = "test_brain_name"
+    processor = AgentProcessor(
+        policy,
+        name_behavior_id,
+        max_trajectory_length=5,
+        stats_reporter=StatsReporter("testcat"),
+    )
+
+    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
+        num_agents=4,
+        observation_specs=create_observation_specs_with_shapes([(8,)]),
+        action_spec=ActionSpec.create_continuous(2),
+        grouped=True,
+    )
+    fake_action_info = _create_action_info(4, mock_decision_steps.agent_id)
+    processor.publish_trajectory_queue(tqueue)
+    # This is like the initial state after the env reset
+    processor.add_experiences(
+        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
+    )
+    for _ in range(2):
+        processor.add_experiences(
+            mock_decision_steps, mock_terminal_steps, 0, fake_action_info
+        )
+
+    # Make terminal steps for some dead agents
+    mock_decision_steps_2, mock_terminal_steps_2 = mb.create_mock_steps(
+        num_agents=2,
+        observation_specs=create_observation_specs_with_shapes([(8,)]),
+        action_spec=ActionSpec.create_continuous(2),
+        done=True,
+        grouped=True,
+    )
+
+    processor.add_experiences(
+        mock_decision_steps_2, mock_terminal_steps_2, 0, fake_action_info
+    )
+    fake_action_info = _create_action_info(4, mock_decision_steps.agent_id)
+    for _ in range(3):
+        processor.add_experiences(
+            mock_decision_steps, mock_terminal_steps, 0, fake_action_info
+        )
+
+    # Assert that four trajectories have been added to the Trainer
+    assert len(tqueue.put.call_args_list) == 4
+    # Last trajectory should be the longest
+    trajectory = tqueue.put.call_args_list[0][0][-1]
+
+    # Make sure trajectory has the right Groupmate Experiences
+    for step in trajectory.steps[0:3]:
+        assert len(step.group_status) == 3
+    # After 2 agents has died
+    for step in trajectory.steps[3:]:
+        assert len(step.group_status) == 1


 def test_agent_deletion():
        stats_reporter=StatsReporter("testcat"),
    )
    fake_action_outputs = {
-        "action": ActionTuple(continuous=np.array([[0.1]])),
+        "action": ActionTuple(continuous=np.array([[0.1]], dtype=np.float32)),
-        "log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
+        "log_probs": LogProbsTuple(continuous=np.array([[0.1]], dtype=np.float32)),
    }

    mock_decision_step, mock_terminal_step = mb.create_mock_steps(
        done=True,
    )
    fake_action_info = ActionInfo(
-        action=ActionTuple(continuous=np.array([[0.1]])),
-        env_action=ActionTuple(continuous=np.array([[0.1]])),
-        value=[0.1],
+        action=ActionTuple(continuous=np.array([[0.1]], dtype=np.float32)),
+        env_action=ActionTuple(continuous=np.array([[0.1]], dtype=np.float32)),
        outputs=fake_action_outputs,
        agent_ids=mock_decision_step.agent_id,
    )
    policy.save_previous_action.assert_has_calls(add_calls)
    policy.remove_previous_action.assert_has_calls(remove_calls)
    # Check that there are no experiences left
-    assert len(processor.experience_buffers.keys()) == 0
-    assert len(processor.last_take_action_outputs.keys()) == 0
-    assert len(processor.episode_steps.keys()) == 0
-    assert len(processor.episode_rewards.keys()) == 0
-    assert len(processor.last_step_result.keys()) == 0
+    assert len(processor._experience_buffers.keys()) == 0
+    assert len(processor._last_take_action_outputs.keys()) == 0
+    assert len(processor._episode_steps.keys()) == 0
+    assert len(processor._episode_rewards.keys()) == 0
+    assert len(processor._last_step_result.keys()) == 0
-    assert len(processor.experience_buffers.keys()) == 0
-    assert len(processor.last_take_action_outputs.keys()) == 0
-    assert len(processor.episode_steps.keys()) == 0
-    assert len(processor.episode_rewards.keys()) == 0
-    assert len(processor.last_step_result.keys()) == 0
+    assert len(processor._experience_buffers.keys()) == 0
+    assert len(processor._last_take_action_outputs.keys()) == 0
+    assert len(processor._episode_steps.keys()) == 0
+    assert len(processor._episode_rewards.keys()) == 0
+    assert len(processor._last_step_result.keys()) == 0


 def test_end_episode():
        stats_reporter=StatsReporter("testcat"),
    )
    fake_action_outputs = {
-        "action": ActionTuple(continuous=np.array([[0.1]])),
+        "action": ActionTuple(continuous=np.array([[0.1]], dtype=np.float32)),
-        "log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
+        "log_probs": LogProbsTuple(continuous=np.array([[0.1]], dtype=np.float32)),
    }

    mock_decision_step, mock_terminal_step = mb.create_mock_steps(
    )
    fake_action_info = ActionInfo(
-        action=ActionTuple(continuous=np.array([[0.1]])),
-        env_action=ActionTuple(continuous=np.array([[0.1]])),
-        value=[0.1],
+        action=ActionTuple(continuous=np.array([[0.1]], dtype=np.float32)),
+        env_action=ActionTuple(continuous=np.array([[0.1]], dtype=np.float32)),
        outputs=fake_action_outputs,
        agent_ids=mock_decision_step.agent_id,
    )
    # Check that we removed every agent
    policy.remove_previous_action.assert_has_calls(remove_calls)
    # Check that there are no experiences left
-    assert len(processor.experience_buffers.keys()) == 0
-    assert len(processor.last_take_action_outputs.keys()) == 0
-    assert len(processor.episode_steps.keys()) == 0
-    assert len(processor.episode_rewards.keys()) == 0
+    assert len(processor._experience_buffers.keys()) == 0
+    assert len(processor._last_take_action_outputs.keys()) == 0
+    assert len(processor._episode_steps.keys()) == 0
+    assert len(processor._episode_rewards.keys()) == 0


 def test_agent_manager():
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )
-    assert len(manager.trajectory_queues) == 1
-    assert isinstance(manager.trajectory_queues[0], AgentManagerQueue)
+    assert len(manager._trajectory_queues) == 1
+    assert isinstance(manager._trajectory_queues[0], AgentManagerQueue)


 def test_agent_manager_queue():
--- a/ml-agents/mlagents/trainers/tests/test_buffer.py
+++ b/ml-agents/mlagents/trainers/tests/test_buffer.py
    b = AgentBuffer()
    for step in range(9):
        b[ObsUtil.get_name_at(0)].append(
+            np.array(
+                [
+                    100 * fake_agent_id + 10 * step + 1,
+                    100 * fake_agent_id + 10 * step + 2,
+                    100 * fake_agent_id + 10 * step + 3,
+                ],
+                dtype=np.float32,
+            )
+        )
+        b[BufferKey.CONTINUOUS_ACTION].append(
+            np.array(
+                [
+                    100 * fake_agent_id + 10 * step + 4,
+                    100 * fake_agent_id + 10 * step + 5,
+                ],
+                dtype=np.float32,
+            )
+        )
+        b[BufferKey.GROUP_CONTINUOUS_ACTION].append(
-                100 * fake_agent_id + 10 * step + 1,
-                100 * fake_agent_id + 10 * step + 2,
-                100 * fake_agent_id + 10 * step + 3,
+                np.array(
+                    [
+                        100 * fake_agent_id + 10 * step + 4,
+                        100 * fake_agent_id + 10 * step + 5,
+                    ],
+                    dtype=np.float32,
+                )
-        )
-        b[BufferKey.CONTINUOUS_ACTION].append(
-            [100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5]
+            * 3
        )
    return b

    agent_2_buffer = construct_fake_buffer(2)
    agent_3_buffer = construct_fake_buffer(3)
+
+    # Test get_batch
-    assert_array(np.array(a), np.array([[171, 172, 173], [181, 182, 183]]))
+    assert_array(
+        np.array(a), np.array([[171, 172, 173], [181, 182, 183]], dtype=np.float32)
+    )
+
+    # Test get_batch
    a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(
        batch_size=2, training_length=3, sequential=True
    )
            ]
        ),
    )
+    # Test group entries return Lists of Lists
+    a = agent_2_buffer[BufferKey.GROUP_CONTINUOUS_ACTION].get_batch(
+        batch_size=2, training_length=1, sequential=True
+    )
+    for _group_entry in a:
+        assert len(_group_entry) == 3
+
    agent_1_buffer.reset_agent()
    assert agent_1_buffer.num_experiences == 0
    update_buffer = AgentBuffer()

    c = update_buffer.make_mini_batch(start=0, end=1)
    assert c.keys() == update_buffer.keys()
+    # Make sure the values of c are AgentBufferField
+    for val in c.values():
+        assert isinstance(val, AgentBufferField)
+
+
+def test_agentbufferfield():
+    # Test constructor
+    a = AgentBufferField([0, 1, 2])
+    for i, num in enumerate(a):
+        assert num == i
+        # Test indexing
+        assert a[i] == num
+
+    # Test slicing
+    b = a[1:3]
+    assert b == [1, 2]
+    assert isinstance(b, AgentBufferField)
+
+    # Test padding
+    c = AgentBufferField()
+    for _ in range(2):
+        c.append([np.array(1), np.array(2)])
+
+    for _ in range(2):
+        c.append([np.array(1)])
+
+    padded = c.padded_to_batch(pad_value=3)
+    assert np.array_equal(padded[0], np.array([1, 1, 1, 1]))
+    assert np.array_equal(padded[1], np.array([2, 2, 3, 3]))


 def fakerandint(values):
--- a/ml-agents/mlagents/trainers/tests/test_trajectory.py
+++ b/ml-agents/mlagents/trainers/tests/test_trajectory.py
+import numpy as np
+
+from mlagents.trainers.trajectory import GroupObsUtil
-from mlagents.trainers.buffer import BufferKey, ObservationKeyPrefix
+from mlagents.trainers.buffer import AgentBuffer, BufferKey, ObservationKeyPrefix

 VEC_OBS_SIZE = 6
 ACTION_SIZE = 4
    length = 15
+    # These keys should be of type np.ndarray
    wanted_keys = [
        (ObservationKeyPrefix.OBSERVATION, 0),
        (ObservationKeyPrefix.OBSERVATION, 1),
        BufferKey.ACTION_MASK,
        BufferKey.PREV_ACTION,
        BufferKey.ENVIRONMENT_REWARDS,
+        BufferKey.GROUP_REWARD,
-    wanted_keys = set(wanted_keys)
+    # These keys should be of type List
+    wanted_group_keys = [
+        BufferKey.GROUPMATE_REWARDS,
+        BufferKey.GROUP_CONTINUOUS_ACTION,
+        BufferKey.GROUP_DISCRETE_ACTION,
+        BufferKey.GROUP_DONES,
+        BufferKey.GROUP_NEXT_CONT_ACTION,
+        BufferKey.GROUP_NEXT_DISC_ACTION,
+    ]
+    wanted_keys = set(wanted_keys + wanted_group_keys)
    trajectory = make_fake_trajectory(
        length=length,
        observation_specs=create_observation_specs_with_shapes(
+        num_other_agents_in_group=4,
    )
    agentbuffer = trajectory.to_agentbuffer()
    seen_keys = set()

-    assert seen_keys == wanted_keys
+    assert seen_keys.issuperset(wanted_keys)
+
+    for _key in wanted_group_keys:
+        for step in agentbuffer[_key]:
+            assert len(step) == 4
+
+
+def test_obsutil_group_from_buffer():
+    buff = AgentBuffer()
+    # Create some obs
+    for _ in range(3):
+        buff[GroupObsUtil.get_name_at(0)].append(3 * [np.ones((5,), dtype=np.float32)])
+    # Some agents have died
+    for _ in range(2):
+        buff[GroupObsUtil.get_name_at(0)].append(1 * [np.ones((5,), dtype=np.float32)])
+
+    # Get the group obs, which will be a List of Lists of np.ndarray, where each element is the same
+    # length as the AgentBuffer but contains only one agent's obs. Dead agents are padded by
+    # NaNs.
+    gobs = GroupObsUtil.from_buffer(buff, 1)
+    # Agent 0 is full
+    agent_0_obs = gobs[0]
+    for obs in agent_0_obs:
+        assert obs.shape == (buff.num_experiences, 5)
+        assert not np.isnan(obs).any()
+
+    agent_1_obs = gobs[1]
+    for obs in agent_1_obs:
+        assert obs.shape == (buff.num_experiences, 5)
+        for i, _exp_obs in enumerate(obs):
+            if i >= 3:
+                assert np.isnan(_exp_obs).all()
+            else:
+                assert not np.isnan(_exp_obs).any()
--- a/ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py
+++ b/ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py
    """
    Make sure two policies have the same output for the same input.
    """
-    policy1.actor_critic = policy1.actor_critic.to(default_device())
-    policy2.actor_critic = policy2.actor_critic.to(default_device())
+    policy1.actor = policy1.actor.to(default_device())
+    policy2.actor = policy2.actor.to(default_device())

    decision_step, _ = mb.create_steps_from_behavior_spec(
        policy1.behavior_spec, num_agents=1
--- a/ml-agents/mlagents/trainers/tests/torch/test_networks.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_networks.py
 from mlagents.trainers.torch.networks import (
    NetworkBody,
    ValueNetwork,
+    SimpleActor,
-    SeparateActorCritic,
 )
 from mlagents.trainers.settings import NetworkSettings
 from mlagents_envs.base_env import ActionSpec
            assert _out[0] == pytest.approx(1.0, abs=0.1)


-@pytest.mark.parametrize("ac_type", [SharedActorCritic, SeparateActorCritic])
+@pytest.mark.parametrize("shared", [True, False])
-def test_actor_critic(ac_type, lstm):
+def test_actor_critic(lstm, shared):
    obs_size = 4
    network_settings = NetworkSettings(
        memory=NetworkSettings.MemorySettings() if lstm else None, normalize=True
    stream_names = [f"stream_name{n}" for n in range(4)]
    # action_spec = ActionSpec.create_continuous(act_size[0])
    action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size)))
-    actor = ac_type(obs_spec, network_settings, action_spec, stream_names)
+    if shared:
+        actor = critic = SharedActorCritic(
+            obs_spec, network_settings, action_spec, stream_names, network_settings
+        )
+    else:
+        actor = SimpleActor(obs_spec, network_settings, action_spec)
+        critic = ValueNetwork(stream_names, obs_spec, network_settings)
    if lstm:
        sample_obs = torch.ones((1, network_settings.memory.sequence_length, obs_size))
        memories = torch.ones(
        # memories isn't always set to None, the network should be able to
        # deal with that.
    # Test critic pass
-    value_out, memories_out = actor.critic_pass([sample_obs], memories=memories)
+    value_out, memories_out = critic.critic_pass([sample_obs], memories=memories)
    for stream in stream_names:
        if lstm:
            assert value_out[stream].shape == (network_settings.memory.sequence_length,)

    # Test get action stats and_value
-    action, log_probs, entropies, value_out, mem_out = actor.get_action_stats_and_value(
+    action, log_probs, entropies, mem_out = actor.get_action_and_stats(
        [sample_obs], memories=memories, masks=mask
    )
    if lstm:

    if mem_out is not None:
        assert mem_out.shape == memories.shape
-    for stream in stream_names:
-        if lstm:
-            assert value_out[stream].shape == (network_settings.memory.sequence_length,)
-        else:
-            assert value_out[stream].shape == (1,)
--- a/ml-agents/mlagents/trainers/tests/torch/test_policy.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_policy.py
    if len(memories) > 0:
        memories = torch.stack(memories).unsqueeze(0)

-    log_probs, entropy, values = policy.evaluate_actions(
+    log_probs, entropy = policy.evaluate_actions(
        tensor_obs,
        masks=act_masks,
        actions=agent_action,

    assert log_probs.flatten().shape == (64, _size)
    assert entropy.shape == (64,)
-    for val in values.values():
-        assert val.shape == (64,)


@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])