浏览代码

Merge branch 'develop-hybrid-actions-singleton' into develop-hybrid-actions-csharp

/MLA-1734-demo-provider
Ruo-Ping Dong 4 年前
当前提交
a7d04be6
共有 105 个文件被更改,包括 6632 次插入293 次删除
  1. 10
      README.md
  2. 5
      com.unity.ml-agents.extensions/Documentation~/com.unity.ml-agents.extensions.md
  3. 7
      com.unity.ml-agents.extensions/Tests/Editor/Sensors/RigidBodySensorTests.cs
  4. 2
      com.unity.ml-agents/Documentation~/com.unity.ml-agents.md
  5. 4
      com.unity.ml-agents/Runtime/Academy.cs
  6. 2
      com.unity.ml-agents/Runtime/Actuators/IActionReceiver.cs
  7. 2
      com.unity.ml-agents/Runtime/Actuators/IDiscreteActionMask.cs
  8. 26
      com.unity.ml-agents/Runtime/Agent.cs
  9. 2
      com.unity.ml-agents/Runtime/Demonstrations/DemonstrationRecorder.cs
  10. 2
      com.unity.ml-agents/Runtime/DiscreteActionMasker.cs
  11. 2
      com.unity.ml-agents/Runtime/SensorHelper.cs
  12. 4
      docs/Installation-Anaconda-Windows.md
  13. 6
      docs/Installation.md
  14. 21
      docs/Learning-Environment-Examples.md
  15. 2
      docs/Training-on-Amazon-Web-Service.md
  16. 4
      docs/Unity-Inference-Engine.md
  17. 78
      ml-agents-envs/mlagents_envs/base_env.py
  18. 26
      ml-agents/mlagents/trainers/agent_processor.py
  19. 12
      ml-agents/mlagents/trainers/env_manager.py
  20. 12
      ml-agents/mlagents/trainers/policy/policy.py
  21. 22
      ml-agents/mlagents/trainers/policy/tf_policy.py
  22. 10
      ml-agents/mlagents/trainers/policy/torch_policy.py
  23. 6
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  24. 3
      ml-agents/mlagents/trainers/simple_env_manager.py
  25. 7
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  26. 20
      ml-agents/mlagents/trainers/tests/mock_brain.py
  27. 4
      ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py
  28. 2
      ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py
  29. 27
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  30. 4
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  31. 2
      ml-agents/mlagents/trainers/tests/torch/test_distributions.py
  32. 82
      ml-agents/mlagents/trainers/tests/torch/test_hybrid.py
  33. 10
      ml-agents/mlagents/trainers/tests/torch/test_policy.py
  34. 11
      ml-agents/mlagents/trainers/tests/torch/test_ppo.py
  35. 4
      ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
  36. 4
      ml-agents/mlagents/trainers/tests/torch/test_utils.py
  37. 14
      ml-agents/mlagents/trainers/torch/action_flattener.py
  38. 44
      ml-agents/mlagents/trainers/torch/action_log_probs.py
  39. 84
      ml-agents/mlagents/trainers/torch/action_model.py
  40. 21
      ml-agents/mlagents/trainers/torch/agent_action.py
  41. 12
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  42. 2
      ml-agents/mlagents/trainers/torch/distributions.py
  43. 25
      ml-agents/mlagents/trainers/trajectory.py
  44. 1
      utils/make_readme_table.py
  45. 8
      Project/Assets/ML-Agents/Examples/Match3.meta
  46. 67
      com.unity.ml-agents.extensions/Documentation~/Match3.md
  47. 3
      com.unity.ml-agents.extensions/Runtime/Match3.meta
  48. 3
      com.unity.ml-agents.extensions/Tests/Editor/Match3.meta
  49. 75
      config/ppo/Match3.yaml
  50. 77
      docs/images/match3.png
  51. 8
      Project/Assets/ML-Agents/Examples/Match3/Prefabs.meta
  52. 174
      Project/Assets/ML-Agents/Examples/Match3/Prefabs/Match3Heuristic.prefab
  53. 7
      Project/Assets/ML-Agents/Examples/Match3/Prefabs/Match3Heuristic.prefab.meta
  54. 170
      Project/Assets/ML-Agents/Examples/Match3/Prefabs/Match3VectorObs.prefab
  55. 7
      Project/Assets/ML-Agents/Examples/Match3/Prefabs/Match3VectorObs.prefab.meta
  56. 170
      Project/Assets/ML-Agents/Examples/Match3/Prefabs/Match3VisualObs.prefab
  57. 7
      Project/Assets/ML-Agents/Examples/Match3/Prefabs/Match3VisualObs.prefab.meta
  58. 8
      Project/Assets/ML-Agents/Examples/Match3/Scenes.meta
  59. 1001
      Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity
  60. 7
      Project/Assets/ML-Agents/Examples/Match3/Scenes/Match3.unity.meta
  61. 8
      Project/Assets/ML-Agents/Examples/Match3/Scripts.meta
  62. 373
      Project/Assets/ML-Agents/Examples/Match3/Scripts/Match3Agent.cs
  63. 3
      Project/Assets/ML-Agents/Examples/Match3/Scripts/Match3Agent.cs.meta
  64. 272
      Project/Assets/ML-Agents/Examples/Match3/Scripts/Match3Board.cs
  65. 11
      Project/Assets/ML-Agents/Examples/Match3/Scripts/Match3Board.cs.meta
  66. 102
      Project/Assets/ML-Agents/Examples/Match3/Scripts/Match3Drawer.cs
  67. 3
      Project/Assets/ML-Agents/Examples/Match3/Scripts/Match3Drawer.cs.meta
  68. 8
      Project/Assets/ML-Agents/Examples/Match3/TFModels.meta
  69. 1001
      Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VectorObs.onnx
  70. 14
      Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VectorObs.onnx.meta
  71. 1001
      Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VisualObs.nn
  72. 11
      Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VisualObs.nn.meta
  73. 233
      com.unity.ml-agents.extensions/Runtime/Match3/AbstractBoard.cs
  74. 3
      com.unity.ml-agents.extensions/Runtime/Match3/AbstractBoard.cs.meta
  75. 120
      com.unity.ml-agents.extensions/Runtime/Match3/Match3Actuator.cs
  76. 3
      com.unity.ml-agents.extensions/Runtime/Match3/Match3Actuator.cs.meta
  77. 49
      com.unity.ml-agents.extensions/Runtime/Match3/Match3ActuatorComponent.cs
  78. 3
      com.unity.ml-agents.extensions/Runtime/Match3/Match3ActuatorComponent.cs.meta
  79. 297
      com.unity.ml-agents.extensions/Runtime/Match3/Match3Sensor.cs
  80. 3
      com.unity.ml-agents.extensions/Runtime/Match3/Match3Sensor.cs.meta
  81. 43
      com.unity.ml-agents.extensions/Runtime/Match3/Match3SensorComponent.cs
  82. 3
      com.unity.ml-agents.extensions/Runtime/Match3/Match3SensorComponent.cs.meta
  83. 260
      com.unity.ml-agents.extensions/Runtime/Match3/Move.cs
  84. 3
      com.unity.ml-agents.extensions/Runtime/Match3/Move.cs.meta
  85. 152
      com.unity.ml-agents.extensions/Tests/Editor/Match3/AbstractBoardTests.cs
  86. 3
      com.unity.ml-agents.extensions/Tests/Editor/Match3/AbstractBoardTests.cs.meta
  87. 115
      com.unity.ml-agents.extensions/Tests/Editor/Match3/Match3ActuatorTests.cs
  88. 3
      com.unity.ml-agents.extensions/Tests/Editor/Match3/Match3ActuatorTests.cs.meta
  89. 314
      com.unity.ml-agents.extensions/Tests/Editor/Match3/Match3SensorTests.cs
  90. 3
      com.unity.ml-agents.extensions/Tests/Editor/Match3/Match3SensorTests.cs.meta
  91. 60
      com.unity.ml-agents.extensions/Tests/Editor/Match3/MoveTests.cs
  92. 3
      com.unity.ml-agents.extensions/Tests/Editor/Match3/MoveTests.cs.meta
  93. 3
      com.unity.ml-agents.extensions/Tests/Editor/Match3/match3obs0.png

10
README.md


# Unity ML-Agents Toolkit
[![docs badge](https://img.shields.io/badge/docs-reference-blue.svg)](https://github.com/Unity-Technologies/ml-agents/tree/release_8_docs/docs/)
[![docs badge](https://img.shields.io/badge/docs-reference-blue.svg)](https://github.com/Unity-Technologies/ml-agents/tree/release_9_docs/docs/)
[![license badge](https://img.shields.io/badge/license-Apache--2.0-green.svg)](LICENSE)

## Releases & Documentation
**Our latest, stable release is `Release 8`. Click
[here](https://github.com/Unity-Technologies/ml-agents/tree/release_8_docs/docs/Readme.md)
**Our latest, stable release is `Release 9`. Click
[here](https://github.com/Unity-Technologies/ml-agents/tree/release_9_docs/docs/Readme.md)
to get started with the latest release of ML-Agents.**
The table below lists all our releases, including our `master` branch which is

| **Version** | **Release Date** | **Source** | **Documentation** | **Download** |
|:-------:|:------:|:-------------:|:-------:|:------------:|
| **master (unstable)** | -- | [source](https://github.com/Unity-Technologies/ml-agents/tree/master) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/master/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/master.zip) |
| **Release 8** | **October 14, 2020** | **[source](https://github.com/Unity-Technologies/ml-agents/tree/release_8)** | **[docs](https://github.com/Unity-Technologies/ml-agents/tree/release_8_docs/docs/Readme.md)** | **[download](https://github.com/Unity-Technologies/ml-agents/archive/release_8.zip)** |
| **Release 9** | **November 4, 2020** | **[source](https://github.com/Unity-Technologies/ml-agents/tree/release_9)** | **[docs](https://github.com/Unity-Technologies/ml-agents/tree/release_9_docs/docs/Readme.md)** | **[download](https://github.com/Unity-Technologies/ml-agents/archive/release_9.zip)** |
| **Release 8** | October 14, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/release_8) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/release_8_docs/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/release_8.zip) |
| **Release 2** | May 20, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/release_2) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/release_2_docs/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/release_2.zip) |
## Citation

5
com.unity.ml-agents.extensions/Documentation~/com.unity.ml-agents.extensions.md


| _Runtime_ | Contains core C# APIs for integrating ML-Agents into your Unity scene. |
| _Tests_ | Contains the unit tests for the package. |
The Runtime directory currently contains three features:
* [Match-3 sensor and actuator](Match3.md)
* [Grid-based sensor](Grid-Sensor.md)
* Physics-based sensors
## Installation
The ML-Agents Extensions package is not currently available in the Package Manager. There are two
recommended ways to install the package:

7
com.unity.ml-agents.extensions/Tests/Editor/Sensors/RigidBodySensorTests.cs


bool isOK = SensorHelper.CompareObservation(sensor, expected, out errorMessage);
Assert.IsTrue(isOK, errorMessage);
}
public static void CompareObservation(ISensor sensor, float[,,] expected)
{
string errorMessage;
bool isOK = SensorHelper.CompareObservation(sensor, expected, out errorMessage);
Assert.IsTrue(isOK, errorMessage);
}
}
public class RigidBodySensorTests

2
com.unity.ml-agents/Documentation~/com.unity.ml-agents.md


[unity ML-Agents Toolkit]: https://github.com/Unity-Technologies/ml-agents
[unity inference engine]: https://docs.unity3d.com/Packages/com.unity.barracuda@latest/index.html
[package manager documentation]: https://docs.unity3d.com/Manual/upm-ui-install.html
[installation instructions]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/Installation.md
[installation instructions]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/Installation.md
[github repository]: https://github.com/Unity-Technologies/ml-agents
[python package]: https://github.com/Unity-Technologies/ml-agents
[execution order of event functions]: https://docs.unity3d.com/Manual/ExecutionOrder.html

4
com.unity.ml-agents/Runtime/Academy.cs


* API. For more information on each of these entities, in addition to how to
* set-up a learning environment and train the behavior of characters in a
* Unity scene, please browse our documentation pages on GitHub:
* https://github.com/Unity-Technologies/ml-agents/tree/release_8_docs/docs/
* https://github.com/Unity-Technologies/ml-agents/tree/release_9_docs/docs/
*/
namespace Unity.MLAgents

/// fall back to inference or heuristic decisions. (You can also set agents to always use
/// inference or heuristics.)
/// </remarks>
[HelpURL("https://github.com/Unity-Technologies/ml-agents/tree/release_8_docs/" +
[HelpURL("https://github.com/Unity-Technologies/ml-agents/tree/release_9_docs/" +
"docs/Learning-Environment-Design.md")]
public class Academy : IDisposable
{

2
com.unity.ml-agents/Runtime/Actuators/IActionReceiver.cs


///
/// See [Agents - Actions] for more information on masking actions.
///
/// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/Learning-Environment-Design-Agents.md#actions
/// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/Learning-Environment-Design-Agents.md#actions
/// </remarks>
/// <seealso cref="IActionReceiver.OnActionReceived"/>
void WriteDiscreteActionMask(IDiscreteActionMask actionMask);

2
com.unity.ml-agents/Runtime/Actuators/IDiscreteActionMask.cs


///
/// See [Agents - Actions] for more information on masking actions.
///
/// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/Learning-Environment-Design-Agents.md#actions
/// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/Learning-Environment-Design-Agents.md#actions
/// </remarks>
/// <param name="branch">The branch for which the actions will be masked.</param>
/// <param name="actionIndices">The indices of the masked actions.</param>

26
com.unity.ml-agents/Runtime/Agent.cs


/// [OnDisable()]: https://docs.unity3d.com/ScriptReference/MonoBehaviour.OnDisable.html]
/// [OnBeforeSerialize()]: https://docs.unity3d.com/ScriptReference/MonoBehaviour.OnBeforeSerialize.html
/// [OnAfterSerialize()]: https://docs.unity3d.com/ScriptReference/MonoBehaviour.OnAfterSerialize.html
/// [Agents]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/Learning-Environment-Design-Agents.md
/// [Reinforcement Learning in Unity]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/Learning-Environment-Design.md
/// [Agents]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/Learning-Environment-Design-Agents.md
/// [Reinforcement Learning in Unity]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/Learning-Environment-Design.md
/// [Unity ML-Agents Toolkit manual]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/Readme.md
/// [Unity ML-Agents Toolkit manual]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/Readme.md
[HelpURL("https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/" +
[HelpURL("https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/" +
"docs/Learning-Environment-Design-Agents.md")]
[Serializable]
[RequireComponent(typeof(BehaviorParameters))]

/// for information about mixing reward signals from curiosity and Generative Adversarial
/// Imitation Learning (GAIL) with rewards supplied through this method.
///
/// [Agents - Rewards]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/Learning-Environment-Design-Agents.md#rewards
/// [Reward Signals]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/ML-Agents-Overview.md#a-quick-note-on-reward-signals
/// [Agents - Rewards]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/Learning-Environment-Design-Agents.md#rewards
/// [Reward Signals]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/ML-Agents-Overview.md#a-quick-note-on-reward-signals
/// </remarks>
/// <param name="reward">The new value of the reward.</param>
public void SetReward(float reward)

/// for information about mixing reward signals from curiosity and Generative Adversarial
/// Imitation Learning (GAIL) with rewards supplied through this method.
///
/// [Agents - Rewards]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/Learning-Environment-Design-Agents.md#rewards
/// [Reward Signals]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/ML-Agents-Overview.md#a-quick-note-on-reward-signals
/// [Agents - Rewards]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/Learning-Environment-Design-Agents.md#rewards
/// [Reward Signals]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/ML-Agents-Overview.md#a-quick-note-on-reward-signals
///</remarks>
/// <param name="increment">Incremental reward value.</param>
public void AddReward(float increment)

/// implementing a simple heuristic function can aid in debugging agent actions and interactions
/// with its environment.
///
/// [Demonstration Recorder]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/Learning-Environment-Design-Agents.md#recording-demonstrations
/// [Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/Learning-Environment-Design-Agents.md#actions
/// [Demonstration Recorder]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/Learning-Environment-Design-Agents.md#recording-demonstrations
/// [Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/Learning-Environment-Design-Agents.md#actions
/// [GameObject]: https://docs.unity3d.com/Manual/GameObjects.html
/// </remarks>
/// <example>

/// For more information about observations, see [Observations and Sensors].
///
/// [GameObject]: https://docs.unity3d.com/Manual/GameObjects.html
/// [Observations and Sensors]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/Learning-Environment-Design-Agents.md#observations-and-sensors
/// [Observations and Sensors]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/Learning-Environment-Design-Agents.md#observations-and-sensors
/// </remarks>
public virtual void CollectObservations(VectorSensor sensor)
{

///
/// See [Agents - Actions] for more information on masking actions.
///
/// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/Learning-Environment-Design-Agents.md#actions
/// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/Learning-Environment-Design-Agents.md#actions
/// </remarks>
/// <seealso cref="IActionReceiver.OnActionReceived"/>
public virtual void WriteDiscreteActionMask(IDiscreteActionMask actionMask)

///
/// For more information about implementing agent actions see [Agents - Actions].
///
/// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/Learning-Environment-Design-Agents.md#actions
/// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/Learning-Environment-Design-Agents.md#actions
/// </remarks>
/// <param name="actions">
/// Struct containing the buffers of actions to be executed at this step.

2
com.unity.ml-agents/Runtime/Demonstrations/DemonstrationRecorder.cs


/// See [Imitation Learning - Recording Demonstrations] for more information.
///
/// [GameObject]: https://docs.unity3d.com/Manual/GameObjects.html
/// [Imitation Learning - Recording Demonstrations]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs//Learning-Environment-Design-Agents.md#recording-demonstrations
/// [Imitation Learning - Recording Demonstrations]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs//Learning-Environment-Design-Agents.md#recording-demonstrations
/// </remarks>
[RequireComponent(typeof(Agent))]
[AddComponentMenu("ML Agents/Demonstration Recorder", (int)MenuGroup.Default)]

2
com.unity.ml-agents/Runtime/DiscreteActionMasker.cs


///
/// See [Agents - Actions] for more information on masking actions.
///
/// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/docs/Learning-Environment-Design-Agents.md#actions
/// [Agents - Actions]: https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/docs/Learning-Environment-Design-Agents.md#actions
/// </remarks>
/// <param name="branch">The branch for which the actions will be masked.</param>
/// <param name="actionIndices">The indices of the masked actions.</param>

2
com.unity.ml-agents/Runtime/SensorHelper.cs


if (expected[h, w, c] != output[tensorShape.Index(0, h, w, c)])
{
errorMessage = $"Expected and actual differed in position [{h}, {w}, {c}]. " +
"Expected: {expected[h, w, c]} Actual: {output[tensorShape.Index(0, h, w, c)]} ";
$"Expected: {expected[h, w, c]} Actual: {output[tensorShape.Index(0, h, w, c)]} ";
return false;
}
}

4
docs/Installation-Anaconda-Windows.md


the ml-agents Conda environment by typing `activate ml-agents`)_:
```sh
git clone --branch release_8 https://github.com/Unity-Technologies/ml-agents.git
git clone --branch release_9 https://github.com/Unity-Technologies/ml-agents.git
The `--branch release_8` option will switch to the tag of the latest stable
The `--branch release_9` option will switch to the tag of the latest stable
release. Omitting that will get the `master` branch which is potentially
unstable.

6
docs/Installation.md


of our tutorials / guides assume you have access to our example environments).
```sh
git clone --branch release_8 https://github.com/Unity-Technologies/ml-agents.git
git clone --branch release_9 https://github.com/Unity-Technologies/ml-agents.git
The `--branch release_8` option will switch to the tag of the latest stable
The `--branch release_9` option will switch to the tag of the latest stable
release. Omitting that will get the `master` branch which is potentially
unstable.

ML-Agents Toolkit for your purposes. If you plan to contribute those changes
back, make sure to clone the `master` branch (by omitting `--branch release_8`
back, make sure to clone the `master` branch (by omitting `--branch release_9`
from the command above). See our
[Contributions Guidelines](../com.unity.ml-agents/CONTRIBUTING.md) for more
information on contributing to the ML-Agents Toolkit.

21
docs/Learning-Environment-Examples.md


does not train with the provided default training parameters.**
- Float Properties: None
- Benchmark Mean Reward: 1.75
## Match 3
![Match 3](images/match3.png)
- Set-up: Simple match-3 game. Matched pieces are removed, and remaining pieces
drop down. New pieces are spawned randomly at the top, with a chance of being
"special".
- Goal: Maximize score from matching pieces.
- Agents: The environment contains several independent Agents.
- Agent Reward Function (independent):
- .01 for each normal piece cleared. Special pieces are worth 2x or 3x.
- Behavior Parameters:
- None
- Observations and actions are defined with a sensor and actuator respectively.
- Float Properties: None
- Benchmark Mean Reward:
- 37.2 for visual observations
- 37.6 for vector observations
- 34.2 for simple heuristic (pick a random valid move)
- 37.0 for greedy heuristic (pick the highest-scoring valid move)

2
docs/Training-on-Amazon-Web-Service.md


2. Clone the ML-Agents repo and install the required Python packages
```sh
git clone --branch release_8 https://github.com/Unity-Technologies/ml-agents.git
git clone --branch release_9 https://github.com/Unity-Technologies/ml-agents.git
cd ml-agents/ml-agents/
pip3 install -e .
```

4
docs/Unity-Inference-Engine.md


loading expects certain conventions for constants and tensor names. While it is
possible to construct a model that follows these conventions, we don't provide
any additional help for this. More details can be found in
[TensorNames.cs](https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/com.unity.ml-agents/Runtime/Inference/TensorNames.cs)
[TensorNames.cs](https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/com.unity.ml-agents/Runtime/Inference/TensorNames.cs)
[BarracudaModelParamLoader.cs](https://github.com/Unity-Technologies/ml-agents/blob/release_8_docs/com.unity.ml-agents/Runtime/Inference/BarracudaModelParamLoader.cs).
[BarracudaModelParamLoader.cs](https://github.com/Unity-Technologies/ml-agents/blob/release_9_docs/com.unity.ml-agents/Runtime/Inference/BarracudaModelParamLoader.cs).
If you wish to run inference on an externally trained model, you should use
Barracuda directly, instead of trying to run it through ML-Agents.

78
ml-agents-envs/mlagents_envs/base_env.py


)
class ActionTuple:
class _ActionTupleBase(ABC):
An object whose fields correspond to actions of different types.
Continuous and discrete actions are numpy arrays of type float32 and
int32, respectively and are type checked on construction.
Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively.
An object whose fields correspond to action data of continuous and discrete
spaces. Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively. Note, this also holds when continuous or discrete size is
zero.
"""
def __init__(

):
if continuous is not None and continuous.dtype != np.float32:
continuous = continuous.astype(np.float32, copy=False)
self._continuous = continuous
if discrete is not None and discrete.dtype != np.int32:
discrete = discrete.astype(np.int32, copy=False)
self._discrete = discrete
self._continuous: Optional[np.ndarray] = None
self._discrete: Optional[np.ndarray] = None
if continuous is not None:
self.add_continuous(continuous)
if discrete is not None:
self.add_discrete(discrete)
@property
def continuous(self) -> np.ndarray:

def discrete(self) -> np.ndarray:
return self._discrete
def add_continuous(self, continuous: np.ndarray) -> None:
if continuous.dtype != np.float32:
continuous = continuous.astype(np.float32, copy=False)
if self._discrete is None:
_discrete_dtype = self.get_discrete_dtype()
self._discrete = np.zeros((continuous.shape[0], 0), dtype=_discrete_dtype)
self._continuous = continuous
def add_discrete(self, discrete: np.ndarray) -> None:
_discrete_dtype = self.get_discrete_dtype()
if discrete.dtype != _discrete_dtype:
discrete = discrete.astype(np.int32, copy=False)
if self._continuous is None:
self._continuous = np.zeros((discrete.shape[0], 0), dtype=np.float32)
self._discrete = discrete
@abstractmethod
def get_discrete_dtype(self) -> np.dtype:
pass
class ActionTuple(_ActionTupleBase):
"""
An object whose fields correspond to actions of different types.
Continuous and discrete actions are numpy arrays of type float32 and
int32, respectively and are type checked on construction.
Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively. Note, this also holds when continuous or discrete size is
zero.
"""
def get_discrete_dtype(self) -> np.dtype:
"""
The dtype of a discrete action.
"""
return np.int32
class ActionSpec(NamedTuple):
"""

for a number of agents.
:param n_agents: The number of agents that will have actions generated
"""
continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
return ActionTuple(continuous, discrete)
_continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
_discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
return ActionTuple(continuous=_continuous, discrete=_discrete)
def random_action(self, n_agents: int) -> ActionTuple:
"""

"""
continuous = np.random.uniform(
_continuous = np.random.uniform(
discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
_discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
discrete = np.column_stack(
_discrete = np.column_stack(
[
np.random.randint(
0,

for i in range(self.discrete_size)
]
)
return ActionTuple(continuous, discrete)
return ActionTuple(continuous=_continuous, discrete=_discrete)
def _validate_action(
self, actions: ActionTuple, n_agents: int, name: str

for the correct number of agents and ensures the type.
"""
_expected_shape = (n_agents, self.continuous_size)
if self.continuous_size > 0 and actions.continuous.shape != _expected_shape:
if actions.continuous.shape != _expected_shape:
raise UnityActionException(
f"The behavior {name} needs a continuous input of dimension "
f"{_expected_shape} for (<number of agents>, <action size>) but "

if self.discrete_size > 0 and actions.discrete.shape != _expected_shape:
if actions.discrete.shape != _expected_shape:
raise UnityActionException(
f"The behavior {name} needs a discrete input of dimension "
f"{_expected_shape} for (<number of agents>, <action size>) but "

26
ml-agents/mlagents/trainers/agent_processor.py


from typing import List, Dict, TypeVar, Generic, Tuple, Any, Union
from collections import defaultdict, Counter
import queue
import numpy as np
ActionTuple,
DecisionSteps,
DecisionStep,
TerminalSteps,

from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents.trainers.policy import Policy
from mlagents.trainers.action_info import ActionInfo, ActionInfoOutputs
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents.trainers.stats import StatsReporter
from mlagents.trainers.behavior_id_utils import get_global_agent_id

done = terminated # Since this is an ongoing step
interrupted = step.interrupted if terminated else False
# Add the outputs of the last eval
action_dict = stored_take_action_outputs["action"]
action: Dict[str, np.ndarray] = {}
for act_type, act_array in action_dict.items():
action[act_type] = act_array[idx]
stored_actions = stored_take_action_outputs["action"]
action_tuple = ActionTuple(
continuous=stored_actions.continuous[idx],
discrete=stored_actions.discrete[idx],
)
action_probs_dict = stored_take_action_outputs["log_probs"]
action_probs: Dict[str, np.ndarray] = {}
for prob_type, prob_array in action_probs_dict.items():
action_probs[prob_type] = prob_array[idx]
stored_action_probs = stored_take_action_outputs["log_probs"]
log_probs_tuple = LogProbsTuple(
continuous=stored_action_probs.continuous[idx],
discrete=stored_action_probs.discrete[idx],
)
action_mask = stored_decision_step.action_mask
prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
experience = AgentExperience(

action=action,
action_probs=action_probs,
action=action_tuple,
action_probs=log_probs_tuple,
action_pre=action_pre,
action_mask=action_mask,
prev_action=prev_action,

12
ml-agents/mlagents/trainers/env_manager.py


from abc import ABC, abstractmethod
import numpy as np
from typing import List, Dict, NamedTuple, Iterable, Tuple
from mlagents_envs.base_env import (

BehaviorName,
ActionTuple,
)
from mlagents_envs.side_channel.stats_side_channel import EnvironmentStats

step_info.environment_stats, step_info.worker_id
)
return len(step_infos)
@staticmethod
def action_tuple_from_numpy_dict(action_dict: Dict[str, np.ndarray]) -> ActionTuple:
continuous: np.ndarray = None
discrete: np.ndarray = None
if "continuous_action" in action_dict:
continuous = action_dict["continuous_action"]
if "discrete_action" in action_dict:
discrete = action_dict["discrete_action"]
return ActionTuple(continuous, discrete)

12
ml-agents/mlagents/trainers/policy/policy.py


from typing import Dict, List, Optional
import numpy as np
from mlagents_envs.base_env import DecisionSteps
from mlagents_envs.base_env import ActionTuple, BehaviorSpec, DecisionSteps
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.settings import TrainerSettings, NetworkSettings

condition_sigma_on_obs: bool = True,
):
self.behavior_spec = behavior_spec
self.action_spec = behavior_spec.action_spec
self.trainer_settings = trainer_settings
self.network_settings: NetworkSettings = trainer_settings.network_settings
self.seed = seed

)
def save_previous_action(
self, agent_ids: List[str], action_dict: Dict[str, np.ndarray]
self, agent_ids: List[str], action_tuple: ActionTuple
if action_dict is None or "discrete_action" not in action_dict:
return
self.previous_action_dict[agent_id] = action_dict["discrete_action"][
index, :
]
self.previous_action_dict[agent_id] = action_tuple.discrete[index, :]
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = self.make_empty_previous_action(len(agent_ids))

22
ml-agents/mlagents/trainers/policy/tf_policy.py


from mlagents.tf_utils import tf
from mlagents import tf_utils
from mlagents_envs.exception import UnityException
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents_envs.base_env import DecisionSteps
from mlagents_envs.base_env import DecisionSteps, ActionTuple, BehaviorSpec
from mlagents.trainers.tf.models import ModelUtils
from mlagents.trainers.settings import TrainerSettings, EncoderType
from mlagents.trainers import __version__

reparameterize,
condition_sigma_on_obs,
)
if self.action_spec.continuous_size > 0 and self.action_spec.discrete_size > 0:
if (
self.behavior_spec.action_spec.continuous_size > 0
and self.behavior_spec.action_spec.discrete_size > 0
):
raise UnityPolicyException(
"TensorFlow does not support mixed action spaces. Please run with the Torch framework."
)

self.save_memories(global_agent_ids, run_out.get("memory_out"))
# For Compatibility with buffer changes for hybrid action support
if "log_probs" in run_out:
run_out["log_probs"] = {"action_probs": run_out["log_probs"]}
log_probs_tuple = LogProbsTuple()
if self.behavior_spec.action_spec.is_continuous():
log_probs_tuple.add_continuous(run_out["log_probs"])
else:
log_probs_tuple.add_discrete(run_out["log_probs"])
run_out["log_probs"] = log_probs_tuple
action_tuple = ActionTuple()
run_out["action"] = {"continuous_action": run_out["action"]}
action_tuple.add_continuous(run_out["action"])
run_out["action"] = {"discrete_action": run_out["action"]}
action_tuple.add_discrete(run_out["action"])
run_out["action"] = action_tuple
return ActionInfo(
action=run_out.get("action"),
value=run_out.get("value"),

10
ml-agents/mlagents/trainers/policy/torch_policy.py


) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
mask = None
if self.action_spec.discrete_size > 0:
if self.behavior_spec.action_spec.discrete_size > 0:
mask = torch.ones([len(decision_requests), np.sum(self.act_size)])
if decision_requests.action_mask is not None:
mask = torch.as_tensor(

action, log_probs, entropy, memories = self.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
)
action_dict = action.to_numpy_dict()
run_out["action"] = action_dict
action_tuple = action.to_action_tuple()
run_out["action"] = action_tuple
action_dict["continuous_action"] if self.use_continuous_act else None
action_tuple.continuous if self.use_continuous_act else None
run_out["log_probs"] = log_probs.to_numpy_dict()
run_out["log_probs"] = log_probs.to_log_probs_tuple()
run_out["entropy"] = ModelUtils.to_numpy(entropy)
run_out["learning_rate"] = 0.0
if self.use_recurrent:

6
ml-agents/mlagents/trainers/ppo/optimizer_tf.py


self.policy.sequence_length_ph: self.policy.sequence_length,
self.policy.mask_input: mini_batch["masks"] * burn_in_mask,
self.advantage: mini_batch["advantages"],
self.all_old_log_probs: mini_batch["action_probs"],
if self.policy.use_continuous_act: # For hybrid action buffer support
feed_dict[self.all_old_log_probs] = mini_batch["continuous_log_probs"]
else:
feed_dict[self.all_old_log_probs] = mini_batch["discrete_log_probs"]
if self.policy.output_pre is not None and "actions_pre" in mini_batch:
feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]

3
ml-agents/mlagents/trainers/simple_env_manager.py


self.previous_all_action_info = all_action_info
for brain_name, action_info in all_action_info.items():
_action = EnvManager.action_tuple_from_numpy_dict(action_info.action)
self.env.set_actions(brain_name, _action)
self.env.set_actions(brain_name, action_info.action)
self.env.step()
all_step_result = self._generate_all_results()

7
ml-agents/mlagents/trainers/subprocess_env_manager.py


if req.cmd == EnvironmentCommand.STEP:
all_action_info = req.payload
for brain_name, action_info in all_action_info.items():
if len(action_info.action) != 0:
_action = EnvManager.action_tuple_from_numpy_dict(
action_info.action
)
env.set_actions(brain_name, _action)
if len(action_info.agent_ids) > 0:
env.set_actions(brain_name, action_info.action)
env.step()
all_step_result = _generate_all_results()
# The timers in this process are independent from all the processes and the "main" process

20
ml-agents/mlagents/trainers/tests/mock_brain.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents_envs.base_env import (
DecisionSteps,

ActionTuple,
)

steps_list = []
action_size = action_spec.discrete_size + action_spec.continuous_size
action_probs = {
"action_probs": np.ones(
int(np.sum(action_spec.discrete_branches) + action_spec.continuous_size),
dtype=np.float32,
)
}
for _i in range(length - 1):
obs = []
for _shape in observation_shapes:

if action_spec.is_continuous():
action = {"continuous_action": np.zeros(action_size, dtype=np.float32)}
else:
action = {"discrete_action": np.zeros(action_size, dtype=np.float32)}
action = ActionTuple(
continuous=np.zeros(action_spec.continuous_size, dtype=np.float32),
discrete=np.zeros(action_spec.discrete_size, dtype=np.int32),
)
action_probs = LogProbsTuple(
continuous=np.ones(action_spec.continuous_size, dtype=np.float32),
discrete=np.ones(action_spec.discrete_size, dtype=np.float32),
)
action_pre = np.zeros(action_size, dtype=np.float32)
action_mask = (
[

4
ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py


@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
def test_2d_sac(action_sizes):
env = SimpleEnvironment(
[BRAIN_NAME], action_sizes=action_sizes, action_size=2, step_size=0.8
)
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
new_hyperparams = attr.evolve(SAC_TF_CONFIG.hyperparameters, buffer_init_steps=2000)
config = attr.evolve(
SAC_TF_CONFIG,

2
ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py


behavior_spec = basic_behavior_spec()
policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output")
policy_eval_out = {
"action": {"continuous_action": np.array([1.0], dtype=np.float32)},
"action": np.array([[1.0]], dtype=np.float32),
"memory_out": np.array([[2.5]], dtype=np.float32),
"value": np.array([1.1], dtype=np.float32),
}

27
ml-agents/mlagents/trainers/tests/test_agent_processor.py


AgentManagerQueue,
)
from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents_envs.base_env import ActionSpec
from mlagents_envs.base_env import ActionSpec, ActionTuple
def create_mock_policy():

)
fake_action_outputs = {
"action": {"continuous_action": [0.1, 0.1]},
"action": ActionTuple(continuous=np.array([[0.1], [0.1]])),
"log_probs": {"continuous_log_probs": [0.1, 0.1]},
"log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])),
}
mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
num_agents=2,

fake_action_info = ActionInfo(
action={"continuous_action": [0.1, 0.1]},
action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
value=[0.1, 0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_steps.agent_id,

max_trajectory_length=5,
stats_reporter=StatsReporter("testcat"),
)
"action": {"continuous_action": [0.1]},
"action": ActionTuple(continuous=np.array([[0.1]])),
"log_probs": {"continuous_log_probs": [0.1]},
"log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
mock_decision_step, mock_terminal_step = mb.create_mock_steps(
num_agents=1,
observation_shapes=[(8,)],

done=True,
)
fake_action_info = ActionInfo(
action={"continuous_action": [0.1]},
action=ActionTuple(continuous=np.array([[0.1]])),
value=[0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_step.agent_id,

mock_decision_step, mock_terminal_step, _ep, fake_action_info
)
add_calls.append(
mock.call([get_global_agent_id(_ep, 0)], {"continuous_action": [0.1]})
mock.call([get_global_agent_id(_ep, 0)], fake_action_outputs["action"])
)
processor.add_experiences(
mock_done_decision_step, mock_done_terminal_step, _ep, fake_action_info

max_trajectory_length=5,
stats_reporter=StatsReporter("testcat"),
)
"action": {"continuous_action": [0.1]},
"action": ActionTuple(continuous=np.array([[0.1]])),
"log_probs": {"continuous_log_probs": [0.1]},
"log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
mock_decision_step, mock_terminal_step = mb.create_mock_steps(
num_agents=1,
observation_shapes=[(8,)],

action={"continuous_action": [0.1]},
action=ActionTuple(continuous=np.array([[0.1]])),
value=[0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_step.agent_id,

4
ml-agents/mlagents/trainers/tests/test_trajectory.py


"done",
"actions_pre",
"continuous_action",
"action_probs",
"discrete_action",
"continuous_log_probs",
"discrete_log_probs",
"action_mask",
"prev_action",
"environment_rewards",

2
ml-agents/mlagents/trainers/tests/torch/test_distributions.py


optimizer = torch.optim.Adam(gauss_dist.parameters(), lr=3e-3)
for _ in range(50):
dist_inst = gauss_dist(sample_embedding)[0]
dist_inst = gauss_dist(sample_embedding)
if tanh_squash:
assert isinstance(dist_inst, TanhGaussianDistInstance)
else:

82
ml-agents/mlagents/trainers/tests/torch/test_hybrid.py


from mlagents.trainers.tests.simple_test_envs import (
SimpleEnvironment,
MemoryEnvironment,
RecordEnvironment,
from mlagents.trainers.demo_loader import write_demo
from mlagents.trainers.settings import (
NetworkSettings,
SelfPlaySettings,
BehavioralCloningSettings,
GAILSettings,
RewardSignalType,
EncoderType,
FrameworkType,
)
from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
DemonstrationMetaProto,
)
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous
from mlagents.trainers.settings import NetworkSettings, FrameworkType
from mlagents.trainers.tests.check_env_trains import (
check_environment_trains,
default_reward_processor,
)
from mlagents.trainers.tests.check_env_trains import check_environment_trains
BRAIN_NAME = "1D"

def test_hybrid_ppo():
env = SimpleEnvironment(
[BRAIN_NAME], continuous_action_size=1, discrete_action_size=1
)
config = attr.evolve(PPO_TORCH_CONFIG)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=1.0)
def test_conthybrid_ppo():
env = SimpleEnvironment(
[BRAIN_NAME], continuous_action_size=1, discrete_action_size=0
)
config = attr.evolve(PPO_TORCH_CONFIG)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=1.0)
def test_dischybrid_ppo():
env = SimpleEnvironment(
[BRAIN_NAME], continuous_action_size=0, discrete_action_size=1
)
env = SimpleEnvironment([BRAIN_NAME], action_sizes=(1, 1))
config = attr.evolve(PPO_TORCH_CONFIG)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=1.0)

env = SimpleEnvironment(
[BRAIN_NAME],
num_visual=num_visual,
num_vector=0,
continuous_action_size=1,
discrete_action_size=1,
[BRAIN_NAME], num_visual=num_visual, num_vector=0, action_sizes=(1, 1)
)
new_hyperparams = attr.evolve(
PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4

def test_recurrent_ppo():
env = MemoryEnvironment(
[BRAIN_NAME], continuous_action_size=1, discrete_action_size=1
)
env = MemoryEnvironment([BRAIN_NAME], action_sizes=(1, 1))
new_network_settings = attr.evolve(
PPO_TORCH_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16),

PPO_TORCH_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_network_settings,
max_steps=100000,
max_steps=10000,
# def test_3cdhybrid_ppo():
# env = SimpleEnvironment(
# [BRAIN_NAME], continuous_action_size=2, discrete_action_size=1, step_size=0.8
# )
# new_hyperparams = attr.evolve(
# PPO_TORCH_CONFIG.hyperparameters, batch_size=128, buffer_size=1280, beta=0.01
# )
# config = attr.evolve(
# PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
# )
# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=1.0)
#
#
# def test_3ddhybrid_ppo():
# env = SimpleEnvironment(
# [BRAIN_NAME], continuous_action_size=1, discrete_action_size=2, step_size=0.8
# )
# new_hyperparams = attr.evolve(
# PPO_TORCH_CONFIG.hyperparameters, batch_size=128, buffer_size=1280, beta=0.01
# )
# config = attr.evolve(
# PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
# )
# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=1.0)

10
ml-agents/mlagents/trainers/tests/torch/test_policy.py


run_out = policy.evaluate(decision_step, list(decision_step.agent_id))
if discrete:
run_out["action"]["discrete_action"].shape == (
NUM_AGENTS,
len(DISCRETE_ACTION_SPACE),
)
run_out["action"].discrete.shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
assert run_out["action"]["continuous_action"].shape == (
NUM_AGENTS,
VECTOR_ACTION_SPACE,
)
assert run_out["action"].continuous.shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])

11
ml-agents/mlagents/trainers/tests/torch/test_ppo.py


update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
if discrete:
update_buffer["discrete_log_probs"] = np.ones_like(
update_buffer["discrete_action"]
)
else:
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
return_stats = optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

4
ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py


def test_visual_advanced_ppo(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
action_sizes=True,
action_sizes=(0, 1),
num_visual=num_visual,
num_vector=0,
step_size=0.5,

def test_visual_advanced_sac(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
action_sizes=True,
action_sizes=(0, 1),
num_visual=num_visual,
num_vector=0,
step_size=0.5,

4
ml-agents/mlagents/trainers/tests/torch/test_utils.py


from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.torch.encoders import VectorInput
from mlagents.trainers.torch.distributions import (
CategoricalDistInstance,
GaussianDistInstance,
)
def test_min_visual_size():

14
ml-agents/mlagents/trainers/torch/action_flattener.py


class ActionFlattener:
def __init__(self, action_spec: ActionSpec):
"""
A torch module that creates the flattened form of an AgentAction object.
The flattened form is the continuous action concatenated with the
concatenated one hot encodings of the discrete actions.
:param action_spec: An ActionSpec that describes the action space dimensions
"""
"""
The flattened size is the continuous size plus the sum of the branch sizes
since discrete actions are encoded as one hots.
"""
"""
Returns a tensor corresponding the flattened action
:param action: An AgentAction object
"""
action_list: List[torch.Tensor] = []
if self._specs.continuous_size > 0:
action_list.append(action.continuous_tensor)

44
ml-agents/mlagents/trainers/torch/action_log_probs.py


import numpy as np
from mlagents.trainers.torch.utils import ModelUtils
from mlagents_envs.base_env import _ActionTupleBase
class LogProbsTuple(_ActionTupleBase):
"""
An object whose fields correspond to the log probs of actions of different types.
Continuous and discrete are numpy arrays
Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively. Note, this also holds when continuous or discrete size is
zero.
"""
def get_discrete_dtype(self) -> np.dtype:
"""
The dtype of a discrete log probability.
"""
return np.float32
class ActionLogProbs(NamedTuple):

"""
return torch.cat(self.all_discrete_list, dim=1)
def to_numpy_dict(self) -> Dict[str, np.ndarray]:
def to_log_probs_tuple(self) -> LogProbsTuple:
Returns a Dict of np arrays with an entry correspinding to the continuous log probs
and an entry corresponding to the discrete log probs. "continuous_log_probs" and
"discrete_log_probs" are added to the agents buffer individually to maintain a flat buffer.
Returns a LogProbsTuple. Only adds if tensor is not None. Otherwise,
LogProbsTuple uses a default.
array_dict: Dict[str, np.ndarray] = {}
log_probs_tuple = LogProbsTuple()
array_dict["continuous_log_probs"] = ModelUtils.to_numpy(
self.continuous_tensor
)
continuous = ModelUtils.to_numpy(self.continuous_tensor)
log_probs_tuple.add_continuous(continuous)
array_dict["discrete_log_probs"] = ModelUtils.to_numpy(self.discrete_tensor)
return array_dict
discrete = ModelUtils.to_numpy(self.discrete_tensor)
log_probs_tuple.add_discrete(discrete)
return log_probs_tuple
def _to_tensor_list(self) -> List[torch.Tensor]:
"""

continuous = ModelUtils.list_to_tensor(buff["continuous_log_probs"])
if "discrete_log_probs" in buff:
discrete_tensor = ModelUtils.list_to_tensor(buff["discrete_log_probs"])
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
# This will keep discrete_list = None which enables flatten()
if discrete_tensor.shape[1] > 0:
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
return ActionLogProbs(continuous, discrete, None)

84
ml-agents/mlagents/trainers/torch/action_model.py


class DistInstances(NamedTuple):
continuous: DistInstance
discrete: List[DiscreteDistInstance]
"""
A NamedTuple with fields corresponding the the DistInstance objects
output by continuous and discrete distributions, respectively. Discrete distributions
output a list of DistInstance objects whereas continuous distributions output a single
DistInstance object.
"""
continuous: Optional[DistInstance]
discrete: Optional[List[DiscreteDistInstance]]
class ActionModel(nn.Module):

conditional_sigma: bool = False,
tanh_squash: bool = False,
):
"""
A torch module that represents the action space of a policy. The ActionModel may contain
a continuous distribution, a discrete distribution or both where construction depends on
the action_spec. The ActionModel uses the encoded input of the network body to parameterize
these distributions. The forward method of this module outputs the action, log probs,
and entropies given the encoding from the network body.
:params hidden_size: Size of the input to the ActionModel.
:params action_spec: The ActionSpec defining the action space dimensions and distributions.
:params conditional_sigma: Whether or not the std of a Gaussian is conditioned on state.
:params tanh_squash: Whether to squash the output of a Gaussian with the tanh function.
"""
super().__init__()
self.encoding_size = hidden_size
self.action_spec = action_spec

def _sample_action(self, dists: DistInstances) -> AgentAction:
"""
Samples actions from a DistInstances tuple
:params dists: The DistInstances tuple
:return: An AgentAction corresponding to the actions sampled from the DistInstances
if self.action_spec.continuous_size > 0:
# This checks None because mypy complains otherwise
if dists.continuous is not None:
if self.action_spec.discrete_size > 0:
if dists.discrete is not None:
discrete_action = []
for discrete_dist in dists.discrete:
discrete_action.append(discrete_dist.sample())

"""
Creates a DistInstances tuple using the continuous and discrete distributions
:params inputs: The encoding from the network body
:params masks: Action masks for discrete actions
:return: A DistInstances tuple
"""
if self.action_spec.continuous_size > 0:
continuous_dist = self._continuous_distribution(inputs, masks)
if self.action_spec.discrete_size > 0:
# This checks None because mypy complains otherwise
if self._continuous_distribution is not None:
continuous_dist = self._continuous_distribution(inputs)
if self._discrete_distribution is not None:
discrete_dist = self._discrete_distribution(inputs, masks)
return DistInstances(continuous_dist, discrete_dist)

"""
Computes the log probabilites of the actions given distributions and entropies of
the given distributions.
:params actions: The AgentAction
:params dists: The DistInstances tuple
:return: An ActionLogProbs tuple and a torch tensor of the distribution entropies.
"""
if self.action_spec.continuous_size > 0:
# This checks None because mypy complains otherwise
if dists.continuous is not None:
if self.action_spec.discrete_size > 0:
if dists.discrete is not None:
actions.discrete_list, dists.discrete
actions.discrete_list, dists.discrete # type: ignore
):
discrete_log_prob = discrete_dist.log_prob(discrete_action)
entropies_list.append(discrete_dist.entropy())

def evaluate(
self, inputs: torch.Tensor, masks: torch.Tensor, actions: AgentAction
) -> Tuple[ActionLogProbs, torch.Tensor]:
"""
Given actions and encoding from the network body, gets the distributions and
computes the log probabilites and entropies.
:params inputs: The encoding from the network body
:params masks: Action masks for discrete actions
:params actions: The AgentAction
:return: An ActionLogProbs tuple and a torch tensor of the distribution entropies.
"""
dists = self._get_dists(inputs, masks)
log_probs, entropies = self._get_probs_and_entropy(actions, dists)
# Use the sum of entropy across actions, not the mean

def get_action_out(self, inputs: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:
"""
Gets the tensors corresponding to the output of the policy network to be used for
inference. Called by the Actor's forward call.
:params inputs: The encoding from the network body
:params masks: Action masks for discrete actions
:return: A tuple of torch tensors corresponding to the inference output
"""
if self.action_spec.continuous_size > 0:
if self.action_spec.continuous_size > 0 and dists.continuous is not None:
if self.action_spec.discrete_size > 0:
if self.action_spec.discrete_size > 0 and dists.discrete is not None:
discrete_out = [
discrete_dist.exported_model_output()
for discrete_dist in dists.discrete

def forward(
self, inputs: torch.Tensor, masks: torch.Tensor
) -> Tuple[AgentAction, ActionLogProbs, torch.Tensor]:
"""
The forward method of this module. Outputs the action, log probs,
and entropies given the encoding from the network body.