浏览代码

Merge branch 'develop-action-buffer' into develop-hybrid-actions-singleton

/develop/actionmodel-csharp
Andrew Cohen 4 年前
当前提交
498b1ee6
共有 113 个文件被更改,包括 1577 次插入932 次删除
  1. 2
      .github/ISSUE_TEMPLATE/bug_report.md
  2. 4
      .github/workflows/pytest.yml
  3. 11
      .yamato/test_versions.metafile
  4. 2
      .yamato/training-int-tests.yml
  5. 22
      Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ModelOverrider.cs
  6. 8
      README.md
  7. 11
      com.unity.ml-agents/CHANGELOG.md
  8. 27
      com.unity.ml-agents/Runtime/Academy.cs
  9. 24
      com.unity.ml-agents/Runtime/Agent.cs
  10. 56
      com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs
  11. 2
      com.unity.ml-agents/package.json
  12. 2
      docs/Background-Machine-Learning.md
  13. 10
      docs/Getting-Started.md
  14. 24
      docs/Installation.md
  15. 4
      docs/Learning-Environment-Executable.md
  16. 8
      docs/ML-Agents-Overview.md
  17. 2
      docs/Readme.md
  18. 2
      docs/Training-Configuration-File.md
  19. 35
      docs/Training-ML-Agents.md
  20. 2
      docs/Training-on-Amazon-Web-Service.md
  21. 5
      docs/Unity-Inference-Engine.md
  22. 21
      gym-unity/gym_unity/envs/__init__.py
  23. 8
      gym-unity/gym_unity/tests/test_gym.py
  24. 192
      ml-agents-envs/mlagents_envs/base_env.py
  25. 51
      ml-agents-envs/mlagents_envs/environment.py
  26. 18
      ml-agents-envs/mlagents_envs/rpc_utils.py
  27. 18
      ml-agents-envs/mlagents_envs/tests/test_envs.py
  28. 30
      ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py
  29. 69
      ml-agents-envs/mlagents_envs/tests/test_steps.py
  30. 1
      ml-agents/mlagents/tf_utils/__init__.py
  31. 63
      ml-agents/mlagents/tf_utils/tf.py
  32. 1
      ml-agents/mlagents/torch_utils/__init__.py
  33. 66
      ml-agents/mlagents/torch_utils/torch.py
  34. 19
      ml-agents/mlagents/trainers/agent_processor.py
  35. 2
      ml-agents/mlagents/trainers/buffer.py
  36. 11
      ml-agents/mlagents/trainers/cli_utils.py
  37. 22
      ml-agents/mlagents/trainers/demo_loader.py
  38. 15
      ml-agents/mlagents/trainers/env_manager.py
  39. 16
      ml-agents/mlagents/trainers/ghost/trainer.py
  40. 6
      ml-agents/mlagents/trainers/learn.py
  41. 6
      ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
  42. 47
      ml-agents/mlagents/trainers/policy/policy.py
  43. 19
      ml-agents/mlagents/trainers/policy/tf_policy.py
  44. 86
      ml-agents/mlagents/trainers/policy/torch_policy.py
  45. 17
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  46. 14
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  47. 40
      ml-agents/mlagents/trainers/ppo/trainer.py
  48. 8
      ml-agents/mlagents/trainers/sac/optimizer_tf.py
  49. 71
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  50. 61
      ml-agents/mlagents/trainers/sac/trainer.py
  51. 2
      ml-agents/mlagents/trainers/settings.py
  52. 3
      ml-agents/mlagents/trainers/simple_env_manager.py
  53. 86
      ml-agents/mlagents/trainers/stats.py
  54. 5
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  55. 71
      ml-agents/mlagents/trainers/tests/mock_brain.py
  56. 42
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  57. 46
      ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py
  58. 5
      ml-agents/mlagents/trainers/tests/tensorflow/test_models.py
  59. 8
      ml-agents/mlagents/trainers/tests/tensorflow/test_nn_policy.py
  60. 12
      ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
  61. 9
      ml-agents/mlagents/trainers/tests/tensorflow/test_sac.py
  62. 4
      ml-agents/mlagents/trainers/tests/tensorflow/test_saver.py
  63. 8
      ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py
  64. 27
      ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py
  65. 47
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  66. 10
      ml-agents/mlagents/trainers/tests/test_demo_loader.py
  67. 6
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  68. 20
      ml-agents/mlagents/trainers/tests/test_stats.py
  69. 8
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  70. 9
      ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py
  71. 46
      ml-agents/mlagents/trainers/tests/torch/test_ghost.py
  72. 41
      ml-agents/mlagents/trainers/tests/torch/test_networks.py
  73. 40
      ml-agents/mlagents/trainers/tests/torch/test_policy.py
  74. 33
      ml-agents/mlagents/trainers/tests/torch/test_ppo.py
  75. 34
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py
  76. 18
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_extrinsic.py
  77. 29
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_gail.py
  78. 25
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_rnd.py
  79. 17
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
  80. 6
      ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
  81. 19
      ml-agents/mlagents/trainers/tests/torch/test_utils.py
  82. 2
      ml-agents/mlagents/trainers/tf/components/bc/model.py
  83. 10
      ml-agents/mlagents/trainers/tf/components/bc/module.py
  84. 2
      ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/model.py
  85. 10
      ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/signal.py
  86. 2
      ml-agents/mlagents/trainers/tf/components/reward_signals/gail/model.py
  87. 17
      ml-agents/mlagents/trainers/tf/components/reward_signals/gail/signal.py
  88. 4
      ml-agents/mlagents/trainers/tf/model_serialization.py
  89. 29
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  90. 29
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  91. 9
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
  92. 1
      ml-agents/mlagents/trainers/torch/components/reward_providers/rnd_reward_provider.py
  93. 6
      ml-agents/mlagents/trainers/torch/distributions.py
  94. 4
      ml-agents/mlagents/trainers/torch/model_serialization.py
  95. 82
      ml-agents/mlagents/trainers/torch/networks.py
  96. 231
      ml-agents/mlagents/trainers/torch/utils.py
  97. 34
      ml-agents/mlagents/trainers/trainer/rl_trainer.py
  98. 18
      ml-agents/mlagents/trainers/trainer/trainer_factory.py
  99. 10
      ml-agents/mlagents/trainers/trainer_controller.py
  100. 12
      ml-agents/mlagents/trainers/training_status.py

2
.github/ISSUE_TEMPLATE/bug_report.md


- Unity Version: [e.g. Unity 2020.1f1]
- OS + version: [e.g. Windows 10]
- _ML-Agents version_: (e.g. ML-Agents v0.8, or latest `develop` branch from source)
- _TensorFlow version_: (you can run `pip3 show tensorflow` to get this)
- _Torch version_: (you can run `pip3 show torch` to get this)
- _Environment_: (which example environment you used to reproduce the error)
**NOTE:** We are unable to help reproduce bugs with custom environments. Please attempt to reproduce your issue with one of the example environments, or provide a minimal patch to one of the environments needed to reproduce the issue.

4
.github/workflows/pytest.yml


python -m pip install --progress-bar=off -r test_requirements.txt -c ${{ matrix.pip_constraints }}
python -m pip install --progress-bar=off -e ./gym-unity -c ${{ matrix.pip_constraints }}
- name: Save python dependencies
run: pip freeze > pip_versions-${{ matrix.python-version }}.txt
run: |
pip freeze > pip_versions-${{ matrix.python-version }}.txt
cat pip_versions-${{ matrix.python-version }}.txt
- name: Run pytest
run: pytest --cov=ml-agents --cov=ml-agents-envs --cov=gym-unity --cov-report html --junitxml=junit/test-results-${{ matrix.python-version }}.xml -p no:warnings
- name: Upload pytest test results

11
.yamato/test_versions.metafile


# List of editor versions for standalone-build-test and its dependencies.
# csharp_backcompat_version is used in training-int-tests to determine the
# older package version to run the backwards compat tests against.
csharp_backcompat_version: 1.0.0
csharp_backcompat_version: 1.0.0
# Waiting on a barracuda fix, see https://jira.unity3d.com/browse/MLA-1464
# - version: 2020.2
csharp_backcompat_version: 1.0.0
- version: 2020.2
# 2020.2 moved the AssetImporters namespace
# but we didn't handle this until 1.2.0
csharp_backcompat_version: 1.2.0

2
.yamato/training-int-tests.yml


# If we make a breaking change to the communication protocol, these will need
# to be disabled until the next release.
- python -u -m ml-agents.tests.yamato.training_int_tests --python=0.16.0
- python -u -m ml-agents.tests.yamato.training_int_tests --csharp=1.0.0
- python -u -m ml-agents.tests.yamato.training_int_tests --csharp={{ editor.csharp_backcompat_version }}
dependencies:
- .yamato/standalone-build-test.yml#test_mac_standalone_{{ editor.version }}
triggers:

22
Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ModelOverrider.cs


var bp = m_Agent.GetComponent<BehaviorParameters>();
var behaviorName = bp.BehaviorName;
var nnModel = GetModelForBehaviorName(behaviorName);
NNModel nnModel = null;
try
{
nnModel = GetModelForBehaviorName(behaviorName);
}
catch (Exception e)
{
overrideError = $"Exception calling GetModelForBehaviorName: {e}";
}
overrideError =
$"Didn't find a model for behaviorName {behaviorName}. Make " +
$"sure the behaviorName is set correctly in the commandline " +
$"and that the model file exists";
if (string.IsNullOrEmpty(overrideError))
{
overrideError =
$"Didn't find a model for behaviorName {behaviorName}. Make " +
"sure the behaviorName is set correctly in the commandline " +
"and that the model file exists";
}
}
else
{

8
README.md


**The Unity Machine Learning Agents Toolkit** (ML-Agents) is an open-source
project that enables games and simulations to serve as environments for
training intelligent agents. Agents can be trained using reinforcement learning,
imitation learning, neuroevolution, or other machine learning methods through a
simple-to-use Python API. We also provide implementations (based on TensorFlow)
training intelligent agents. We provide implementations (based on PyTorch)
train intelligent agents for 2D, 3D and VR/AR games. These trained agents can be
train intelligent agents for 2D, 3D and VR/AR games. Researchers can also use the
provided simple-to-use Python API to train Agents using reinforcement learning,
imitation learning, neuroevolution, or any other methods. These trained agents can be
used for multiple purposes, including controlling NPC behavior (in a variety of
settings such as multi-agent and adversarial), automated testing of game builds
and evaluating different game design decisions pre-release. The ML-Agents

11
com.unity.ml-agents/CHANGELOG.md


### Major Changes
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
- PyTorch trainers are now the default. See the
[installation docs](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) for
more information on installing PyTorch. For the time being, TensorFlow is still available;
you can use the TensorFlow backend by adding `--tensorflow` to the CLI, or
adding `framework: tensorflow` in the configuration YAML. (#4517)
- The Barracuda dependency was upgraded to 1.1.2 (#4571)
- The `action_probs` node is no longer listed as an output in TensorFlow models (#4613).
- `Agent.CollectObservations()` and `Agent.EndEpisode()` will now throw an exception
if they are called recursively (for example, if they call `Agent.EndEpisode()`).
Previously, this would result in an infinite loop and cause the editor to hang. (#4573)
- Fixed an issue where runs could not be resumed when using TensorFlow and Ghost Training. (#4593)
## [1.5.0-preview] - 2020-10-14

27
com.unity.ml-agents/Runtime/Academy.cs


// Flag used to keep track of the first time the Academy is reset.
bool m_HadFirstReset;
// Whether the Academy is in the middle of a step. This is used to detect and Academy
// step called by user code that is also called by the Academy.
bool m_IsStepping;
// Detect an Academy step called by user code that is also called by the Academy.
private RecursionChecker m_StepRecursionChecker = new RecursionChecker("EnvironmentStep");
// Random seed used for inference.
int m_InferenceSeed;

/// </summary>
public void EnvironmentStep()
{
// Check whether we're already in the middle of a step.
// This shouldn't happen generally, but could happen if user code (e.g. CollectObservations)
// that is called by EnvironmentStep() also calls EnvironmentStep(). This would result
// in an infinite loop and/or stack overflow, so stop it before it happens.
if (m_IsStepping)
{
throw new UnityAgentsException(
"Academy.EnvironmentStep() called recursively. " +
"This might happen if you call EnvironmentStep() from custom code such as " +
"CollectObservations() or OnActionReceived()."
);
}
m_IsStepping = true;
try
using (m_StepRecursionChecker.Start())
{
if (!m_HadFirstReset)
{

{
AgentAct?.Invoke();
}
}
finally
{
// Reset m_IsStepping when we're done (or if an exception occurred).
m_IsStepping = false;
}
}

24
com.unity.ml-agents/Runtime/Agent.cs


/// </summary>
internal VectorSensor collectObservationsSensor;
private RecursionChecker m_CollectObservationsChecker = new RecursionChecker("CollectObservations");
private RecursionChecker m_OnEpisodeBeginChecker = new RecursionChecker("OnEpisodeBegin");
/// <summary>
/// List of IActuators that this Agent will delegate actions to if any exist.
/// </summary>

// episode when initializing until after the Academy had its first reset.
if (Academy.Instance.TotalStepCount != 0)
{
OnEpisodeBegin();
using (m_OnEpisodeBeginChecker.Start())
{
OnEpisodeBegin();
}
}
}

{
// Make sure the latest observations are being passed to training.
collectObservationsSensor.Reset();
CollectObservations(collectObservationsSensor);
using (m_CollectObservationsChecker.Start())
{
CollectObservations(collectObservationsSensor);
}
}
// Request the last decision with no callbacks
// We request a decision so Python knows the Agent is done immediately

UpdateSensors();
using (TimerStack.Instance.Scoped("CollectObservations"))
{
CollectObservations(collectObservationsSensor);
using (m_CollectObservationsChecker.Start())
{
CollectObservations(collectObservationsSensor);
}
}
using (TimerStack.Instance.Scoped("CollectDiscreteActionMasks"))
{

{
ResetData();
m_StepCount = 0;
OnEpisodeBegin();
using (m_OnEpisodeBeginChecker.Start())
{
OnEpisodeBegin();
}
}
/// <summary>

56
com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs


}
}
}
[TestFixture]
public class AgentRecursionTests
{
[SetUp]
public void SetUp()
{
if (Academy.IsInitialized)
{
Academy.Instance.Dispose();
}
}
class CollectObsEndEpisodeAgent : Agent
{
public override void CollectObservations(VectorSensor sensor)
{
// NEVER DO THIS IN REAL CODE!
EndEpisode();
}
}
class OnEpisodeBeginEndEpisodeAgent : Agent
{
public override void OnEpisodeBegin()
{
// NEVER DO THIS IN REAL CODE!
EndEpisode();
}
}
void TestRecursiveThrows<T>() where T : Agent
{
var gameObj = new GameObject();
var agent = gameObj.AddComponent<T>();
agent.LazyInitialize();
agent.RequestDecision();
Assert.Throws<UnityAgentsException>(() =>
{
Academy.Instance.EnvironmentStep();
});
}
[Test]
public void TestRecursiveCollectObsEndEpisodeThrows()
{
TestRecursiveThrows<CollectObsEndEpisodeAgent>();
}
[Test]
public void TestRecursiveOnEpisodeBeginEndEpisodeThrows()
{
TestRecursiveThrows<OnEpisodeBeginEndEpisodeAgent>();
}
}
}

2
com.unity.ml-agents/package.json


"unity": "2018.4",
"description": "Use state-of-the-art machine learning to create intelligent character behaviors in any Unity environment (games, robotics, film, etc.).",
"dependencies": {
"com.unity.barracuda": "1.1.1-preview",
"com.unity.barracuda": "1.1.2-preview",
"com.unity.modules.imageconversion": "1.0.0",
"com.unity.modules.jsonserialize": "1.0.0",
"com.unity.modules.physics": "1.0.0",

2
docs/Background-Machine-Learning.md


one where the number of observations an agent perceives and the number of
actions they can take are large). Many of the algorithms we provide in ML-Agents
use some form of deep learning, built on top of the open-source library,
[TensorFlow](Background-TensorFlow.md).
[PyTorch](Background-PyTorch.md).

10
docs/Getting-Started.md


## Running a pre-trained model
We include pre-trained models for our agents (`.nn` files) and we use the
We include pre-trained models for our agents (`.onnx` files) and we use the
[Unity Inference Engine](Unity-Inference-Engine.md) to run these models inside
Unity. In this section, we will use the pre-trained model for the 3D Ball
example.

## Training a new model with Reinforcement Learning
While we provide pre-trained `.nn` files for the agents in this environment, any
While we provide pre-trained models for the agents in this environment, any
environment you make yourself will require training agents from scratch to
generate a new model file. In this section we will demonstrate how to use the
reinforcement learning algorithms that are part of the ML-Agents Python package

use it with compatible Agents (the Agents that generated the model). **Note:**
Do not just close the Unity Window once the `Saved Model` message appears.
Either wait for the training process to close the window or press `Ctrl+C` at
the command-line prompt. If you close the window manually, the `.nn` file
the command-line prompt. If you close the window manually, the `.onnx` file
containing the trained model is not exported into the ml-agents folder.
If you've quit the training early using `Ctrl+C` and want to resume training,

mlagents-learn config/ppo/3DBall.yaml --run-id=first3DBallRun --resume
```
Your trained model will be at `results/<run-identifier>/<behavior_name>.nn` where
Your trained model will be at `results/<run-identifier>/<behavior_name>.onnx` where
`<behavior_name>` is the name of the `Behavior Name` of the agents corresponding
to the model. This file corresponds to your model's latest checkpoint. You can
now embed this trained model into your Agents by following the steps below,

`Project/Assets/ML-Agents/Examples/3DBall/TFModels/`.
1. Open the Unity Editor, and select the **3DBall** scene as described above.
1. Select the **3DBall** prefab Agent object.
1. Drag the `<behavior_name>.nn` file from the Project window of the Editor to
1. Drag the `<behavior_name>.onnx` file from the Project window of the Editor to
the **Model** placeholder in the **Ball3DAgent** inspector window.
1. Press the **Play** button at the top of the Editor.

24
docs/Installation.md


[instructions](https://packaging.python.org/guides/installing-using-linux-tools/#installing-pip-setuptools-wheel-with-linux-package-managers)
on installing it.
Although we do not provide support for Anaconda installation on Windows, the
previous
[Windows Anaconda Installation (Deprecated) guide](Installation-Anaconda-Windows.md)
is still available.
### Clone the ML-Agents Toolkit Repository (Optional)
Now that you have installed Unity and Python, you can now install the Unity and

dependencies for each project and are supported on Mac / Windows / Linux. We
offer a dedicated [guide on Virtual Environments](Using-Virtual-Environment.md).
#### (Windows) Installing PyTorch
On Windows, you'll have to install the PyTorch package separately prior to
installing ML-Agents. Activate your virtual environment and run from the command line:
```sh
pip3 install torch==1.7.0 -f https://download.pytorch.org/whl/torch_stable.html
```
Note that on Windows, you may also need Microsoft's
[Visual C++ Redistributable](https://support.microsoft.com/en-us/help/2977003/the-latest-supported-visual-c-downloads)
if you don't have it already. See the [PyTorch installation guide](https://pytorch.org/get-started/locally/)
for more installation options and versions.
#### Installing `mlagents`
To install the `mlagents` Python package, activate your virtual environment and
run from the command line:

By installing the `mlagents` package, the dependencies listed in the
[setup.py file](../ml-agents/setup.py) are also installed. These include
[TensorFlow](Background-TensorFlow.md) (Requires a CPU w/ AVX support).
[PyTorch](Background-PyTorch.md) (Requires a CPU w/ AVX support).
#### Advanced: Local Installation for Development

the repository's root directory, run:
```sh
pip3 install torch -f https://download.pytorch.org/whl/torch_stable.html
pip3 install -e ./ml-agents-envs
pip3 install -e ./ml-agents
```

4
docs/Learning-Environment-Executable.md


```
You can press Ctrl+C to stop the training, and your trained model will be at
`results/<run-identifier>/<behavior_name>.nn`, which corresponds to your model's
`results/<run-identifier>/<behavior_name>.onnx`, which corresponds to your model's
latest checkpoint. (**Note:** There is a known bug on Windows that causes the
saving of the model to fail when you early terminate the training, it's
recommended to wait until Step has reached the max_steps parameter you set in

`Project/Assets/ML-Agents/Examples/3DBall/TFModels/`.
1. Open the Unity Editor, and select the **3DBall** scene as described above.
1. Select the **3DBall** prefab from the Project window and select **Agent**.
1. Drag the `<behavior_name>.nn` file from the Project window of the Editor to
1. Drag the `<behavior_name>.onnx` file from the Project window of the Editor to
the **Model** placeholder in the **Ball3DAgent** inspector window.
1. Press the **Play** button at the top of the Editor.

8
docs/ML-Agents-Overview.md


for training intelligent agents. Agents can be trained using reinforcement
learning, imitation learning, neuroevolution, or other machine learning methods
through a simple-to-use Python API. We also provide implementations (based on
TensorFlow) of state-of-the-art algorithms to enable game developers and
PyTorch) of state-of-the-art algorithms to enable game developers and
hobbyists to easily train intelligent agents for 2D, 3D and VR/AR games. These
trained agents can be used for multiple purposes, including controlling NPC
behavior (in a variety of settings such as multi-agent and adversarial),

that include overviews and helpful resources on the
[Unity Engine](Background-Unity.md),
[machine learning](Background-Machine-Learning.md) and
[TensorFlow](Background-TensorFlow.md). We **strongly** recommend browsing the
[PyTorch](Background-PyTorch.md). We **strongly** recommend browsing the
machine learning concepts or have not previously heard of TensorFlow.
machine learning concepts or have not previously heard of PyTorch.
The remainder of this page contains a deep dive into ML-Agents, its key
components, different training modes and scenarios. By the end of it, you should

### Custom Training and Inference
In the previous mode, the Agents were used for training to generate a TensorFlow
In the previous mode, the Agents were used for training to generate a PyTorch
model that the Agents can later use. However, any user of the ML-Agents Toolkit
can leverage their own algorithms for training. In this case, the behaviors of
all the Agents in the scene will be controlled within Python. You can even turn

2
docs/Readme.md


- [ML-Agents Toolkit Overview](ML-Agents-Overview.md)
- [Background: Unity](Background-Unity.md)
- [Background: Machine Learning](Background-Machine-Learning.md)
- [Background: TensorFlow](Background-TensorFlow.md)
- [Background: PyTorch](Background-PyTorch.md)
- [Example Environments](Learning-Environment-Examples.md)
## Creating Learning Environments

2
docs/Training-Configuration-File.md


| `time_horizon` | (default = `64`) How many steps of experience to collect per-agent before adding it to the experience buffer. When this limit is reached before the end of an episode, a value estimate is used to predict the overall expected reward from the agent's current state. As such, this parameter trades off between a less biased, but higher variance estimate (long time horizon) and more biased, but less varied estimate (short time horizon). In cases where there are frequent rewards within an episode, or episodes are prohibitively large, a smaller number can be more ideal. This number should be large enough to capture all the important behavior within a sequence of an agent's actions. <br><br> Typical range: `32` - `2048` |
| `max_steps` | (default = `500000`) Total number of steps (i.e., observation collected and action taken) that must be taken in the environment (or across all environments if using multiple in parallel) before ending the training process. If you have multiple agents with the same behavior name within your environment, all steps taken by those agents will contribute to the same `max_steps` count. <br><br>Typical range: `5e5` - `1e7` |
| `keep_checkpoints` | (default = `5`) The maximum number of model checkpoints to keep. Checkpoints are saved after the number of steps specified by the checkpoint_interval option. Once the maximum number of checkpoints has been reached, the oldest checkpoint is deleted when saving a new checkpoint. |
| `checkpoint_interval` | (default = `500000`) The number of experiences collected between each checkpoint by the trainer. A maximum of `keep_checkpoints` checkpoints are saved before old ones are deleted. Each checkpoint saves the `.nn` (and `.onnx` if applicable) files in `results/` folder.|
| `checkpoint_interval` | (default = `500000`) The number of experiences collected between each checkpoint by the trainer. A maximum of `keep_checkpoints` checkpoints are saved before old ones are deleted. Each checkpoint saves the `.onnx` (and `.nn` if using TensorFlow) files in `results/` folder.|
| `init_path` | (default = None) Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. <br><br>You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run. |
| `threaded` | (default = `true`) By default, model updates can happen while the environment is being stepped. This violates the [on-policy](https://spinningup.openai.com/en/latest/user/algorithms.html#the-on-policy-algorithms) assumption of PPO slightly in exchange for a training speedup. To maintain the strict on-policyness of PPO, you can disable parallel updates by setting `threaded` to `false`. There is usually no reason to turn `threaded` off for SAC. |
| `hyperparameters -> learning_rate` | (default = `3e-4`) Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3` |

35
docs/Training-ML-Agents.md


- [Curriculum Learning](#curriculum)
- [Training with a Curriculum](#training-with-a-curriculum)
- [Training Using Concurrent Unity Instances](#training-using-concurrent-unity-instances)
- [Using PyTorch (Experimental)](#using-pytorch-experimental)
For a broad overview of reinforcement learning, imitation learning and all the
training scenarios, methods and options within the ML-Agents Toolkit, see

values. See [Using TensorBoard](Using-Tensorboard.md) for more details on how
to visualize the training metrics.
1. Models: these contain the model checkpoints that
are updated throughout training and the final model file (`.nn`). This final
are updated throughout training and the final model file (`.onnx`). This final
model file is generated once either when training completes or is
interrupted.
1. Timers file (under `results/<run-identifier>/run_logs`): this contains aggregated

- **Result Variation Using Concurrent Unity Instances** - If you keep all the
hyperparameters the same, but change `--num-envs=<n>`, the results and model
would likely change.
### Using PyTorch (Experimental)
ML-Agents, by default, uses TensorFlow as its backend, but experimental support
for PyTorch has been added. To use PyTorch, the `torch` Python package must
be installed, and PyTorch must be enabled for your trainer.
#### Installing PyTorch
If you've already installed ML-Agents, follow the
[official PyTorch install instructions](https://pytorch.org/get-started/locally/) for
your platform and configuration. Note that on Windows, you may also need Microsoft's
[Visual C++ Redistributable](https://support.microsoft.com/en-us/help/2977003/the-latest-supported-visual-c-downloads) if you don't have it already.
If you're installing or upgrading ML-Agents on Linux or Mac, you can also run
`pip3 install mlagents[torch]` instead of `pip3 install mlagents`
during [installation](Installation.md). On Windows, install ML-Agents first and then
separately install PyTorch.
#### Enabling PyTorch
PyTorch can be enabled in one of two ways. First, by adding `--torch` to the
`mlagents-learn` command. This will make all behaviors train with PyTorch.
Second, by changing the `framework` option for your agent behavior in the
configuration YAML as below. This will use PyTorch just for that behavior.
```yaml
behaviors:
YourAgentBehavior:
framework: pytorch
```

2
docs/Training-on-Amazon-Web-Service.md


# Download and install the latest Nvidia driver for ubuntu
# Please refer to http://download.nvidia.com/XFree86/Linux-#x86_64/latest.txt
$ wget http://download.nvidia.com/XFree86/Linux-x86_64/390.87/NVIDIA-Linux-x86_64-390.87.run
$ sudo /bin/bash ./NVIDIA-Linux-x86_64-390.67.run --accept-license --no-questions --ui=none
$ sudo /bin/bash ./NVIDIA-Linux-x86_64-390.87.run --accept-license --no-questions --ui=none
# Disable Nouveau as it will clash with the Nvidia driver
$ sudo echo 'blacklist nouveau' | sudo tee -a /etc/modprobe.d/blacklist.conf

5
docs/Unity-Inference-Engine.md


[industry-standard open format](https://onnx.ai/about.html) produced by the
[tf2onnx package](https://github.com/onnx/tensorflow-onnx).
Export to ONNX is currently considered beta. To enable it, make sure
`tf2onnx>=1.5.5` is installed in pip. tf2onnx does not currently support
tensorflow 2.0.0 or later, or earlier than 1.12.0.
Export to ONNX is used if using PyTorch (the default). To enable it
while using TensorFlow, make sure `tf2onnx>=1.6.1` is installed in pip.
## Using the Unity Inference Engine

21
gym-unity/gym_unity/envs/__init__.py


self._previous_decision_step = decision_steps
# Set action spaces
if self.group_spec.is_action_discrete():
branches = self.group_spec.discrete_action_branches
if self.group_spec.action_size == 1:
if self.group_spec.action_spec.is_discrete():
self.action_size = self.group_spec.action_spec.discrete_size
branches = self.group_spec.action_spec.discrete_branches
if self.group_spec.action_spec.discrete_size == 1:
self._action_space = spaces.Discrete(branches[0])
else:
if flatten_branched:

self._action_space = spaces.MultiDiscrete(branches)
else:
elif self.group_spec.action_spec.is_continuous():
high = np.array([1] * self.group_spec.action_shape)
self.action_size = self.group_spec.action_spec.continuous_size
high = np.array([1] * self.group_spec.action_spec.continuous_size)
else:
raise UnityGymException(
"The gym wrapper does not provide explicit support for both discrete "
"and continuous actions."
)
# Set observations space
list_spaces: List[gym.Space] = []

# Translate action into list
action = self._flattener.lookup_action(action)
spec = self.group_spec
action = np.array(action).reshape((1, spec.action_size))
action = np.array(action).reshape((1, self.action_size))
self._env.set_actions(self.name, action)
self._env.step()

8
gym-unity/gym_unity/tests/test_gym.py


from gym_unity.envs import UnityToGymWrapper
from mlagents_envs.base_env import (
BehaviorSpec,
ActionType,
ActionSpec,
DecisionSteps,
TerminalSteps,
BehaviorMapping,

Creates a mock BrainParameters object with parameters.
"""
# Avoid using mutable object as default param
act_type = ActionType.DISCRETE
act_type = ActionType.CONTINUOUS
action_spec = ActionSpec.create_continuous(vector_action_space_size)
action_spec = ActionSpec.create_discrete(vector_action_space_size)
return BehaviorSpec(obs_shapes, act_type, vector_action_space_size)
return BehaviorSpec(obs_shapes, action_spec)
def create_mock_vector_steps(specs, num_agents=1, number_visual_observations=0):

192
ml-agents-envs/mlagents_envs/base_env.py


NamedTuple,
Tuple,
Optional,
Union,
Dict,
Iterator,
Any,

from enum import Enum
from mlagents_envs.exception import UnityActionException
AgentId = int
BehaviorName = str

)
class ActionType(Enum):
DISCRETE = 0
CONTINUOUS = 1
HYBRID = 2
class ActionBuffers(NamedTuple):
"""
A NamedTuple whose fields correspond to actions of different types.
Continuous and discrete actions are numpy arrays.
"""
continuous: np.ndarray # dims (n_agents, cont_size)
discrete: np.ndarray # dims (n_agents, disc_size)
num_continuous_actions: int
discrete_branch_sizes: Tuple[int]
"""
A NamedTuple containing utility functions and information about the action spaces
for a group of Agents under the same behavior.
- num_continuous_actions is an int corresponding to the number of floats which
constitute the action.
- discrete_branch_sizes is a Tuple of int where each int corresponds to
the number of discrete actions available to the agent on an independent action branch.
"""
continuous_size: int
discrete_branches: Tuple[int, ...]
def __eq__(self, other):
return (
self.continuous_size == other.continuous_size
and self.discrete_branches == other.discrete_branches
)
def __str__(self):
return f"Continuous: {self.continuous_size}, Discrete: {self.discrete_branches}"
def is_action_discrete(self) -> bool:
def is_discrete(self) -> bool:
return self.discrete_action_size > 0
return self.discrete_size > 0 and self.continuous_size == 0
def is_action_continuous(self) -> bool:
def is_continuous(self) -> bool:
return self.continuous_action_size > 0
return self.discrete_size == 0 and self.continuous_size > 0
def discrete_action_branches(self) -> Optional[Tuple[int, ...]]:
return self.discrete_branch_sizes # type: ignore
def discrete_size(self) -> int:
"""
Returns a an int corresponding to the number of discrete branches.
"""
return len(self.discrete_branches)
@property
def discrete_action_size(self) -> int:
return len(self.discrete_branch_sizes)
def empty_action(self, n_agents: int) -> ActionBuffers:
"""
Generates ActionBuffers corresponding to an empty action (all zeros)
for a number of agents.
:param n_agents: The number of agents that will have actions generated
"""
continuous: np.ndarray = None
discrete: np.ndarray = None
if self.continuous_size > 0:
continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
@property
def continuous_action_size(self) -> int:
return self.num_continuous_actions
if self.discrete_size > 0:
discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
return ActionBuffers(continuous, discrete)
@property
def action_size(self) -> int:
return self.discrete_action_size + self.continuous_action_size
def random_action(self, n_agents: int) -> ActionBuffers:
"""
Generates ActionBuffers corresponding to a random action (either discrete
or continuous) for a number of agents.
:param n_agents: The number of agents that will have actions generated
"""
continuous: np.ndarray = None
discrete: np.ndarray = None
if self.continuous_size > 0:
continuous = np.random.uniform(
low=-1.0, high=1.0, size=(n_agents, self.continuous_size)
).astype(np.float32)
def create_empty_action(self, n_agents: int) -> Tuple[np.ndarray, np.ndarray]:
return ActionBuffer(
np.zeros((n_agents, self.continuous_action_size), dtype=np.float32),
np.zeros((n_agents, self.discrete_action_size), dtype=np.int32),
)
if self.discrete_size > 0:
discrete = np.column_stack(
[
np.random.randint(
0,
self.discrete_branches[i], # type: ignore
size=(n_agents),
dtype=np.int32,
)
for i in range(self.discrete_size)
]
)
return ActionBuffers(continuous, discrete)
def create_random_action(self, n_agents: int) -> np.ndarray:
continuous_action = np.random.uniform(
low=-1.0, high=1.0, size=(n_agents, self.continuous_action_size)
).astype(np.float32)
def _validate_action(
self, actions: ActionBuffers, n_agents: int, name: str
) -> ActionBuffers:
"""
Validates that action has the correct action dim
for the correct number of agents and ensures the type.
"""
_expected_shape = (n_agents, self.continuous_size)
if self.continuous_size > 0 and actions.continuous.shape != _expected_shape:
raise UnityActionException(
f"The behavior {name} needs a continuous input of dimension "
f"{_expected_shape} for (<number of agents>, <action size>) but "
f"received input of dimension {actions.continuous.shape}"
)
if actions.continuous.dtype != np.float32:
actions.continuous = actions.continuous.astype(np.float32)
branch_size = self.discrete_action_branches
discrete_action = np.column_stack(
[
np.random.randint(
0,
branch_size[i], # type: ignore
size=(n_agents),
dtype=np.int32,
)
for i in range(self.discrete_action_size)
]
)
return ActionBuffer(continuous_action, discrete_action)
_expected_shape = (n_agents, self.discrete_size)
if self.discrete_size > 0 and actions.discrete.shape != _expected_shape:
raise UnityActionException(
f"The behavior {name} needs a discrete input of dimension "
f"{_expected_shape} for (<number of agents>, <action size>) but "
f"received input of dimension {actions.discrete.shape}"
)
if actions.discrete.dtype != np.int32:
actions.discrete = actions.discrete.astype(np.int32)
return actions
@staticmethod
def create_continuous(continuous_size: int) -> "ActionSpec":
"""
Creates an ActionSpec that is homogenously continuous
"""
return ActionSpec(continuous_size, ())
@staticmethod
def create_discrete(discrete_branches: Tuple[int]) -> "ActionSpec":
"""
Creates an ActionSpec that is homogenously discrete
"""
return ActionSpec(0, discrete_branches)
"""
A NamedTuple containing information about the observation and action
spaces for a group of Agents under the same behavior.
- observation_shapes is a List of Tuples of int : Each Tuple corresponds
to an observation's dimensions. The shape tuples have the same ordering as
the ordering of the DecisionSteps and TerminalSteps.
- action_spec is an ActionSpec NamedTuple
"""
class BehaviorMapping(Mapping):
def __init__(self, specs: Dict[BehaviorName, BehaviorSpec]):
self._dict = specs

"""
@abstractmethod
def set_actions(
self, behavior_name: BehaviorName, action: Union[ActionBuffer, np.ndarray]
) -> None:
def set_actions(self, behavior_name: BehaviorName, action: ActionBuffers) -> None:
:param action: A two dimensional np.ndarray corresponding to the action
(either int or float)
:param action: ActionBuffers tuple of continuous and/or discrete action
self,
behavior_name: BehaviorName,
agent_id: AgentId,
action: Union[ActionBuffer, np.ndarray],
self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionBuffers
) -> None:
"""
Sets the action for one of the agents in the simulation for the next

:param action: A one dimensional np.ndarray corresponding to the action
(either int or float)
:param action: ActionBuffers tuple of continuous and/or discrete action
"""
@abstractmethod

51
ml-agents-envs/mlagents_envs/environment.py


DecisionSteps,
TerminalSteps,
BehaviorSpec,
ActionBuffers,
BehaviorName,
AgentId,
BehaviorMapping,

self._env_state: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
self._env_specs: Dict[str, BehaviorSpec] = {}
self._env_actions: Dict[str, np.ndarray] = {}
self._env_actions: Dict[str, ActionBuffers] = {}
self._is_first_message = True
self._update_behavior_specs(aca_output)

n_agents = len(self._env_state[group_name][0])
self._env_actions[group_name] = self._env_specs[
group_name
].create_empty_action(n_agents)
].action_spec.empty_action(n_agents)
step_input = self._generate_step_input(self._env_actions)
with hierarchical_timer("communicator.exchange"):
outputs = self._communicator.exchange(step_input)

f"agent group in the environment"
)
def set_actions(self, behavior_name: BehaviorName, action: np.ndarray) -> None:
def set_actions(self, behavior_name: BehaviorName, action: ActionBuffers) -> None:
spec = self._env_specs[behavior_name]
expected_type = np.float32 if spec.is_action_continuous() else np.int32
expected_shape = (len(self._env_state[behavior_name][0]), spec.action_size)
if action.shape != expected_shape:
raise UnityActionException(
f"The behavior {behavior_name} needs an input of dimension "
f"{expected_shape} for (<number of agents>, <action size>) but "
f"received input of dimension {action.shape}"
)
if action.dtype != expected_type:
action = action.astype(expected_type)
action_spec = self._env_specs[behavior_name].action_spec
num_agents = len(self._env_state[behavior_name][0])
action = action_spec._validate_action(action, num_agents, behavior_name)
self, behavior_name: BehaviorName, agent_id: AgentId, action: np.ndarray
self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionBuffers
spec = self._env_specs[behavior_name]
expected_shape = (spec.action_size,)
if action.shape != expected_shape:
raise UnityActionException(
f"The Agent {agent_id} with BehaviorName {behavior_name} needs "
f"an input of dimension {expected_shape} but received input of "
f"dimension {action.shape}"
)
expected_type = np.float32 if spec.is_action_continuous() else np.int32
if action.dtype != expected_type:
action = action.astype(expected_type)
action_spec = self._env_specs[behavior_name].action_spec
num_agents = len(self._env_state[behavior_name][0])
action = action_spec._validate_action(action, num_agents, behavior_name)
self._env_actions[behavior_name] = spec.create_empty_action(
len(self._env_state[behavior_name][0])
)
self._env_actions[behavior_name] = action_spec.empty_action(num_agents)
try:
index = np.where(self._env_state[behavior_name][0].agent_id == agent_id)[0][
0

@timed
def _generate_step_input(
self, vector_action: Dict[str, np.ndarray]
self, vector_action: Dict[str, ActionBuffers]
) -> UnityInputProto:
rl_in = UnityRLInputProto()
for b in vector_action:

for i in range(n_agents):
action = AgentActionProto(vector_actions=vector_action[b][i])
# TODO: extend to AgentBuffers
if vector_action[b].continuous is not None:
_act = vector_action[b].continuous[i]
else:
_act = vector_action[b].discrete[i]
action = AgentActionProto(vector_actions=_act)
rl_in.agent_actions[b].value.extend([action])
rl_in.command = STEP
rl_in.side_channel = bytes(

18
ml-agents-envs/mlagents_envs/rpc_utils.py


from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
import numpy as np
import io
from typing import cast, List, Tuple, Union, Collection, Optional, Iterable
from typing import cast, List, Tuple, Collection, Optional, Iterable
from PIL import Image

:return: BehaviorSpec object.
"""
observation_shape = [tuple(obs.shape) for obs in agent_info.observations]
action_spec = brain_param_proto.action_spec
action_spec = ActionSpec(action_spec.num_continuous_actions,
tuple(branch for branch in action_spec.discrete_branch_sizes)
)
if brain_param_proto.vector_action_space_type == 1:
action_spec = ActionSpec(brain_param_proto.vector_action_size[0], ())
else:
action_spec = ActionSpec(0, tuple(brain_param_proto.vector_action_size))
return BehaviorSpec(observation_shape, action_spec)

[agent_info.id for agent_info in terminal_agent_info_list], dtype=np.int32
)
action_mask = None
if behavior_spec.is_action_discrete():
if behavior_spec.action_spec.discrete_size > 0:
a_size = np.sum(behavior_spec.discrete_action_branches)
a_size = np.sum(behavior_spec.action_spec.discrete_branches)
mask_matrix = np.ones((n_agents, a_size), dtype=np.bool)
for agent_index, agent_info in enumerate(decision_agent_info_list):
if agent_info.action_mask is not None:

for k in range(a_size)
]
action_mask = (1 - mask_matrix).astype(np.bool)
indices = _generate_split_indices(behavior_spec.discrete_action_branches)
indices = _generate_split_indices(
behavior_spec.action_spec.discrete_branches
)
action_mask = np.split(action_mask, indices, axis=1)
return (
DecisionSteps(

18
ml-agents-envs/mlagents_envs/tests/test_envs.py


from unittest import mock
import pytest
import numpy as np
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.base_env import DecisionSteps, TerminalSteps
from mlagents_envs.exception import UnityEnvironmentException, UnityActionException

env.step()
decision_steps, terminal_steps = env.get_steps("RealFakeBrain")
n_agents = len(decision_steps)
env.set_actions(
"RealFakeBrain", np.zeros((n_agents, spec.action_size), dtype=np.float32)
)
env.set_actions("RealFakeBrain", spec.action_spec.empty_action(n_agents))
env.set_actions(
"RealFakeBrain",
np.zeros((n_agents - 1, spec.action_size), dtype=np.float32),
)
decision_steps, terminal_steps = env.get_steps("RealFakeBrain")
n_agents = len(decision_steps)
env.set_actions(
"RealFakeBrain", -1 * np.ones((n_agents, spec.action_size), dtype=np.float32)
)
env.step()
env.set_actions("RealFakeBrain", spec.action_spec.empty_action(n_agents - 1))
env.close()
assert isinstance(decision_steps, DecisionSteps)
assert isinstance(terminal_steps, TerminalSteps)

30
ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py


from mlagents_envs.communicator_objects.agent_action_pb2 import AgentActionProto
from mlagents_envs.base_env import (
BehaviorSpec,
ActionType,
ActionSpec,
DecisionSteps,
TerminalSteps,
)

def test_batched_step_result_from_proto():
n_agents = 10
shapes = [(3,), (4,)]
spec = BehaviorSpec(shapes, ActionType.CONTINUOUS, 3)
spec = BehaviorSpec(shapes, ActionSpec.create_continuous(3))
ap_list = generate_list_agent_proto(n_agents, shapes)
decision_steps, terminal_steps = steps_from_proto(ap_list, spec)
for agent_id in range(n_agents):

def test_action_masking_discrete():
n_agents = 10
shapes = [(3,), (4,)]
behavior_spec = BehaviorSpec(shapes, ActionType.DISCRETE, (7, 3))
behavior_spec = BehaviorSpec(shapes, ActionSpec.create_discrete((7, 3)))
ap_list = generate_list_agent_proto(n_agents, shapes)
decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
masks = decision_steps.action_mask

def test_action_masking_discrete_1():
n_agents = 10
shapes = [(3,), (4,)]
behavior_spec = BehaviorSpec(shapes, ActionType.DISCRETE, (10,))
behavior_spec = BehaviorSpec(shapes, ActionSpec.create_discrete((10,)))
ap_list = generate_list_agent_proto(n_agents, shapes)
decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
masks = decision_steps.action_mask

def test_action_masking_discrete_2():
n_agents = 10
shapes = [(3,), (4,)]
behavior_spec = BehaviorSpec(shapes, ActionType.DISCRETE, (2, 2, 6))
behavior_spec = BehaviorSpec(shapes, ActionSpec.create_discrete((2, 2, 6)))
ap_list = generate_list_agent_proto(n_agents, shapes)
decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
masks = decision_steps.action_mask

def test_action_masking_continuous():
n_agents = 10
shapes = [(3,), (4,)]
behavior_spec = BehaviorSpec(shapes, ActionType.CONTINUOUS, 10)
behavior_spec = BehaviorSpec(shapes, ActionSpec.create_continuous(10))
ap_list = generate_list_agent_proto(n_agents, shapes)
decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
masks = decision_steps.action_mask

bp.vector_action_size.extend([5, 4])
bp.vector_action_space_type = 0
behavior_spec = behavior_spec_from_proto(bp, agent_proto)
assert behavior_spec.is_action_discrete()
assert not behavior_spec.is_action_continuous()
assert behavior_spec.action_spec.is_discrete()
assert not behavior_spec.action_spec.is_continuous()
assert behavior_spec.discrete_action_branches == (5, 4)
assert behavior_spec.action_size == 2
assert behavior_spec.action_spec.discrete_branches == (5, 4)
assert behavior_spec.action_spec.discrete_size == 2
assert not behavior_spec.is_action_discrete()
assert behavior_spec.is_action_continuous()
assert behavior_spec.action_size == 6
assert not behavior_spec.action_spec.is_discrete()
assert behavior_spec.action_spec.is_continuous()
assert behavior_spec.action_spec.continuous_size == 6
behavior_spec = BehaviorSpec(shapes, ActionType.CONTINUOUS, 3)
behavior_spec = BehaviorSpec(shapes, ActionSpec.create_continuous(3))
ap_list = generate_list_agent_proto(n_agents, shapes, infinite_rewards=True)
with pytest.raises(RuntimeError):
steps_from_proto(ap_list, behavior_spec)

n_agents = 10
shapes = [(3,), (4,)]
behavior_spec = BehaviorSpec(shapes, ActionType.CONTINUOUS, 3)
behavior_spec = BehaviorSpec(shapes, ActionSpec.create_continuous(3))
ap_list = generate_list_agent_proto(n_agents, shapes, nan_observations=True)
with pytest.raises(RuntimeError):
steps_from_proto(ap_list, behavior_spec)

69
ml-agents-envs/mlagents_envs/tests/test_steps.py


from mlagents_envs.base_env import (
DecisionSteps,
TerminalSteps,
ActionType,
ActionSpec,
BehaviorSpec,
)

def test_empty_decision_steps():
specs = BehaviorSpec(
observation_shapes=[(3, 2), (5,)],
action_type=ActionType.CONTINUOUS,
action_shape=3,
observation_shapes=[(3, 2), (5,)], action_spec=ActionSpec.create_continuous(3)
)
ds = DecisionSteps.empty(specs)
assert len(ds.obs) == 2

def test_empty_terminal_steps():
specs = BehaviorSpec(
observation_shapes=[(3, 2), (5,)],
action_type=ActionType.CONTINUOUS,
action_shape=3,
observation_shapes=[(3, 2), (5,)], action_spec=ActionSpec.create_continuous(3)
)
ts = TerminalSteps.empty(specs)
assert len(ts.obs) == 2

def test_specs():
specs = BehaviorSpec(
observation_shapes=[(3, 2), (5,)],
action_type=ActionType.CONTINUOUS,
action_shape=3,
)
assert specs.discrete_action_branches is None
assert specs.action_size == 3
assert specs.create_empty_action(5).shape == (5, 3)
assert specs.create_empty_action(5).dtype == np.float32
specs = ActionSpec.create_continuous(3)
assert specs.discrete_branches == ()
assert specs.discrete_size == 0
assert specs.continuous_size == 3
assert specs.empty_action(5).continuous.shape == (5, 3)
assert specs.empty_action(5).continuous.dtype == np.float32
specs = ActionSpec.create_discrete((3,))
assert specs.discrete_branches == (3,)
assert specs.discrete_size == 1
assert specs.continuous_size == 0
assert specs.empty_action(5).discrete.shape == (5, 1)
assert specs.empty_action(5).discrete.dtype == np.int32
specs = BehaviorSpec(
observation_shapes=[(3, 2), (5,)],
action_type=ActionType.DISCRETE,
action_shape=(3,),
)
assert specs.discrete_action_branches == (3,)
assert specs.action_size == 1
assert specs.create_empty_action(5).shape == (5, 1)
assert specs.create_empty_action(5).dtype == np.int32
specs = ActionSpec(3, (3,))
assert specs.continuous_size == 3
assert specs.discrete_branches == (3,)
assert specs.discrete_size == 1
assert specs.empty_action(5).continuous.shape == (5, 3)
assert specs.empty_action(5).continuous.dtype == np.float32
assert specs.empty_action(5).discrete.shape == (5, 1)
assert specs.empty_action(5).discrete.dtype == np.int32
specs = BehaviorSpec(
observation_shapes=[(5,)],
action_type=ActionType.CONTINUOUS,
action_shape=action_len,
)
zero_action = specs.create_empty_action(4)
specs = ActionSpec.create_continuous(action_len)
zero_action = specs.empty_action(4).continuous
random_action = specs.create_random_action(4)
random_action = specs.random_action(4).continuous
assert random_action.dtype == np.float32
assert random_action.shape == (4, action_len)
assert np.min(random_action) >= -1

action_shape = (10, 20, 30)
specs = BehaviorSpec(
observation_shapes=[(5,)],
action_type=ActionType.DISCRETE,
action_shape=action_shape,
)
zero_action = specs.create_empty_action(4)
specs = ActionSpec.create_discrete(action_shape)
zero_action = specs.empty_action(4).discrete
random_action = specs.create_random_action(4)
random_action = specs.random_action(4).discrete
assert random_action.dtype == np.int32
assert random_action.shape == (4, len(action_shape))
assert np.min(random_action) >= 0

1
ml-agents/mlagents/tf_utils/__init__.py


from mlagents.tf_utils.tf import tf as tf # noqa
from mlagents.tf_utils.tf import set_warnings_enabled # noqa
from mlagents.tf_utils.tf import generate_session_config # noqa
from mlagents.tf_utils.tf import is_available # noqa

63
ml-agents/mlagents/tf_utils/tf.py


# This should be the only place that we import tensorflow directly.
# Everywhere else is caught by the banned-modules setting for flake8
import tensorflow as tf # noqa I201
try:
import tensorflow as tf # noqa I201
# LooseVersion handles things "1.2.3a" or "4.5.6-rc7" fairly sensibly.
_is_tensorflow2 = LooseVersion(tf.__version__) >= LooseVersion("2.0.0")
# LooseVersion handles things "1.2.3a" or "4.5.6-rc7" fairly sensibly.
_is_tensorflow2 = LooseVersion(tf.__version__) >= LooseVersion("2.0.0")
if _is_tensorflow2:
import tensorflow.compat.v1 as tf
if _is_tensorflow2:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
tf_logging = tf.logging
else:
try:
# Newer versions of tf 1.x will complain that tf.logging is deprecated
tf_logging = tf.compat.v1.logging
except AttributeError:
# Fall back to the safe import, even if it might generate a warning or two.
tf.disable_v2_behavior()
else:
try:
# Newer versions of tf 1.x will complain that tf.logging is deprecated
tf_logging = tf.compat.v1.logging
except AttributeError:
# Fall back to the safe import, even if it might generate a warning or two.
tf_logging = tf.logging
except ImportError:
tf = None
def is_available():
"""
Returns whether Torch is available in this Python environment
"""
return tf is not None
def set_warnings_enabled(is_enabled: bool) -> None:

"""
level = tf_logging.WARN if is_enabled else tf_logging.ERROR
tf_logging.set_verbosity(level)
if is_available():
level = tf_logging.WARN if is_enabled else tf_logging.ERROR
tf_logging.set_verbosity(level)
def generate_session_config() -> tf.ConfigProto:
def generate_session_config() -> "tf.ConfigProto":
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
# For multi-GPU training, set allow_soft_placement to True to allow
# placing the operation into an alternative device automatically
# to prevent from exceptions if the device doesn't suppport the operation
# or the device does not exist
config.allow_soft_placement = True
return config
if is_available():
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
# For multi-GPU training, set allow_soft_placement to True to allow
# placing the operation into an alternative device automatically
# to prevent from exceptions if the device doesn't suppport the operation
# or the device does not exist
config.allow_soft_placement = True
return config
else:
return None

1
ml-agents/mlagents/torch_utils/__init__.py


from mlagents.torch_utils.torch import torch as torch # noqa
from mlagents.torch_utils.torch import nn # noqa
from mlagents.torch_utils.torch import is_available # noqa
from mlagents.torch_utils.torch import default_device # noqa

66
ml-agents/mlagents/torch_utils/torch.py


import os
from distutils.version import LooseVersion
import pkg_resources
# Detect availability of torch package here.
# NOTE: this try/except is temporary until torch is required for ML-Agents.
try:
# This should be the only place that we import torch directly.
# Everywhere else is caught by the banned-modules setting for flake8
import torch # noqa I201
torch.set_num_threads(cpu_utils.get_num_threads_to_use())
os.environ["KMP_BLOCKTIME"] = "0"
def assert_torch_installed():
# Check that torch version 1.6.0 or later has been installed. If not, refer
# user to the PyTorch webpage for install instructions.
torch_pkg = None
try:
torch_pkg = pkg_resources.get_distribution("torch")
except pkg_resources.DistributionNotFound:
pass
assert torch_pkg is not None and LooseVersion(torch_pkg.version) >= LooseVersion(
"1.6.0"
), (
"A compatible version of PyTorch was not installed. Please visit the PyTorch homepage "
+ "(https://pytorch.org/get-started/locally/) and follow the instructions to install. "
+ "Version 1.6.0 and later are supported."
)
# Known PyLint compatibility with PyTorch https://github.com/pytorch/pytorch/issues/701
# pylint: disable=E1101
if torch.cuda.is_available():
torch.set_default_tensor_type(torch.cuda.FloatTensor)
device = torch.device("cuda")
else:
torch.set_default_tensor_type(torch.FloatTensor)
device = torch.device("cpu")
nn = torch.nn
# pylint: disable=E1101
except ImportError:
torch = None
nn = None
device = None
assert_torch_installed()
# This should be the only place that we import torch directly.
# Everywhere else is caught by the banned-modules setting for flake8
import torch # noqa I201
torch.set_num_threads(cpu_utils.get_num_threads_to_use())
os.environ["KMP_BLOCKTIME"] = "0"
# Known PyLint compatibility with PyTorch https://github.com/pytorch/pytorch/issues/701
# pylint: disable=E1101
if torch.cuda.is_available():
torch.set_default_tensor_type(torch.cuda.FloatTensor)
device = torch.device("cuda")
else:
torch.set_default_tensor_type(torch.FloatTensor)
device = torch.device("cpu")
nn = torch.nn
def is_available():
"""
Returns whether Torch is available in this Python environment
"""
return torch is not None

19
ml-agents/mlagents/trainers/agent_processor.py


from typing import List, Dict, TypeVar, Generic, Tuple, Any, Union
from collections import defaultdict, Counter
import queue
import numpy as np
from mlagents_envs.base_env import (
DecisionSteps,

done = terminated # Since this is an ongoing step
interrupted = step.interrupted if terminated else False
# Add the outputs of the last eval
action = stored_take_action_outputs["action"][idx]
action_dict = stored_take_action_outputs["action"]
action: Dict[str, np.ndarray] = {}
for act_type, act_array in action_dict.items():
action[act_type] = act_array[idx]
action_probs = stored_take_action_outputs["log_probs"][idx]
action_probs_dict = stored_take_action_outputs["log_probs"]
action_probs: Dict[str, np.ndarray] = {}
for prob_type, prob_array in action_probs_dict.items():
action_probs[prob_type] = prob_array[idx]
prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
prev_action = self.policy.retrieve_previous_action([global_id])
prev_action_dict: Dict[str, np.ndarray] = {}
for _prev_act_type, _prev_act in prev_action.items():
prev_action_dict[_prev_act_type] = _prev_act[0, :]
experience = AgentExperience(
obs=obs,
reward=step.reward,

action_pre=action_pre,
action_mask=action_mask,
prev_action=prev_action,
prev_action=prev_action_dict,
interrupted=interrupted,
memory=memory,
)

2
ml-agents/mlagents/trainers/buffer.py


class AgentBufferField(list):
"""
AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to his
AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to its
AgentBufferField with the append method.
"""

11
ml-agents/mlagents/trainers/cli_utils.py


"--torch",
default=False,
action=DetectDefaultStoreTrue,
help="(Experimental) Use the PyTorch framework instead of TensorFlow. Install PyTorch "
"before using this option",
help="Use the PyTorch framework. Note that this option is not required anymore as PyTorch is the"
"default framework, and will be removed in the next release.",
)
argparser.add_argument(
"--tensorflow",
default=False,
action=DetectDefaultStoreTrue,
help="(Deprecated) Use the TensorFlow framework instead of PyTorch. Install TensorFlow "
"before using this option.",
)
eng_conf = argparser.add_argument_group(title="Engine Configuration")

22
ml-agents/mlagents/trainers/demo_loader.py


for i, obs in enumerate(split_obs.visual_observations):
demo_raw_buffer["visual_obs%d" % i].append(obs)
demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
if behavior_spec.action_spec.is_continuous():
demo_raw_buffer["continuous_action"].append(
current_pair_info.action_info.vector_actions
)
else:
demo_raw_buffer["discrete_action"].append(
current_pair_info.action_info.vector_actions
)
demo_raw_buffer["prev_action"].append(previous_action)
if next_done:
demo_raw_buffer.resequence_and_append(

demo_buffer = make_demo_buffer(info_action_pair, behavior_spec, sequence_length)
if expected_behavior_spec:
# check action dimensions in demonstration match
if behavior_spec.action_shape != expected_behavior_spec.action_shape:
if behavior_spec.action_spec != expected_behavior_spec.action_spec:
"The action dimensions {} in demonstration do not match the policy's {}.".format(
behavior_spec.action_shape, expected_behavior_spec.action_shape
)
)
# check the action types in demonstration match
if behavior_spec.action_type != expected_behavior_spec.action_type:
raise RuntimeError(
"The action type of {} in demonstration do not match the policy's {}.".format(
behavior_spec.action_type, expected_behavior_spec.action_type
"The action spaces {} in demonstration do not match the policy's {}.".format(
behavior_spec.action_spec, expected_behavior_spec.action_spec
)
)
# check observations match

15
ml-agents/mlagents/trainers/env_manager.py


from abc import ABC, abstractmethod
import numpy as np
from typing import List, Dict, NamedTuple, Iterable, Tuple
from mlagents_envs.base_env import (
DecisionSteps,

ActionBuffers,
)
from mlagents_envs.side_channel.stats_side_channel import EnvironmentStats

step_info.environment_stats, step_info.worker_id
)
return len(step_infos)
@staticmethod
def action_buffers_from_numpy_dict(
action_dict: Dict[str, np.ndarray]
) -> ActionBuffers:
continuous: np.ndarray = None
discrete: np.ndarray = None
if "continuous_action" in action_dict:
continuous = action_dict["continuous_action"]
if "discrete_action" in action_dict:
discrete = action_dict["discrete_action"]
return ActionBuffers(continuous, discrete)

16
ml-agents/mlagents/trainers/ghost/trainer.py


@property
def reward_buffer(self) -> Deque[float]:
"""
Returns the reward buffer. The reward buffer contains the cumulative
rewards of the most recent episodes completed by agents using this
trainer.
:return: the reward buffer.
"""
Returns the reward buffer. The reward buffer contains the cumulative
rewards of the most recent episodes completed by agents using this
trainer.
:return: the reward buffer.
"""
return self.trainer.reward_buffer
@property

policy = self.trainer.create_policy(
parsed_behavior_id, behavior_spec, create_graph=True
)
self.trainer.model_saver.initialize_or_load(policy)
team_id = parsed_behavior_id.team_id
self.controller.subscribe_team_id(team_id, self)

self._save_snapshot() # Need to save after trainer initializes policy
self._learning_team = self.controller.get_learning_team
self.wrapped_trainer_team = team_id
else:
# Load the weights of the ghost policy from the wrapped one
policy.load_weights(
self.trainer.get_policy(parsed_behavior_id).get_weights()
)
return policy
def add_policy(

6
ml-agents/mlagents/trainers/learn.py


# # Unity ML-Agents Toolkit
from mlagents import torch_utils
import yaml
import os

ml-agents: {mlagents.trainers.__version__},
ml-agents-envs: {mlagents_envs.__version__},
Communicator API: {UnityEnvironment.API_VERSION},
TensorFlow: {tf_utils.tf.__version__}"""
PyTorch: {torch_utils.torch.__version__}"""
def parse_command_line(argv: Optional[List[str]] = None) -> RunOptions:

init_path=maybe_init_path,
multi_gpu=False,
force_torch="torch" in DetectDefault.non_default_args,
force_tensorflow="tensorflow" in DetectDefault.non_default_args,
)
# Create controller and begin training.
tc = TrainerController(

add_timer_metadata("mlagents_version", mlagents.trainers.__version__)
add_timer_metadata("mlagents_envs_version", mlagents_envs.__version__)
add_timer_metadata("communication_protocol_version", UnityEnvironment.API_VERSION)
add_timer_metadata("tensorflow_version", tf_utils.tf.__version__)
add_timer_metadata("pytorch_version", torch_utils.torch.__version__)
add_timer_metadata("numpy_version", np.__version__)
if options.env_settings.seed == -1:

6
ml-agents/mlagents/trainers/optimizer/tf_optimizer.py


]
feed_dict[self.memory_in] = [np.zeros((self.m_size), dtype=np.float32)]
if self.policy.prev_action is not None:
feed_dict[self.policy.prev_action] = batch["prev_action"]
feed_dict[self.policy.prev_action] = batch["prev_discrete_action"]
if self.policy.use_recurrent:
value_estimates, policy_mem, value_mem = self.sess.run(

batch["actions"][-1] if not self.policy.use_continuous_act else None
batch["discrete_action"][-1]
if not self.policy.use_continuous_act
else None
)
else:
value_estimates = self.sess.run(self.value_heads, feed_dict)

47
ml-agents/mlagents/trainers/policy/policy.py


self.trainer_settings = trainer_settings
self.network_settings: NetworkSettings = trainer_settings.network_settings
self.seed = seed
# For mixed action spaces
self.continuous_act_size = self.action_spec.continuous_action_size
self.discrete_act_size = self.action_spec.discrete_action_size
self.discrete_act_branches = self.action_spec.discrete_action_branches
list(self.action_spec.discrete_action_branches)
if self.action_spec.is_action_discrete()
else [self.action_spec.action_size]
list(self.behavior_spec.action_spec.discrete_branches)
if self.behavior_spec.action_spec.is_discrete()
else [self.behavior_spec.action_spec.continuous_size]
)
self.vec_obs_size = sum(
shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1

)
self.use_continuous_act = self.action_spec.is_action_continuous()
self.num_branches = self.action_spec.action_size
self.previous_action_dict: Dict[str, np.array] = {}
self.use_continuous_act = self.behavior_spec.action_spec.is_continuous()
self.previous_action_dict: Dict[str, Dict[str, np.ndarray]] = {}
self.memory_dict: Dict[str, np.ndarray] = {}
self.normalize = trainer_settings.network_settings.normalize
self.use_recurrent = self.network_settings.memory is not None

if agent_id in self.memory_dict:
self.memory_dict.pop(agent_id)
def make_empty_previous_action(self, num_agents):
def make_empty_previous_action(self, num_agents: int) -> Dict[str, np.ndarray]:
:return: Numpy array of zeros.
:return: Dict of action type to np.ndarray
return np.zeros((num_agents, self.num_branches), dtype=np.int)
act_dict: Dict[str, np.ndarray] = {}
action_buffer = self.behavior_spec.action_spec.empty_action(num_agents)
if action_buffer.continuous is not None:
act_dict["continuous_action"] = action_buffer.continuous
if action_buffer.discrete is not None:
act_dict["discrete_action"] = action_buffer.discrete
return act_dict
self, agent_ids: List[str], action_matrix: Optional[np.ndarray]
self, agent_ids: List[str], action_dict: Dict[str, np.ndarray]
if action_matrix is None:
if action_dict is None:
self.previous_action_dict[agent_id] = action_matrix[index, :]
agent_action_dict: Dict[str, np.ndarray] = {}
for act_type in action_dict:
agent_action_dict[act_type] = action_dict[act_type][index, :]
self.previous_action_dict[agent_id] = agent_action_dict
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int)
def retrieve_previous_action(self, agent_ids: List[str]) -> Dict[str, np.ndarray]:
action_dict = self.make_empty_previous_action(len(agent_ids))
action_matrix[index, :] = self.previous_action_dict[agent_id]
return action_matrix
for act_type in action_dict:
action_dict[act_type][index, :] = self.previous_action_dict[
agent_id
][act_type]
return action_dict
def remove_previous_action(self, agent_ids):
for agent_id in agent_ids:

19
ml-agents/mlagents/trainers/policy/tf_policy.py


reparameterize,
condition_sigma_on_obs,
)
if self.continuous_act_size > 0 and len(self.discrete_act_size) > 0:
if self.action_spec.continuous_size > 0 and self.action_spec.discrete_size > 0:
raise UnityPolicyException(
"Tensorflow does not support mixed action spaces. Please run with --torch."
)

if not self.use_continuous_act:
feed_dict[self.prev_action] = self.retrieve_previous_action(
global_agent_ids
)
)["discrete_action"]
feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
feed_dict = self.fill_eval_dict(feed_dict, decision_requests)
run_out = self._execute_model(feed_dict, self.inference_dict)

)
self.save_memories(global_agent_ids, run_out.get("memory_out"))
# For Compatibility with buffer changes for hybrid action support
if "log_probs" in run_out:
run_out["log_probs"] = {"action_probs": run_out["log_probs"]}
if "action" in run_out:
if self.behavior_spec.action_spec.is_continuous():
run_out["action"] = {"continuous_action": run_out["action"]}
else:
run_out["action"] = {"discrete_action": run_out["action"]}
return ActionInfo(
action=run_out.get("action"),
value=run_out.get("value"),

mask = np.ones(
(
len(batched_step_result),
sum(self.behavior_spec.discrete_action_branches),
sum(self.behavior_spec.action_spec.discrete_branches),
),
dtype=np.float32,
)

self.mask = tf.cast(self.mask_input, tf.int32)
tf.Variable(
int(self.behavior_spec.is_action_continuous()),
int(self.behavior_spec.action_spec.is_continuous()),
name="is_continuous_control",
trainable=False,
dtype=tf.int32,

tf.Variable(
self.m_size, name="memory_size", trainable=False, dtype=tf.int32
)
if self.behavior_spec.is_action_continuous():
if self.behavior_spec.action_spec.is_continuous():
tf.Variable(
self.act_size[0],
name="action_output_shape",

86
ml-agents/mlagents/trainers/policy/torch_policy.py


SeparateActorCritic,
GlobalSteps,
)
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
EPSILON = 1e-7 # Small value to avoid divide by zero

self.actor_critic = ac_class(
observation_shapes=self.behavior_spec.observation_shapes,
network_settings=trainer_settings.network_settings,
action_spec=self.behavior_spec.action_spec,
action_spec=behavior_spec.action_spec,
stream_names=reward_signal_names,
conditional_sigma=self.condition_sigma_on_obs,
tanh_squash=tanh_squash,

) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
mask = None
if self.discrete_act_size > 0:
mask = torch.ones([len(decision_requests), np.sum(self.discrete_act_branches)])
if decision_requests.action_mask is not None:
mask = torch.as_tensor(
1 - np.concatenate(decision_requests.action_mask, axis=1)
)
if not self.use_continuous_act:
mask = torch.ones([len(decision_requests), np.sum(self.act_size)])
if decision_requests.action_mask is not None:
mask = torch.as_tensor(
1 - np.concatenate(decision_requests.action_mask, axis=1)
)
return vec_vis_obs, mask
def update_normalization(self, vector_obs: np.ndarray) -> None:

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
seq_len: int = 1,
all_log_probs: bool = False,
torch.Tensor, torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
:param all_log_probs: Returns (for discrete actions) a tensor of log probs, one for each action.
:param vec_obs: List of vector observations.
:param vis_obs: List of visual observations.
:param masks: Loss masks for RNN, else None.
:param memories: Input memories when using RNN, else None.
:param seq_len: Sequence length when using RNN.
:return: Tuple of AgentAction, ActionLogProbs, entropies, and output memories.
return (
actions,
log_probs,
entropies,
value_heads,
memories,
)
return (actions, log_probs, entropies, value_heads, memories)
# if memories is None:
# dists, memories = self.actor_critic.get_dists(
# vec_obs, vis_obs, masks, memories, seq_len
# )
# else:
# # If we're using LSTM. we need to execute the values to get the critic memories
# dists, _, memories = self.actor_critic.get_dist_and_value(
# vec_obs, vis_obs, masks, memories, seq_len
# )
# action_list = self.actor_critic.sample_action(dists)
# log_probs_list, entropies, all_logs_list = ModelUtils.get_probs_and_entropy(
# action_list, dists
# )
# actions = AgentAction.create(action_list, self.behavior_spec.action_spec)
# log_probs = ActionLogProbs.create(
# log_probs_list, self.behavior_spec.action_spec, all_logs_list
# )
# # Use the sum of entropy across actions, not the mean
# entropy_sum = torch.sum(entropies, dim=1)
# return (actions, log_probs, entropy_sum, memories)
actions: torch.Tensor,
actions: AgentAction,
) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]:
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
# dists, value_heads, _ = self.actor_critic.get_dist_and_value(
# vec_obs, vis_obs, masks, memories, seq_len
# )
# action_list = actions.to_tensor_list()
# log_probs_list, entropies, _ = ModelUtils.get_probs_and_entropy(
# action_list, dists
# )
# log_probs = ActionLogProbs.create(
# log_probs_list, self.behavior_spec.action_spec
# )
## Use the sum of entropy across actions, not the mean
# entropy_sum = torch.sum(entropies, dim=1)
# return log_probs, entropy_sum, value_heads
@timed
def evaluate(
self, decision_requests: DecisionSteps, global_agent_ids: List[str]

action, log_probs, entropy, value_heads, memories = self.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
)
# Todo - make pre_action difference
run_out["pre_action"] = ModelUtils.to_numpy(action)
run_out["action"] = ModelUtils.to_numpy(action)
run_out["log_probs"] = ModelUtils.to_numpy(log_probs)
run_out["action"] = action.to_numpy_dict()
run_out["pre_action"] = (
action.to_numpy_dict()["continuous_action"]
if self.use_continuous_act
else None
) # Todo - make pre_action difference
run_out["log_probs"] = log_probs.to_numpy_dict()
run_out["entropy"] = ModelUtils.to_numpy(entropy)
run_out["value_heads"] = {
name: ModelUtils.to_numpy(t) for name, t in value_heads.items()

17
ml-agents/mlagents/trainers/ppo/optimizer_tf.py


)
stats_needed.update(reward_signal.stats_name_to_update_name)
for tens, d in feed_dict.items():
print(tens, d)
update_vals = self._execute_model(feed_dict, self.update_dict)
for stat_name, update_name in stats_needed.items():
update_stats[stat_name] = update_vals[update_name]

if self.policy.output_pre is not None and "actions_pre" in mini_batch:
feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]
else:
feed_dict[self.policy.output] = mini_batch["actions"]
if self.policy.use_recurrent:
feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
if self.policy.use_continuous_act: # For hybrid action buffer support
feed_dict[self.policy.output] = mini_batch["continuous_action"]
if self.policy.use_recurrent:
feed_dict[self.policy.prev_action] = mini_batch[
"prev_continuous_action"
]
else:
feed_dict[self.policy.output] = mini_batch["discrete_action"]
if self.policy.use_recurrent:
feed_dict[self.policy.prev_action] = mini_batch[
"prev_discrete_action"
]
feed_dict[self.policy.action_masks] = mini_batch["action_mask"]
if "vector_obs" in mini_batch:
feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]

14
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
from mlagents.trainers.settings import TrainerSettings, PPOSettings
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
class TorchPPOOptimizer(TorchOptimizer):

advantage = advantages.unsqueeze(-1)
decay_epsilon = self.hyperparameters.epsilon
r_theta = torch.exp(log_probs - old_log_probs)
p_opt_a = r_theta * advantage
p_opt_b = (

vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
#discrete_actions = ModelUtils.list_to_tensor(batch["actions"][self.policy.continuous_act_size:], dtype=torch.long)
actions = AgentAction.from_dict(batch)
memories = [
ModelUtils.list_to_tensor(batch["memory"][i])

memories=memories,
seq_len=self.policy.sequence_length,
)
old_log_probs = ActionLogProbs.from_dict(batch).flatten()
log_probs = log_probs.flatten()
loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
value_loss = self.ppo_value_loss(
values, old_values, returns, decay_eps, loss_masks

log_probs,
ModelUtils.list_to_tensor(batch["action_probs"]),
old_log_probs,
loss_masks,
)
loss = (

self.optimizer.step()
update_stats = {
"Losses/Policy Loss": policy_loss.item(),
# NOTE: abs() is not technically correct, but matches the behavior in TensorFlow.
# TODO: After PyTorch is default, change to something more correct.
"Losses/Policy Loss": torch.abs(policy_loss).item(),
"Losses/Value Loss": value_loss.item(),
"Policy/Learning Rate": decay_lr,
"Policy/Epsilon": decay_eps,

40
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.policy import Policy
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer
from mlagents.trainers.tf.components.reward_signals import RewardSignal
from mlagents import torch_utils
from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
BaseRewardProvider,
)
from mlagents import tf_utils
if torch_utils.is_available():
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer
if tf_utils.is_available():
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
TorchPolicy = None # type: ignore
TorchPPOOptimizer = None # type: ignore
TFPolicy = None # type: ignore
PPOOptimizer = None # type: ignore
logger = get_logger(__name__)

for name, v in value_estimates.items():
agent_buffer_trajectory[f"{name}_value_estimates"].extend(v)
if isinstance(self.optimizer.reward_signals[name], RewardSignal):
if isinstance(self.optimizer.reward_signals[name], BaseRewardProvider):
self.optimizer.reward_signals[name].value_name, np.mean(v)
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
np.mean(v),
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
np.mean(v),
self.optimizer.reward_signals[name].value_name, np.mean(v)
)
# Evaluate all reward functions

for name, reward_signal in self.optimizer.reward_signals.items():
if isinstance(reward_signal, RewardSignal):
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
else:
# BaseRewardProvider is a PyTorch-based reward signal
if isinstance(reward_signal, BaseRewardProvider):
else: # reward_signal is a TensorFlow-based RewardSignal class
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
agent_buffer_trajectory[f"{name}_rewards"].extend(evaluate_result)
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

behavior_spec,
self.trainer_settings,
condition_sigma_on_obs=False, # Faster training for PPO
separate_critic=behavior_spec.action_spec.is_continuous(),
)
return policy

8
ml-agents/mlagents/trainers/sac/optimizer_tf.py


feed_dict[self.rewards_holders[name]] = batch[f"{name}_rewards"]
if self.policy.use_continuous_act:
feed_dict[self.policy_network.external_action_in] = batch["actions"]
feed_dict[self.policy_network.external_action_in] = batch[
"continuous_action"
]
feed_dict[policy.output] = batch["actions"]
feed_dict[policy.output] = batch["discrete_action"]
feed_dict[policy.prev_action] = batch["prev_action"]
feed_dict[policy.prev_action] = batch["prev_discrete_action"]
feed_dict[policy.action_masks] = batch["action_mask"]
if self.policy.use_vec_obs:
feed_dict[policy.vector_in] = batch["vector_obs"]

71
ml-agents/mlagents/trainers/sac/optimizer_torch.py


from mlagents.torch_utils import torch, nn, default_device
from mlagents_envs.logging_util import get_logger
from mlagents_envs.base_env import ActionType
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
from mlagents.trainers.buffer import AgentBuffer
from mlagents_envs.timers import timed
from mlagents.trainers.exception import UnityTrainerException

stream_names: List[str],
observation_shapes: List[Tuple[int, ...]],
network_settings: NetworkSettings,
act_type: ActionType,
act_size: List[int],
action_spec: ActionSpec,
if act_type == ActionType.CONTINUOUS:
self.action_spec = action_spec
if self.action_spec.is_continuous():
self.act_size = self.action_spec.continuous_size
num_action_ins = sum(act_size)
num_action_ins = self.act_size
num_value_outs = sum(act_size)
self.act_size = self.action_spec.discrete_branches
num_value_outs = sum(self.act_size)
num_action_ins = 0
self.q1_network = ValueNetwork(
stream_names,

self.stream_names,
self.policy.behavior_spec.observation_shapes,
policy_network_settings,
self.policy.behavior_spec.action_type,
self.act_size,
self.policy.behavior_spec.action_spec,
)
self.target_network = ValueNetwork(

def sac_value_loss(
self,
log_probs: torch.Tensor,
log_probs: ActionLogProbs,
values: Dict[str, torch.Tensor],
q1p_out: Dict[str, torch.Tensor],
q2p_out: Dict[str, torch.Tensor],

if not discrete:
min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name])
else:
action_probs = log_probs.exp()
action_probs = log_probs.all_discrete_tensor.exp()
_branched_q1p = ModelUtils.break_into_branches(
q1p_out[name] * action_probs, self.act_size
)

for name in values.keys():
with torch.no_grad():
v_backup = min_policy_qs[name] - torch.sum(
_ent_coef * log_probs, dim=1
_ent_coef * log_probs.continuous_tensor, dim=1
)
value_loss = 0.5 * ModelUtils.masked_mean(
torch.nn.functional.mse_loss(values[name], v_backup), loss_masks

branched_per_action_ent = ModelUtils.break_into_branches(
log_probs * log_probs.exp(), self.act_size
log_probs.all_discrete_tensor * log_probs.all_discrete_tensor.exp(),
self.act_size,
)
# We have to do entropy bonus per action branch
branched_ent_bonus = torch.stack(

def sac_policy_loss(
self,
log_probs: torch.Tensor,
log_probs: ActionLogProbs,
q1p_outs: Dict[str, torch.Tensor],
loss_masks: torch.Tensor,
discrete: bool,

if not discrete:
mean_q1 = mean_q1.unsqueeze(1)
batch_policy_loss = torch.mean(_ent_coef * log_probs - mean_q1, dim=1)
batch_policy_loss = torch.mean(
_ent_coef * log_probs.continuous_tensor - mean_q1, dim=1
)
action_probs = log_probs.exp()
action_probs = log_probs.all_discrete_tensor.exp()
log_probs * action_probs, self.act_size
log_probs.all_discrete_tensor * action_probs, self.act_size
)
branched_q_term = ModelUtils.break_into_branches(
mean_q1 * action_probs, self.act_size

for i, (_lp, _qt) in enumerate(
zip(branched_per_action_ent, branched_q_term)
)
]
],
dim=1,
policy_loss = torch.mean(loss_masks * batch_policy_loss)
policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
self, log_probs: torch.Tensor, loss_masks: torch.Tensor, discrete: bool
self, log_probs: ActionLogProbs, loss_masks: torch.Tensor, discrete: bool
target_current_diff = torch.sum(log_probs + self.target_entropy, dim=1)
entropy_loss = -torch.mean(
self._log_ent_coef * loss_masks * target_current_diff
target_current_diff = torch.sum(
log_probs.continuous_tensor + self.target_entropy, dim=1
)
entropy_loss = -1 * ModelUtils.masked_mean(
self._log_ent_coef * target_current_diff, loss_masks
log_probs * log_probs.exp(), self.act_size
log_probs.all_discrete_tensor * log_probs.all_discrete_tensor.exp(),
self.act_size,
)
target_current_diff_branched = torch.stack(
[

vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
next_vec_obs = [ModelUtils.list_to_tensor(batch["next_vector_in"])]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
else:
actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long)
actions = AgentAction.from_dict(batch)
memories_list = [
ModelUtils.list_to_tensor(batch["memory"][i])

masks=act_masks,
memories=memories,
seq_len=self.policy.sequence_length,
all_log_probs=not self.policy.use_continuous_act,
squeezed_actions = actions.squeeze(-1)
squeezed_actions = actions.continuous_tensor
sampled_actions,
sampled_actions.continuous_tensor,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q2_grad=False,

memories=q_memories,
sequence_length=self.policy.sequence_length,
)
q1_stream = self._condense_q_streams(q1_out, actions)
q2_stream = self._condense_q_streams(q2_out, actions)
q1_stream = self._condense_q_streams(q1_out, actions.discrete_tensor)
q2_stream = self._condense_q_streams(q2_out, actions.discrete_tensor)
with torch.no_grad():
target_values, _ = self.target_network(

61
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents_envs.logging_util import get_logger
from mlagents_envs.timers import timed
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.sac.optimizer_tf import SACOptimizer
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer
from mlagents.trainers.tf.components.reward_signals import RewardSignal
from mlagents import torch_utils
from mlagents.trainers.torch.components.reward_providers import BaseRewardProvider
from mlagents import tf_utils
if torch_utils.is_available():
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer
if tf_utils.is_available():
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.sac.optimizer_tf import SACOptimizer
TorchPolicy = None # type: ignore
TorchSACOptimizer = None # type: ignore
TFPolicy = None # type: ignore
SACOptimizer = None # type: ignore
logger = get_logger(__name__)

self.seed = seed
self.policy: Policy = None # type: ignore
self.optimizer: SACOptimizer = None # type: ignore
self.optimizer: TorchSACOptimizer = None # type: ignore
self.hyperparameters: SACSettings = cast(
SACSettings, trainer_settings.hyperparameters
)

agent_buffer_trajectory["environment_rewards"]
)
for name, reward_signal in self.optimizer.reward_signals.items():
if isinstance(reward_signal, RewardSignal):
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
else:
# BaseRewardProvider is a PyTorch-based reward signal
if isinstance(reward_signal, BaseRewardProvider):
else: # reward_signal uses TensorFlow
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

)
for name, v in value_estimates.items():
if isinstance(self.optimizer.reward_signals[name], RewardSignal):
self._stats_reporter.add_stat(
self.optimizer.reward_signals[name].value_name, np.mean(v)
)
else:
# BaseRewardProvider is a PyTorch-based reward signal
if isinstance(self.optimizer.reward_signals[name], BaseRewardProvider):
)
else: # TensorFlow reward signal
self._stats_reporter.add_stat(
self.optimizer.reward_signals[name].value_name, np.mean(v)
)
# Bootstrap using the last step rather than the bootstrap step if max step is reached.

)
# Get rewards for each reward
for name, signal in self.optimizer.reward_signals.items():
if isinstance(signal, RewardSignal):
# BaseRewardProvider is a PyTorch-based reward signal
if isinstance(signal, BaseRewardProvider):
sampled_minibatch[f"{name}_rewards"] = (
signal.evaluate(sampled_minibatch) * signal.strength
)
else: # reward_signal is a TensorFlow-based RewardSignal class
else:
sampled_minibatch[f"{name}_rewards"] = (
signal.evaluate(sampled_minibatch) * signal.strength
)
update_stats = self.optimizer.update(sampled_minibatch, n_sequences)
for stat_name, value in update_stats.items():

reward_signal_minibatches = {}
for name, signal in self.optimizer.reward_signals.items():
logger.debug(f"Updating {name} at step {self.step}")
if isinstance(signal, RewardSignal):
# BaseRewardProvider is a PyTorch-based reward signal
if not isinstance(signal, BaseRewardProvider):
# Some signals don't need a minibatch to be sampled - so we don't!
if signal.update_dict:
reward_signal_minibatches[name] = buffer.sample_mini_batch(

else:
else: # TensorFlow reward signal
if name != "extrinsic":
reward_signal_minibatches[name] = buffer.sample_mini_batch(
self.hyperparameters.batch_size,

for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
def create_sac_optimizer(self) -> SACOptimizer:
def create_sac_optimizer(self) -> TorchSACOptimizer:
if self.framework == FrameworkType.PYTORCH:
return TorchSACOptimizer( # type: ignore
cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore

2
ml-agents/mlagents/trainers/settings.py


threaded: bool = True
self_play: Optional[SelfPlaySettings] = None
behavioral_cloning: Optional[BehavioralCloningSettings] = None
framework: FrameworkType = FrameworkType.TENSORFLOW
framework: FrameworkType = FrameworkType.PYTORCH
cattr.register_structure_hook(
Dict[RewardSignalType, RewardSignalSettings], RewardSignalSettings.structure

3
ml-agents/mlagents/trainers/simple_env_manager.py


self.previous_all_action_info = all_action_info
for brain_name, action_info in all_action_info.items():
self.env.set_actions(brain_name, action_info.action)
_action = EnvManager.action_buffers_from_numpy_dict(action_info.action)
self.env.set_actions(brain_name, _action)
self.env.step()
all_step_result = self._generate_all_results()

86
ml-agents/mlagents/trainers/stats.py


from collections import defaultdict
from enum import Enum
from typing import List, Dict, NamedTuple, Any, Optional
from typing import List, Dict, NamedTuple, Any
import numpy as np
import abc
import os

from mlagents_envs.logging_util import get_logger
from mlagents_envs.timers import set_gauge
from mlagents.tf_utils import tf, generate_session_config
from torch.utils.tensorboard import SummaryWriter
def _dict_to_str(param_dict: Dict[str, Any], num_tabs: int) -> str:
"""
Takes a parameter dictionary and converts it to a human-readable string.
Recurses if there are multiple levels of dict. Used to print out hyperparameters.
param: param_dict: A Dictionary of key, value parameters.
return: A string version of this dictionary.
"""
if not isinstance(param_dict, dict):
return str(param_dict)
else:
append_newline = "\n" if num_tabs > 0 else ""
return append_newline + "\n".join(
[
"\t"
+ " " * num_tabs
+ "{}:\t{}".format(x, _dict_to_str(param_dict[x], num_tabs + 1))
for x in param_dict
]
)
class StatsSummary(NamedTuple):

if property_type == StatsPropertyType.HYPERPARAMETERS:
logger.info(
"""Hyperparameters for behavior name {}: \n{}""".format(
category, self._dict_to_str(value, 0)
category, _dict_to_str(value, 0)
)
)
elif property_type == StatsPropertyType.SELF_PLAY:

def _dict_to_str(self, param_dict: Dict[str, Any], num_tabs: int) -> str:
"""
Takes a parameter dictionary and converts it to a human-readable string.
Recurses if there are multiple levels of dict. Used to print out hyperparameters.
param: param_dict: A Dictionary of key, value parameters.
return: A string version of this dictionary.
"""
if not isinstance(param_dict, dict):
return str(param_dict)
else:
append_newline = "\n" if num_tabs > 0 else ""
return append_newline + "\n".join(
[
"\t"
+ " " * num_tabs
+ "{}:\t{}".format(
x, self._dict_to_str(param_dict[x], num_tabs + 1)
)
for x in param_dict
]
)
class TensorboardWriter(StatsWriter):
def __init__(self, base_dir: str, clear_past_data: bool = False):

:param clear_past_data: Whether or not to clean up existing Tensorboard files associated with the base_dir and
category.
"""
self.summary_writers: Dict[str, tf.summary.FileWriter] = {}
self.summary_writers: Dict[str, SummaryWriter] = {}
self.base_dir: str = base_dir
self._clear_past_data = clear_past_data

self._maybe_create_summary_writer(category)
for key, value in values.items():
summary = tf.Summary()
summary.value.add(tag=f"{key}", simple_value=value.mean)
self.summary_writers[category].add_summary(summary, step)
self.summary_writers[category].add_scalar(f"{key}", value.mean, step)
self.summary_writers[category].flush()
def _maybe_create_summary_writer(self, category: str) -> None:

os.makedirs(filewriter_dir, exist_ok=True)
if self._clear_past_data:
self._delete_all_events_files(filewriter_dir)
self.summary_writers[category] = tf.summary.FileWriter(filewriter_dir)
self.summary_writers[category] = SummaryWriter(filewriter_dir)
def _delete_all_events_files(self, directory_name: str) -> None:
for file_name in os.listdir(directory_name):

) -> None:
if property_type == StatsPropertyType.HYPERPARAMETERS:
assert isinstance(value, dict)
summary = self._dict_to_tensorboard("Hyperparameters", value)
summary = _dict_to_str(value, 0)
self.summary_writers[category].add_summary(summary, 0)
def _dict_to_tensorboard(
self, name: str, input_dict: Dict[str, Any]
) -> Optional[bytes]:
"""
Convert a dict to a Tensorboard-encoded string.
:param name: The name of the text.
:param input_dict: A dictionary that will be displayed in a table on Tensorboard.
"""
try:
with tf.Session(config=generate_session_config()) as sess:
s_op = tf.summary.text(
name,
tf.convert_to_tensor(
[[str(x), str(input_dict[x])] for x in input_dict]
),
)
s = sess.run(s_op)
return s
except Exception:
logger.warning(
f"Could not write {name} summary for Tensorboard: {input_dict}"
)
return None
self.summary_writers[category].add_text("Hyperparameters", summary)
self.summary_writers[category].flush()
class StatsReporter:

5
ml-agents/mlagents/trainers/subprocess_env_manager.py


all_action_info = req.payload
for brain_name, action_info in all_action_info.items():
if len(action_info.action) != 0:
env.set_actions(brain_name, action_info.action)
_action = EnvManager.action_buffers_from_numpy_dict(
action_info.action
)
env.set_actions(brain_name, _action)
env.step()
all_step_result = _generate_all_results()
# The timers in this process are independent from all the processes and the "main" process

71
ml-agents/mlagents/trainers/tests/mock_brain.py


from typing import List, Tuple, Union
from collections.abc import Iterable
from typing import List, Tuple
import numpy as np
from mlagents.trainers.buffer import AgentBuffer

TerminalSteps,
BehaviorSpec,
ActionType,
ActionSpec,
)

action_shape: Union[int, Tuple[int]] = None,
discrete: bool = False,
action_spec: ActionSpec,
done: bool = False,
) -> Tuple[DecisionSteps, TerminalSteps]:
"""

:bool discrete: Whether or not action space is discrete
:bool done: Whether all the agents in the batch are done
"""
if action_shape is None:
action_shape = 2
if discrete and isinstance(action_shape, Iterable):
if action_spec.is_discrete():
for action_size in action_shape # type: ignore
for action_size in action_spec.discrete_branches # type: ignore
behavior_spec = BehaviorSpec(
observation_shapes,
ActionType.DISCRETE if discrete else ActionType.CONTINUOUS,
action_shape,
)
behavior_spec = BehaviorSpec(observation_shapes, action_spec)
if done:
return (
DecisionSteps.empty(behavior_spec),

return create_mock_steps(
num_agents=num_agents,
observation_shapes=behavior_spec.observation_shapes,
action_shape=behavior_spec.action_shape,
discrete=behavior_spec.is_action_discrete(),
action_spec=behavior_spec.action_spec,
)

action_spec: ActionSpec,
action_space: Union[int, Tuple[int]] = 2,
is_discrete: bool = True,
) -> Trajectory:
"""
Makes a fake trajectory of length length. If max_step_complete,

action_size = action_spec.discrete_size + action_spec.continuous_size
action_probs = {
"action_probs": np.ones(
int(np.sum(action_spec.discrete_branches) + action_spec.continuous_size),
dtype=np.float32,
)
}
for _i in range(length - 1):
obs = []
for _shape in observation_shapes:

if is_discrete:
action_size = len(action_space) # type: ignore
action_probs = np.ones(np.sum(action_space), dtype=np.float32)
if action_spec.is_continuous():
action = {"continuous_action": np.zeros(action_size, dtype=np.float32)}
action_size = int(action_space) # type: ignore
action_probs = np.ones((action_size), dtype=np.float32)
action = np.zeros(action_size, dtype=np.float32)
action = {"discrete_action": np.zeros(action_size, dtype=np.float32)}
[[False for _ in range(branch)] for branch in action_space] # type: ignore
if is_discrete
[
[False for _ in range(branch)]
for branch in action_spec.discrete_branches
] # type: ignore
if action_spec.is_discrete()
prev_action = np.ones(action_size, dtype=np.float32)
if action_spec.is_continuous():
prev_action = {"continuous_action": np.ones(action_size, dtype=np.float32)}
else:
prev_action = {"discrete_action": np.ones(action_size, dtype=np.float32)}
max_step = False
memory = np.ones(memory_size, dtype=np.float32)
agent_id = "test_agent"

memory_size: int = 10,
exclude_key_list: List[str] = None,
) -> AgentBuffer:
action_space = behavior_spec.action_shape
is_discrete = behavior_spec.is_action_discrete()
action_space=action_space,
action_spec=behavior_spec.action_spec,
is_discrete=is_discrete,
)
buffer = trajectory.to_agentbuffer()
# If a key_list was given, remove those keys

def setup_test_behavior_specs(
use_discrete=True, use_visual=False, vector_action_space=2, vector_obs_space=8
):
if use_discrete:
action_spec = ActionSpec.create_discrete(tuple(vector_action_space))
else:
action_spec = ActionSpec.create_continuous(vector_action_space)
[(84, 84, 3)] * int(use_visual) + [(vector_obs_space,)],
ActionType.DISCRETE if use_discrete else ActionType.CONTINUOUS,
tuple(vector_action_space) if use_discrete else vector_action_space,
[(84, 84, 3)] * int(use_visual) + [(vector_obs_space,)], action_spec
)
return behavior_spec

42
ml-agents/mlagents/trainers/tests/simple_test_envs.py


from mlagents_envs.base_env import (
ActionSpec,
ActionBuffers,
ActionType,
BehaviorMapping,
BehaviorName,
ActionBuffer,

self.num_vector = num_vector
self.vis_obs_size = vis_obs_size
self.vec_obs_size = vec_obs_size
action_type = ActionType.DISCRETE if use_discrete else ActionType.CONTINUOUS
self.behavior_spec = BehaviorSpec(
self._make_obs_spec(), ActionSpec(0, tuple(2 for _ in range(action_size)))
action_spec = ActionSpec.create_discrete(
tuple(2 for _ in range(action_size))
self.behavior_spec = BehaviorSpec(
self._make_obs_spec(), ActionSpec(action_size, tuple())
)
action_spec = ActionSpec.create_continuous(action_size)
self.behavior_spec = BehaviorSpec(self._make_obs_spec(), action_spec)
self.action_size = action_size
self.names = brain_names
self.positions: Dict[str, List[float]] = {}

def _take_action(self, name: str) -> bool:
deltas = []
for _act in self.action[name][0]:
if self.discrete:
deltas.append(1 if _act else -1)
else:
deltas.append(_act)
_act = self.action[name]
if _act.discrete is not None:
for _disc in _act.discrete[0]:
deltas.append(1 if _disc else -1)
if _act.continuous is not None:
for _cont in _act.continuous[0]:
deltas.append(_cont)
for i, _delta in enumerate(deltas):
_delta = clamp(_delta, -self.step_size, self.step_size)
self.positions[name][i] += _delta

# less than 1/step_size to force agent to use memory
self.behavior_spec = BehaviorSpec(
self._make_obs_spec(),
ActionSpec(continuous_action_size, tuple(2 for _ in range(discrete_action_size))),
ActionSpec(
continuous_action_size, tuple(2 for _ in range(discrete_action_size))
),
)
self.continuous_action_size = continuous_action_size
self.discrete_action_size = discrete_action_size

def step(self) -> None:
super().step()
for name in self.names:
if self.discrete:
action = self.action[name].discrete
else:
action = self.action[name].continuous
self.step_result[name][0], self.step_result[name][1], self.action[name]
self.step_result[name][0], self.step_result[name][1], action
)
self.demonstration_protos[name] = self.demonstration_protos[name][
-self.n_demos :

for _ in range(self.n_demos):
for name in self.names:
if self.discrete:
self.action[name] = [[1]] if self.goal[name] > 0 else [[0]]
self.action[name] = ActionBuffers(
[[]], np.array([[1]] if self.goal[name] > 0 else [[0]])
)
self.action[name] = [[float(self.goal[name])]]
self.action[name] = ActionBuffers(
np.array([[float(self.goal[name])]]), [[]]
)
self.step()

46
ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py


np.testing.assert_array_equal(w, lw)
def test_resume(dummy_config, tmp_path):
mock_specs = mb.setup_test_behavior_specs(
True, False, vector_action_space=[2], vector_obs_space=1
)
behavior_id_team0 = "test_brain?team=0"
behavior_id_team1 = "test_brain?team=1"
brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name
tmp_path = tmp_path.as_posix()
ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, tmp_path)
controller = GhostController(100)
trainer = GhostTrainer(
ppo_trainer, brain_name, controller, 0, dummy_config, True, tmp_path
)
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
trainer.add_policy(parsed_behavior_id0, policy)
parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1)
policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
trainer.add_policy(parsed_behavior_id1, policy)
trainer.save_model()
# Make a new trainer, check that the policies are the same
ppo_trainer2 = PPOTrainer(brain_name, 0, dummy_config, True, True, 0, tmp_path)
trainer2 = GhostTrainer(
ppo_trainer2, brain_name, controller, 0, dummy_config, True, tmp_path
)
policy = trainer2.create_policy(parsed_behavior_id0, mock_specs)
trainer2.add_policy(parsed_behavior_id0, policy)
policy = trainer2.create_policy(parsed_behavior_id1, mock_specs)
trainer2.add_policy(parsed_behavior_id1, policy)
trainer1_policy = trainer.get_policy(parsed_behavior_id1.behavior_id)
trainer2_policy = trainer2.get_policy(parsed_behavior_id1.behavior_id)
weights = trainer1_policy.get_weights()
weights2 = trainer2_policy.get_weights()
for w, lw in zip(weights, weights2):
np.testing.assert_array_equal(w, lw)
def test_process_trajectory(dummy_config):
mock_specs = mb.setup_test_behavior_specs(
True, False, vector_action_space=[2], vector_obs_space=1

length=time_horizon,
max_step_complete=True,
observation_shapes=[(1,)],
action_space=[2],
action_spec=mock_specs.action_spec,
)
trajectory_queue0.put(trajectory)
trainer.advance()

5
ml-agents/mlagents/trainers/tests/tensorflow/test_models.py


from mlagents.trainers.tf.models import ModelUtils
from mlagents.tf_utils import tf
from mlagents_envs.base_env import BehaviorSpec, ActionType
from mlagents_envs.base_env import BehaviorSpec, ActionSpec
ActionType.DISCRETE,
(1,),
ActionSpec.create_discrete((1,)),
)
return behavior_spec

8
ml-agents/mlagents/trainers/tests/tensorflow/test_nn_policy.py


length=time_horizon,
max_step_complete=True,
observation_shapes=[(1,)],
action_space=[2],
action_spec=behavior_spec.action_spec,
)
for i in range(time_horizon):
trajectory.steps[i].obs[0] = np.array([large_obs1[i]], dtype=np.float32)

length=time_horizon,
max_step_complete=True,
observation_shapes=[(1,)],
action_space=[2],
action_spec=behavior_spec.action_spec,
)
for i in range(time_horizon):
trajectory.steps[i].obs[0] = np.array([large_obs2[i]], dtype=np.float32)

length=time_horizon,
max_step_complete=True,
observation_shapes=[(1,)],
action_space=[2],
action_spec=behavior_spec.action_spec,
)
# Change half of the obs to 0
for i in range(3):

length=time_horizon,
max_step_complete=True,
observation_shapes=[(1,)],
action_space=[2],
action_spec=behavior_spec.action_spec,
)
trajectory_buffer = trajectory.to_agentbuffer()
policy.update_normalization(trajectory_buffer["vector_obs"])

12
ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py


ppo_dummy_config,
)
from mlagents_envs.base_env import ActionSpec
@pytest.fixture
def dummy_config():

DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 64
NUM_AGENTS = 12
CONTINUOUS_ACTION_SPEC = ActionSpec.create_continuous(VECTOR_ACTION_SPACE)
DISCRETE_ACTION_SPEC = ActionSpec.create_discrete(tuple(DISCRETE_ACTION_SPACE))
def _create_ppo_optimizer_ops_mock(dummy_config, use_rnn, use_discrete, use_visual):

length=time_horizon,
observation_shapes=optimizer.policy.behavior_spec.observation_shapes,
max_step_complete=True,
action_space=DISCRETE_ACTION_SPACE if discrete else VECTOR_ACTION_SPACE,
is_discrete=discrete,
action_spec=DISCRETE_ACTION_SPEC if discrete else CONTINUOUS_ACTION_SPEC,
)
run_out, final_value_out = optimizer.get_trajectory_value_estimates(
trajectory.to_agentbuffer(), trajectory.next_obs, done=False

length=time_horizon,
observation_shapes=behavior_spec.observation_shapes,
max_step_complete=True,
action_space=[2],
action_spec=behavior_spec.action_spec,
)
trajectory_queue.put(trajectory)
trainer.advance()

length=time_horizon + 1,
max_step_complete=False,
observation_shapes=behavior_spec.observation_shapes,
action_space=[2],
action_spec=behavior_spec.action_spec,
)
trajectory_queue.put(trajectory)
trainer.advance()

9
ml-agents/mlagents/trainers/tests/tensorflow/test_sac.py


length=15,
observation_shapes=specs.observation_shapes,
max_step_complete=True,
action_space=2,
is_discrete=False,
action_spec=specs.action_spec,
)
trajectory_queue.put(trajectory)
trainer.advance()

length=6,
observation_shapes=specs.observation_shapes,
max_step_complete=False,
action_space=2,
is_discrete=False,
action_spec=specs.action_spec,
)
trajectory_queue.put(trajectory)
trainer.advance()

trajectory = make_fake_trajectory(
length=5,
observation_shapes=specs.observation_shapes,
action_spec=specs.action_spec,
action_space=2,
is_discrete=False,
)
trajectory_queue.put(trajectory)
trainer.advance()

4
ml-agents/mlagents/trainers/tests/tensorflow/test_saver.py


length=time_horizon,
max_step_complete=True,
observation_shapes=[(1,)],
action_space=[2],
action_spec=behavior_spec.action_spec,
)
# Change half of the obs to 0
for i in range(3):

length=time_horizon,
max_step_complete=True,
observation_shapes=[(1,)],
action_space=[2],
action_spec=behavior_spec.action_spec,
)
trajectory_buffer = trajectory.to_agentbuffer()
policy1.update_normalization(trajectory_buffer["vector_obs"])

8
ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py


PPO_TF_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=500,
max_steps=300,
summary_freq=100,
framework=FrameworkType.TENSORFLOW,
)

@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_sac(use_discrete):
step_size = 0.5 if use_discrete else 0.2
step_size = 0.2 if use_discrete else 0.5
memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16),
memory=NetworkSettings.MemorySettings(memory_size=16),
)
new_hyperparams = attr.evolve(
SAC_TF_CONFIG.hyperparameters,

SAC_TF_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=5000,
max_steps=4000,
framework=FrameworkType.TENSORFLOW,
)
_check_environment_trains(env, {BRAIN_NAME: config})

27
ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py


from unittest.mock import MagicMock
from mlagents.trainers.settings import TrainerSettings
import numpy as np
from mlagents_envs.base_env import ActionSpec
def basic_mock_brain():
mock_brain = MagicMock()
mock_brain.vector_action_space_type = "continuous"
mock_brain.vector_observation_space_size = 1
mock_brain.vector_action_space_size = [1]
mock_brain.brain_name = "MockBrain"
return mock_brain
def basic_behavior_spec():
dummy_actionspec = ActionSpec.create_continuous(1)
dummy_groupspec = BehaviorSpec([(1,)], dummy_actionspec)
return dummy_groupspec
class FakePolicy(TFPolicy):

def test_take_action_returns_empty_with_no_agents():
test_seed = 3
policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings(), "output")
# Doesn't really matter what this is
dummy_groupspec = BehaviorSpec([(1,)], "continuous", 1)
no_agent_step = DecisionSteps.empty(dummy_groupspec)
behavior_spec = basic_behavior_spec()
policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output")
no_agent_step = DecisionSteps.empty(behavior_spec)
result = policy.get_action(no_agent_step)
assert result == ActionInfo.empty()

policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings(), "output")
behavior_spec = basic_behavior_spec()
policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output")
policy.evaluate = MagicMock(return_value={})
policy.save_memories = MagicMock()
step_with_agents = DecisionSteps(

def test_take_action_returns_action_info_when_available():
test_seed = 3
policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings(), "output")
behavior_spec = basic_behavior_spec()
policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output")
"action": np.array([1.0], dtype=np.float32),
"action": {"continuous_action": np.array([1.0], dtype=np.float32)},
"memory_out": np.array([[2.5]], dtype=np.float32),
"value": np.array([1.1], dtype=np.float32),
}

47
ml-agents/mlagents/trainers/tests/test_agent_processor.py


from mlagents.trainers.behavior_id_utils import get_global_agent_id
from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
from mlagents_envs.base_env import ActionSpec
mock_policy.retrieve_previous_action.return_value = np.zeros(
(1, 1), dtype=np.float32
)
mock_policy.retrieve_previous_action.return_value = {
"prev_continuous_action": np.zeros((1, 1), dtype=np.float32)
}
return mock_policy

)
fake_action_outputs = {
"action": [0.1, 0.1],
"action": {"continuous_action": [0.1, 0.1]},
"log_probs": [0.1, 0.1],
"log_probs": {"continuous_log_probs": [0.1, 0.1]},
action_shape=2,
action_spec=ActionSpec.create_continuous(2),
action=[0.1, 0.1],
action={"continuous_action": [0.1, 0.1]},
value=[0.1, 0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_steps.agent_id,

mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
num_agents=0,
observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)],
action_shape=2,
action_spec=ActionSpec.create_continuous(2),
)
processor.add_experiences(
mock_decision_steps, mock_terminal_steps, 0, ActionInfo([], [], {}, [])

)
fake_action_outputs = {
"action": [0.1],
"action": {"continuous_action": [0.1]},
"log_probs": [0.1],
"log_probs": {"continuous_log_probs": [0.1]},
num_agents=1, observation_shapes=[(8,)], action_shape=2
num_agents=1,
observation_shapes=[(8,)],
action_spec=ActionSpec.create_continuous(2),
num_agents=1, observation_shapes=[(8,)], action_shape=2, done=True
num_agents=1,
observation_shapes=[(8,)],
action_spec=ActionSpec.create_continuous(2),
done=True,
action=[0.1],
action={"continuous_action": [0.1]},
value=[0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_step.agent_id,

processor.add_experiences(
mock_decision_step, mock_terminal_step, _ep, fake_action_info
)
add_calls.append(mock.call([get_global_agent_id(_ep, 0)], [0.1]))
add_calls.append(
mock.call([get_global_agent_id(_ep, 0)], {"continuous_action": [0.1]})
)
processor.add_experiences(
mock_done_decision_step, mock_done_terminal_step, _ep, fake_action_info
)

)
fake_action_outputs = {
"action": [0.1],
"action": {"continuous_action": [0.1]},
"log_probs": [0.1],
"log_probs": {"continuous_log_probs": [0.1]},
num_agents=1, observation_shapes=[(8,)], action_shape=2
num_agents=1,
observation_shapes=[(8,)],
action_spec=ActionSpec.create_continuous(2),
action=[0.1],
action={"continuous_action": [0.1]},
value=[0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_step.agent_id,

10
ml-agents/mlagents/trainers/tests/test_demo_loader.py


assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, BEHAVIOR_SPEC)
assert len(demo_buffer["actions"]) == total_expected - 1
assert (
len(demo_buffer["continuous_action"]) == total_expected - 1
or len(demo_buffer["discrete_action"]) == total_expected - 1
)
def test_load_demo_dir():

assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1, BEHAVIOR_SPEC)
assert len(demo_buffer["actions"]) == total_expected - 1
assert (
len(demo_buffer["continuous_action"]) == total_expected - 1
or len(demo_buffer["discrete_action"]) == total_expected - 1
)
def test_demo_mismatch():

6
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.settings import TrainerSettings, FrameworkType
from mlagents_envs.base_env import ActionSpec
# Add concrete implementations of abstract methods
class FakeTrainer(RLTrainer):

length=time_horizon,
observation_shapes=[(1,)],
max_step_complete=True,
action_space=[2],
action_spec=ActionSpec.create_discrete((2,)),
)
trajectory_queue.put(trajectory)

length=time_horizon,
observation_shapes=[(1,)],
max_step_complete=True,
action_space=[2],
action_spec=ActionSpec.create_discrete((2,)),
)
# Check that we can turn off the trainer and that the buffer is cleared
num_trajectories = 5

20
ml-agents/mlagents/trainers/tests/test_stats.py


)
@mock.patch("mlagents.tf_utils.tf.Summary")
@mock.patch("mlagents.tf_utils.tf.summary.FileWriter")
def test_tensorboard_writer(mock_filewriter, mock_summary):
@mock.patch("mlagents.trainers.stats.SummaryWriter")
def test_tensorboard_writer(mock_summary):
# Test write_stats
category = "category1"
with tempfile.TemporaryDirectory(prefix="unittest-") as base_dir:

basedir=base_dir, category=category
)
assert os.path.exists(filewriter_dir)
mock_filewriter.assert_called_once_with(filewriter_dir)
mock_summary.assert_called_once_with(filewriter_dir)
mock_summary.return_value.value.add.assert_called_once_with(
tag="key1", simple_value=1.0
)
mock_filewriter.return_value.add_summary.assert_called_once_with(
mock_summary.return_value, 10
)
mock_filewriter.return_value.flush.assert_called_once()
mock_summary.return_value.add_scalar.assert_called_once_with("key1", 1.0, 10)
mock_summary.return_value.flush.assert_called_once()
assert mock_filewriter.return_value.add_summary.call_count > 1
assert mock_summary.return_value.add_text.call_count >= 1
def test_tensorboard_writer_clear(tmp_path):

},
10,
)
# Test hyperparameter writing - no good way to parse the TB string though.
# Test hyperparameter writing
console_writer.add_property(
"category1", StatsPropertyType.HYPERPARAMETERS, {"example": 1.0}
)

8
ml-agents/mlagents/trainers/tests/test_trajectory.py


from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.tests.mock_brain import make_fake_trajectory
from mlagents_envs.base_env import ActionSpec
VEC_OBS_SIZE = 6
ACTION_SIZE = 4

"masks",
"done",
"actions_pre",
"actions",
"continuous_action",
"prev_action",
"prev_continuous_action",
"environment_rewards",
]
wanted_keys = set(wanted_keys)

action_space=[ACTION_SIZE],
action_spec=ActionSpec.create_continuous(ACTION_SIZE),
)
agentbuffer = trajectory.to_agentbuffer()
seen_keys = set()

9
ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py


with torch.no_grad():
_, log_probs1, _, _ = policy1.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories, all_log_probs=True
vec_obs, vis_obs, masks=masks, memories=memories
vec_obs, vis_obs, masks=masks, memories=memories, all_log_probs=True
vec_obs, vis_obs, masks=masks, memories=memories
np.testing.assert_array_equal(log_probs1, log_probs2)
np.testing.assert_array_equal(
log_probs1.all_discrete_tensor, log_probs2.all_discrete_tensor
)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])

46
ml-agents/mlagents/trainers/tests/torch/test_ghost.py


np.testing.assert_array_equal(w, lw)
def test_resume(dummy_config, tmp_path):
mock_specs = mb.setup_test_behavior_specs(
True, False, vector_action_space=[2], vector_obs_space=1
)
behavior_id_team0 = "test_brain?team=0"
behavior_id_team1 = "test_brain?team=1"
brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name
tmp_path = tmp_path.as_posix()
ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, tmp_path)
controller = GhostController(100)
trainer = GhostTrainer(
ppo_trainer, brain_name, controller, 0, dummy_config, True, tmp_path
)
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
trainer.add_policy(parsed_behavior_id0, policy)
parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1)
policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
trainer.add_policy(parsed_behavior_id1, policy)
trainer.save_model()
# Make a new trainer, check that the policies are the same
ppo_trainer2 = PPOTrainer(brain_name, 0, dummy_config, True, True, 0, tmp_path)
trainer2 = GhostTrainer(
ppo_trainer2, brain_name, controller, 0, dummy_config, True, tmp_path
)
policy = trainer2.create_policy(parsed_behavior_id0, mock_specs)
trainer2.add_policy(parsed_behavior_id0, policy)
policy = trainer2.create_policy(parsed_behavior_id1, mock_specs)
trainer2.add_policy(parsed_behavior_id1, policy)
trainer1_policy = trainer.get_policy(parsed_behavior_id1.behavior_id)
trainer2_policy = trainer2.get_policy(parsed_behavior_id1.behavior_id)
weights = trainer1_policy.get_weights()
weights2 = trainer2_policy.get_weights()
for w, lw in zip(weights, weights2):
np.testing.assert_array_equal(w, lw)
def test_process_trajectory(dummy_config):
mock_specs = mb.setup_test_behavior_specs(
True, False, vector_action_space=[2], vector_obs_space=1

length=time_horizon,
max_step_complete=True,
observation_shapes=[(1,)],
action_space=[2],
action_spec=mock_specs.action_spec,
)
trajectory_queue0.put(trajectory)
trainer.advance()

41
ml-agents/mlagents/trainers/tests/torch/test_networks.py


SeparateActorCritic,
)
from mlagents.trainers.settings import NetworkSettings
from mlagents_envs.base_env import ActionType
from mlagents_envs.base_env import ActionSpec
def test_networkbody_vector():

assert _out[0] == pytest.approx(1.0, abs=0.1)
@pytest.mark.parametrize("action_type", [ActionType.DISCRETE, ActionType.CONTINUOUS])
def test_simple_actor(action_type):
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_actor(use_discrete):
masks = None if action_type == ActionType.CONTINUOUS else torch.ones((1, 1))
actor = SimpleActor(obs_shapes, network_settings, action_type, act_size)
if use_discrete:
masks = torch.ones((1, 1))
action_spec = ActionSpec.create_discrete(tuple(act_size))
else:
masks = None
action_spec = ActionSpec.create_continuous(act_size[0])
actor = SimpleActor(obs_shapes, network_settings, action_spec)
if action_type == ActionType.CONTINUOUS:
assert isinstance(dist, GaussianDistInstance)
if use_discrete:
assert isinstance(dist, CategoricalDistInstance)
assert isinstance(dist, CategoricalDistInstance)
assert isinstance(dist, GaussianDistInstance)
if action_type == ActionType.CONTINUOUS:
assert act.shape == (1, act_size[0])
if use_discrete:
assert act.shape == (1, 1)
assert act.shape == (1, 1)
assert act.shape == (1, act_size[0])
# Test forward
actions, ver_num, mem_size, is_cont, act_size_vec = actor.forward(

assert act.shape == tuple(act_size)
# This is different from above for ONNX export
if use_discrete:
assert act.shape == tuple(act_size)
else:
assert act.shape == (act_size[0], 1)
assert is_cont == int(action_type == ActionType.CONTINUOUS)
assert is_cont == int(not use_discrete)
assert act_size_vec == torch.tensor(act_size)

obs_shapes = [(obs_size,)]
act_size = [2]
stream_names = [f"stream_name{n}" for n in range(4)]
actor = ac_type(
obs_shapes, network_settings, ActionType.CONTINUOUS, act_size, stream_names
)
action_spec = ActionSpec.create_continuous(act_size[0])
actor = ac_type(obs_shapes, network_settings, action_spec, stream_names)
if lstm:
sample_obs = torch.ones((1, network_settings.memory.sequence_length, obs_size))
memories = torch.ones(

40
ml-agents/mlagents/trainers/tests/torch/test_policy.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction
VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8

run_out = policy.evaluate(decision_step, list(decision_step.agent_id))
if discrete:
run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
run_out["action"]["discrete_action"].shape == (
NUM_AGENTS,
len(DISCRETE_ACTION_SPACE),
)
assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)
assert run_out["action"]["continuous_action"].shape == (
NUM_AGENTS,
VECTOR_ACTION_SPACE,
)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])

buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])]
act_masks = ModelUtils.list_to_tensor(buffer["action_mask"])
if policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(buffer["actions"]).unsqueeze(-1)
else:
actions = ModelUtils.list_to_tensor(buffer["actions"], dtype=torch.long)
agent_action = AgentAction.from_dict(buffer)
vis_obs = []
for idx, _ in enumerate(policy.actor_critic.network_body.visual_processors):
vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx])

vec_obs,
vis_obs,
masks=act_masks,
actions=actions,
actions=agent_action,
assert log_probs.shape == (64, policy.behavior_spec.action_size)
assert entropy.shape == (64, policy.behavior_spec.action_size)
if discrete:
_size = policy.behavior_spec.action_spec.discrete_size
else:
_size = policy.behavior_spec.action_spec.continuous_size
assert log_probs.flatten().shape == (64, _size)
assert entropy.shape == (64,)
for val in values.values():
assert val.shape == (64,)

masks=act_masks,
memories=memories,
seq_len=policy.sequence_length,
all_log_probs=not policy.use_continuous_act,
assert log_probs.shape == (
assert log_probs.all_discrete_tensor.shape == (
sum(policy.behavior_spec.discrete_action_branches),
sum(policy.behavior_spec.action_spec.discrete_branches),
assert log_probs.shape == (64, policy.behavior_spec.action_shape)
assert entropies.shape == (64, policy.behavior_spec.action_size)
assert log_probs.continuous_tensor.shape == (
64,
policy.behavior_spec.action_spec.continuous_size,
)
assert entropies.shape == (64,)
if rnn:
assert memories.shape == (1, 1, policy.m_size)

33
ml-agents/mlagents/trainers/tests/torch/test_ppo.py


gail_dummy_config,
)
from mlagents_envs.base_env import ActionSpec
@pytest.fixture
def dummy_config():

DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 64
NUM_AGENTS = 12
CONTINUOUS_ACTION_SPEC = ActionSpec.create_continuous(VECTOR_ACTION_SPACE)
DISCRETE_ACTION_SPEC = ActionSpec.create_discrete(tuple(DISCRETE_ACTION_SPACE))
def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual):

# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
if discrete:
update_buffer["discrete_log_probs"] = np.ones_like(
update_buffer["discrete_action"]
)
else:
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
return_stats = optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
if discrete:
update_buffer["discrete_log_probs"] = np.ones_like(
update_buffer["discrete_action"]
)
else:
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["gail_returns"] = update_buffer["environment_rewards"]
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

trajectory = make_fake_trajectory(
length=time_horizon,
observation_shapes=optimizer.policy.behavior_spec.observation_shapes,
action_spec=DISCRETE_ACTION_SPEC if discrete else CONTINUOUS_ACTION_SPEC,
action_space=DISCRETE_ACTION_SPACE if discrete else VECTOR_ACTION_SPACE,
is_discrete=discrete,
)
run_out, final_value_out = optimizer.get_trajectory_value_estimates(
trajectory.to_agentbuffer(), trajectory.next_obs, done=False

34
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py


CuriosityRewardProvider,
create_reward_provider,
)
from mlagents_envs.base_env import BehaviorSpec, ActionType
from mlagents_envs.base_env import BehaviorSpec, ActionSpec
from mlagents.trainers.settings import CuriositySettings, RewardSignalType
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
create_agent_buffer,

SEED = [42]
ACTIONSPEC_CONTINUOUS = ActionSpec.create_continuous(5)
ACTIONSPEC_TWODISCRETE = ActionSpec.create_discrete((2, 3))
ACTIONSPEC_DISCRETE = ActionSpec.create_discrete((2,))
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
BehaviorSpec([(10,)], ACTIONSPEC_CONTINUOUS),
BehaviorSpec([(10,)], ACTIONSPEC_TWODISCRETE),
],
)
def test_construction(behavior_spec: BehaviorSpec) -> None:

@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,), (64, 66, 3), (84, 86, 1)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,), (64, 66, 1)], ActionType.DISCRETE, (2, 3)),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)),
BehaviorSpec([(10,)], ACTIONSPEC_CONTINUOUS),
BehaviorSpec([(10,), (64, 66, 3), (84, 86, 1)], ACTIONSPEC_CONTINUOUS),
BehaviorSpec([(10,), (64, 66, 1)], ACTIONSPEC_TWODISCRETE),
BehaviorSpec([(10,)], ACTIONSPEC_DISCRETE),
],
)
def test_factory(behavior_spec: BehaviorSpec) -> None:

@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(10,), (64, 66, 3), (24, 26, 1)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)),
BehaviorSpec([(10,), (64, 66, 3), (24, 26, 1)], ACTIONSPEC_CONTINUOUS),
BehaviorSpec([(10,)], ACTIONSPEC_TWODISCRETE),
BehaviorSpec([(10,)], ACTIONSPEC_DISCRETE),
],
)
def test_reward_decreases(behavior_spec: BehaviorSpec, seed: int) -> None:

@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize(
"behavior_spec", [BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5)]
"behavior_spec", [BehaviorSpec([(10,)], ACTIONSPEC_CONTINUOUS)]
)
def test_continuous_action_prediction(behavior_spec: BehaviorSpec, seed: int) -> None:
np.random.seed(seed)

for _ in range(200):
curiosity_rp.update(buffer)
prediction = curiosity_rp._network.predict_action(buffer)[0]
target = torch.tensor(buffer["actions"][0])
target = torch.tensor(buffer["continuous_action"][0])
error = torch.mean((prediction - target) ** 2).item()
assert error < 0.001

"behavior_spec",
[
BehaviorSpec([(10,), (64, 66, 3)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)),
BehaviorSpec([(10,), (64, 66, 3), (24, 26, 1)], ACTIONSPEC_CONTINUOUS),
BehaviorSpec([(10,)], ACTIONSPEC_TWODISCRETE),
BehaviorSpec([(10,)], ACTIONSPEC_DISCRETE),
],
)
def test_next_state_prediction(behavior_spec: BehaviorSpec, seed: int) -> None:

18
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_extrinsic.py


ExtrinsicRewardProvider,
create_reward_provider,
)
from mlagents_envs.base_env import BehaviorSpec, ActionType
from mlagents_envs.base_env import BehaviorSpec, ActionSpec
from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
create_agent_buffer,

ACTIONSPEC_CONTINUOUS = ActionSpec.create_continuous(5)
ACTIONSPEC_TWODISCRETE = ActionSpec.create_discrete((2, 3))
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
BehaviorSpec([(10,)], ACTIONSPEC_CONTINUOUS),
BehaviorSpec([(10,)], ACTIONSPEC_TWODISCRETE),
],
)
def test_construction(behavior_spec: BehaviorSpec) -> None:

@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
BehaviorSpec([(10,)], ACTIONSPEC_CONTINUOUS),
BehaviorSpec([(10,)], ACTIONSPEC_TWODISCRETE),
],
)
def test_factory(behavior_spec: BehaviorSpec) -> None:

@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
BehaviorSpec([(10,)], ACTIONSPEC_CONTINUOUS),
BehaviorSpec([(10,)], ACTIONSPEC_TWODISCRETE),
],
)
def test_reward(behavior_spec: BehaviorSpec, reward: float) -> None:

29
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_gail.py


GAILRewardProvider,
create_reward_provider,
)
from mlagents_envs.base_env import BehaviorSpec, ActionType
from mlagents_envs.base_env import BehaviorSpec, ActionSpec
from mlagents.trainers.settings import GAILSettings, RewardSignalType
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
create_agent_buffer,

)
CONTINUOUS_PATH = (
os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir)

)
SEED = [42]
ACTIONSPEC_CONTINUOUS = ActionSpec.create_continuous(2)
ACTIONSPEC_FOURDISCRETE = ActionSpec.create_discrete((2, 3, 3, 3))
ACTIONSPEC_DISCRETE = ActionSpec.create_discrete((20,))
@pytest.mark.parametrize(
"behavior_spec", [BehaviorSpec([(8,)], ActionType.CONTINUOUS, 2)]
)
@pytest.mark.parametrize("behavior_spec", [BehaviorSpec([(8,)], ACTIONSPEC_CONTINUOUS)])
def test_construction(behavior_spec: BehaviorSpec) -> None:
gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH)
gail_rp = GAILRewardProvider(behavior_spec, gail_settings)

@pytest.mark.parametrize(
"behavior_spec", [BehaviorSpec([(8,)], ActionType.CONTINUOUS, 2)]
)
@pytest.mark.parametrize("behavior_spec", [BehaviorSpec([(8,)], ACTIONSPEC_CONTINUOUS)])
def test_factory(behavior_spec: BehaviorSpec) -> None:
gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH)
gail_rp = create_reward_provider(

@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(8,), (24, 26, 1)], ActionType.CONTINUOUS, 2),
BehaviorSpec([(50,)], ActionType.DISCRETE, (2, 3, 3, 3)),
BehaviorSpec([(10,)], ActionType.DISCRETE, (20,)),
BehaviorSpec([(8,), (24, 26, 1)], ACTIONSPEC_CONTINUOUS),
BehaviorSpec([(50,)], ACTIONSPEC_FOURDISCRETE),
BehaviorSpec([(10,)], ACTIONSPEC_DISCRETE),
],
)
@pytest.mark.parametrize("use_actions", [False, True])

@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(8,)], ActionType.CONTINUOUS, 2),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3, 3, 3)),
BehaviorSpec([(10,)], ActionType.DISCRETE, (20,)),
BehaviorSpec([(8,), (24, 26, 1)], ACTIONSPEC_CONTINUOUS),
BehaviorSpec([(50,)], ACTIONSPEC_FOURDISCRETE),
BehaviorSpec([(10,)], ACTIONSPEC_DISCRETE),
],
)
@pytest.mark.parametrize("use_actions", [False, True])

RewardSignalType.GAIL, behavior_spec, gail_settings
)
for _ in range(200):
for _ in range(300):
gail_rp.update(buffer_policy)
reward_expert = gail_rp.evaluate(buffer_expert)[0]
reward_policy = gail_rp.evaluate(buffer_policy)[0]

25
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_rnd.py


RNDRewardProvider,
create_reward_provider,
)
from mlagents_envs.base_env import BehaviorSpec, ActionType
from mlagents_envs.base_env import BehaviorSpec, ActionSpec
ACTIONSPEC_CONTINUOUS = ActionSpec.create_continuous(5)
ACTIONSPEC_TWODISCRETE = ActionSpec.create_discrete((2, 3))
ACTIONSPEC_DISCRETE = ActionSpec.create_discrete((2,))
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
BehaviorSpec([(10,)], ACTIONSPEC_CONTINUOUS),
BehaviorSpec([(10,)], ACTIONSPEC_TWODISCRETE),
],
)
def test_construction(behavior_spec: BehaviorSpec) -> None:

@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,), (64, 66, 3), (84, 86, 1)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,), (64, 66, 1)], ActionType.DISCRETE, (2, 3)),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)),
BehaviorSpec([(10,)], ACTIONSPEC_CONTINUOUS),
BehaviorSpec([(10,), (64, 66, 3), (84, 86, 1)], ACTIONSPEC_CONTINUOUS),
BehaviorSpec([(10,), (64, 66, 1)], ACTIONSPEC_TWODISCRETE),
BehaviorSpec([(10,)], ACTIONSPEC_DISCRETE),
],
)
def test_factory(behavior_spec: BehaviorSpec) -> None:

@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(10,), (64, 66, 3), (24, 26, 1)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)),
BehaviorSpec([(10,), (64, 66, 3), (24, 26, 1)], ACTIONSPEC_CONTINUOUS),
BehaviorSpec([(10,)], ACTIONSPEC_TWODISCRETE),
BehaviorSpec([(10,)], ACTIONSPEC_DISCRETE),
],
)
def test_reward_decreases(behavior_spec: BehaviorSpec, seed: int) -> None:

17
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py


) -> AgentBuffer:
buffer = AgentBuffer()
curr_observations = [
np.random.normal(size=shape) for shape in behavior_spec.observation_shapes
np.random.normal(size=shape).astype(np.float32)
for shape in behavior_spec.observation_shapes
np.random.normal(size=shape) for shape in behavior_spec.observation_shapes
np.random.normal(size=shape).astype(np.float32)
for shape in behavior_spec.observation_shapes
action = behavior_spec.create_random_action(1)[0, :]
action_buffer = behavior_spec.action_spec.random_action(1)
action = {}
if action_buffer.continuous is not None:
action["continuous_action"] = action_buffer.continuous
if action_buffer.discrete is not None:
action["discrete_action"] = action_buffer.discrete
for _ in range(number):
curr_split_obs = SplitObservations.from_observations(curr_observations)
next_split_obs = SplitObservations.from_observations(next_observations)

)
buffer["vector_obs"].append(curr_split_obs.vector_observations)
buffer["next_vector_in"].append(next_split_obs.vector_observations)
buffer["actions"].append(action)
for _act_type, _act in action.items():
buffer[_act_type].append(_act[0, :])
buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
buffer["masks"].append(np.ones(1, dtype=np.float32))
buffer["done"] = np.zeros(number, dtype=np.float32)

6
ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py


PPO_TORCH_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=700,
max_steps=900,
summary_freq=100,
)
# The number of steps is pretty small for these encoders

)
new_hyperparams = attr.evolve(
SAC_TORCH_CONFIG.hyperparameters,
batch_size=128,
batch_size=256,
learning_rate=1e-3,
buffer_init_steps=1000,
steps_per_update=2,

hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=5000,
max_steps=2000,
)
check_environment_trains(env, {BRAIN_NAME: config})

19
ml-agents/mlagents/trainers/tests/torch/test_utils.py


def test_list_to_tensor():
# Test converting pure list
unconverted_list = [[1, 2], [1, 3], [1, 4]]
unconverted_list = [[1.0, 2], [1, 3], [1, 4]]
tensor = ModelUtils.list_to_tensor(unconverted_list)
# Should be equivalent to torch.tensor conversion
assert torch.equal(tensor, torch.tensor(unconverted_list))

list_of_np = [np.asarray(_el) for _el in unconverted_list]
tensor = ModelUtils.list_to_tensor(list_of_np)
# Should be equivalent to torch.tensor conversion
assert torch.equal(tensor, torch.tensor(unconverted_list))
assert torch.equal(tensor, torch.tensor(unconverted_list, dtype=torch.float32))
def test_break_into_branches():

log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy(
action_list, dist_list
)
assert log_probs.shape == (1, 2, 2)
for lp in log_probs:
assert lp.shape == (1, 2)
assert all_probs is None
assert all_probs == []
for log_prob in log_probs.flatten():
for log_prob in log_probs:
assert log_prob == pytest.approx(-0.919, abs=0.01)
for lp in log_prob.flatten():
assert lp == pytest.approx(-0.919, abs=0.01)
for ent in entropies.flatten():
# entropy of standard normal at 0

log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy(
action_list, dist_list
)
assert all_probs.shape == (1, len(dist_list * act_size))
for all_prob in all_probs:
assert all_prob.shape == (1, act_size)
assert log_probs.flatten()[0] > log_probs.flatten()[1]
assert log_probs[0] > log_probs[1]
def test_masked_mean():

2
ml-agents/mlagents/trainers/tf/components/bc/model.py


self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32)
self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32)
if self.policy.behavior_spec.is_action_continuous():
if self.policy.behavior_spec.action_spec.is_continuous():
action_length = self.policy.act_size[0]
self.action_in_expert = tf.placeholder(
shape=[None, action_length], dtype=tf.float32

10
ml-agents/mlagents/trainers/tf/components/bc/module.py


self.policy.batch_size_ph: n_sequences,
self.policy.sequence_length_ph: self.policy.sequence_length,
}
feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"]
if self.policy.behavior_spec.is_action_discrete():
if self.policy.behavior_spec.action_spec.is_discrete():
feed_dict[self.model.action_in_expert] = mini_batch_demo["discrete_action"]
sum(self.policy.behavior_spec.discrete_action_branches),
sum(self.policy.behavior_spec.action_spec.discrete_branches),
else:
feed_dict[self.model.action_in_expert] = mini_batch_demo[
"continuous_action"
]
if self.policy.vec_obs_size > 0:
feed_dict[self.policy.vector_in] = mini_batch_demo["vector_obs"]
for i, _ in enumerate(self.policy.visual_in):

2
ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/model.py


"""
combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
if self.policy.behavior_spec.is_action_continuous():
if self.policy.behavior_spec.action_spec.is_continuous():
pred_action = tf.layers.dense(
hidden, self.policy.act_size[0], activation=None
)

10
ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/signal.py


def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size_ph: len(mini_batch["actions"]),
self.policy.batch_size_ph: len(mini_batch["vector_obs"]),
self.policy.sequence_length_ph: self.policy.sequence_length,
}
if self.policy.use_vec_obs:

feed_dict[self.model.next_visual_in[i]] = _next_obs
if self.policy.use_continuous_act:
feed_dict[self.policy.selected_actions] = mini_batch["actions"]
feed_dict[self.policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[self.policy.output] = mini_batch["actions"]
feed_dict[self.policy.output] = mini_batch["discrete_action"]
unscaled_reward = self.policy.sess.run(
self.model.intrinsic_reward, feed_dict=feed_dict
)

policy.mask_input: mini_batch["masks"],
}
if self.policy.use_continuous_act:
feed_dict[policy.selected_actions] = mini_batch["actions"]
feed_dict[policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[policy.output] = mini_batch["actions"]
feed_dict[policy.output] = mini_batch["discrete_action"]
if self.policy.use_vec_obs:
feed_dict[policy.vector_in] = mini_batch["vector_obs"]
feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]

2
ml-agents/mlagents/trainers/tf/components/reward_signals/gail/model.py


self.done_expert = tf.expand_dims(self.done_expert_holder, -1)
self.done_policy = tf.expand_dims(self.done_policy_holder, -1)
if self.policy.behavior_spec.is_action_continuous():
if self.policy.behavior_spec.action_spec.is_continuous():
action_length = self.policy.act_size[0]
self.action_in_expert = tf.placeholder(
shape=[None, action_length], dtype=tf.float32

17
ml-agents/mlagents/trainers/tf/components/reward_signals/gail/signal.py


def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size_ph: len(mini_batch["actions"]),
self.policy.batch_size_ph: len(mini_batch["vector_obs"]),
self.policy.sequence_length_ph: self.policy.sequence_length,
}
if self.model.use_vail:

feed_dict[self.policy.visual_in[i]] = _obs
if self.policy.use_continuous_act:
feed_dict[self.policy.selected_actions] = mini_batch["actions"]
feed_dict[self.policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[self.policy.output] = mini_batch["actions"]
feed_dict[self.policy.output] = mini_batch["discrete_action"]
feed_dict[self.model.done_policy_holder] = np.array(
mini_batch["done"]
).flatten()

if self.model.use_vail:
feed_dict[self.model.use_noise] = [1]
feed_dict[self.model.action_in_expert] = np.array(mini_batch_demo["actions"])
feed_dict[policy.selected_actions] = mini_batch["actions"]
feed_dict[policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[self.model.action_in_expert] = np.array(
mini_batch_demo["continuous_action"]
)
feed_dict[policy.output] = mini_batch["actions"]
feed_dict[policy.output] = mini_batch["discrete_action"]
feed_dict[self.model.action_in_expert] = np.array(
mini_batch_demo["discrete_action"]
)
if self.policy.use_vis_obs > 0:
for i in range(len(policy.visual_in)):

4
ml-agents/mlagents/trainers/tf/model_serialization.py


]
)
POSSIBLE_OUTPUT_NODES = frozenset(
["action", "action_probs", "recurrent_out", "value_estimate"]
)
POSSIBLE_OUTPUT_NODES = frozenset(["action", "recurrent_out", "value_estimate"])
MODEL_CONSTANTS = frozenset(
[

29
ml-agents/mlagents/trainers/torch/components/bc/module.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.settings import BehavioralCloningSettings, ScheduleType
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
class BCModule:

update_stats = {"Losses/Pretraining Loss": np.mean(batch_losses)}
return update_stats
def _behavioral_cloning_loss(self, selected_actions, log_probs, expert_actions):
def _behavioral_cloning_loss(
self,
selected_actions: AgentAction,
log_probs: ActionLogProbs,
expert_actions: torch.Tensor,
) -> torch.Tensor:
bc_loss = torch.nn.functional.mse_loss(selected_actions, expert_actions)
bc_loss = torch.nn.functional.mse_loss(
selected_actions.continuous_tensor, expert_actions
)
log_probs, self.policy.act_size
log_probs.all_discrete_tensor,
self.policy.behavior_spec.action_spec.discrete_branches,
)
bc_loss = torch.mean(
torch.stack(

vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])]
act_masks = None
if self.policy.use_continuous_act:
expert_actions = ModelUtils.list_to_tensor(mini_batch_demo["actions"])
expert_actions = ModelUtils.list_to_tensor(
mini_batch_demo["continuous_action"]
)
mini_batch_demo["actions"], dtype=torch.long
mini_batch_demo["discrete_action"], dtype=torch.long
)
expert_actions = ModelUtils.actions_to_onehot(
raw_expert_actions, self.policy.act_size

(
self.n_sequences * self.policy.sequence_length,
sum(self.policy.behavior_spec.discrete_action_branches),
sum(self.policy.behavior_spec.action_spec.discrete_branches),
),
dtype=np.float32,
)

else:
vis_obs = []
selected_actions, all_log_probs, _, _ = self.policy.sample_actions(
selected_actions, log_probs, _, _ = self.policy.sample_actions(
all_log_probs=True,
selected_actions, all_log_probs, expert_actions
selected_actions, log_probs, expert_actions
)
self.optimizer.zero_grad()
bc_loss.backward()

29
ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py


from mlagents.trainers.settings import CuriositySettings
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction
from mlagents.trainers.torch.networks import NetworkBody
from mlagents.trainers.torch.layers import LinearEncoder, linear_layer
from mlagents.trainers.settings import NetworkSettings, EncoderType

def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None:
super().__init__()
self._policy_specs = specs
self._action_spec = specs.action_spec
state_encoder_settings = NetworkSettings(
normalize=False,
hidden_units=settings.encoding_size,

specs.observation_shapes, state_encoder_settings
)
self._action_flattener = ModelUtils.ActionFlattener(specs)
self._action_flattener = ModelUtils.ActionFlattener(self._action_spec)
self.inverse_model_action_prediction = torch.nn.Sequential(
LinearEncoder(2 * settings.encoding_size, 1, 256),

(self.get_current_state(mini_batch), self.get_next_state(mini_batch)), dim=1
)
hidden = self.inverse_model_action_prediction(inverse_model_input)
if self._policy_specs.is_action_continuous():
if self._action_spec.is_continuous():
hidden, self._policy_specs.discrete_action_branches
hidden, self._action_spec.discrete_branches
)
branches = [torch.softmax(b, dim=1) for b in branches]
return torch.cat(branches, dim=1)

Uses the current state embedding and the action of the mini_batch to predict
the next state embedding.
"""
if self._policy_specs.is_action_continuous():
action = ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float)
actions = AgentAction.from_dict(mini_batch)
if self._action_spec.is_continuous():
action = actions.continuous_tensor
ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long),
self._policy_specs.discrete_action_branches,
actions.discrete_tensor, self._action_spec.discrete_branches
),
dim=1,
)

action prediction (given the current and next state).
"""
predicted_action = self.predict_action(mini_batch)
if self._policy_specs.is_action_continuous():
sq_difference = (
ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float)
- predicted_action
) ** 2
actions = AgentAction.from_dict(mini_batch)
if self._action_spec.is_continuous():
sq_difference = (actions.continuous_tensor - predicted_action) ** 2
sq_difference = torch.sum(sq_difference, dim=1)
return torch.mean(
ModelUtils.dynamic_partition(

else:
true_action = torch.cat(
ModelUtils.actions_to_onehot(
ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long),
self._policy_specs.discrete_action_branches,
actions.discrete_tensor, self._action_spec.discrete_branches
),
dim=1,
)

9
ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py


)
from mlagents.trainers.settings import GAILSettings
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction
from mlagents.trainers.torch.networks import NetworkBody
from mlagents.trainers.torch.layers import linear_layer, Initialization
from mlagents.trainers.settings import NetworkSettings, EncoderType

def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None:
super().__init__()
self._policy_specs = specs
self._use_vail = settings.use_vail
self._settings = settings

vis_encode_type=EncoderType.SIMPLE,
memory=None,
)
self._action_flattener = ModelUtils.ActionFlattener(specs)
self._action_flattener = ModelUtils.ActionFlattener(specs.action_spec)
unencoded_size = (
self._action_flattener.flattened_size + 1 if settings.use_actions else 0
) # +1 is for dones

Creates the action Tensor. In continuous case, corresponds to the action. In
the discrete case, corresponds to the concatenation of one hot action Tensors.
"""
return self._action_flattener.forward(
torch.as_tensor(mini_batch["actions"], dtype=torch.float)
)
return self._action_flattener.forward(AgentAction.from_dict(mini_batch))
def get_state_inputs(
self, mini_batch: AgentBuffer

1
ml-agents/mlagents/trainers/torch/components/reward_providers/rnd_reward_provider.py


def __init__(self, specs: BehaviorSpec, settings: RNDSettings) -> None:
super().__init__()
self._policy_specs = specs
state_encoder_settings = NetworkSettings(
normalize=True,
hidden_units=settings.encoding_size,

6
ml-agents/mlagents/trainers/torch/distributions.py


return action[:, 0, :].type(torch.float)
class GaussianDistribution(nn.Module):
def __init__(
self,

if self.conditional_sigma:
log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2)
else:
log_sigma = self.log_sigma.expand(inputs.shape[0], -1)
# Expand so that entropy matches batch size. Note that we're using
# torch.cat here instead of torch.expand() becuase it is not supported in the
# verified version of Barracuda (1.0.2).
log_sigma = torch.cat([self.log_sigma] * inputs.shape[0], axis=0)
if self.tanh_squash:
return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))]
else:

4
ml-agents/mlagents/trainers/torch/model_serialization.py


for shape in self.policy.behavior_spec.observation_shapes
if len(shape) == 3
]
dummy_masks = torch.ones(batch_dim + [sum(self.policy.actor_critic.discrete_act_branches)])
dummy_masks = torch.ones(
batch_dim + [sum(self.policy.behavior_spec.action_spec.discrete_branches)]
)
dummy_memories = torch.zeros(
batch_dim + seq_len_dim + [self.policy.export_memory_size]
)

82
ml-agents/mlagents/trainers/torch/networks.py


from mlagents.torch_utils import torch, nn
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.torch.distributions import DistInstance
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
from mlagents.trainers.torch.decoders import ValueHeads
from mlagents.trainers.torch.layers import LSTM, LinearEncoder
from mlagents.trainers.torch.model_serialization import exporting_to_onnx

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[List[DistInstance], List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
) -> Tuple[
List[DistInstance], List[DistInstance], Dict[str, torch.Tensor], torch.Tensor
]:
"""
Returns distributions, from which actions can be sampled, and value estimates.
If memory is enabled, return the memories as well.

self,
observation_shapes: List[Tuple[int, ...]],
network_settings: NetworkSettings,
action_spec: ActionSpec,
action_spec: ActionSpec,
self.discrete_act_size = action_spec.discrete_action_size
self.discrete_act_branches = action_spec.discrete_action_branches
self.continuous_act_size = action_spec.continuous_action_size
self.action_spec = action_spec
self.is_continuous_int = torch.nn.Parameter(
torch.Tensor([int(self.action_spec.is_continuous())])
)
torch.Tensor(action_spec.action_size)
)
self.is_continuous_int = torch.nn.Parameter(
torch.Tensor([int(self.continuous_act_size > 0)])
torch.Tensor(
[
self.action_spec.continuous_size
+ sum(self.action_spec.discrete_branches)
]
),
requires_grad=False,
)
self.network_body = NetworkBody(observation_shapes, network_settings)
if network_settings.memory is not None:

self,
observation_shapes: List[Tuple[int, ...]],
network_settings: NetworkSettings,
action_spec: ActionSpec,
action_spec: ActionSpec,
stream_names: List[str],
conditional_sigma: bool = False,
tanh_squash: bool = False,

self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
actions: torch.Tensor,
actions: AgentAction,
) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]:
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
encoding, memories = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
)

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
) -> Tuple[
AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
]:
encoding, memories = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length

self,
observation_shapes: List[Tuple[int, ...]],
network_settings: NetworkSettings,
action_spec: ActionSpec,
action_spec: ActionSpec,
stream_names: List[str],
conditional_sigma: bool = False,
tanh_squash: bool = False,

self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
actions: torch.Tensor,
actions: AgentAction,
) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]:
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
if self.use_lstm:
# Use only the back half of memories for critic and actor
actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1)

)
log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
value_outputs, critic_mem_outs = self.critic(
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
)
return log_probs, entropies, value_outputs

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
) -> Tuple[
AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
]:
if self.use_lstm:
# Use only the back half of memories for critic and actor
actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1)

)
action, log_probs, entropies = self.action_model(encoding, masks)
value_outputs, critic_mem_outs = self.critic(
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
)
if self.use_lstm:
mem_out = torch.cat([actor_mem_outs, critic_mem_outs], dim=-1)

def __init__(self):
super().__init__()
self.__global_step = nn.Parameter(torch.Tensor([0]), requires_grad=False)
def __init__(self):
super().__init__()
self.__global_step = nn.Parameter(torch.Tensor([0]), requires_grad=False)
@property
def current_step(self):
return int(self.__global_step.item())
@property
def current_step(self):
return int(self.__global_step.item())
@current_step.setter
def current_step(self, value):
self.__global_step[:] = value
@current_step.setter
def current_step(self, value):
self.__global_step[:] = value
def increment(self, value):
self.__global_step += value
def increment(self, value):
self.__global_step += value
def __init__(self, lr):
# Todo: add learning rate decay
super().__init__()
self.learning_rate = torch.Tensor([lr])
def __init__(self, lr):
# Todo: add learning rate decay
super().__init__()
self.learning_rate = torch.Tensor([lr])

231
ml-agents/mlagents/trainers/torch/utils.py


from typing import List, Optional, Tuple
from typing import List, Optional, Tuple, NamedTuple, Dict
from mlagents.torch_utils import torch, nn
import numpy as np

)
from mlagents.trainers.settings import EncoderType, ScheduleType
from mlagents.trainers.exception import UnityTrainerException
from mlagents_envs.base_env import BehaviorSpec
from mlagents_envs.base_env import ActionSpec
class AgentAction(NamedTuple):
"""
A NamedTuple containing the tensor for continuous actions and list of tensors for
discrete actions. Utility functions provide numpy <=> tensor conversions to be
sent as actions to the environment manager as well as used by the optimizers.
:param continuous_tensor: Torch tensor corresponding to continuous actions
:param discrete_list: List of Torch tensors each corresponding to discrete actions
"""
continuous_tensor: torch.Tensor
discrete_list: List[torch.Tensor]
@property
def discrete_tensor(self):
"""
Returns the discrete action list as a stacked tensor
"""
return torch.stack(self.discrete_list, dim=-1)
def to_numpy_dict(self) -> Dict[str, np.ndarray]:
"""
Returns a Dict of np arrays with an entry correspinding to the continuous action
and an entry corresponding to the discrete action. "continuous_action" and
"discrete_action" are added to the agents buffer individually to maintain a flat buffer.
"""
array_dict: Dict[str, np.ndarray] = {}
if self.continuous_tensor is not None:
array_dict["continuous_action"] = ModelUtils.to_numpy(
self.continuous_tensor
)
if self.discrete_list is not None:
array_dict["discrete_action"] = ModelUtils.to_numpy(
self.discrete_tensor[:, 0, :]
)
return array_dict
def to_tensor_list(self) -> List[torch.Tensor]:
"""
Returns the tensors in the AgentAction as a flat List of torch Tensors. This will be removed
when the ActionModel is merged.
"""
tensor_list: List[torch.Tensor] = []
if self.continuous_tensor is not None:
tensor_list.append(self.continuous_tensor)
if self.discrete_list is not None:
tensor_list += (
self.discrete_list
) # Note this is different for ActionLogProbs
return tensor_list
@staticmethod
def create(
tensor_list: List[torch.Tensor], action_spec: ActionSpec
) -> "AgentAction":
"""
A static method that converts a list of torch Tensors into an AgentAction using the ActionSpec.
This will change (and may be removed) in the ActionModel.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
_offset = 0
if action_spec.continuous_size > 0:
continuous = tensor_list[0]
_offset = 1
if action_spec.discrete_size > 0:
discrete = tensor_list[_offset:]
return AgentAction(continuous, discrete)
@staticmethod
def from_dict(buff: Dict[str, np.ndarray]) -> "AgentAction":
"""
A static method that accesses continuous and discrete action fields in an AgentBuffer
and constructs the corresponding AgentAction from the retrieved np arrays.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
if "continuous_action" in buff:
continuous = ModelUtils.list_to_tensor(buff["continuous_action"])
if "discrete_action" in buff:
discrete_tensor = ModelUtils.list_to_tensor(
buff["discrete_action"], dtype=torch.long
)
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
return AgentAction(continuous, discrete)
class ActionLogProbs(NamedTuple):
"""
A NamedTuple containing the tensor for continuous log probs and list of tensors for
discrete log probs of individual actions as well as all the log probs for an entire branch.
Utility functions provide numpy <=> tensor conversions to be used by the optimizers.
:param continuous_tensor: Torch tensor corresponding to log probs of continuous actions
:param discrete_list: List of Torch tensors each corresponding to log probs of discrete actions
:param all_discrete_list: List of Torch tensors each corresponding to all log probs of
a discrete action branch
"""
continuous_tensor: torch.Tensor
discrete_list: List[torch.Tensor]
all_discrete_list: Optional[List[torch.Tensor]]
@property
def discrete_tensor(self):
"""
Returns the discrete log probs list as a stacked tensor
"""
return torch.stack(self.discrete_list, dim=-1)
@property
def all_discrete_tensor(self):
"""
Returns the discrete log probs of each branch as a tensor
"""
return torch.cat(self.all_discrete_list, dim=1)
def to_numpy_dict(self) -> Dict[str, np.ndarray]:
"""
Returns a Dict of np arrays with an entry correspinding to the continuous log probs
and an entry corresponding to the discrete log probs. "continuous_log_probs" and
"discrete_log_probs" are added to the agents buffer individually to maintain a flat buffer.
"""
array_dict: Dict[str, np.ndarray] = {}
if self.continuous_tensor is not None:
array_dict["continuous_log_probs"] = ModelUtils.to_numpy(
self.continuous_tensor
)
if self.discrete_list is not None:
array_dict["discrete_log_probs"] = ModelUtils.to_numpy(self.discrete_tensor)
return array_dict
def _to_tensor_list(self) -> List[torch.Tensor]:
"""
Returns the tensors in the ActionLogProbs as a flat List of torch Tensors. This
is private and serves as a utility for self.flatten()
"""
tensor_list: List[torch.Tensor] = []
if self.continuous_tensor is not None:
tensor_list.append(self.continuous_tensor)
if self.discrete_list is not None:
tensor_list.append(
self.discrete_tensor
) # Note this is different for AgentActions
return tensor_list
def flatten(self) -> torch.Tensor:
"""
A utility method that returns all log probs in ActionLogProbs as a flattened tensor.
This is useful for algorithms like PPO which can treat all log probs in the same way.
"""
return torch.cat(self._to_tensor_list(), dim=1)
@staticmethod
def create(
log_prob_list: List[torch.Tensor],
action_spec: ActionSpec,
all_log_prob_list: List[torch.Tensor] = None,
) -> "ActionLogProbs":
"""
A static method that converts a list of torch Tensors into an ActionLogProbs using the ActionSpec.
This will change (and may be removed) in the ActionModel.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
_offset = 0
if action_spec.continuous_size > 0:
continuous = log_prob_list[0]
_offset = 1
if action_spec.discrete_size > 0:
discrete = log_prob_list[_offset:]
return ActionLogProbs(continuous, discrete, all_log_prob_list)
@staticmethod
def from_dict(buff: Dict[str, np.ndarray]) -> "ActionLogProbs":
"""
A static method that accesses continuous and discrete log probs fields in an AgentBuffer
and constructs the corresponding ActionLogProbs from the retrieved np arrays.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
if "continuous_log_probs" in buff:
continuous = ModelUtils.list_to_tensor(buff["continuous_log_probs"])
if "discrete_log_probs" in buff:
discrete_tensor = ModelUtils.list_to_tensor(buff["discrete_log_probs"])
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
return ActionLogProbs(continuous, discrete, None)
class ModelUtils:
# Minimum supported side for each encoder type. If refactoring an encoder, please
# adjust these also.

}
class ActionFlattener:
def __init__(self, behavior_spec: BehaviorSpec):
self._specs = behavior_spec
def __init__(self, action_spec: ActionSpec):
self._specs = action_spec
if self._specs.is_action_continuous():
return self._specs.action_size
if self._specs.is_continuous():
return self._specs.continuous_size
return sum(self._specs.discrete_action_branches)
return sum(self._specs.discrete_branches)
def forward(self, action: torch.Tensor) -> torch.Tensor:
if self._specs.is_action_continuous():
return action
def forward(self, action: AgentAction) -> torch.Tensor:
if self._specs.is_continuous():
return action.continuous_tensor
torch.as_tensor(action, dtype=torch.long),
self._specs.discrete_action_branches,
torch.as_tensor(action.discrete_tensor, dtype=torch.long),
self._specs.discrete_branches,
),
dim=1,
)

@staticmethod
def list_to_tensor(
ndarray_list: List[np.ndarray], dtype: Optional[torch.dtype] = None
ndarray_list: List[np.ndarray], dtype: Optional[torch.dtype] = torch.float32
) -> torch.Tensor:
"""
Converts a list of numpy arrays into a tensor. MUCH faster than

@staticmethod
def get_probs_and_entropy(
action_list: List[torch.Tensor], dists: List[DistInstance]
) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
) -> Tuple[List[torch.Tensor], torch.Tensor, Optional[torch.Tensor]]:
log_probs_list = []
all_probs_list = []
entropies_list = []

entropies_list.append(entropy)
if isinstance(action_dist, DiscreteDistInstance):
all_probs_list.append(action_dist.all_log_prob())
log_probs = torch.cat(log_probs_list, dim=1)
entropies = torch.cat(entropies_list, dim=1)
entropies = torch.stack(entropies_list, dim=-1)
log_probs = log_probs.squeeze(-1)
all_probs = None
else:
all_probs = torch.cat(all_probs_list, dim=-1)
return log_probs, entropies, all_probs
return log_probs_list, entropies, all_probs_list
@staticmethod
def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:

34
ml-agents/mlagents/trainers/trainer/rl_trainer.py


from mlagents.trainers.optimizer import Optimizer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.tf.components.reward_signals import (
RewardSignalResult,
RewardSignal,
from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
BaseRewardProvider,
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.model_saver.torch_model_saver import TorchModelSaver
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.trajectory import Trajectory

from mlagents.trainers.model_saver.tf_model_saver import TFModelSaver
from mlagents import torch_utils
from mlagents import tf_utils
if torch_utils.is_available():
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.model_saver.torch_model_saver import TorchModelSaver
if tf_utils.is_available():
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.model_saver.tf_model_saver import TFModelSaver
TorchPolicy = None # type: ignore
TorchSaver = None # type: ignore
RewardSignalResults = Dict[str, RewardSignalResult]
TFPolicy = None # type: ignore
TFModelSaver = None # type: ignore
logger = get_logger(__name__)

StatsPropertyType.HYPERPARAMETERS, self.trainer_settings.as_dict()
)
self.framework = self.trainer_settings.framework
if self.framework == FrameworkType.PYTORCH and not torch_utils.is_available():
if self.framework == FrameworkType.TENSORFLOW and not tf_utils.is_available():
"To use the experimental PyTorch backend, install the PyTorch Python package first."
"To use the TensorFlow backend, install the TensorFlow Python package first."
)
logger.debug(f"Using framework {self.framework.value}")

self.reward_buffer.appendleft(rewards.get(agent_id, 0))
rewards[agent_id] = 0
else:
if isinstance(optimizer.reward_signals[name], RewardSignal):
if isinstance(optimizer.reward_signals[name], BaseRewardProvider):
optimizer.reward_signals[name].stat_name,
f"Policy/{optimizer.reward_signals[name].name.capitalize()} Reward",
f"Policy/{optimizer.reward_signals[name].name.capitalize()} Reward",
optimizer.reward_signals[name].stat_name,
rewards.get(agent_id, 0),
)
rewards[agent_id] = 0

18
ml-agents/mlagents/trainers/trainer/trainer_factory.py


init_path: str = None,
multi_gpu: bool = False,
force_torch: bool = False,
force_tensorflow: bool = False,
):
"""
The TrainerFactory generates the Trainers based on the configuration passed as

:param init_path: Path from which to load model.
:param multi_gpu: If True, multi-gpu will be used. (currently not available)
:param force_torch: If True, the Trainers will all use the PyTorch framework
instead of the TensorFlow framework.
instead of what is specified in the config YAML.
:param force_tensorflow: If True, thee Trainers will all use the TensorFlow
framework.
"""
self.trainer_config = trainer_config
self.output_path = output_path

self.multi_gpu = multi_gpu
self.ghost_controller = GhostController()
self._force_torch = force_torch
self._force_tf = force_tensorflow
def generate(self, behavior_name: str) -> Trainer:
if behavior_name not in self.trainer_config.keys():

trainer_settings = self.trainer_config[behavior_name]
if self._force_torch:
trainer_settings.framework = FrameworkType.PYTORCH
logger.warning(
"Note that specifying --torch is not required anymore as PyTorch is the default framework."
)
if self._force_tf:
trainer_settings.framework = FrameworkType.TENSORFLOW
logger.warning(
"Setting the framework to TensorFlow. TensorFlow trainers will be deprecated in the future."
)
if self._force_torch:
logger.warning(
"Both --torch and --tensorflow CLI options were specified. Using TensorFlow."
)
return TrainerFactory._initialize_trainer(
trainer_settings,
behavior_name,

10
ml-agents/mlagents/trainers/trainer_controller.py


import numpy as np
from mlagents.tf_utils import tf
from mlagents import tf_utils
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.env_manager import EnvManager, EnvironmentStep

self.trainer_threads: List[threading.Thread] = []
self.kill_trainers = False
np.random.seed(training_seed)
tf.set_random_seed(training_seed)
if torch_utils.is_available():
torch_utils.torch.manual_seed(training_seed)
if tf_utils.is_available():
tf.set_random_seed(training_seed)
torch_utils.torch.manual_seed(training_seed)
self.rank = get_rank()
@timed

@timed
def start_learning(self, env_manager: EnvManager) -> None:
self._create_output_path(self.output_path)
tf.reset_default_graph()
if tf_utils.is_available():
tf.reset_default_graph()
try:
# Initial reset
self._reset_env(env_manager)

12
ml-agents/mlagents/trainers/training_status.py


import attr
import cattr
from mlagents.tf_utils import tf
from mlagents.torch_utils import torch
from mlagents.tf_utils import tf, is_available as tf_is_available
from mlagents_envs.logging_util import get_logger
from mlagents.trainers import __version__
from mlagents.trainers.exception import TrainerError

STATUS_FORMAT_VERSION = "0.1.0"
STATUS_FORMAT_VERSION = "0.2.0"
class StatusType(Enum):

class StatusMetaData:
stats_format_version: str = STATUS_FORMAT_VERSION
mlagents_version: str = __version__
tensorflow_version: str = tf.__version__
torch_version: str = torch.__version__
tensorflow_version: str = tf.__version__ if tf_is_available() else -1
def to_dict(self) -> Dict[str, str]:
return cattr.unstructure(self)

if self.tensorflow_version != other.tensorflow_version:
logger.warning(
"Tensorflow checkpoint was saved with a different version of Tensorflow. Model may not resume properly."
)
if self.torch_version != other.torch_version:
logger.warning(
"PyTorch checkpoint was saved with a different version of PyTorch. Model may not resume properly."
)

部分文件因为文件数量过多而无法显示

正在加载...
取消
保存