浏览代码

Merge branch 'master' into develop-sac-apex

/develop/sac-apex
Ervin Teng 4 年前
当前提交
06fa3d39
共有 113 个文件被更改,包括 1991 次插入966 次删除
  1. 2
      .pylintrc
  2. 4
      .yamato/standalone-build-test.yml
  3. 7
      .yamato/training-int-tests.yml
  4. 8
      Dockerfile
  5. 30
      Project/Assets/ML-Agents/Editor/Tests/StandaloneBuildTest.cs
  6. 2
      Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs
  7. 2
      Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs
  8. 2
      Project/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs
  9. 5
      Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorAgent.cs
  10. 12
      Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorSettings.cs
  11. 18
      Project/Assets/ML-Agents/Examples/GridWorld/Scenes/GridWorld.unity
  12. 3
      Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAgent.cs
  13. 2
      Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridArea.cs
  14. 3
      Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridSettings.cs
  15. 5
      Project/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs
  16. 3
      Project/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs
  17. 3
      Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ProjectSettingsOverrides.cs
  18. 3
      Project/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerFieldArea.cs
  19. 2
      Project/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs
  20. 2
      Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs
  21. 13
      Project/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs
  22. 4
      README.md
  23. 22
      com.unity.ml-agents/CHANGELOG.md
  24. 43
      com.unity.ml-agents/Runtime/Academy.cs
  25. 3
      com.unity.ml-agents/Runtime/Agent.cs
  26. 13
      com.unity.ml-agents/Runtime/Communicator/ICommunicator.cs
  27. 184
      com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs
  28. 5
      com.unity.ml-agents/Runtime/Policies/HeuristicPolicy.cs
  29. 20
      com.unity.ml-agents/Runtime/Sensors/StackingSensor.cs
  30. 7
      com.unity.ml-agents/Runtime/SideChannels/EngineConfigurationChannel.cs
  31. 2
      com.unity.ml-agents/Runtime/SideChannels/SideChannel.cs
  32. 8
      com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs
  33. 30
      com.unity.ml-agents/Tests/Editor/PublicAPI/PublicApiValidation.cs
  34. 16
      com.unity.ml-agents/Tests/Editor/SideChannelTests.cs
  35. 6
      config/trainer_config.yaml
  36. 10
      docs/Custom-SideChannels.md
  37. 23
      docs/Getting-Started.md
  38. 3
      docs/Installation.md
  39. 7
      docs/Learning-Environment-Create-New.md
  40. 1
      docs/Learning-Environment-Design-Agents.md
  41. 1
      docs/Learning-Environment-Examples.md
  42. 8
      docs/Learning-Environment-Executable.md
  43. 17
      docs/Migrating.md
  44. 11
      docs/Python-API.md
  45. 1
      docs/Readme.md
  46. 4
      docs/Training-Curriculum-Learning.md
  47. 6
      docs/Training-Environment-Parameter-Randomization.md
  48. 38
      docs/Training-ML-Agents.md
  49. 98
      docs/Training-Self-Play.md
  50. 9
      docs/Using-Docker.md
  51. 7
      docs/Using-Tensorboard.md
  52. 2
      gym-unity/README.md
  53. 34
      gym-unity/gym_unity/envs/__init__.py
  54. 48
      gym-unity/gym_unity/tests/test_gym.py
  55. 2
      ml-agents-envs/mlagents_envs/communicator.py
  56. 117
      ml-agents-envs/mlagents_envs/environment.py
  57. 3
      ml-agents-envs/mlagents_envs/mock_communicator.py
  58. 3
      ml-agents-envs/mlagents_envs/rpc_communicator.py
  59. 4
      ml-agents-envs/mlagents_envs/side_channel/outgoing_message.py
  60. 4
      ml-agents-envs/mlagents_envs/side_channel/side_channel.py
  61. 23
      ml-agents-envs/mlagents_envs/tests/test_envs.py
  62. 5
      ml-agents/mlagents/model_serialization.py
  63. 52
      ml-agents/mlagents/trainers/agent_processor.py
  64. 52
      ml-agents/mlagents/trainers/behavior_id_utils.py
  65. 5
      ml-agents/mlagents/trainers/components/reward_signals/__init__.py
  66. 4
      ml-agents/mlagents/trainers/curriculum.py
  67. 38
      ml-agents/mlagents/trainers/distributions.py
  68. 15
      ml-agents/mlagents/trainers/env_manager.py
  69. 439
      ml-agents/mlagents/trainers/ghost/trainer.py
  70. 145
      ml-agents/mlagents/trainers/learn.py
  71. 4
      ml-agents/mlagents/trainers/meta_curriculum.py
  72. 1
      ml-agents/mlagents/trainers/policy/nn_policy.py
  73. 34
      ml-agents/mlagents/trainers/policy/tf_policy.py
  74. 11
      ml-agents/mlagents/trainers/ppo/trainer.py
  75. 5
      ml-agents/mlagents/trainers/sac/optimizer.py
  76. 11
      ml-agents/mlagents/trainers/sac/trainer.py
  77. 6
      ml-agents/mlagents/trainers/simple_env_manager.py
  78. 47
      ml-agents/mlagents/trainers/stats.py
  79. 124
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  80. 80
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  81. 43
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  82. 10
      ml-agents/mlagents/trainers/tests/test_distributions.py
  83. 36
      ml-agents/mlagents/trainers/tests/test_ghost.py
  84. 48
      ml-agents/mlagents/trainers/tests/test_learn.py
  85. 4
      ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
  86. 129
      ml-agents/mlagents/trainers/tests/test_simple_rl.py
  87. 30
      ml-agents/mlagents/trainers/tests/test_stats.py
  88. 55
      ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
  89. 22
      ml-agents/mlagents/trainers/tests/test_trainer_util.py
  90. 10
      ml-agents/mlagents/trainers/trainer/trainer.py
  91. 25
      ml-agents/mlagents/trainers/trainer_controller.py
  92. 41
      ml-agents/mlagents/trainers/trainer_util.py
  93. 2
      ml-agents/setup.py
  94. 67
      ml-agents/tests/yamato/training_int_tests.py
  95. 69
      ml-agents/tests/yamato/yamato_utils.py
  96. 1
      setup.cfg
  97. 32
      .yamato/gym-interface-test.yml
  98. 32
      .yamato/python-ll-api-test.yml
  99. 234
      com.unity.ml-agents/Runtime/SideChannels/SideChannelUtils.cs
  100. 11
      com.unity.ml-agents/Runtime/SideChannels/SideChannelUtils.cs.meta

2
.pylintrc


# Appears to be https://github.com/PyCQA/pylint/issues/2981
W0201,
# Using the global statement
W0603,

4
.yamato/standalone-build-test.yml


- "*.md"
- "com.unity.ml-agents/*.md"
- "com.unity.ml-agents/**/*.md"
artifacts:
standalonebuild:
paths:
- "Project/testPlayer*/**"
{% endfor %}

7
.yamato/training-int-tests.yml


commands:
- pip install pyyaml
- python -u -m ml-agents.tests.yamato.training_int_tests
# Backwards-compatibility tests.
# If we make a breaking change to the communication protocol, these will need
# to be disabled until the next release.
- python -u -m ml-agents.tests.yamato.training_int_tests --python=0.15.0
- python -u -m ml-agents.tests.yamato.training_int_tests --csharp=0.15.0
dependencies:
- .yamato/standalone-build-test.yml#test_mac_standalone_{{ editor.version }}
triggers:
cancel_old_ci: true
changes:

8
Dockerfile


WORKDIR /ml-agents
RUN pip install -e .
# port 5005 is the port used in in Editor training.
EXPOSE 5005
# Port 5004 is the port used in in Editor training.
# Environments will start from port 5005,
# so allow enough ports for several environments.
EXPOSE 5004-5050
ENTRYPOINT ["mlagents-learn"]
ENTRYPOINT ["xvfb-run", "--auto-servernum", "--server-args='-screen 0 640x480x24'", "mlagents-learn"]

30
Project/Assets/ML-Agents/Editor/Tests/StandaloneBuildTest.cs


{
public class StandaloneBuildTest
{
const string k_outputCommandLineFlag = "--mlagents-build-output-path";
const string k_sceneCommandLineFlag = "--mlagents-build-scene-path";
string[] scenes = { "Assets/ML-Agents/Examples/3DBall/Scenes/3DBall.unity" };
var buildResult = BuildPipeline.BuildPlayer(scenes, "testPlayer", BuildTarget.StandaloneOSX, BuildOptions.None);
// Read commandline arguments for options
var outputPath = "testPlayer";
var scenePath = "Assets/ML-Agents/Examples/3DBall/Scenes/3DBall.unity";
var args = Environment.GetCommandLineArgs();
for (var i = 0; i < args.Length - 1; i++)
{
if (args[i] == k_outputCommandLineFlag)
{
outputPath = args[i + 1];
Debug.Log($"Overriding output path to {outputPath}");
}
else if (args[i] == k_sceneCommandLineFlag)
{
scenePath = args[i + 1];
}
}
string[] scenes = { scenePath };
var buildResult = BuildPipeline.BuildPlayer(
scenes,
outputPath,
BuildTarget.StandaloneOSX,
BuildOptions.None
);
var isOk = buildResult.summary.result == BuildResult.Succeeded;
var error = "";
foreach (var stepInfo in buildResult.steps)

2
Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs


public override void Initialize()
{
m_BallRb = ball.GetComponent<Rigidbody>();
m_ResetParams = Academy.Instance.FloatProperties;
m_ResetParams = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
SetResetParameters();
}

2
Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs


public override void Initialize()
{
m_BallRb = ball.GetComponent<Rigidbody>();
m_ResetParams = Academy.Instance.FloatProperties;
m_ResetParams = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
SetResetParameters();
}

2
Project/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs


m_Rb = gameObject.GetComponent<Rigidbody>();
m_LookDir = Vector3.zero;
m_ResetParams = Academy.Instance.FloatProperties;
m_ResetParams = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
SetResetParameters();
}

5
Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorAgent.cs


using UnityEngine;
using MLAgents;
using MLAgents.Sensors;
using MLAgents.SideChannels;
public class FoodCollectorAgent : Agent
{

public void SetLaserLengths()
{
m_LaserLength = Academy.Instance.FloatProperties.GetPropertyWithDefault("laser_length", 1.0f);
m_LaserLength = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>().GetPropertyWithDefault("laser_length", 1.0f);
float agentScale = Academy.Instance.FloatProperties.GetPropertyWithDefault("agent_scale", 1.0f);
float agentScale = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>().GetPropertyWithDefault("agent_scale", 1.0f);
gameObject.transform.localScale = new Vector3(agentScale, agentScale, agentScale);
}

12
Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorSettings.cs


using UnityEngine;
using UnityEngine.UI;
using MLAgents;
using MLAgents.SideChannels;
public class FoodCollectorSettings : MonoBehaviour
{

public int totalScore;
public Text scoreText;
StatsSideChannel m_statsSideChannel;
m_statsSideChannel = SideChannelUtils.GetSideChannel<StatsSideChannel>();
}
public void EnvironmentReset()

public void Update()
{
scoreText.text = $"Score: {totalScore}";
// Send stats via SideChannel so that they'll appear in TensorBoard.
// These values get averaged every summary_frequency steps, so we don't
// need to send every Update() call.
if ((Time.frameCount % 100)== 0)
{
m_statsSideChannel?.AddStat("TotalScore", totalScore);
}
}
}

18
Project/Assets/ML-Agents/Examples/GridWorld/Scenes/GridWorld.unity


m_ReflectionIntensity: 1
m_CustomReflection: {fileID: 0}
m_Sun: {fileID: 0}
m_IndirectSpecularColor: {r: 0.44971162, g: 0.49977726, b: 0.5756362, a: 1}
m_IndirectSpecularColor: {r: 0.44971168, g: 0.4997775, b: 0.57563686, a: 1}
m_UseRadianceAmbientProbe: 0
--- !u!157 &3
LightmapSettings:

m_Father: {fileID: 363761400}
m_RootOrder: 1
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
m_AnchorMin: {x: 0.5, y: 0.5}
m_AnchorMax: {x: 0.5, y: 0.5}
m_AnchoredPosition: {x: -369.5, y: -62.2}
m_SizeDelta: {x: 160, y: 55.6}
m_AnchorMin: {x: 0, y: 1}
m_AnchorMax: {x: 0, y: 1}
m_AnchoredPosition: {x: 150, y: -230}
m_SizeDelta: {x: 160, y: 55.599976}
m_Pivot: {x: 0.5, y: 0.5}
--- !u!114 &918893360
MonoBehaviour:

m_Calls: []
m_FontData:
m_Font: {fileID: 10102, guid: 0000000000000000e000000000000000, type: 0}
m_FontSize: 20
m_FontSize: 22
m_FontStyle: 0
m_BestFit: 0
m_MinSize: 2

m_Father: {fileID: 363761400}
m_RootOrder: 2
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
m_AnchorMin: {x: 0.5, y: 0.5}
m_AnchorMax: {x: 0.5, y: 0.5}
m_AnchoredPosition: {x: -369.5, y: -197}
m_AnchorMin: {x: 0, y: 1}
m_AnchorMax: {x: 0, y: 1}
m_AnchoredPosition: {x: 150, y: -128}
m_SizeDelta: {x: 200, y: 152}
m_Pivot: {x: 0.5, y: 0.5}
--- !u!114 &1305247361

3
Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAgent.cs


using MLAgents;
using MLAgents.Sensors;
using UnityEngine.Serialization;
using MLAgents.SideChannels;
public class GridAgent : Agent
{

// Prevents the agent from picking an action that would make it collide with a wall
var positionX = (int)transform.position.x;
var positionZ = (int)transform.position.z;
var maxPosition = (int)Academy.Instance.FloatProperties.GetPropertyWithDefault("gridSize", 5f) - 1;
var maxPosition = (int)SideChannelUtils.GetSideChannel<FloatPropertiesChannel>().GetPropertyWithDefault("gridSize", 5f) - 1;
if (positionX == 0)
{

2
Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridArea.cs


public void Start()
{
m_ResetParameters = Academy.Instance.FloatProperties;
m_ResetParameters = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
m_Objects = new[] { goalPref, pitPref };

3
Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridSettings.cs


using UnityEngine;
using MLAgents;
using MLAgents.SideChannels;
public class GridSettings : MonoBehaviour
{

{
Academy.Instance.FloatProperties.RegisterCallback("gridSize", f =>
SideChannelUtils.GetSideChannel<FloatPropertiesChannel>().RegisterCallback("gridSize", f =>
{
MainCamera.transform.position = new Vector3(-(f - 1) / 2f, f * 1.25f, -(f - 1) / 2f);
MainCamera.orthographicSize = (f + 5f) / 2f;

5
Project/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs


using System.Collections;
using UnityEngine;
using MLAgents;
using MLAgents.SideChannels;
public class PushAgentBasic : Agent
{

public void SetGroundMaterialFriction()
{
var resetParams = Academy.Instance.FloatProperties;
var resetParams = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
var groundCollider = ground.GetComponent<Collider>();

public void SetBlockProperties()
{
var resetParams = Academy.Instance.FloatProperties;
var resetParams = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
var scale = resetParams.GetPropertyWithDefault("block_scale", 2);
//Set the scale of the block

3
Project/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs


using UnityEngine;
using MLAgents;
using MLAgents.Sensors;
using MLAgents.SideChannels;
public class ReacherAgent : Agent
{

public void SetResetParameters()
{
var fp = Academy.Instance.FloatProperties;
var fp = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
m_GoalSize = fp.GetPropertyWithDefault("goal_size", 5);
m_GoalSpeed = Random.Range(-1f, 1f) * fp.GetPropertyWithDefault("goal_speed", 1);
m_Deviation = fp.GetPropertyWithDefault("deviation", 0);

3
Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ProjectSettingsOverrides.cs


using UnityEngine;
using MLAgents;
using MLAgents.SideChannels;
namespace MLAgentsExamples
{

Physics.defaultSolverIterations = solverIterations;
Physics.defaultSolverVelocityIterations = solverVelocityIterations;
Academy.Instance.FloatProperties.RegisterCallback("gravity", f => { Physics.gravity = new Vector3(0, -f, 0); });
SideChannelUtils.GetSideChannel<FloatPropertiesChannel>().RegisterCallback("gravity", f => { Physics.gravity = new Vector3(0, -f, 0); });
}
public void OnDestroy()

3
Project/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerFieldArea.cs


using System.Collections;
using System.Collections.Generic;
using MLAgents;
using MLAgents.SideChannels;
using UnityEngine;
using UnityEngine.Serialization;

ballRb.velocity = Vector3.zero;
ballRb.angularVelocity = Vector3.zero;
var ballScale = Academy.Instance.FloatProperties.GetPropertyWithDefault("ball_scale", 0.015f);
var ballScale = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>().GetPropertyWithDefault("ball_scale", 0.015f);
ballRb.transform.localScale = new Vector3(ballScale, ballScale, ballScale);
}
}

2
Project/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs


m_BallRb = ball.GetComponent<Rigidbody>();
var canvas = GameObject.Find(k_CanvasName);
GameObject scoreBoard;
m_ResetParams = Academy.Instance.FloatProperties;
m_ResetParams = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
if (invertX)
{
scoreBoard = canvas.transform.Find(k_ScoreBoardBName).gameObject;

2
Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs


m_ChestRb = chest.GetComponent<Rigidbody>();
m_SpineRb = spine.GetComponent<Rigidbody>();
m_ResetParams = Academy.Instance.FloatProperties;
m_ResetParams = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
SetResetParameters();
}

13
Project/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs


using MLAgents;
using Barracuda;
using MLAgents.Sensors;
using MLAgents.SideChannels;
public class WallJumpAgent : Agent
{

Vector3 m_JumpTargetPos;
Vector3 m_JumpStartingPos;
FloatPropertiesChannel m_FloatProperties;
public override void Initialize()
{
m_WallJumpSettings = FindObjectOfType<WallJumpSettings>();

m_GroundMaterial = m_GroundRenderer.material;
spawnArea.SetActive(false);
m_FloatProperties = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
}
// Begin the jump sequence

{
localScale = new Vector3(
localScale.x,
Academy.Instance.FloatProperties.GetPropertyWithDefault("no_wall_height", 0),
m_FloatProperties.GetPropertyWithDefault("no_wall_height", 0),
localScale.z);
wall.transform.localScale = localScale;
SetModel("SmallWallJump", noWallBrain);

localScale = new Vector3(
localScale.x,
Academy.Instance.FloatProperties.GetPropertyWithDefault("small_wall_height", 4),
m_FloatProperties.GetPropertyWithDefault("small_wall_height", 4),
localScale.z);
wall.transform.localScale = localScale;
SetModel("SmallWallJump", smallWallBrain);

var min = Academy.Instance.FloatProperties.GetPropertyWithDefault("big_wall_min_height", 8);
var max = Academy.Instance.FloatProperties.GetPropertyWithDefault("big_wall_max_height", 8);
var min = m_FloatProperties.GetPropertyWithDefault("big_wall_min_height", 8);
var max = m_FloatProperties.GetPropertyWithDefault("big_wall_max_height", 8);
var height = min + Random.value * (max - min);
localScale = new Vector3(
localScale.x,

4
README.md


* Train using concurrent Unity environment instances
## Releases & Documentation
**Our latest, stable release is 0.15.0. Click
**Our latest, stable release is 0.15.1. Click
get started with the latest release of ML-Agents.**
The table below lists all our releases, including our `master` branch which is under active

| **Version** | **Release Date** | **Source** | **Documentation** | **Download** |
|:-------:|:------:|:-------------:|:-------:|:------------:|
| **master (unstable)** | -- | [source](https://github.com/Unity-Technologies/ml-agents/tree/master) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/master/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/master.zip) |
| **0.15.1** | **March 30, 2020** | **[source](https://github.com/Unity-Technologies/ml-agents/tree/0.15.1)** | **[docs](https://github.com/Unity-Technologies/ml-agents/tree/0.15.1/docs/Readme.md)** | **[download](https://github.com/Unity-Technologies/ml-agents/archive/0.15.1.zip)** |
| **0.15.0** | **March 18, 2020** | **[source](https://github.com/Unity-Technologies/ml-agents/tree/0.15.0)** | **[docs](https://github.com/Unity-Technologies/ml-agents/tree/0.15.0/docs/Readme.md)** | **[download](https://github.com/Unity-Technologies/ml-agents/archive/0.15.0.zip)** |
| **0.14.1** | February 26, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/0.14.1) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/0.14.1/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/0.14.1.zip) |
| **0.14.0** | February 13, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/0.14.0) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/0.14.0/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/0.14.0.zip) |

22
com.unity.ml-agents/CHANGELOG.md


## [Unreleased]
### Major Changes
- The `--load` and `--train` command-line flags have been deprecated. Training now happens by default, and
use `--resume` to resume training instead. (#3705)
- The Jupyter notebooks have been removed from the repository.
- Introduced the `SideChannelUtils` to register, unregister and access side channels.
- `Academy.FloatProperties` was removed, please use `SideChannelUtils.GetSideChannel<FloatPropertiesChannel>()` instead.
- Raise the wall in CrawlerStatic scene to prevent Agent from falling off. (#3650)
- Added a feature to allow sending stats from C# environments to TensorBoard (and other python StatsWriters). To do this from your code, use `SideChannelUtils.GetSideChannel<StatsSideChannel>().AddStat(key, value)` (#3660)
- The way that UnityEnvironment decides the port was changed. If no port is specified, the behavior will depend on the `file_name` parameter. If it is `None`, 5004 (the editor port) will be used; otherwise 5005 (the base environment port) will be used.
- Fixed an issue where exceptions from environments provided a returncode of 0. (#3680)
- Running `mlagents-learn` with the same `--run-id` twice will no longer overwrite the existing files. (#3705)
- `StackingSensor` was changed from `internal` visibility to `public`
## [0.15.1-preview] - 2020-03-30
### Bug Fixes
- Raise the wall in CrawlerStatic scene to prevent Agent from falling off. (#3650)
- Fixed an issue where specifying `vis_encode_type` was required only for SAC. (#3677)
- Fixed the reported entropy values for continuous actions (#3684)
- Fixed an issue where switching models using `SetModel()` during training would use an excessive amount of memory. (#3664)
- Environment subprocesses now close immediately on timeout or wrong API version. (#3679)
- Fixed an issue in the gym wrapper that would raise an exception if an Agent called EndEpisode multiple times in the same step. (#3700)
- Fixed an issue where logging output was not visible; logging levels are now set consistently. (#3703)
## [0.15.0-preview] - 2020-03-18
### Major Changes

43
com.unity.ml-agents/Runtime/Academy.cs


/// </summary>
public static Academy Instance { get { return s_Lazy.Value; } }
/// <summary>
/// Collection of float properties (indexed by a string).
/// </summary>
public FloatPropertiesChannel FloatProperties;
// Fields not provided in the Inspector.
/// <summary>

}
/// <summary>
/// Registers SideChannel to the Academy to send and receive data with Python.
/// If IsCommunicatorOn is false, the SideChannel will not be registered.
/// </summary>
/// <param name="channel"> The side channel to be registered.</param>
public void RegisterSideChannel(SideChannel channel)
{
LazyInitialize();
Communicator?.RegisterSideChannel(channel);
}
/// <summary>
/// Unregisters SideChannel to the Academy. If the side channel was not registered,
/// nothing will happen.
/// </summary>
/// <param name="channel"> The side channel to be unregistered.</param>
public void UnregisterSideChannel(SideChannel channel)
{
Communicator?.UnregisterSideChannel(channel);
}
/// <summary>
/// Disable stepping of the Academy during the FixedUpdate phase. If this is called, the Academy must be
/// stepped manually by the user by calling Academy.EnvironmentStep().
/// </summary>

{
EnableAutomaticStepping();
var floatProperties = new FloatPropertiesChannel();
FloatProperties = floatProperties;
SideChannelUtils.RegisterSideChannel(new EngineConfigurationChannel());
SideChannelUtils.RegisterSideChannel(new FloatPropertiesChannel());
SideChannelUtils.RegisterSideChannel(new StatsSideChannel());
// Try to launch the communicator by using the arguments passed at launch
var port = ReadPortFromArgs();

if (Communicator != null)
{
Communicator.RegisterSideChannel(new EngineConfigurationChannel());
Communicator.RegisterSideChannel(floatProperties);
// We try to exchange the first message with Python. If this fails, it means
// no Python Process is ready to train the environment. In this case, the
//environment must use Inference.

DecideAction?.Invoke();
}
// If the communicator is not on, we need to clear the SideChannel sending queue
if (!IsCommunicatorOn)
{
SideChannelUtils.GetSideChannelMessage();
}
using (TimerStack.Instance.Scoped("AgentAct"))
{
AgentAct?.Invoke();

Communicator?.Dispose();
Communicator = null;
SideChannelUtils.UnregisterAllSideChannels();
if (m_ModelRunners != null)
{

// TODO - Pass worker ID or some other identifier,
// so that multiple envs won't overwrite each others stats.
TimerStack.Instance.SaveJsonTimers();
FloatProperties = null;
m_Initialized = false;
// Reset the Lazy instance

3
com.unity.ml-agents/Runtime/Agent.cs


void NotifyAgentDone(DoneReason doneReason)
{
m_Info.episodeId = m_EpisodeId;
m_Info.reward = m_Reward;
m_Info.done = true;
m_Info.maxStepReached = doneReason == DoneReason.MaxStepReached;

// If everything is the same, don't make any changes.
return;
}
NotifyAgentDone(DoneReason.Disabled);
m_PolicyFactory.model = model;
m_PolicyFactory.inferenceDevice = inferenceDevice;
m_PolicyFactory.behaviorName = behaviorName;

13
com.unity.ml-agents/Runtime/Communicator/ICommunicator.cs


/// <param name="agentId">A key to identify which Agent actions to get.</param>
/// <returns></returns>
float[] GetActions(string key, int agentId);
/// <summary>
/// Registers a side channel to the communicator. The side channel will exchange
/// messages with its Python equivalent.
/// </summary>
/// <param name="sideChannel"> The side channel to be registered.</param>
void RegisterSideChannel(SideChannel sideChannel);
/// <summary>
/// Unregisters a side channel from the communicator.
/// </summary>
/// <param name="sideChannel"> The side channel to be unregistered.</param>
void UnregisterSideChannel(SideChannel sideChannel);
}
}

184
com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs


using MLAgents.Sensors;
using MLAgents.Policies;
using MLAgents.SideChannels;
using System.IO;
using Google.Protobuf;
namespace MLAgents

#endif
/// The communicator parameters sent at construction
CommunicatorInitParameters m_CommunicatorInitParameters;
Dictionary<Guid, SideChannel> m_SideChannels = new Dictionary<Guid, SideChannel>();
/// <summary>
/// Initializes a new instance of the RPCCommunicator class.

void UpdateEnvironmentWithInput(UnityRLInputProto rlInput)
{
ProcessSideChannelData(m_SideChannels, rlInput.SideChannel.ToArray());
SideChannelUtils.ProcessSideChannelData(rlInput.SideChannel.ToArray());
SendCommandEvent(rlInput.Command);
}

message.RlInitializationOutput = tempUnityRlInitializationOutput;
}
byte[] messageAggregated = GetSideChannelMessage(m_SideChannels);
byte[] messageAggregated = SideChannelUtils.GetSideChannelMessage();
message.RlOutput.SideChannel = ByteString.CopyFrom(messageAggregated);
var input = Exchange(message);

{
if (m_CurrentUnityRlOutput.AgentInfos.ContainsKey(behaviorName))
{
if (output == null)
if (m_CurrentUnityRlOutput.AgentInfos[behaviorName].CalculateSize() > 0)
output = new UnityRLInitializationOutputProto();
}
// Only send the BrainParameters if there is a non empty list of
// AgentInfos ready to be sent.
// This is to ensure that The Python side will always have a first
// observation when receiving the BrainParameters
if (output == null)
{
output = new UnityRLInitializationOutputProto();
}
var brainParameters = m_UnsentBrainKeys[behaviorName];
output.BrainParameters.Add(brainParameters.ToProto(behaviorName, true));
var brainParameters = m_UnsentBrainKeys[behaviorName];
output.BrainParameters.Add(brainParameters.ToProto(behaviorName, true));
}
}
}

{
m_SentBrainKeys.Add(brainProto.BrainName);
m_UnsentBrainKeys.Remove(brainProto.BrainName);
}
}
#endregion
#region Handling side channels
/// <summary>
/// Registers a side channel to the communicator. The side channel will exchange
/// messages with its Python equivalent.
/// </summary>
/// <param name="sideChannel"> The side channel to be registered.</param>
public void RegisterSideChannel(SideChannel sideChannel)
{
var channelId = sideChannel.ChannelId;
if (m_SideChannels.ContainsKey(channelId))
{
throw new UnityAgentsException(string.Format(
"A side channel with type index {0} is already registered. You cannot register multiple " +
"side channels of the same id.", channelId));
}
// Process any messages that we've already received for this channel ID.
var numMessages = m_CachedMessages.Count;
for (int i = 0; i < numMessages; i++)
{
var cachedMessage = m_CachedMessages.Dequeue();
if (channelId == cachedMessage.ChannelId)
{
using (var incomingMsg = new IncomingMessage(cachedMessage.Message))
{
sideChannel.OnMessageReceived(incomingMsg);
}
}
else
{
m_CachedMessages.Enqueue(cachedMessage);
}
}
m_SideChannels.Add(channelId, sideChannel);
}
/// <summary>
/// Unregisters a side channel from the communicator.
/// </summary>
/// <param name="sideChannel"> The side channel to be unregistered.</param>
public void UnregisterSideChannel(SideChannel sideChannel)
{
if (m_SideChannels.ContainsKey(sideChannel.ChannelId))
{
m_SideChannels.Remove(sideChannel.ChannelId);
}
}
/// <summary>
/// Grabs the messages that the registered side channels will send to Python at the current step
/// into a singe byte array.
/// </summary>
/// <param name="sideChannels"> A dictionary of channel type to channel.</param>
/// <returns></returns>
public static byte[] GetSideChannelMessage(Dictionary<Guid, SideChannel> sideChannels)
{
using (var memStream = new MemoryStream())
{
using (var binaryWriter = new BinaryWriter(memStream))
{
foreach (var sideChannel in sideChannels.Values)
{
var messageList = sideChannel.MessageQueue;
foreach (var message in messageList)
{
binaryWriter.Write(sideChannel.ChannelId.ToByteArray());
binaryWriter.Write(message.Count());
binaryWriter.Write(message);
}
sideChannel.MessageQueue.Clear();
}
return memStream.ToArray();
}
}
}
private struct CachedSideChannelMessage
{
public Guid ChannelId;
public byte[] Message;
}
private static Queue<CachedSideChannelMessage> m_CachedMessages = new Queue<CachedSideChannelMessage>();
/// <summary>
/// Separates the data received from Python into individual messages for each registered side channel.
/// </summary>
/// <param name="sideChannels">A dictionary of channel type to channel.</param>
/// <param name="dataReceived">The byte array of data received from Python.</param>
public static void ProcessSideChannelData(Dictionary<Guid, SideChannel> sideChannels, byte[] dataReceived)
{
while (m_CachedMessages.Count != 0)
{
var cachedMessage = m_CachedMessages.Dequeue();
if (sideChannels.ContainsKey(cachedMessage.ChannelId))
{
using (var incomingMsg = new IncomingMessage(cachedMessage.Message))
{
sideChannels[cachedMessage.ChannelId].OnMessageReceived(incomingMsg);
}
}
else
{
Debug.Log(string.Format(
"Unknown side channel data received. Channel Id is "
+ ": {0}", cachedMessage.ChannelId));
}
}
if (dataReceived.Length == 0)
{
return;
}
using (var memStream = new MemoryStream(dataReceived))
{
using (var binaryReader = new BinaryReader(memStream))
{
while (memStream.Position < memStream.Length)
{
Guid channelId = Guid.Empty;
byte[] message = null;
try
{
channelId = new Guid(binaryReader.ReadBytes(16));
var messageLength = binaryReader.ReadInt32();
message = binaryReader.ReadBytes(messageLength);
}
catch (Exception ex)
{
throw new UnityAgentsException(
"There was a problem reading a message in a SideChannel. Please make sure the " +
"version of MLAgents in Unity is compatible with the Python version. Original error : "
+ ex.Message);
}
if (sideChannels.ContainsKey(channelId))
{
using (var incomingMsg = new IncomingMessage(message))
{
sideChannels[channelId].OnMessageReceived(incomingMsg);
}
}
else
{
// Don't recognize this ID, but cache it in case the SideChannel that can handle
// it is registered before the next call to ProcessSideChannelData.
m_CachedMessages.Enqueue(new CachedSideChannelMessage
{
ChannelId = channelId,
Message = message
});
}
}
}
}
}

5
com.unity.ml-agents/Runtime/Policies/HeuristicPolicy.cs


public void RequestDecision(AgentInfo info, List<ISensor> sensors)
{
StepSensors(sensors);
m_LastDecision = m_Heuristic.Invoke();
if (!info.done)
{
m_LastDecision = m_Heuristic.Invoke();
}
}
/// <inheritdoc />

20
com.unity.ml-agents/Runtime/Sensors/StackingSensor.cs


/// For example, 4 stacked sets of observations would be output like
/// | t = now - 3 | t = now -3 | t = now - 2 | t = now |
/// Internally, a circular buffer of arrays is used. The m_CurrentIndex represents the most recent observation.
///
/// Currently, compressed and multidimensional observations are not supported.
internal class StackingSensor : ISensor
public class StackingSensor : ISensor
{
/// <summary>
/// The wrapped sensor.

WriteAdapter m_LocalAdapter = new WriteAdapter();
/// <summary>
///
/// Initializes the sensor.
/// </summary>
/// <param name="wrapped">The wrapped sensor.</param>
/// <param name="numStackedObservations">Number of stacked observations to keep.</param>

m_Name = $"StackingSensor_size{numStackedObservations}_{wrapped.GetName()}";
if (wrapped.GetCompressionType() != SensorCompressionType.None)
{
throw new UnityAgentsException("StackingSensor doesn't support compressed observations.'");
}
if (shape.Length != 1)
{
throw new UnityAgentsException("Only 1-D observations are supported by StackingSensor");
}
m_Shape = new int[shape.Length];
m_UnstackedObservationSize = wrapped.ObservationSize();

}
}
/// <inheritdoc/>
public int Write(WriteAdapter adapter)
{
// First, call the wrapped sensor's write method. Make sure to use our own adapter, not the passed one.

m_CurrentIndex = (m_CurrentIndex + 1) % m_NumStackedObservations;
}
/// <inheritdoc/>
/// <inheritdoc/>
/// <inheritdoc/>
/// <inheritdoc/>
public virtual SensorCompressionType GetCompressionType()
{
return SensorCompressionType.None;

7
com.unity.ml-agents/Runtime/SideChannels/EngineConfigurationChannel.cs


/// </summary>
public class EngineConfigurationChannel : SideChannel
{
private const string k_EngineConfigId = "e951342c-4f7e-11ea-b238-784f4387d1f7";
const string k_EngineConfigId = "e951342c-4f7e-11ea-b238-784f4387d1f7";
/// Initializes the side channel.
/// Initializes the side channel. The constructor is internal because only one instance is
/// supported at a time, and is created by the Academy.
public EngineConfigurationChannel()
internal EngineConfigurationChannel()
{
ChannelId = new Guid(k_EngineConfigId);
}

2
com.unity.ml-agents/Runtime/SideChannels/SideChannel.cs


using System.Collections.Generic;
using System;
using System.IO;
using System.Text;
namespace MLAgents.SideChannels
{

8
com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs


using System.Collections.Generic;
using MLAgents.Sensors;
using MLAgents.Policies;
using MLAgents.SideChannels;
namespace MLAgents.Tests
{

Assert.AreEqual(0, aca.EpisodeCount);
Assert.AreEqual(0, aca.StepCount);
Assert.AreEqual(0, aca.TotalStepCount);
Assert.AreNotEqual(null, aca.FloatProperties);
Assert.AreNotEqual(null, SideChannelUtils.GetSideChannel<FloatPropertiesChannel>());
// Check that Dispose is idempotent
aca.Dispose();

[Test]
public void TestAcademyDispose()
{
var floatProperties1 = Academy.Instance.FloatProperties;
var floatProperties1 = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
var floatProperties2 = Academy.Instance.FloatProperties;
Academy.Instance.LazyInitialize();
var floatProperties2 = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
Academy.Instance.Dispose();
Assert.AreNotEqual(floatProperties1, floatProperties2);

30
com.unity.ml-agents/Tests/Editor/PublicAPI/PublicApiValidation.cs


}
}
// Simple SensorComponent that sets up a StackingSensor
class StackingComponent : SensorComponent
{
public SensorComponent wrappedComponent;
public int numStacks;
public override ISensor CreateSensor()
{
var wrappedSensor = wrappedComponent.CreateSensor();
return new StackingSensor(wrappedSensor, numStacks);
}
public override int[] GetObservationShape()
{
int[] shape = (int[]) wrappedComponent.GetObservationShape().Clone();
for (var i = 0; i < shape.Length; i++)
{
shape[i] *= numStacks;
}
return shape;
}
}
[Test]
public void CheckSetupAgent()

sensorComponent.sensorName = "ray3d";
sensorComponent.detectableTags = new List<string> { "Player", "Respawn" };
sensorComponent.raysPerDirection = 3;
// Make a StackingSensor that wraps the RayPerceptionSensorComponent3D
// This isn't necessarily practical, just to ensure that it can be done
var wrappingSensorComponent = gameObject.AddComponent<StackingComponent>();
wrappingSensorComponent.wrappedComponent = sensorComponent;
wrappingSensorComponent.numStacks = 3;
// ISensor isn't set up yet.
Assert.IsNull(sensorComponent.raySensor);

16
com.unity.ml-agents/Tests/Editor/SideChannelTests.cs


intSender.SendInt(5);
intSender.SendInt(6);
byte[] fakeData = RpcCommunicator.GetSideChannelMessage(dictSender);
RpcCommunicator.ProcessSideChannelData(dictReceiver, fakeData);
byte[] fakeData = SideChannelUtils.GetSideChannelMessage(dictSender);
SideChannelUtils.ProcessSideChannelData(dictReceiver, fakeData);
Assert.AreEqual(intReceiver.messagesReceived[0], 4);
Assert.AreEqual(intReceiver.messagesReceived[1], 5);

strSender.SendRawBytes(Encoding.ASCII.GetBytes(str1));
strSender.SendRawBytes(Encoding.ASCII.GetBytes(str2));
byte[] fakeData = RpcCommunicator.GetSideChannelMessage(dictSender);
RpcCommunicator.ProcessSideChannelData(dictReceiver, fakeData);
byte[] fakeData = SideChannelUtils.GetSideChannelMessage(dictSender);
SideChannelUtils.ProcessSideChannelData(dictReceiver, fakeData);
var messages = strReceiver.GetAndClearReceivedMessages();

tmp = propB.GetPropertyWithDefault(k2, 3.0f);
Assert.AreEqual(tmp, 1.0f);
byte[] fakeData = RpcCommunicator.GetSideChannelMessage(dictSender);
RpcCommunicator.ProcessSideChannelData(dictReceiver, fakeData);
byte[] fakeData = SideChannelUtils.GetSideChannelMessage(dictSender);
SideChannelUtils.ProcessSideChannelData(dictReceiver, fakeData);
tmp = propA.GetPropertyWithDefault(k2, 3.0f);
Assert.AreEqual(tmp, 1.0f);

Assert.AreEqual(wasCalled, 0);
fakeData = RpcCommunicator.GetSideChannelMessage(dictSender);
RpcCommunicator.ProcessSideChannelData(dictReceiver, fakeData);
fakeData = SideChannelUtils.GetSideChannelMessage(dictSender);
SideChannelUtils.ProcessSideChannelData(dictReceiver, fakeData);
Assert.AreEqual(wasCalled, 1);
var keysA = propA.ListProperties();

6
config/trainer_config.yaml


time_horizon: 1000
self_play:
window: 10
play_against_current_self_ratio: 0.5
play_against_latest_model_ratio: 0.5
team_change: 100000
Soccer:
normalize: false

num_layers: 2
self_play:
window: 10
play_against_current_self_ratio: 0.5
play_against_latest_model_ratio: 0.5
team_change: 100000
CrawlerStatic:
normalize: true

10
docs/Custom-SideChannels.md


`base.QueueMessageToSend(msg)` method inside the side channel, and call the
`OutgoingMessage.Dispose()` method.
To register a side channel on the Unity side, call `Academy.Instance.RegisterSideChannel` with the side channel
To register a side channel on the Unity side, call `SideChannelUtils.RegisterSideChannel` with the side channel
as only argument.
### Python side

// When a Debug.Log message is created, we send it to the stringChannel
Application.logMessageReceived += stringChannel.SendDebugStatementToPython;
// The channel must be registered with the Academy
Academy.Instance.RegisterSideChannel(stringChannel);
// The channel must be registered with the SideChannelUtils class
SideChannelUtils.RegisterSideChannel(stringChannel);
}
public void OnDestroy()

if (Academy.IsInitialized){
Academy.Instance.UnregisterSideChannel(stringChannel);
SideChannelUtils.UnregisterSideChannel(stringChannel);
}
}

string_log = StringLogChannel()
# We start the communication with the Unity Editor and pass the string_log side channel as input
env = UnityEnvironment(base_port=UnityEnvironment.DEFAULT_EDITOR_PORT, side_channels=[string_log])
env = UnityEnvironment(side_channels=[string_log])
env.reset()
string_log.send_string("The environment was reset")

23
docs/Getting-Started.md


Depending on your version of Unity, it may be necessary to change the **Scripting Runtime Version** of your project. This can be done as follows:
1. Launch Unity
2. On the Projects dialog, choose the **Open** option at the top of the window.
1. Launch Unity Hub
2. On the Projects dialog, choose the **Add** option at the top of the window.
3. Using the file dialog that opens, locate the `Project` folder
within the ML-Agents toolkit project and click **Open**.
4. Go to **Edit** > **Project Settings** > **Player**

2. Navigate to the folder where you cloned the ML-Agents toolkit repository.
**Note**: If you followed the default [installation](Installation.md), then
you should be able to run `mlagents-learn` from any directory.
3. Run `mlagents-learn <trainer-config-path> --run-id=<run-identifier> --train`
3. Run `mlagents-learn <trainer-config-path> --run-id=<run-identifier>`
training runs
- `--train` tells `mlagents-learn` to run a training session (rather
than inference)
training runs. Make sure to use one that hasn't been used already!
mlagents-learn config/trainer_config.yaml --run-id=firstRun --train
mlagents-learn config/trainer_config.yaml --run-id=firstRun
```
5. When the message _"Start training by pressing the Play button in the Unity

**Note**: If you're using Anaconda, don't forget to activate the ml-agents
environment first.
The `--train` flag tells the ML-Agents toolkit to run in training mode.
The `--time-scale=100` sets the `Time.TimeScale` value in Unity.
**Note**: You can train using an executable rather than the Editor. To do so,

command-line prompt. If you close the window manually, the `.nn` file
containing the trained model is not exported into the ml-agents folder.
You can press Ctrl+C to stop the training, and your trained model will be at
`models/<run-identifier>/<behavior_name>.nn` where
If you've quit the training early using Ctrl+C and want to resume training, run the
same command again, appending the `--resume` flag:
```sh
mlagents-learn config/trainer_config.yaml --run-id=firstRun --resume
```
Your trained model will be at `models/<run-identifier>/<behavior_name>.nn` where
`<behavior_name>` is the name of the `Behavior Name` of the agents corresponding to the model.
(**Note:** There is a known bug on Windows that causes the saving of the model to
fail when you early terminate the training, it's recommended to wait until Step

3
docs/Installation.md


By installing the `mlagents` package, the dependencies listed in the
[setup.py file](../ml-agents/setup.py) are also installed. These include
[TensorFlow](Background-TensorFlow.md) (Requires a CPU w/ AVX support) and
[Jupyter](Background-Jupyter.md).
[TensorFlow](Background-TensorFlow.md) (Requires a CPU w/ AVX support).
#### Advanced: Installing for Development

7
docs/Learning-Environment-Create-New.md


includes a convenient Monitor class that you can use to easily display Agent
status information in the Game window.
One additional test you can perform is to first ensure that your environment and
the Python API work as expected using the `notebooks/getting-started.ipynb`
[Jupyter notebook](Background-Jupyter.md). Within the notebook, be sure to set
`env_name` to the name of the environment file you specify when building this
environment.
## Training the Environment

To train in the editor, run the following Python command from a Terminal or Console
window before pressing play:
mlagents-learn config/config.yaml --run-id=RollerBall-1 --train
mlagents-learn config/config.yaml --run-id=RollerBall-1
(where `config.yaml` is a copy of `trainer_config.yaml` that you have edited
to change the `batch_size` and `buffer_size` hyperparameters for your trainer.)

1
docs/Learning-Environment-Design-Agents.md


```csharp
normalizedValue = (currentValue - minValue)/(maxValue - minValue)
```
:warning: For vectors, you should apply the above formula to each component (x, y, and z). Note that this is *not* the same as using the `Vector3.normalized` property or `Vector3.Normalize()` method in Unity (and similar for `Vector2`).
Rotations and angles should also be normalized. For angles between 0 and 360
degrees, you can use the following formulas:

1
docs/Learning-Environment-Examples.md


* Goal:
* Get the ball into the opponent's goal while preventing
the ball from entering own goal.
* Goalie:
* Agents: The environment contains four agents, with the same
Behavior Parameters : Soccer.
* Agent Reward Function (dependent):

8
docs/Learning-Environment-Executable.md


followed the default [installation](Installation.md), then navigate to the
`ml-agents/` folder.
3. Run
`mlagents-learn <trainer-config-file> --env=<env_name> --run-id=<run-identifier> --train`
`mlagents-learn <trainer-config-file> --env=<env_name> --run-id=<run-identifier>`
Where:
* `<trainer-config-file>` is the file path of the trainer configuration yaml
* `<env_name>` is the name and path to the executable you exported from Unity

* And the `--train` tells `mlagents-learn` to run a training session (rather
than inference)
mlagents-learn ../config/trainer_config.yaml --env=3DBall --run-id=firstRun --train
mlagents-learn ../config/trainer_config.yaml --env=3DBall --run-id=firstRun
ml-agents$ mlagents-learn config/trainer_config.yaml --env=3DBall --run-id=first-run --train
ml-agents$ mlagents-learn config/trainer_config.yaml --env=3DBall --run-id=first-run
▄▄▄▓▓▓▓

17
docs/Migrating.md


## Migrating from 0.15 to latest
### Important changes
* The `--load` and `--train` command-line flags have been deprecated and replaced with `--resume` and `--inference`.
* Running with the same `--run-id` twice will now throw an error.
* The `play_against_current_self_ratio` self-play trainer hyperparameter has been renamed to `play_against_latest_model_ratio`
* Replace the `--load` flag with `--resume` when calling `mlagents-learn`, and don't use the `--train` flag as training
will happen by default. To run without training, use `--inference`.
* To force-overwrite files from a pre-existing run, add the `--force` command-line flag.
* The Jupyter notebooks have been removed from the repository.
* `Academy.FloatProperties` was removed.
* `Academy.RegisterSideChannel` and `Academy.UnregisterSideChannel` were removed.
### Steps to Migrate
* Replace `Academy.FloatProperties` with `SideChannelUtils.GetSideChannel<FloatPropertiesChannel>()`.
* Replace `Academy.RegisterSideChannel` with `SideChannelUtils.RegisterSideChannel()`.
* Replace `Academy.UnregisterSideChannel` with `SideChannelUtils.UnregisterSideChannel`.
## Migrating from 0.14 to 0.15

* The interface for SideChannels was changed:
* In C#, `OnMessageReceived` now takes a `IncomingMessage` argument, and `QueueMessageToSend` takes an `OutgoingMessage` argument.
* In python, `on_message_received` now takes a `IncomingMessage` argument, and `queue_message_to_send` takes an `OutgoingMessage` argument.
* Automatic stepping for Academy is now controlled from the AutomaticSteppingEnabled property.
### Steps to Migrate
* Add the `using MLAgents.Sensors;` in addition to `using MLAgents;` on top of your Agent's script.

* We strongly recommend replacing the following methods with their new equivalent as they will be removed in a later release:
* `InitializeAgent()` to `Initialize()`
* `AgentAction()` to `OnActionReceived()`
* `AgentReset()` to `OnEpsiodeBegin()`
* `AgentReset()` to `OnEpisodeBegin()`
* Replace calls to Academy.EnableAutomaticStepping()/DisableAutomaticStepping() with Academy.AutomaticSteppingEnabled = true/false.
## Migrating from 0.13 to 0.14

11
docs/Python-API.md


The ML-Agents Toolkit Low Level API is a Python API for controlling the simulation
loop of an environment or game built with Unity. This API is used by the
training algorithms inside the ML-Agent Toolkit, but you can also write your own
Python programs using this API. Go [here](../notebooks/getting-started.ipynb)
for a Jupyter Notebook walking through the functionality of the API.
Python programs using this API.
The key objects in the Python API include:

```python
from mlagents_envs.environment import UnityEnvironment
env = UnityEnvironment(file_name="3DBall", base_port=5005, seed=1, side_channels=[])
env = UnityEnvironment(file_name="3DBall", seed=1, side_channels=[])
```
- `file_name` is the name of the environment binary (located in the root

channel = EngineConfigurationChannel()
env = UnityEnvironment(base_port = UnityEnvironment.DEFAULT_EDITOR_PORT, side_channels = [channel])
env = UnityEnvironment(side_channels=[channel])
channel.set_configuration_parameters(time_scale = 2.0)

channel = FloatPropertiesChannel()
env = UnityEnvironment(base_port = UnityEnvironment.DEFAULT_EDITOR_PORT, side_channels = [channel])
env = UnityEnvironment(side_channels=[channel])
channel.set_property("parameter_1", 2.0)

Once a property has been modified in Python, you can access it in C# after the next call to `step` as follows:
```csharp
var sharedProperties = Academy.Instance.FloatProperties;
var sharedProperties = SideChannelUtils.GetSideChannel<FloatPropertiesChannel>();
float property1 = sharedProperties.GetPropertyWithDefault("parameter_1", 0.0f);
```

1
docs/Readme.md


## Installation & Set-up
* [Installation](Installation.md)
* [Background: Jupyter Notebooks](Background-Jupyter.md)
* [Using Virtual Environment](Using-Virtual-Environment.md)
## Getting Started

4
docs/Training-Curriculum-Learning.md


In order to define the curricula, the first step is to decide which parameters of
the environment will vary. In the case of the Wall Jump environment,
the height of the wall is what varies. We define this as a `Shared Float Property`
that can be accessed in `Academy.Instance.FloatProperties`, and by doing
that can be accessed in `SideChannelUtils.GetSideChannel<FloatPropertiesChannel>()`, and by doing
so it becomes adjustable via the Python API.
Rather than adjusting it by hand, we will create a YAML file which
describes the structure of the curricula. Within it, we can specify which

to train agents in the Wall Jump environment with curriculum learning, we can run:
```sh
mlagents-learn config/trainer_config.yaml --curriculum=config/curricula/wall_jump.yaml --run-id=wall-jump-curriculum --train
mlagents-learn config/trainer_config.yaml --curriculum=config/curricula/wall_jump.yaml --run-id=wall-jump-curriculum
```
We can then keep track of the current lessons and progresses via TensorBoard.

6
docs/Training-Environment-Parameter-Randomization.md


To enable variations in the environments, we implemented `Environment Parameters`.
`Environment Parameters` are `Academy.Instance.FloatProperties` that can be read when setting
`Environment Parameters` are values in the `FloatPropertiesChannel` that can be read when setting
up the environment. We
also included different sampling methods and the ability to create new kinds of
sampling methods for each `Environment Parameter`. In the 3D ball environment example displayed

environment with a new sample of `Environment Parameters`.
* `Environment Parameter` - Name of the `Environment Parameter` like `mass`, `gravity` and `scale`. This should match the name
specified in the `FloatProperties` of the environment being trained. If a parameter specified in the file doesn't exist in the
specified in the `FloatPropertiesChannel` of the environment being trained. If a parameter specified in the file doesn't exist in the
environment, then this parameter will be ignored. Within each `Environment Parameter`
* `sampler-type` - Specify the sampler type to use for the `Environment Parameter`.

```sh
mlagents-learn config/trainer_config.yaml --sampler=config/3dball_randomize.yaml
--run-id=3D-Ball-randomize --train
--run-id=3D-Ball-randomize
```
We can observe progress and metrics via Tensorboard.

38
docs/Training-ML-Agents.md


The basic command for training is:
```sh
mlagents-learn <trainer-config-file> --env=<env_name> --run-id=<run-identifier> --train
mlagents-learn <trainer-config-file> --env=<env_name> --run-id=<run-identifier>
```
where

environment you built in step 1:
```sh
mlagents-learn config/trainer_config.yaml --env=../../projects/Cats/CatsOnBicycles.app --run-id=cob_1 --train
mlagents-learn config/trainer_config.yaml --env=../../projects/Cats/CatsOnBicycles.app --run-id=cob_1
```
During a training session, the training program prints out and saves updates at

`models/cob_1/CatsOnBicycles_cob_1.nn`.
While this example used the default training hyperparameters, you can edit the
[training_config.yaml file](#training-config-file) with a text editor to set
[trainer_config.yaml file](#training-config-file) with a text editor to set
To interrupt training and save the current progress, hit Ctrl+C once and wait for the
model to be saved out.
### Loading an Existing Model
If you've quit training early using Ctrl+C, you can resume the training run by running
`mlagents-learn` again, specifying the same `<run-identifier>` and appending the `--resume` flag
to the command.
You can also use this mode to run inference of an already-trained model in Python.
Append both the `--resume` and `--inference` to do this. Note that if you want to run
inference in Unity, you should use the
[Unity Inference Engine](Getting-started#Running-a-pre-trained-model).
If you've already trained a model using the specified `<run-identifier>` and `--resume` is not
specified, you will not be able to continue with training. Use `--force` to force ML-Agents to
overwrite the existing data.
### Command Line Training Options
In addition to passing the path of the Unity executable containing your training

training. Defaults to 0.
* `--num-envs=<n>`: Specifies the number of concurrent Unity environment instances to
collect experiences from when training. Defaults to 1.
* `--run-id=<path>`: Specifies an identifier for each training run. This
* `--run-id=<run-identifier>`: Specifies an identifier for each training run. This
identifier is used to name the subdirectories in which the trained model and
summary statistics are saved as well as the saved model itself. The default id
is "ppo". If you use TensorBoard to view the training statistics, always set a

will use the port `(base_port + worker_id)`, where the `worker_id` is sequential IDs
given to each instance from 0 to `num_envs - 1`. Default is 5005. __Note:__ When
training using the Editor rather than an executable, the base port will be ignored.
* `--train`: Specifies whether to train model or only run in inference mode.
When training, **always** use the `--train` option.
* `--load`: If set, the training code loads an already trained model to
* `--inference`: Specifies whether to only run in inference mode. Omit to train the model.
To load an existing model, specify a run-id and combine with `--resume`.
* `--resume`: If set, the training code loads an already trained model to
training). When not set (the default), the neural network weights are randomly
initialized and an existing model is not loaded.
training). This option only works when the models exist, and have the same behavior names
as the current agents in your scene.
* `--force`: Attempting to train a model with a run-id that has been used before will
throw an error. Use `--force` to force-overwrite this run-id's summary and model data.
* `--no-graphics`: Specify this option to run the Unity executable in
`-batchmode` and doesn't initialize the graphics driver. Use this only if your
training doesn't involve visual observations (reading from Pixels). See

98
docs/Training-Self-Play.md


# Training with Self-Play
ML-Agents provides the functionality to train symmetric, adversarial games with [Self-Play](https://openai.com/blog/competitive-self-play/).
A symmetric game is one in which opposing agents are *equal* in form and function. In reinforcement learning,
this means both agents have the same observation and action spaces.
With self-play, an agent learns in adversarial games by competing against fixed, past versions of itself
to provide a more stable, stationary learning environment. This is compared
to competing against its current self in every episode, which is a constantly changing opponent.
ML-Agents provides the functionality to train both symmetric and asymmetric adversarial games with
[Self-Play](https://openai.com/blog/competitive-self-play/).
A symmetric game is one in which opposing agents are equal in form, function and objective. Examples of symmetric games
are our Tennis and Soccer example environments. In reinforcement learning, this means both agents have the same observation and
action spaces and learn from the same reward function and so *they can share the same policy*. In asymmetric games,
this is not the case. An example of an asymmetric games are Hide and Seek. Agents in these
types of games do not always have the same observation or action spaces and so sharing policy networks is not
necessarily ideal.
With self-play, an agent learns in adversarial games by competing against fixed, past versions of its opponent
(which could be itself as in symmetric games) to provide a more stable, stationary learning environment. This is compared
to competing against the current, best opponent in every episode, which is constantly changing (because it's learning).
However, from the perspective of an individual agent, these scenarios appear to have non-stationary dynamics because the opponent is often changing.
This can cause significant issues in the experience replay mechanism used by SAC. Thus, we recommend that users use PPO. For further reading on
this issue in particular, see the paper [Stabilising Experience Replay for Deep Multi-Agent Reinforcement Learning](https://arxiv.org/pdf/1702.08887.pdf).
For more general information on training with ML-Agents, see [Training ML-Agents](Training-ML-Agents.md).
For more algorithm specific instruction, please see the documentation for [PPO](Training-PPO.md) or [SAC](Training-SAC.md).

See the trainer configuration and agent prefabs for our Tennis environment for an example.
***Team ID must be 0 or an integer greater than 0.***
In symmetric games, since all agents (even on opposing teams) will share the same policy, they should have the same 'Behavior Name' in their
Behavior Parameters Script. In asymmetric games, they should have a different Behavior Name in their Behavior Parameters script.
Note, in asymmetric games, the agents must have both different Behavior Names *and* different team IDs! Then, specify the trainer configuration
for each Behavior Name in your scene as you would normally, and remember to include the self-play hyperparameter hierarchy!
For examples of how to use this feature, you can see the trainer configurations and agent prefabs for our Tennis and Soccer environments.
Tennis and Soccer provide examples of symmetric games. To train an asymmetric game, specify trainer configurations for each of your behavior names
and include the self-play hyperparameter hierarchy in both.
## Best Practices Training with Self-Play

Training against a set of slowly or unchanging adversaries with low diversity
results in a more stable learning process than training against a set of quickly
changing adversaries with high diversity. With this context, this guide discusses the exposed self-play hyperparameters and intuitions for tuning them.
changing adversaries with high diversity. With this context, this guide discusses
the exposed self-play hyperparameters and intuitions for tuning them.
## Hyperparameters

### Save Steps
The `save_steps` parameter corresponds to the number of *trainer steps* between snapshots. For example, if `save_steps`=10000 then a snapshot of the current policy will be saved every 10000 trainer steps. Note, trainer steps are counted per agent. For more information, please see the [migration doc](Migrating.md) after v0.13.
The `save_steps` parameter corresponds to the number of *trainer steps* between snapshots. For example, if `save_steps=10000` then a snapshot of the current policy will be saved every `10000` trainer steps. Note, trainer steps are counted per agent. For more information, please see the [migration doc](Migrating.md) after v0.13.
### Team Change
The `team_change` parameter corresponds to the number of *trainer_steps* between switching the learning team.
This is the number of trainer steps the teams associated with a specific ghost trainer will train before a different team
becomes the new learning team. It is possible that, in asymmetric games, opposing teams require fewer trainer steps to make similar
performance gains. This enables users to train a more complicated team of agents for more trainer steps than a simpler team of agents
per team switch.
A larger value of `team-change` will allow the agent to train longer against it's opponents. The longer an agent trains against the same set of opponents
the more able it will be to defeat them. However, training against them for too long may result in overfitting to the particular opponent strategies
and so the agent may fail against the next batch of opponents.
The value of `team-change` will determine how many snapshots of the agent's policy are saved to be used as opponents for the other team. So, we
recommend setting this value as a function of the `save_steps` parameter discussed previously.
Recommended Range : 4x-10x where x=`save_steps`
The `swap_steps` parameter corresponds to the number of *trainer steps* between swapping the opponents policy with a different snapshot. As in the `save_steps` discussion, note that trainer steps are counted per agent. For more information, please see the [migration doc](Migrating.md) after v0.13.
The `swap_steps` parameter corresponds to the number of *ghost steps* (not trainer steps) between swapping the opponents policy with a different snapshot.
A 'ghost step' refers to a step taken by an agent *that is following a fixed policy and not learning*. The reason for this distinction is that in asymmetric games,
we may have teams with an unequal number of agents e.g. a 2v1 scenario. The team with two agents collects
twice as many agent steps per environment step as the team with one agent. Thus, these two values will need to be distinct to ensure that the same number
of trainer steps corresponds to the same number of opponent swaps for each team. The formula for `swap_steps` if
a user desires `x` swaps of a team with `num_agents` agents against an opponent team with `num_opponent_agents`
agents during `team-change` total steps is:
```
swap_steps = (num_agents / num_opponent_agents) * (team_change / x)
```
As an example, in a 2v1 scenario, if we want the swap to occur `x=4` times during `team-change=200000` steps,
the `swap_steps` for the team of one agent is:
```
swap_steps = (1 / 2) * (200000 / 4) = 25000
```
The `swap_steps` for the team of two agents is:
```
swap_steps = (2 / 1) * (200000 / 4) = 100000
```
Note, with equal team sizes, the first term is equal to 1 and `swap_steps` can be calculated by just dividing the total steps by the desired number of swaps.
### Play against current self ratio
### Play against latest model ratio
The `play_against_current_self_ratio` parameter corresponds to the probability
an agent will play against its ***current*** self. With probability
1 - `play_against_current_self_ratio`, the agent will play against a snapshot of itself
from a past iteration.
The `play_against_latest_model_ratio` parameter corresponds to the probability
an agent will play against the latest opponent policy. With probability
1 - `play_against_latest_model_ratio`, the agent will play against a snapshot of its
opponent from a past iteration.
A larger value of `play_against_current_self_ratio` indicates that an agent will be playing against itself more often. Since the agent is updating it's policy, the opponent will be different from iteration to iteration. This can lead to an unstable learning environment, but poses the agent with an [auto-curricula](https://openai.com/blog/emergent-tool-use/) of more increasingly challenging situations which may lead to a stronger final policy.
A larger value of `play_against_latest_model_ratio` indicates that an agent will be playing against the current opponent more often. Since the agent is updating it's policy, the opponent will be different from iteration to iteration. This can lead to an unstable learning environment, but poses the agent with an [auto-curricula](https://openai.com/blog/emergent-tool-use/) of more increasingly challenging situations which may lead to a stronger final policy.
Recommended Range : 0.0 - 1.0
Range : 0.0 - 1.0
### Window

In adversarial games, the cumulative environment reward may not be a meaningful metric by which to track learning progress. This is because cumulative reward is entirely dependent on the skill of the opponent. An agent at a particular skill level will get more or less reward against a worse or better agent, respectively.
We provide an implementation of the ELO rating system, a method for calculating the relative skill level between two players from a given population in a zero-sum game. For more information on ELO, please see [the ELO wiki](https://en.wikipedia.org/wiki/Elo_rating_system).
In a proper training run, the ELO of the agent should steadily increase. The absolute value of the ELO is less important than the change in ELO over training iterations.
In a proper training run, the ELO of the agent should steadily increase. The absolute value of the ELO is less important than the change in ELO over training iterations.
Note, this implementation will support any number of teams but ELO is only applicable to games with two teams. It is ongoing work to implement
a reliable metric for measuring progress in scenarios with three or more teams. These scenarios can still train, though as of now, reward and qualitative observations
are the only metric by which we can judge performance.

9
docs/Using-Docker.md


-p 5005:5005 \
-p 6006:6006 \
<image-name>:latest \
--docker-target-name=unity-volume \
<trainer-config-file> \
--env=<environment-name> \
--train \

- `source`: Reference to the path in your host OS where you will store the Unity
executable.
- `target`: Tells Docker to mount the `source` path as a disk with this name.
- `docker-target-name`: Tells the ML-Agents Python package what the name of the
disk where it can read the Unity executable and store the graph. **This should
therefore be identical to `target`.**
- `trainer-config-file`, `train`, `run-id`: ML-Agents arguments passed to
`mlagents-learn`. `trainer-config-file` is the filename of the trainer config
file, `train` trains the algorithm, and `run-id` is used to tag each

-p 5005:5005 \
-p 6006:6006 \
balance.ball.v0.1:latest 3DBall \
--docker-target-name=unity-volume \
trainer_config.yaml \
--env=3DBall \
/unity-volume/trainer_config.yaml \
--env=/unity-volume/3DBall \
--train \
--run-id=3dball_first_trial
```

7
docs/Using-Tensorboard.md


taken between two observations.
* `Losses/Cloning Loss` (BC) - The mean magnitude of the behavioral cloning loss. Corresponds to how well the model imitates the demonstration data.
## Custom Metrics from C#
To get custom metrics from a C# environment into Tensorboard, you can use the StatsSideChannel:
```csharp
var statsSideChannel = SideChannelUtils.GetSideChannel<StatsSideChannel>();
statsSideChannel.AddStat("MyMetric", 1.0);
```

2
gym-unity/README.md


The returned environment `env` will function as a gym.
For more on using the gym interface, see our
[Jupyter Notebook tutorial](../notebooks/getting-started-gym.ipynb).
## Limitations

34
gym-unity/gym_unity/envs/__init__.py


import logging
import itertools
import numpy as np
from typing import Any, Dict, List, Optional, Tuple, Union

from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.base_env import BatchedStepResult
from mlagents_envs import logging_util
class UnityGymException(error.Error):

pass
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("gym_unity")
logger = logging_util.get_logger(__name__)
logging_util.set_log_level(logging_util.INFO)
GymSingleStepResult = Tuple[np.ndarray, float, bool, Dict]
GymMultiStepResult = Tuple[List[np.ndarray], List[float], List[bool], Dict]

:param no_graphics: Whether to run the Unity simulator in no-graphics mode
:param allow_multiple_visual_obs: If True, return a list of visual observations instead of only one.
"""
base_port = 5005
base_port = UnityEnvironment.BASE_ENVIRONMENT_PORT
if environment_filename is None:
base_port = UnityEnvironment.DEFAULT_EDITOR_PORT

def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult:
n_extra_agents = step_result.n_agents() - self._n_agents
if n_extra_agents < 0 or n_extra_agents > self._n_agents:
if n_extra_agents < 0:
# or too many requested a decision
raise UnityGymException(
"The number of agents in the scene does not match the expected number."
)

# only cares about the ordering.
for index, agent_id in enumerate(step_result.agent_id):
if not self._previous_step_result.contains_agent(agent_id):
if step_result.done[index]:
# If the Agent is already done (e.g. it ended its epsiode twice in one step)
# Don't try to register it here.
continue
# Register this agent, and get the reward of the previous agent that
# was in its index, so that we can return it to the gym.
last_reward = self.agent_mapper.register_new_agent_id(agent_id)

"""
Declare the agent done with the corresponding final reward.
"""
gym_index = self._agent_id_to_gym_index.pop(agent_id)
self._done_agents_index_to_last_reward[gym_index] = reward
if agent_id in self._agent_id_to_gym_index:
gym_index = self._agent_id_to_gym_index.pop(agent_id)
self._done_agents_index_to_last_reward[gym_index] = reward
else:
# Agent was never registered in the first place (e.g. EndEpisode called multiple times)
pass
def register_new_agent_id(self, agent_id: int) -> float:
"""

self._gym_id_order = list(agent_ids)
def mark_agent_done(self, agent_id: int, reward: float) -> None:
gym_index = self._gym_id_order.index(agent_id)
self._done_agents_index_to_last_reward[gym_index] = reward
self._gym_id_order[gym_index] = -1
try:
gym_index = self._gym_id_order.index(agent_id)
self._done_agents_index_to_last_reward[gym_index] = reward
self._gym_id_order[gym_index] = -1
except ValueError:
# Agent was never registered in the first place (e.g. EndEpisode called multiple times)
pass
def register_new_agent_id(self, agent_id: int) -> float:
original_index = self._gym_id_order.index(-1)

48
gym-unity/gym_unity/tests/test_gym.py


assert expected_agent_id == agent_id
@mock.patch("gym_unity.envs.UnityEnvironment")
def test_sanitize_action_new_agent_done(mock_env):
mock_spec = create_mock_group_spec(
vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3]
)
mock_step = create_mock_vector_step_result(num_agents=3)
mock_step.agent_id = np.array(range(5))
setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
env = UnityEnv(" ", use_visual=False, multiagent=True)
received_step_result = create_mock_vector_step_result(num_agents=7)
received_step_result.agent_id = np.array(range(7))
# agent #3 (id = 2) is Done
# so is the "new" agent (id = 5)
done = [False] * 7
done[2] = True
done[5] = True
received_step_result.done = np.array(done)
sanitized_result = env._sanitize_info(received_step_result)
for expected_agent_id, agent_id in zip([0, 1, 6, 3, 4], sanitized_result.agent_id):
assert expected_agent_id == agent_id
@mock.patch("gym_unity.envs.UnityEnvironment")
def test_sanitize_action_single_agent_multiple_done(mock_env):
mock_spec = create_mock_group_spec(
vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3]
)
mock_step = create_mock_vector_step_result(num_agents=1)
mock_step.agent_id = np.array(range(1))
setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
env = UnityEnv(" ", use_visual=False, multiagent=False)
received_step_result = create_mock_vector_step_result(num_agents=3)
received_step_result.agent_id = np.array(range(3))
# original agent (id = 0) is Done
# so is the "new" agent (id = 1)
done = [True, True, False]
received_step_result.done = np.array(done)
sanitized_result = env._sanitize_info(received_step_result)
for expected_agent_id, agent_id in zip([2], sanitized_result.agent_id):
assert expected_agent_id == agent_id
# Helper methods

# Mark some agents as done with their last rewards.
mapper.mark_agent_done(1001, 42.0)
mapper.mark_agent_done(1004, 1337.0)
# Make sure we can handle an unknown agent id being marked done.
# This can happen when an agent ends an episode on the same step it starts.
mapper.mark_agent_done(9999, -1.0)
# Now add new agents, and get the rewards of the agent they replaced.
old_reward1 = mapper.register_new_agent_id(2001)

2
ml-agents-envs/mlagents_envs/communicator.py


"""
Python side of the communication. Must be used in pair with the right Unity Communicator equivalent.
:int worker_id: Offset from base_port. Used for training multiple environments simultaneously.
:int worker_id: Number to add to communication port (5005) [0]. Used for asynchronous agent scenarios.
"""
def initialize(self, inputs: UnityInputProto) -> UnityOutputProto:

117
ml-agents-envs/mlagents_envs/environment.py


import atexit
import glob
import uuid
import logging
import numpy as np
import os
import subprocess

from mlagents_envs.logging_util import get_logger
from mlagents_envs.side_channel.side_channel import SideChannel, IncomingMessage
from mlagents_envs.base_env import (

import struct
logger = logging.getLogger("mlagents_envs")
logger = get_logger(__name__)
class UnityEnvironment(BaseEnv):

# isn't specified, this port will be used.
DEFAULT_EDITOR_PORT = 5004
# Default base port for environments. Each environment will be offset from this
# by it's worker_id.
BASE_ENVIRONMENT_PORT = 5005
# Command line argument used to pass the port to the executable environment.
PORT_COMMAND_LINE_ARG = "--mlagents-port"

worker_id: int = 0,
base_port: int = 5005,
base_port: Optional[int] = None,
docker_training: bool = False,
no_graphics: bool = False,
timeout_wait: int = 60,
args: Optional[List[str]] = None,

:string file_name: Name of Unity environment binary.
:int base_port: Baseline port number to connect to Unity environment over. worker_id increments over this.
:int worker_id: Number to add to communication port (5005) [0]. Used for asynchronous agent scenarios.
:bool docker_training: Informs this class whether the process is being run within a container.
If no environment is specified (i.e. file_name is None), the DEFAULT_EDITOR_PORT will be used.
:int worker_id: Offset from base_port. Used for training multiple environments simultaneously.
:bool no_graphics: Whether to run the Unity simulator in no-graphics mode
:int timeout_wait: Time (in seconds) to wait for connection from environment.
:list args: Addition Unity command line arguments

atexit.register(self._close)
# If base port is not specified, use BASE_ENVIRONMENT_PORT if we have
# an environment, otherwise DEFAULT_EDITOR_PORT
if base_port is None:
base_port = (
self.BASE_ENVIRONMENT_PORT if file_name else self.DEFAULT_EDITOR_PORT
)
self.port = base_port + worker_id
self._buffer_size = 12000
# If true, this means the environment was successfully loaded

"the worker-id must be 0 in order to connect with the Editor."
)
if file_name is not None:
self.executable_launcher(file_name, docker_training, no_graphics, args)
self.executable_launcher(file_name, no_graphics, args)
else:
logger.info(
f"Listening on port {self.port}. "

aca_output = self.send_academy_parameters(rl_init_parameters_in)
aca_params = aca_output.rl_initialization_output
except UnityTimeOutException:
self._close()
self._close(0)
self._close()
self._close(0)
raise UnityEnvironmentException(
f"The communication API version is not compatible between Unity and python. "
f"Python API: {UnityEnvironment.API_VERSION}, Unity API: {unity_communicator_version}.\n "

launch_string = candidates[0]
return launch_string
def executable_launcher(self, file_name, docker_training, no_graphics, args):
def executable_launcher(self, file_name, no_graphics, args):
self._close()
self._close(0)
raise UnityEnvironmentException(
f"Couldn't launch the {file_name} environment. Provided filename does not match any environments."
)

if not docker_training:
subprocess_args = [launch_string]
if no_graphics:
subprocess_args += ["-nographics", "-batchmode"]
subprocess_args += [
UnityEnvironment.PORT_COMMAND_LINE_ARG,
str(self.port),
]
subprocess_args += args
try:
self.proc1 = subprocess.Popen(
subprocess_args,
# start_new_session=True means that signals to the parent python process
# (e.g. SIGINT from keyboard interrupt) will not be sent to the new process on POSIX platforms.
# This is generally good since we want the environment to have a chance to shutdown,
# but may be undesirable in come cases; if so, we'll add a command-line toggle.
# Note that on Windows, the CTRL_C signal will still be sent.
start_new_session=True,
)
except PermissionError as perm:
# This is likely due to missing read or execute permissions on file.
raise UnityEnvironmentException(
f"Error when trying to launch environment - make sure "
f"permissions are set correctly. For example "
f'"chmod -R 755 {launch_string}"'
) from perm
else:
# Comments for future maintenance:
# xvfb-run is a wrapper around Xvfb, a virtual xserver where all
# rendering is done to virtual memory. It automatically creates a
# new virtual server automatically picking a server number `auto-servernum`.
# The server is passed the arguments using `server-args`, we are telling
# Xvfb to create Screen number 0 with width 640, height 480 and depth 24 bits.
# Note that 640 X 480 are the default width and height. The main reason for
# us to add this is because we'd like to change the depth from the default
# of 8 bits to 24.
# Unfortunately, this means that we will need to pass the arguments through
# a shell which is why we set `shell=True`. Now, this adds its own
# complications. E.g SIGINT can bounce off the shell and not get propagated
# to the child processes. This is why we add `exec`, so that the shell gets
# launched, the arguments are passed to `xvfb-run`. `exec` replaces the shell
# we created with `xvfb`.
#
docker_ls = (
f"exec xvfb-run --auto-servernum --server-args='-screen 0 640x480x24'"
f" {launch_string} {UnityEnvironment.PORT_COMMAND_LINE_ARG} {self.port}"
)
subprocess_args = [launch_string]
if no_graphics:
subprocess_args += ["-nographics", "-batchmode"]
subprocess_args += [UnityEnvironment.PORT_COMMAND_LINE_ARG, str(self.port)]
subprocess_args += args
try:
docker_ls,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True,
subprocess_args,
# start_new_session=True means that signals to the parent python process
# (e.g. SIGINT from keyboard interrupt) will not be sent to the new process on POSIX platforms.
# This is generally good since we want the environment to have a chance to shutdown,
# but may be undesirable in come cases; if so, we'll add a command-line toggle.
# Note that on Windows, the CTRL_C signal will still be sent.
start_new_session=True,
except PermissionError as perm:
# This is likely due to missing read or execute permissions on file.
raise UnityEnvironmentException(
f"Error when trying to launch environment - make sure "
f"permissions are set correctly. For example "
f'"chmod -R 755 {launch_string}"'
) from perm
def _update_group_specs(self, output: UnityOutputProto) -> None:
init_output = output.rl_initialization_output

else:
raise UnityEnvironmentException("No Unity environment is loaded.")
def _close(self):
def _close(self, timeout: Optional[int] = None) -> None:
"""
Close the communicator and environment subprocess (if necessary).
:int timeout: [Optional] Number of seconds to wait for the environment to shut down before
force-killing it. Defaults to `self.timeout_wait`.
"""
if timeout is None:
timeout = self.timeout_wait
self.proc1.wait(timeout=self.timeout_wait)
self.proc1.wait(timeout=timeout)
signal_name = self.returncode_to_signal_name(self.proc1.returncode)
signal_name = f" ({signal_name})" if signal_name else ""
return_info = f"Environment shut down with return code {self.proc1.returncode}{signal_name}."

3
ml-agents-envs/mlagents_envs/mock_communicator.py


):
"""
Python side of the grpc communication. Python is the client and Unity the server
:int base_port: Baseline port number to connect to Unity environment over. worker_id increments over this.
:int worker_id: Number to add to communication port (5005) [0]. Used for asynchronous agent scenarios.
"""
super().__init__()
self.is_discrete = discrete_action

3
ml-agents-envs/mlagents_envs/rpc_communicator.py


:int base_port: Baseline port number to connect to Unity environment over. worker_id increments over this.
:int worker_id: Number to add to communication port (5005) [0]. Used for asynchronous agent scenarios.
:int worker_id: Offset from base_port. Used for training multiple environments simultaneously.
:int timeout_wait: Timeout (in seconds) to wait for a response before exiting.
"""
super().__init__(worker_id, base_port)
self.port = base_port + worker_id

4
ml-agents-envs/mlagents_envs/side_channel/outgoing_message.py


from typing import List
import struct
import logging
from mlagents_envs.logging_util import get_logger
logger = logging.getLogger(__name__)
logger = get_logger(__name__)
class OutgoingMessage:

4
ml-agents-envs/mlagents_envs/side_channel/side_channel.py


from abc import ABC, abstractmethod
from typing import List
import uuid
import logging
from mlagents_envs.logging_util import get_logger
logger = logging.getLogger(__name__)
logger = get_logger(__name__)
class SideChannel(ABC):

23
ml-agents-envs/mlagents_envs/tests/test_envs.py


env.close()
@pytest.mark.parametrize(
"base_port,file_name,expected",
[
# Non-None base port value will always be used
(6001, "foo.exe", 6001),
# No port specified and environment specified, so use BASE_ENVIRONMENT_PORT
(None, "foo.exe", UnityEnvironment.BASE_ENVIRONMENT_PORT),
# No port specified and no environment, so use DEFAULT_EDITOR_PORT
(None, None, UnityEnvironment.DEFAULT_EDITOR_PORT),
],
)
@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
def test_port_defaults(
mock_communicator, mock_launcher, base_port, file_name, expected
):
mock_communicator.return_value = MockCommunicator(
discrete_action=False, visual_inputs=0
)
env = UnityEnvironment(file_name=file_name, worker_id=0, base_port=base_port)
assert expected == env.port
@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
def test_reset(mock_communicator, mock_launcher):

5
ml-agents/mlagents/model_serialization.py


from distutils.util import strtobool
import os
import logging
from typing import Any, List, Set, NamedTuple
from distutils.version import LooseVersion

from tensorflow.python.platform import gfile
from tensorflow.python.framework import graph_util
from mlagents_envs.logging_util import get_logger
from mlagents.trainers import tensorflow_to_barracuda as tf2bc
if LooseVersion(tf.__version__) < LooseVersion("1.12.0"):

logger = get_logger(__name__)
logger = logging.getLogger("mlagents.trainers")
POSSIBLE_INPUT_NODES = frozenset(
[

52
ml-agents/mlagents/trainers/agent_processor.py


import sys
from typing import List, Dict, TypeVar, Generic, Tuple, Set
from typing import List, Dict, TypeVar, Generic, Tuple, Any
from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy import Policy

for _entropy in take_action_outputs["entropy"]:
self.stats_reporter.add_stat("Policy/Entropy", _entropy)
terminated_agents: Set[str] = set()
# Make unique agent_ids that are global across workers
action_global_agent_ids = [
get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids

stored_take_action_outputs = self.last_take_action_outputs.get(
global_id, None
)
if stored_agent_step is not None and stored_take_action_outputs is not None:
# We know the step is from the same worker, so use the local agent id.
obs = stored_agent_step.obs

traj_queue.put(trajectory)
self.experience_buffers[global_id] = []
if curr_agent_step.done:
# Record episode length for agents which have had at least
# 1 step. Done after reset ignored.
terminated_agents.add(global_id)
elif not curr_agent_step.done:
self.episode_steps[global_id] += 1

batched_step_result.agent_id_to_index[_id],
)
for terminated_id in terminated_agents:
self._clean_agent_data(terminated_id)
# Delete all done agents, regardless of if they had a 0-length episode.
if curr_agent_step.done:
self._clean_agent_data(global_id)
for _gid in action_global_agent_ids:
# If the ID doesn't have a last step result, the agent just reset,

"""
Removes the data for an Agent.
"""
del self.experience_buffers[global_id]
del self.last_take_action_outputs[global_id]
del self.last_step_result[global_id]
del self.episode_steps[global_id]
del self.episode_rewards[global_id]
self._safe_delete(self.experience_buffers, global_id)
self._safe_delete(self.last_take_action_outputs, global_id)
self._safe_delete(self.last_step_result, global_id)
self._safe_delete(self.episode_steps, global_id)
self._safe_delete(self.episode_rewards, global_id)
def _safe_delete(self, my_dictionary: Dict[Any, Any], key: Any) -> None:
"""
Safe removes data from a dictionary. If not found,
don't delete.
"""
if key in my_dictionary:
del my_dictionary[key]
def publish_trajectory_queue(
self, trajectory_queue: "AgentManagerQueue[Trajectory]"
) -> None:

self.behavior_id
)
self.publish_trajectory_queue(self.trajectory_queue)
def record_environment_stats(
self, env_stats: Dict[str, Tuple[float, StatsAggregationMethod]], worker_id: int
) -> None:
"""
Pass stats from the environment to the StatsReporter.
Depending on the StatsAggregationMethod, either StatsReporter.add_stat or StatsReporter.set_stat is used.
The worker_id is used to determin whether StatsReporter.set_stat should be used.
:param env_stats:
:param worker_id:
:return:
"""
for stat_name, (val, agg_type) in env_stats.items():
if agg_type == StatsAggregationMethod.AVERAGE:
self.stats_reporter.add_stat(stat_name, val)
elif agg_type == StatsAggregationMethod.MOST_RECENT:
# In order to prevent conflicts between multiple environments,
# only stats from the first environment are recorded.
if worker_id == 0:
self.stats_reporter.set_stat(stat_name, val)

52
ml-agents/mlagents/trainers/behavior_id_utils.py


from typing import Dict, NamedTuple
from typing import NamedTuple
from urllib.parse import urlparse, parse_qs
name_behavior_id: str
"""
BehaviorIdentifiers is a named tuple of the identifiers that uniquely distinguish
an agent encountered in the trainer_controller. The named tuple consists of the
fully qualified behavior name, the name of the brain name (corresponds to trainer
in the trainer controller) and the team id. In the future, this can be extended
to support further identifiers.
"""
behavior_id: str
behavior_ids: Dict[str, int]
team_id: int
Parses a name_behavior_id of the form name?team=0&param1=i&...
Parses a name_behavior_id of the form name?team=0
This allows you to access the brain name and distinguishing identifiers
without parsing more than once.
This allows you to access the brain name and team id of an agent
ids: Dict[str, int] = {}
if "?" in name_behavior_id:
name, identifiers = name_behavior_id.rsplit("?", 1)
if "&" in identifiers:
list_of_identifiers = identifiers.split("&")
else:
list_of_identifiers = [identifiers]
for identifier in list_of_identifiers:
key, value = identifier.split("=")
ids[key] = int(value)
else:
name = name_behavior_id
parsed = urlparse(name_behavior_id)
name = parsed.path
ids = parse_qs(parsed.query)
team_id: int = 0
if "team" in ids:
team_id = int(ids["team"][0])
name_behavior_id=name_behavior_id, brain_name=name, behavior_ids=ids
behavior_id=name_behavior_id, brain_name=name, team_id=team_id
def create_name_behavior_id(name: str, team_id: int) -> str:
"""
Reconstructs fully qualified behavior name from name and team_id
:param name: brain name
:param team_id: team ID
:return: name_behavior_id
"""
return name + "?team=" + str(team_id)

5
ml-agents/mlagents/trainers/components/reward_signals/__init__.py


import logging
from typing import Any, Dict, List
from collections import namedtuple
import numpy as np

from mlagents_envs.logging_util import get_logger
logger = logging.getLogger("mlagents.trainers")
logger = get_logger(__name__)
RewardSignalResult = namedtuple(
"RewardSignalResult", ["scaled_reward", "unscaled_reward"]

4
ml-agents/mlagents/trainers/curriculum.py


from .exception import CurriculumConfigError, CurriculumLoadingError
import logging
from mlagents_envs.logging_util import get_logger
logger = logging.getLogger("mlagents.trainers")
logger = get_logger(__name__)
class Curriculum:

38
ml-agents/mlagents/trainers/distributions.py


act_size: List[int],
reparameterize: bool = False,
tanh_squash: bool = False,
condition_sigma: bool = True,
log_sigma_min: float = -20,
log_sigma_max: float = 2,
):

:param log_sigma_max: Maximum log standard deviation to clip by.
"""
encoded = self._create_mu_log_sigma(
logits, act_size, log_sigma_min, log_sigma_max
logits,
act_size,
log_sigma_min,
log_sigma_max,
condition_sigma=condition_sigma,
)
self._sampled_policy = self._create_sampled_policy(encoded)
if not reparameterize:

act_size: List[int],
log_sigma_min: float,
log_sigma_max: float,
condition_sigma: bool,
) -> "GaussianDistribution.MuSigmaTensors":
mu = tf.layers.dense(

reuse=tf.AUTO_REUSE,
)
# Policy-dependent log_sigma_sq
log_sigma = tf.layers.dense(
logits,
act_size[0],
activation=None,
name="log_std",
kernel_initializer=ModelUtils.scaled_init(0.01),
)
if condition_sigma:
# Policy-dependent log_sigma_sq
log_sigma = tf.layers.dense(
logits,
act_size[0],
activation=None,
name="log_std",
kernel_initializer=ModelUtils.scaled_init(0.01),
)
else:
log_sigma = tf.get_variable(
"log_std",
[act_size[0]],
dtype=tf.float32,
initializer=tf.zeros_initializer(),
)
log_sigma = tf.clip_by_value(log_sigma, log_sigma_min, log_sigma_max)
sigma = tf.exp(log_sigma)
return self.MuSigmaTensors(mu, log_sigma, sigma)

self, encoded: "GaussianDistribution.MuSigmaTensors"
) -> tf.Tensor:
single_dim_entropy = 0.5 * tf.reduce_mean(
tf.log(2 * np.pi * np.e) + tf.square(encoded.log_sigma)
tf.log(2 * np.pi * np.e) + 2 * encoded.log_sigma
)
# Make entropy the right shape
return tf.ones_like(tf.reshape(encoded.mu[:, 0], [-1])) * single_dim_entropy

Adjust probabilities for squashed sample before output
"""
probs -= tf.log(1 - squashed_policy ** 2 + EPSILON)
return probs
adjusted_probs = probs - tf.log(1 - squashed_policy ** 2 + EPSILON)
return adjusted_probs
@property
def total_log_probs(self) -> tf.Tensor:

15
ml-agents/mlagents/trainers/env_manager.py


from abc import ABC, abstractmethod
import logging
from typing import List, Dict, NamedTuple, Iterable
from typing import List, Dict, NamedTuple, Iterable, Tuple
from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
from mlagents_envs.logging_util import get_logger
logger = logging.getLogger("mlagents.trainers")
logger = get_logger(__name__)
class EnvironmentStep(NamedTuple):

environment_stats: Dict[str, Tuple[float, StatsAggregationMethod]]
@property
def name_behavior_ids(self) -> Iterable[AgentGroup]:

def empty(worker_id: int) -> "EnvironmentStep":
return EnvironmentStep({}, worker_id, {})
return EnvironmentStep({}, worker_id, {}, {})
class EnvManager(ABC):

step_info.brain_name_to_action_info.get(
name_behavior_id, ActionInfo.empty()
),
)
self.agent_managers[name_behavior_id].record_environment_stats(
step_info.environment_stats, step_info.worker_id
)
return len(step_infos)

439
ml-agents/mlagents/trainers/ghost/trainer.py


# # Unity ML-Agents Toolkit
# ## ML-Agent Learning (Ghost Trainer)
from typing import Deque, Dict, List, Any, cast
from typing import Deque, Dict, List, cast
import logging
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.policy import Policy
from mlagents.trainers.policy.tf_policy import TFPolicy

from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.stats import StatsPropertyType
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.behavior_id_utils import (
BehaviorIdentifiers,
create_name_behavior_id,
)
logger = logging.getLogger("mlagents.trainers")
logger = get_logger(__name__)
"""
The GhostTrainer trains agents in adversarial games (there are teams in opposition) using a self-play mechanism.
In adversarial settings with self-play, at any time, there is only a single learning team. The other team(s) is
"ghosted" which means that its agents are executing fixed policies and not learning. The GhostTrainer wraps
a standard RL trainer which trains the learning team and ensures that only the trajectories collected
by the learning team are used for training. The GhostTrainer also maintains past policy snapshots to be used
as the fixed policies when the team is not learning. The GhostTrainer is 1:1 with brain_names as the other
trainers, and is responsible for one or more teams. Note, a GhostTrainer can have only one team in
asymmetric games where there is only one team with a particular behavior i.e. Hide and Seek.
The GhostController manages high level coordination between multiple ghost trainers. The learning team id
is cycled throughout a training run.
"""
self, trainer, brain_name, reward_buff_cap, trainer_parameters, training, run_id
self,
trainer,
brain_name,
controller,
reward_buff_cap,
trainer_parameters,
training,
run_id,
Responsible for collecting experiences and training trainer model via self_play.
Creates a GhostTrainer.
:param controller: GhostController that coordinates all ghost trainers and calculates ELO
:param reward_buff_cap: Max reward history to track in the reward buffer
:param trainer_parameters: The parameters for the trainer (dictionary).
:param training: Whether the trainer is set for training.

)
self.trainer = trainer
self.controller = controller
self.internal_policy_queues: List[AgentManagerQueue[Policy]] = []
self.internal_trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
self.ignored_trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
self.learning_policy_queues: Dict[str, AgentManagerQueue[Policy]] = {}
self._internal_trajectory_queues: Dict[str, AgentManagerQueue[Trajectory]] = {}
self._internal_policy_queues: Dict[str, AgentManagerQueue[Policy]] = {}
self._team_to_name_to_policy_queue: Dict[
int, Dict[str, AgentManagerQueue[Policy]]
] = {}
self._name_to_parsed_behavior_id: Dict[str, BehaviorIdentifiers] = {}
# assign ghost's stats collection to wrapped trainer's
self._stats_reporter = self.trainer.stats_reporter

self_play_parameters = trainer_parameters["self_play"]
self.window = self_play_parameters.get("window", 10)
self.play_against_current_self_ratio = self_play_parameters.get(
"play_against_current_self_ratio", 0.5
self.play_against_latest_model_ratio = self_play_parameters.get(
"play_against_latest_model_ratio", 0.5
if (
self.play_against_latest_model_ratio > 1.0
or self.play_against_latest_model_ratio < 0.0
):
logger.warning(
"The play_against_latest_model_ratio is not between 0 and 1."
)
self.steps_to_train_team = self_play_parameters.get("team_change", 100000)
if self.steps_to_train_team > self.get_max_steps:
logger.warning(
"The max steps of the GhostTrainer for behavior name {} is less than team change. This team will not face \
opposition that has been trained if the opposition is managed by a different GhostTrainer as in an \
asymmetric game.".format(
self.brain_name
)
)
# Counts the The number of steps of the ghost policies. Snapshot swapping
# depends on this counter whereas snapshot saving and team switching depends
# on the wrapped. This ensures that all teams train for the same number of trainer
# steps.
self.ghost_step: int = 0
# A list of dicts from brain name to a single snapshot for this trainer's policies
self.policy_snapshots: List[Dict[str, List[float]]] = []
# A dict from brain name to the current snapshot of this trainer's policies
self.current_policy_snapshot: Dict[str, List[float]] = {}
self.snapshot_counter: int = 0
self.policy_snapshots: List[Any] = []
self.snapshot_counter: int = 0
self.learning_behavior_name: str = None
self.current_policy_snapshot = None
self.last_save = 0
self.last_swap = 0
# wrapped_training_team and learning team need to be separate
# in the situation where new agents are created destroyed
# after learning team switches. These agents need to be added
# to trainers properly.
self._learning_team: int = None
self.wrapped_trainer_team: int = None
self.last_save: int = 0
self.last_swap: int = 0
self.last_team_change: int = 0
self.current_elo: float = self.initial_elo
self.policy_elos: List[float] = [self.initial_elo] * (
self.window + 1
) # for learning policy

def get_step(self) -> int:
"""
Returns the number of steps the trainer has performed
:return: the step count of the trainer
Returns the number of steps the wrapped trainer has performed
:return: the step count of the wrapped trainer
"""
return self.trainer.get_step

"""
return self.trainer.reward_buffer
@property
def current_elo(self) -> float:
"""
Gets ELO of current policy which is always last in the list
:return: ELO of current policy
"""
return self.policy_elos[-1]
def change_current_elo(self, change: float) -> None:
"""
Changes elo of current policy which is always last in the list
:param change: Amount to change current elo by
"""
self.policy_elos[-1] += change
def get_opponent_elo(self) -> float:
"""
Get elo of current opponent policy
:return: ELO of current opponent policy
"""
return self.policy_elos[self.current_opponent]
def change_opponent_elo(self, change: float) -> None:
"""
Changes elo of current opponent policy
:param change: Amount to change current opponent elo by
"""
self.policy_elos[self.current_opponent] -= change
if trajectory.done_reached and not trajectory.max_step_reached:
# Assumption is that final reward is 1/.5/0 for win/draw/loss
"""
Determines the final result of an episode and asks the GhostController
to calculate the ELO change. The GhostController changes the ELO
of the opponent policy since this may be in a different GhostTrainer
i.e. in asymmetric games. We assume the last reward determines the winner.
:param trajectory: Trajectory.
"""
if trajectory.done_reached:
# Assumption is that final reward is >0/0/<0 for win/draw/loss
final_reward = trajectory.steps[-1].reward
result = 0.5
if final_reward > 0:

change = compute_elo_rating_changes(
self.current_elo, self.policy_elos[self.current_opponent], result
change = self.controller.compute_elo_rating_changes(
self.current_elo, result
self.current_elo += change
self.policy_elos[self.current_opponent] -= change
opponents = np.array(self.policy_elos, dtype=np.float32)
self.change_current_elo(change)
self._stats_reporter.add_stat(
"Self-play/Mean Opponent ELO", opponents.mean()
)
self._stats_reporter.add_stat("Self-play/Std Opponent ELO", opponents.std())
for traj_queue, internal_traj_queue in zip(
self.trajectory_queues, self.internal_trajectory_queues
):
try:
# We grab at most the maximum length of the queue.
# This ensures that even if the queue is being filled faster than it is
# being emptied, the trajectories in the queue are on-policy.
for _ in range(traj_queue.maxlen):
t = traj_queue.get_nowait()
# adds to wrapped trainers queue
internal_traj_queue.put(t)
self._process_trajectory(t)
except AgentManagerQueue.Empty:
pass
for trajectory_queue in self.trajectory_queues:
parsed_behavior_id = self._name_to_parsed_behavior_id[
trajectory_queue.behavior_id
]
if parsed_behavior_id.team_id == self._learning_team:
# With a future multiagent trainer, this will be indexed by 'role'
internal_trajectory_queue = self._internal_trajectory_queues[
parsed_behavior_id.brain_name
]
try:
# We grab at most the maximum length of the queue.
# This ensures that even if the queue is being filled faster than it is
# being emptied, the trajectories in the queue are on-policy.
for _ in range(trajectory_queue.maxlen):
t = trajectory_queue.get_nowait()
# adds to wrapped trainers queue
internal_trajectory_queue.put(t)
self._process_trajectory(t)
except AgentManagerQueue.Empty:
pass
else:
# Dump trajectories from non-learning policy
try:
for _ in range(trajectory_queue.maxlen):
t = trajectory_queue.get_nowait()
# count ghost steps
self.ghost_step += len(t.steps)
except AgentManagerQueue.Empty:
pass
self.trainer.advance()
if self.get_step - self.last_team_change > self.steps_to_train_team:
self.controller.change_training_team(self.get_step)
self.last_team_change = self.get_step
self.trainer.advance()
next_learning_team = self.controller.get_learning_team
for internal_q in self.internal_policy_queues:
# Get policies that correspond to the policy queue in question
# CASE 1: Current learning team is managed by this GhostTrainer.
# If the learning team changes, the following loop over queues will push the
# new policy into the policy queue for the new learning agent if
# that policy is managed by this GhostTrainer. Otherwise, it will save the current snapshot.
# CASE 2: Current learning team is managed by a different GhostTrainer.
# If the learning team changes to a team managed by this GhostTrainer, this loop
# will push the current_snapshot into the correct queue. Otherwise,
# it will continue skipping and swap_snapshot will continue to handle
# pushing fixed snapshots
# Case 3: No team change. The if statement just continues to push the policy
# into the correct queue (or not if not learning team).
for brain_name in self._internal_policy_queues:
internal_policy_queue = self._internal_policy_queues[brain_name]
policy = cast(TFPolicy, internal_q.get_nowait())
self.current_policy_snapshot = policy.get_weights()
self.learning_policy_queues[internal_q.behavior_id].put(policy)
policy = cast(TFPolicy, internal_policy_queue.get_nowait())
self.current_policy_snapshot[brain_name] = policy.get_weights()
if next_learning_team in self._team_to_name_to_policy_queue:
name_to_policy_queue = self._team_to_name_to_policy_queue[
next_learning_team
]
if brain_name in name_to_policy_queue:
behavior_id = create_name_behavior_id(
brain_name, next_learning_team
)
policy = self.get_policy(behavior_id)
policy.load_weights(self.current_policy_snapshot[brain_name])
name_to_policy_queue[brain_name].put(policy)
# Note save and swap should be on different step counters.
# We don't want to save unless the policy is learning.
self._save_snapshot(self.trainer.policy)
self._save_snapshot()
if self.get_step - self.last_swap > self.steps_between_swap:
if (
self._learning_team != next_learning_team
or self.ghost_step - self.last_swap > self.steps_between_swap
):
self._learning_team = next_learning_team
self.last_swap = self.get_step
# Dump trajectories from non-learning policy
for traj_queue in self.ignored_trajectory_queues:
try:
for _ in range(traj_queue.maxlen):
traj_queue.get_nowait()
except AgentManagerQueue.Empty:
pass
self.last_swap = self.ghost_step
"""
Forwarding call to wrapped trainers end_episode
"""
"""
Forwarding call to wrapped trainers save_model
"""
self.trainer.export_model(name_behavior_id)
"""
Forwarding call to wrapped trainers export_model.
First loads the current snapshot.
"""
parsed_behavior_id = self._name_to_parsed_behavior_id[name_behavior_id]
brain_name = parsed_behavior_id.brain_name
policy = self.trainer.get_policy(brain_name)
policy.load_weights(self.current_policy_snapshot[brain_name])
self.trainer.export_model(brain_name)
"""
Creates policy with the wrapped trainer's create_policy function
"""
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
) -> None:
Adds policy to trainer. For the first policy added, add a trainer
to the policy and set the learning behavior name to name_behavior_id.
Adds policy to trainer. The first policy encountered sets the wrapped
trainer team. This is to ensure that all agents from the same multi-agent
team are grouped. All policies associated with this team are added to the
wrapped trainer to be trained.
name_behavior_id = parsed_behavior_id.behavior_id
team_id = parsed_behavior_id.team_id
self.controller.subscribe_team_id(team_id, self)
# First policy encountered
if not self.learning_behavior_name:
weights = policy.get_weights()
self.current_policy_snapshot = weights
self.trainer.add_policy(name_behavior_id, policy)
self._save_snapshot(policy) # Need to save after trainer initializes policy
self.learning_behavior_name = name_behavior_id
behavior_id_parsed = BehaviorIdentifiers.from_name_behavior_id(
self.learning_behavior_name
)
team_id = behavior_id_parsed.behavior_ids["team"]
self._stats_reporter.add_property(StatsPropertyType.SELF_PLAY_TEAM, team_id)
else:
# for saving/swapping snapshots
policy.init_load_weights()
self._name_to_parsed_behavior_id[name_behavior_id] = parsed_behavior_id
# for saving/swapping snapshots
policy.init_load_weights()
# First policy or a new agent on the same team encountered
if self.wrapped_trainer_team is None or team_id == self.wrapped_trainer_team:
self.current_policy_snapshot[
parsed_behavior_id.brain_name
] = policy.get_weights()
self._save_snapshot() # Need to save after trainer initializes policy
self.trainer.add_policy(parsed_behavior_id, policy)
self._learning_team = self.controller.get_learning_team
self.wrapped_trainer_team = team_id
"""
Gets policy associated with name_behavior_id
:param name_behavior_id: Fully qualified behavior name
:return: Policy associated with name_behavior_id
"""
def _save_snapshot(self, policy: TFPolicy) -> None:
weights = policy.get_weights()
try:
self.policy_snapshots[self.snapshot_counter] = weights
except IndexError:
self.policy_snapshots.append(weights)
def _save_snapshot(self) -> None:
"""
Saves a snapshot of the current weights of the policy and maintains the policy_snapshots
according to the window size
"""
for brain_name in self.current_policy_snapshot:
current_snapshot_for_brain_name = self.current_policy_snapshot[brain_name]
try:
self.policy_snapshots[self.snapshot_counter][
brain_name
] = current_snapshot_for_brain_name
except IndexError:
self.policy_snapshots.append(
{brain_name: current_snapshot_for_brain_name}
)
for q in self.policy_queues:
name_behavior_id = q.behavior_id
# here is the place for a sampling protocol
if name_behavior_id == self.learning_behavior_name:
"""
Swaps the appropriate weight to the policy and pushes it to respective policy queues
"""
for team_id in self._team_to_name_to_policy_queue:
if team_id == self._learning_team:
elif np.random.uniform() < (1 - self.play_against_current_self_ratio):
elif np.random.uniform() < (1 - self.play_against_latest_model_ratio):
self.policy_elos[-1] = self.current_elo
logger.debug(
"Step {}: Swapping snapshot {} to id {} with {} learning".format(
self.get_step, x, name_behavior_id, self.learning_behavior_name
name_to_policy_queue = self._team_to_name_to_policy_queue[team_id]
for brain_name in self._team_to_name_to_policy_queue[team_id]:
behavior_id = create_name_behavior_id(brain_name, team_id)
policy = self.get_policy(behavior_id)
policy.load_weights(snapshot[brain_name])
name_to_policy_queue[brain_name].put(policy)
logger.debug(
"Step {}: Swapping snapshot {} to id {} with team {} learning".format(
self.ghost_step, x, behavior_id, self._learning_team
)
)
policy = self.get_policy(name_behavior_id)
policy.load_weights(snapshot)
q.put(policy)
Adds a policy queue to the list of queues to publish to when this Trainer
makes a policy update
Adds a policy queue for every member of the team to the list of queues to publish to when this Trainer
makes a policy update. Creates an internal policy queue for the wrapped
trainer to push to. The GhostTrainer pushes all policies to the env.
if policy_queue.behavior_id == self.learning_behavior_name:
parsed_behavior_id = self._name_to_parsed_behavior_id[policy_queue.behavior_id]
try:
self._team_to_name_to_policy_queue[parsed_behavior_id.team_id][
parsed_behavior_id.brain_name
] = policy_queue
except KeyError:
self._team_to_name_to_policy_queue[parsed_behavior_id.team_id] = {
parsed_behavior_id.brain_name: policy_queue
}
if parsed_behavior_id.team_id == self.wrapped_trainer_team:
# With a future multiagent trainer, this will be indexed by 'role'
policy_queue.behavior_id
parsed_behavior_id.brain_name
self.internal_policy_queues.append(internal_policy_queue)
self.learning_policy_queues[policy_queue.behavior_id] = policy_queue
self._internal_policy_queues[
parsed_behavior_id.brain_name
] = internal_policy_queue
self.trainer.publish_policy_queue(internal_policy_queue)
def subscribe_trajectory_queue(

Adds a trajectory queue to the list of queues for the trainer to ingest Trajectories from.
Adds a trajectory queue for every member of the team to the list of queues for the trainer
to ingest Trajectories from. Creates an internal trajectory queue to push trajectories from
the learning team. The wrapped trainer subscribes to this queue.
if trajectory_queue.behavior_id == self.learning_behavior_name:
super().subscribe_trajectory_queue(trajectory_queue)
super().subscribe_trajectory_queue(trajectory_queue)
parsed_behavior_id = self._name_to_parsed_behavior_id[
trajectory_queue.behavior_id
]
if parsed_behavior_id.team_id == self.wrapped_trainer_team:
# With a future multiagent trainer, this will be indexed by 'role'
] = AgentManagerQueue(trajectory_queue.behavior_id)
] = AgentManagerQueue(parsed_behavior_id.brain_name)
self.internal_trajectory_queues.append(internal_trajectory_queue)
self._internal_trajectory_queues[
parsed_behavior_id.brain_name
] = internal_trajectory_queue
else:
self.ignored_trajectory_queues.append(trajectory_queue)
# Taken from https://github.com/Unity-Technologies/ml-agents/pull/1975 and
# https://metinmediamath.wordpress.com/2013/11/27/how-to-calculate-the-elo-rating-including-example/
# ELO calculation
def compute_elo_rating_changes(rating1: float, rating2: float, result: float) -> float:
r1 = pow(10, rating1 / 400)
r2 = pow(10, rating2 / 400)
summed = r1 + r2
e1 = r1 / summed
change = result - e1
return change

145
ml-agents/mlagents/trainers/learn.py


# # Unity ML-Agents Toolkit
import logging
import glob
import shutil
import numpy as np
import json

from mlagents import tf_utils
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.meta_curriculum import MetaCurriculum
from mlagents.trainers.trainer_util import load_config, TrainerFactory
from mlagents.trainers.trainer_util import (
load_config,
TrainerFactory,
handle_existing_directories,
)
from mlagents.trainers.stats import (
TensorboardWriter,
CSVWriter,

from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig
from mlagents_envs.exception import UnityEnvironmentException
from mlagents_envs.timers import hierarchical_timer, get_timer_tree
from mlagents.logging_util import create_logger
from mlagents_envs import logging_util
logger = logging_util.get_logger(__name__)
def _create_parser():

default=False,
dest="load_model",
action="store_true",
help="Whether to load the model or randomly initialize",
help=argparse.SUPPRESS, # Deprecated but still usable for now.
)
argparser.add_argument(
"--resume",
default=False,
dest="resume",
action="store_true",
help="Resumes training from a checkpoint. Specify a --run-id to use this option.",
)
argparser.add_argument(
"--force",
default=False,
dest="force",
action="store_true",
help="Force-overwrite existing models and summaries for a run-id that has been used "
"before.",
)
argparser.add_argument(
"--run-id",

default=False,
dest="train_model",
action="store_true",
help="Whether to train model, or only run inference",
help=argparse.SUPPRESS,
)
argparser.add_argument(
"--inference",
default=False,
dest="inference",
action="store_true",
help="Run in Python inference mode (don't train). Use with --resume to load a model trained with an "
"existing run-id.",
default=5005,
default=UnityEnvironment.BASE_ENVIRONMENT_PORT,
type=int,
help="Base port for environment communication",
)

type=int,
help="Number of parallel environments to use for training",
)
argparser.add_argument(
"--docker-target-name",
default=None,
dest="docker_target_name",
help="Docker volume to store training-specific files",
)
argparser.add_argument(
"--no-graphics",

env_path: Optional[str] = parser.get_default("env_path")
run_id: str = parser.get_default("run_id")
load_model: bool = parser.get_default("load_model")
resume: bool = parser.get_default("resume")
force: bool = parser.get_default("force")
inference: bool = parser.get_default("inference")
save_freq: int = parser.get_default("save_freq")
keep_checkpoints: int = parser.get_default("keep_checkpoints")
base_port: int = parser.get_default("base_port")

no_graphics: bool = parser.get_default("no_graphics")
multi_gpu: bool = parser.get_default("multi_gpu")
sampler_config: Optional[Dict] = None
docker_target_name: Optional[str] = parser.get_default("docker_target_name")
env_args: Optional[List[str]] = parser.get_default("env_args")
cpu: bool = parser.get_default("cpu")
width: int = parser.get_default("width")

configs loaded from files.
"""
argparse_args = vars(args)
docker_target_name = argparse_args["docker_target_name"]
if docker_target_name is not None:
trainer_config_path = f"/{docker_target_name}/{trainer_config_path}"
if curriculum_config_path is not None:
curriculum_config_path = (
f"/{docker_target_name}/{curriculum_config_path}"
)
argparse_args["trainer_config"] = load_config(trainer_config_path)
if curriculum_config_path is not None:
argparse_args["curriculum_config"] = load_config(curriculum_config_path)

)
# Keep deprecated --load working, TODO: remove
argparse_args["resume"] = argparse_args["resume"] or argparse_args["load_model"]
# Since argparse accepts file paths in the config options which don't exist in CommandLineOptions,
# these keys will need to be deleted to use the **/splat operator below.
argparse_args.pop("sampler_file_path")

:param run_options: Command line arguments for training.
"""
with hierarchical_timer("run_training.setup"):
# Recognize and use docker volume if one is passed as an argument
if not options.docker_target_name:
model_path = f"./models/{options.run_id}"
summaries_dir = "./summaries"
else:
model_path = f"/{options.docker_target_name}/models/{options.run_id}"
summaries_dir = f"/{options.docker_target_name}/summaries"
model_path = f"./models/{options.run_id}"
summaries_dir = "./summaries"
port = options.base_port
# Configure CSV, Tensorboard Writers and StatsReporter

"Environment/Episode Length",
],
)
tb_writer = TensorboardWriter(summaries_dir)
handle_existing_directories(
model_path, summaries_dir, options.resume, options.force
)
tb_writer = TensorboardWriter(summaries_dir, clear_past_data=not options.resume)
gauge_write = GaugeWriter()
console_writer = ConsoleWriter()
StatsReporter.add_writer(tb_writer)

if options.env_path is None:
port = UnityEnvironment.DEFAULT_EDITOR_PORT
env_factory = create_environment_factory(
options.env_path,
options.docker_target_name,
options.no_graphics,
run_seed,
port,
options.env_args,
options.env_path, options.no_graphics, run_seed, port, options.env_args
)
engine_config = EngineConfig(
options.width,

options.run_id,
model_path,
options.keep_checkpoints,
options.train_model,
options.load_model,
not options.inference,
options.resume,
run_seed,
maybe_meta_curriculum,
options.multi_gpu,

options.run_id,
options.save_freq,
maybe_meta_curriculum,
options.train_model,
not options.inference,
run_seed,
sampler_manager,
resampling_interval,

with open(timing_path, "w") as f:
json.dump(get_timer_tree(), f, indent=4)
except FileNotFoundError:
logging.warning(
logger.warning(
f"Unable to save to {timing_path}. Make sure the directory exists"
)

return meta_curriculum
def prepare_for_docker_run(docker_target_name, env_path):
for f in glob.glob(
"/{docker_target_name}/*".format(docker_target_name=docker_target_name)
):
if env_path in f:
try:
b = os.path.basename(f)
if os.path.isdir(f):
shutil.copytree(f, "/ml-agents/{b}".format(b=b))
else:
src_f = "/{docker_target_name}/{b}".format(
docker_target_name=docker_target_name, b=b
)
dst_f = "/ml-agents/{b}".format(b=b)
shutil.copyfile(src_f, dst_f)
os.chmod(dst_f, 0o775) # Make executable
except Exception as e:
logging.getLogger("mlagents.trainers").info(e)
env_path = "/ml-agents/{env_path}".format(env_path=env_path)
return env_path
docker_target_name: Optional[str],
no_graphics: bool,
seed: int,
start_port: int,

raise UnityEnvironmentException(
f"Couldn't launch the {env_path} environment. Provided filename does not match any environments."
)
docker_training = docker_target_name is not None
if docker_training and env_path is not None:
# Comments for future maintenance:
# Some OS/VM instances (e.g. COS GCP Image) mount filesystems
# with COS flag which prevents execution of the Unity scene,
# to get around this, we will copy the executable into the
# container.
# Navigate in docker path and find env_path and copy it.
env_path = prepare_for_docker_run(docker_target_name, env_path)
def create_unity_environment(
worker_id: int, side_channels: List[SideChannel]

file_name=env_path,
worker_id=worker_id,
seed=env_seed,
docker_training=docker_training,
no_graphics=no_graphics,
base_port=start_port,
args=env_args,

print(get_version_string())
if options.debug:
log_level = logging.DEBUG
log_level = logging_util.DEBUG
log_level = logging.INFO
log_level = logging_util.INFO
trainer_logger = create_logger("mlagents.trainers", log_level)
logging_util.set_log_level(log_level)
logger.debug("Configuration for this run:")
logger.debug(json.dumps(options._asdict(), indent=4))
trainer_logger.debug("Configuration for this run:")
trainer_logger.debug(json.dumps(options._asdict(), indent=4))
# Options deprecation warnings
if options.load_model:
logger.warning(
"The --load option has been deprecated. Please use the --resume option instead."
)
if options.train_model:
logger.warning(
"The --train option has been deprecated. Train mode is now the default. Use "
"--inference to run in inference mode."
)
run_seed = options.seed
if options.cpu:

4
ml-agents/mlagents/trainers/meta_curriculum.py


from typing import Dict, Set
from mlagents.trainers.curriculum import Curriculum
import logging
from mlagents_envs.logging_util import get_logger
logger = logging.getLogger("mlagents.trainers")
logger = get_logger(__name__)
class MetaCurriculum:

1
ml-agents/mlagents/trainers/policy/nn_policy.py


self.act_size,
reparameterize=reparameterize,
tanh_squash=tanh_squash,
condition_sigma=condition_sigma_on_obs,
)
if tanh_squash:

34
ml-agents/mlagents/trainers/policy/tf_policy.py


import logging
from typing import Any, Dict, List, Optional
import abc
import numpy as np

from mlagents_envs.logging_util import get_logger
from mlagents.trainers.policy import Policy
from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.trajectory import SplitObservations

logger = logging.getLogger("mlagents.trainers")
logger = get_logger(__name__)
class UnityPolicyException(UnityException):

logger.info("Loading Model for brain {}".format(self.brain.brain_name))
ckpt = tf.train.get_checkpoint_state(self.model_path)
if ckpt is None:
logger.info(
"The model {0} could not be found. Make "
raise UnityPolicyException(
"The model {0} could not be loaded. Make "
"--run-id".format(self.model_path)
"--run-id. and that the previous run you are resuming from had the same "
"behavior names.".format(self.model_path)
)
self.saver.restore(self.sess, ckpt.model_checkpoint_path)

self.assign_ops.append(tf.assign(var, assign_ph))
def load_weights(self, values):
if len(self.assign_ops) == 0:
logger.warning(
"Calling load_weights in tf_policy but assign_ops is empty. Did you forget to call init_load_weights?"
)
with self.graph.as_default():
feed_dict = {}
for assign_ph, value in zip(self.assign_phs, values):

"""
if batched_step_result.n_agents() == 0:
return ActionInfo.empty()
agents_done = [
agent
for agent, done in zip(
batched_step_result.agent_id, batched_step_result.done
)
if done
]
self.remove_memories(agents_done)
self.remove_previous_action(agents_done)
global_agent_ids = [
get_global_agent_id(worker_id, int(agent_id))

def create_input_placeholders(self):
with self.graph.as_default():
self.global_step, self.increment_step_op, self.steps_to_increment = (
ModelUtils.create_global_steps()
)
(
self.global_step,
self.increment_step_op,
self.steps_to_increment,
) = ModelUtils.create_global_steps()
self.visual_in = ModelUtils.create_visual_input_placeholders(
self.brain.camera_resolutions
)

11
ml-agents/mlagents/trainers/ppo/trainer.py


# ## ML-Agent Learning (PPO)
# Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347
import logging
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.brain import BrainParameters

from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
logger = logging.getLogger("mlagents.trainers")
logger = get_logger(__name__)
class PPOTrainer(RLTrainer):

return policy
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
) -> None:
:param name_behavior_id: Behavior ID that the policy should belong to.
:param parsed_behavior_id: Behavior identifiers that the policy should belong to.
:param policy: Policy to associate with name_behavior_id.
"""
if self.policy:

5
ml-agents/mlagents/trainers/sac/optimizer.py


import logging
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
from mlagents.trainers.models import LearningRateSchedule, EncoderType, ModelUtils
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer

EPSILON = 1e-6 # Small value to avoid divide by zero
logger = logging.getLogger("mlagents.trainers")
logger = get_logger(__name__)
POLICY_SCOPE = ""
TARGET_SCOPE = "target_network"

"q1_loss": self.q1_loss,
"q2_loss": self.q2_loss,
"entropy_coef": self.ent_coef,
"entropy": self.policy.entropy,
"update_batch": self.update_batch_policy,
"update_value": self.update_batch_value,
"update_entropy": self.update_batch_entropy,

11
ml-agents/mlagents/trainers/sac/trainer.py


# Contains an implementation of SAC as described in https://arxiv.org/abs/1801.01290
# and implemented in https://github.com/hill-a/stable-baselines
import logging
from collections import defaultdict
from typing import Dict
import os

from mlagents_envs.logging_util import get_logger
from mlagents_envs.timers import timed
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy.nn_policy import NNPolicy

from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
logger = logging.getLogger("mlagents.trainers")
logger = get_logger(__name__)
BUFFER_TRUNCATE_PERCENT = 0.8

"memory_size",
"model_path",
"reward_signals",
"vis_encode_type",
]
self._check_param_keys()

for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
) -> None:
"""
Adds policy to trainer.
:param brain_parameters: specifications for policy construction

6
ml-agents/mlagents/trainers/simple_env_manager.py


self.env.step()
all_step_result = self._generate_all_results()
step_info = EnvironmentStep(all_step_result, 0, self.previous_all_action_info)
step_info = EnvironmentStep(
all_step_result, 0, self.previous_all_action_info, {}
)
self.previous_step = step_info
return [step_info]

self.shared_float_properties.set_property(k, v)
self.env.reset()
all_step_result = self._generate_all_results()
self.previous_step = EnvironmentStep(all_step_result, 0, {})
self.previous_step = EnvironmentStep(all_step_result, 0, {}, {})
return [self.previous_step]
@property

47
ml-agents/mlagents/trainers/stats.py


import csv
import os
import time
import logging
from mlagents_envs.logging_util import get_logger
from mlagents_envs.timers import set_gauge
from mlagents_envs.timers import set_gauge
logger = logging.getLogger("mlagents.trainers")
logger = get_logger(__name__)
class StatsSummary(NamedTuple):

class StatsPropertyType(Enum):
HYPERPARAMETERS = "hyperparameters"
SELF_PLAY = "selfplay"
SELF_PLAY_TEAM = "selfplayteam"
class StatsWriter(abc.ABC):

)
if self.self_play and "Self-play/ELO" in values:
elo_stats = values["Self-play/ELO"]
mean_opponent_elo = values["Self-play/Mean Opponent ELO"]
std_opponent_elo = values["Self-play/Std Opponent ELO"]
logger.info(
"{} Team {}: ELO: {:0.3f}. "
"Mean Opponent ELO: {:0.3f}. "
"Std Opponent ELO: {:0.3f}. ".format(
category,
self.self_play_team,
elo_stats.mean,
mean_opponent_elo.mean,
std_opponent_elo.mean,
)
)
logger.info("{} ELO: {:0.3f}. ".format(category, elo_stats.mean))
else:
logger.info(
"{}: Step: {}. No episode was completed since last summary. {}".format(

elif property_type == StatsPropertyType.SELF_PLAY:
assert isinstance(value, bool)
self.self_play = value
elif property_type == StatsPropertyType.SELF_PLAY_TEAM:
assert isinstance(value, int)
self.self_play_team = value
def _dict_to_str(self, param_dict: Dict[str, Any], num_tabs: int) -> str:
"""

class TensorboardWriter(StatsWriter):
def __init__(self, base_dir: str):
def __init__(self, base_dir: str, clear_past_data: bool = False):
:param clear_past_data: Whether or not to clean up existing Tensorboard files associated with the base_dir and
category.
self._clear_past_data = clear_past_data
def write_stats(
self, category: str, values: Dict[str, StatsSummary], step: int

basedir=self.base_dir, category=category
)
os.makedirs(filewriter_dir, exist_ok=True)
if self._clear_past_data:
self._delete_all_events_files(filewriter_dir)
def _delete_all_events_files(self, directory_name: str) -> None:
for file_name in os.listdir(directory_name):
if file_name.startswith("events.out"):
logger.warning(
"{} was left over from a previous run. Deleting.".format(file_name)
)
full_fname = os.path.join(directory_name, file_name)
try:
os.remove(full_fname)
except OSError:
logger.warning(
"{} was left over from a previous run and "
"not deleted.".format(full_fname)
)
def add_property(
self, category: str, property_type: StatsPropertyType, value: Any

124
ml-agents/mlagents/trainers/subprocess_env_manager.py


import logging
from typing import Dict, NamedTuple, List, Any, Optional, Callable, Set
from typing import Dict, NamedTuple, List, Any, Optional, Callable, Set, Tuple
import enum
from mlagents_envs.exception import UnityCommunicationException, UnityTimeOutException
from mlagents_envs.exception import (
UnityCommunicationException,
UnityTimeOutException,
UnityEnvironmentException,
)
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.env_manager import EnvManager, EnvironmentStep, AllStepResult
from mlagents_envs.timers import (
TimerNode,

EngineConfigurationChannel,
EngineConfig,
)
from mlagents_envs.side_channel.stats_side_channel import (
StatsSideChannel,
StatsAggregationMethod,
)
logger = logging.getLogger("mlagents.trainers")
logger = get_logger(__name__)
class EnvironmentCommand(enum.Enum):
STEP = 1
EXTERNAL_BRAINS = 2
GET_PROPERTIES = 3
RESET = 4
CLOSE = 5
ENV_EXITED = 6
class EnvironmentCommand(NamedTuple):
name: str
class EnvironmentRequest(NamedTuple):
cmd: EnvironmentCommand
name: str
cmd: EnvironmentCommand
worker_id: int
payload: Any

timer_root: Optional[TimerNode]
environment_stats: Dict[str, Tuple[float, StatsAggregationMethod]]
class UnityEnvWorker:

self.previous_all_action_info: Dict[str, ActionInfo] = {}
self.waiting = False
def send(self, name: str, payload: Any = None) -> None:
def send(self, cmd: EnvironmentCommand, payload: Any = None) -> None:
cmd = EnvironmentCommand(name, payload)
self.conn.send(cmd)
req = EnvironmentRequest(cmd, payload)
self.conn.send(req)
except (BrokenPipeError, EOFError):
raise UnityCommunicationException("UnityEnvironment worker: send failed.")

if response.cmd == EnvironmentCommand.ENV_EXITED:
env_exception: Exception = response.payload
raise env_exception
return response
except (BrokenPipeError, EOFError):
raise UnityCommunicationException("UnityEnvironment worker: recv failed.")

self.conn.send(EnvironmentCommand("close"))
self.conn.send(EnvironmentRequest(EnvironmentCommand.CLOSE))
except (BrokenPipeError, EOFError):
logger.debug(
f"UnityEnvWorker {self.worker_id} got exception trying to close."

shared_float_properties = FloatPropertiesChannel()
engine_configuration_channel = EngineConfigurationChannel()
engine_configuration_channel.set_configuration(engine_configuration)
env: BaseEnv = env_factory(
worker_id, [shared_float_properties, engine_configuration_channel]
)
stats_channel = StatsSideChannel()
env: BaseEnv = None
def _send_response(cmd_name, payload):
def _send_response(cmd_name: EnvironmentCommand, payload: Any) -> None:
parent_conn.send(EnvironmentResponse(cmd_name, worker_id, payload))
def _generate_all_results() -> AllStepResult:

return result
try:
env = env_factory(
worker_id,
[shared_float_properties, engine_configuration_channel, stats_channel],
)
cmd: EnvironmentCommand = parent_conn.recv()
if cmd.name == "step":
all_action_info = cmd.payload
req: EnvironmentRequest = parent_conn.recv()
if req.cmd == EnvironmentCommand.STEP:
all_action_info = req.payload
for brain_name, action_info in all_action_info.items():
if len(action_info.action) != 0:
env.set_actions(brain_name, action_info.action)

# Note that we could randomly return timers a fraction of the time if we wanted to reduce
# the data transferred.
# TODO get gauges from the workers and merge them in the main process too.
step_response = StepResponse(all_step_result, get_timer_root())
step_queue.put(EnvironmentResponse("step", worker_id, step_response))
env_stats = stats_channel.get_and_reset_stats()
step_response = StepResponse(
all_step_result, get_timer_root(), env_stats
)
step_queue.put(
EnvironmentResponse(
EnvironmentCommand.STEP, worker_id, step_response
)
)
elif cmd.name == "external_brains":
_send_response("external_brains", external_brains())
elif cmd.name == "get_properties":
elif req.cmd == EnvironmentCommand.EXTERNAL_BRAINS:
_send_response(EnvironmentCommand.EXTERNAL_BRAINS, external_brains())
elif req.cmd == EnvironmentCommand.GET_PROPERTIES:
_send_response("get_properties", reset_params)
elif cmd.name == "reset":
for k, v in cmd.payload.items():
_send_response(EnvironmentCommand.GET_PROPERTIES, reset_params)
elif req.cmd == EnvironmentCommand.RESET:
for k, v in req.payload.items():
_send_response("reset", all_step_result)
elif cmd.name == "close":
_send_response(EnvironmentCommand.RESET, all_step_result)
elif req.cmd == EnvironmentCommand.CLOSE:
except (KeyboardInterrupt, UnityCommunicationException, UnityTimeOutException):
except (
KeyboardInterrupt,
UnityCommunicationException,
UnityTimeOutException,
UnityEnvironmentException,
) as ex:
step_queue.put(EnvironmentResponse("env_close", worker_id, None))
step_queue.put(
EnvironmentResponse(EnvironmentCommand.ENV_EXITED, worker_id, ex)
)
_send_response(EnvironmentCommand.ENV_EXITED, ex)
finally:
# If this worker has put an item in the step queue that hasn't been processed by the EnvManager, the process
# will hang until the item is processed. We avoid this behavior by using Queue.cancel_join_thread()

step_queue.cancel_join_thread()
step_queue.close()
env.close()
if env is not None:
env.close()
logger.debug(f"UnityEnvironment worker {worker_id} done.")

if not env_worker.waiting:
env_action_info = self._take_step(env_worker.previous_step)
env_worker.previous_all_action_info = env_action_info
env_worker.send("step", env_action_info)
env_worker.send(EnvironmentCommand.STEP, env_action_info)
env_worker.waiting = True
def _step(self) -> List[EnvironmentStep]:

while len(worker_steps) < 1:
try:
while True:
step = self.step_queue.get_nowait()
if step.name == "env_close":
raise UnityCommunicationException(
"At least one of the environments has closed."
)
step: EnvironmentResponse = self.step_queue.get_nowait()
if step.cmd == EnvironmentCommand.ENV_EXITED:
env_exception: Exception = step.payload
raise env_exception
self.env_workers[step.worker_id].waiting = False
if step.worker_id not in step_workers:
worker_steps.append(step)

self.env_workers[step.worker_id].waiting = False
# First enqueue reset commands for all workers so that they reset in parallel
for ew in self.env_workers:
ew.send("reset", config)
ew.send(EnvironmentCommand.RESET, config)
ew.previous_step = EnvironmentStep(ew.recv().payload, ew.worker_id, {})
ew.previous_step = EnvironmentStep(ew.recv().payload, ew.worker_id, {}, {})
self.env_workers[0].send("external_brains")
self.env_workers[0].send(EnvironmentCommand.EXTERNAL_BRAINS)
self.env_workers[0].send("get_properties")
self.env_workers[0].send(EnvironmentCommand.GET_PROPERTIES)
return self.env_workers[0].recv().payload
def close(self) -> None:

payload.all_step_result,
step.worker_id,
env_worker.previous_all_action_info,
payload.environment_stats,
)
step_infos.append(new_step)
env_worker.previous_step = new_step

80
ml-agents/mlagents/trainers/tests/simple_test_envs.py


VIS_OBS_SIZE = (20, 20, 3)
STEP_SIZE = 0.1
TIME_PENALTY = 0.001
TIME_PENALTY = 0.01
MIN_STEPS = int(1.0 / STEP_SIZE) + 1
SUCCESS_REWARD = 1.0 + MIN_STEPS * TIME_PENALTY

class Simple1DEnvironment(BaseEnv):
class SimpleEnvironment(BaseEnv):
"""
Very simple "game" - the agent has a position on [-1, 1], gets a reward of 1 if it reaches 1, and a reward of -1 if
it reaches -1. The position is incremented by the action amount (clamped to [-step_size, step_size]).

num_vector=1,
vis_obs_size=VIS_OBS_SIZE,
vec_obs_size=OBS_SIZE,
action_size=1,
):
super().__init__()
self.discrete = use_discrete

self.vec_obs_size = vec_obs_size
action_type = ActionType.DISCRETE if use_discrete else ActionType.CONTINUOUS
self.group_spec = AgentGroupSpec(
self._make_obs_spec(), action_type, (2,) if use_discrete else 1
self._make_obs_spec(),
action_type,
tuple(2 for _ in range(action_size)) if use_discrete else action_size,
self.action_size = action_size
self.position: Dict[str, float] = {}
self.positions: Dict[str, List[float]] = {}
self.step_count: Dict[str, float] = {}
self.random = random.Random(str(self.group_spec))
self.goal: Dict[str, int] = {}

return self.step_result[name]
def _take_action(self, name: str) -> bool:
deltas = []
for _act in self.action[name][0]:
if self.discrete:
deltas.append(1 if _act else -1)
else:
deltas.append(_act)
for i, _delta in enumerate(deltas):
_delta = clamp(_delta, -self.step_size, self.step_size)
self.positions[name][i] += _delta
self.positions[name][i] = clamp(self.positions[name][i], -1, 1)
self.step_count[name] += 1
# Both must be in 1.0 to be done
done = all(pos >= 1.0 or pos <= -1.0 for pos in self.positions[name])
return done
def _generate_mask(self):
act = self.action[name][0][0]
delta = 1 if act else -1
# LL-Python API will return an empty dim if there is only 1 agent.
ndmask = np.array(2 * self.action_size * [False], dtype=np.bool)
ndmask = np.expand_dims(ndmask, axis=0)
action_mask = [ndmask]
delta = self.action[name][0][0]
delta = clamp(delta, -self.step_size, self.step_size)
self.position[name] += delta
self.position[name] = clamp(self.position[name], -1, 1)
self.step_count[name] += 1
done = self.position[name] >= 1.0 or self.position[name] <= -1.0
return done
action_mask = None
return action_mask
reward = SUCCESS_REWARD * self.position[name] * self.goal[name]
reward = 0.0
for _pos in self.positions[name]:
reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
self.positions[name]
)
def _reset_agent(self, name):
self.goal[name] = self.random.choice([-1, 1])
self.positions[name] = [0.0 for _ in range(self.action_size)]
self.step_count[name] = 0
self.final_rewards[name].append(self.rewards[name])
self.rewards[name] = 0
self.agent_id[name] = self.agent_id[name] + 1
def _make_batched_step(
self, name: str, done: bool, reward: float

self.rewards[name] += reward
self.step_result[name] = self._make_batched_step(name, done, reward)
def _generate_mask(self):
if self.discrete:
# LL-Python API will return an empty dim if there is only 1 agent.
ndmask = np.array(2 * [False], dtype=np.bool)
ndmask = np.expand_dims(ndmask, axis=0)
action_mask = [ndmask]
else:
action_mask = None
return action_mask
def _reset_agent(self, name):
self.goal[name] = self.random.choice([-1, 1])
self.position[name] = 0.0
self.step_count[name] = 0
self.final_rewards[name].append(self.rewards[name])
self.rewards[name] = 0
self.agent_id[name] = self.agent_id[name] + 1
def reset(self) -> None: # type: ignore
for name in self.names:
self._reset_agent(name)

pass
class Memory1DEnvironment(Simple1DEnvironment):
class MemoryEnvironment(SimpleEnvironment):
def __init__(self, brain_names, use_discrete, step_size=0.2):
super().__init__(brain_names, use_discrete, step_size=step_size)
# Number of steps to reveal the goal for. Lower is harder. Should be

)
class Record1DEnvironment(Simple1DEnvironment):
class RecordEnvironment(SimpleEnvironment):
def __init__(
self,
brain_names,

43
ml-agents/mlagents/trainers/tests/test_agent_processor.py


)
from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.stats import StatsReporter
from mlagents.trainers.stats import StatsReporter, StatsSummary
from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
def create_mock_brain():

assert len(processor.last_take_action_outputs.keys()) == 0
assert len(processor.episode_steps.keys()) == 0
assert len(processor.episode_rewards.keys()) == 0
assert len(processor.last_step_result.keys()) == 0
# check that steps with immediate dones don't add to dicts
processor.add_experiences(mock_done_step, 0, ActionInfo.empty())
assert len(processor.experience_buffers.keys()) == 0
assert len(processor.last_take_action_outputs.keys()) == 0
assert len(processor.episode_steps.keys()) == 0
assert len(processor.episode_rewards.keys()) == 0
assert len(processor.last_step_result.keys()) == 0
def test_end_episode():

queue_traj = queue.get_nowait()
assert isinstance(queue_traj, Trajectory)
assert queue.empty()
def test_agent_manager_stats():
policy = mock.Mock()
stats_reporter = StatsReporter("FakeCategory")
writer = mock.Mock()
stats_reporter.add_writer(writer)
manager = AgentManager(policy, "MyBehavior", stats_reporter)
all_env_stats = [
{
"averaged": (1.0, StatsAggregationMethod.AVERAGE),
"most_recent": (2.0, StatsAggregationMethod.MOST_RECENT),
},
{
"averaged": (3.0, StatsAggregationMethod.AVERAGE),
"most_recent": (4.0, StatsAggregationMethod.MOST_RECENT),
},
]
for env_stats in all_env_stats:
manager.record_environment_stats(env_stats, worker_id=0)
expected_stats = {
"averaged": StatsSummary(mean=2.0, std=mock.ANY, num=2),
"most_recent": StatsSummary(mean=4.0, std=0.0, num=1),
}
stats_reporter.write_stats(123)
writer.write_stats.assert_any_call("FakeCategory", expected_stats, 123)
# clean up our Mock from the global list
StatsReporter.writers.remove(writer)

10
ml-agents/mlagents/trainers/tests/test_distributions.py


def test_gaussian_distribution():
with tf.Graph().as_default():
logits = tf.Variable(initial_value=[[0, 0]], trainable=True, dtype=tf.float32)
logits = tf.Variable(initial_value=[[1, 1]], trainable=True, dtype=tf.float32)
distribution = GaussianDistribution(
logits,
act_size=VECTOR_ACTION_SPACE,

assert out.shape[1] == VECTOR_ACTION_SPACE[0]
output = sess.run([distribution.total_log_probs])
assert output[0].shape[0] == 1
# Test entropy is correct
log_std_tensor = tf.get_default_graph().get_tensor_by_name(
"log_std/BiasAdd:0"
)
feed_dict = {log_std_tensor: [[1.0, 1.0]]}
entropy = sess.run([distribution.entropy], feed_dict=feed_dict)
# Entropy with log_std of 1.0 should be 2.42
assert pytest.approx(entropy[0], 0.01) == 2.42
def test_tanh_distribution():

36
ml-agents/mlagents/trainers/tests/test_ghost.py


import yaml
from mlagents.trainers.ghost.trainer import GhostTrainer
from mlagents.trainers.ghost.controller import GhostController
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory

dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
trainer = GhostTrainer(ppo_trainer, brain_name, 0, dummy_config, True, "0")
controller = GhostController(100)
trainer = GhostTrainer(
ppo_trainer, brain_name, controller, 0, dummy_config, True, "0"
)
trainer.add_policy(brain_params_team0.brain_name, policy)
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
brain_params_team0.brain_name
)
trainer.add_policy(parsed_behavior_id0, policy)
trainer.add_policy(brain_params_team1.brain_name, policy)
parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
brain_params_team1.brain_name
)
trainer.add_policy(parsed_behavior_id1, policy)
trajectory_queue1 = AgentManagerQueue(brain_params_team1.brain_name)
trainer.subscribe_trajectory_queue(trajectory_queue1)

vector_action_space_type=0,
)
brain_name = BehaviorIdentifiers.from_name_behavior_id(
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
).brain_name
)
brain_name = parsed_behavior_id0.brain_name
brain_params_team1 = BrainParameters(
brain_name="test_brain?team=1",

dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
trainer = GhostTrainer(ppo_trainer, brain_name, 0, dummy_config, True, "0")
controller = GhostController(100)
trainer = GhostTrainer(
ppo_trainer, brain_name, controller, 0, dummy_config, True, "0"
)
trainer.add_policy(brain_params_team0.brain_name, policy)
trainer.add_policy(parsed_behavior_id0, policy)
trainer.add_policy(brain_params_team1.brain_name, policy)
parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
brain_params_team1.brain_name
)
trainer.add_policy(parsed_behavior_id1, policy)
policy_queue1 = AgentManagerQueue(brain_params_team1.brain_name)
trainer.publish_policy_queue(policy_queue1)

48
ml-agents/mlagents/trainers/tests/test_learn.py


return parse_command_line(args)
@patch("mlagents.trainers.learn.handle_existing_directories")
@patch("mlagents.trainers.learn.TrainerFactory")
@patch("mlagents.trainers.learn.SamplerManager")
@patch("mlagents.trainers.learn.SubprocessEnvManager")

subproc_env_mock,
sampler_manager_mock,
trainer_factory_mock,
handle_dir_mock,
):
mock_env = MagicMock()
mock_env.external_brain_names = []

"ppo",
50000,
None,
False,
True,
StatsReporter.writers.clear() # make sure there aren't any writers as added by learn.py
@patch("mlagents.trainers.learn.SamplerManager")
@patch("mlagents.trainers.learn.SubprocessEnvManager")
@patch("mlagents.trainers.learn.create_environment_factory")
@patch("mlagents.trainers.learn.load_config")
def test_docker_target_path(
load_config, create_environment_factory, subproc_env_mock, sampler_manager_mock
):
mock_env = MagicMock()
mock_env.external_brain_names = []
mock_env.academy_name = "TestAcademyName"
create_environment_factory.return_value = mock_env
trainer_config_mock = MagicMock()
load_config.return_value = trainer_config_mock
options_with_docker_target = basic_options({"--docker-target-name": "dockertarget"})
mock_init = MagicMock(return_value=None)
with patch.object(TrainerController, "__init__", mock_init):
with patch.object(TrainerController, "start_learning", MagicMock()):
learn.run_training(0, options_with_docker_target)
mock_init.assert_called_once()
assert mock_init.call_args[0][1] == "/dockertarget/models/ppo"
assert mock_init.call_args[0][2] == "/dockertarget/summaries"
handle_dir_mock.assert_called_once_with(
"./models/ppo", "./summaries", False, False
)
StatsReporter.writers.clear() # make sure there aren't any writers as added by learn.py

env_path="/foo/bar",
docker_target_name=None,
no_graphics=True,
seed=None,
start_port=8000,

assert opt.sampler_config is None
assert opt.keep_checkpoints == 5
assert opt.lesson == 0
assert opt.load_model is False
assert opt.resume is False
assert opt.inference is False
assert opt.train_model is False
assert opt.docker_target_name is None
assert opt.no_graphics is False
assert opt.debug is False
assert opt.env_args is None

"--sampler=./mysample",
"--keep-checkpoints=42",
"--lesson=3",
"--load",
"--resume",
"--inference",
"--run-id=myawesomerun",
"--save-freq=123456",
"--seed=7890",

"--docker-target-name=mydockertarget",
"--no-graphics",
"--debug",
]

assert opt.sampler_config == {}
assert opt.keep_checkpoints == 42
assert opt.lesson == 3
assert opt.load_model is True
assert opt.train_model is True
assert opt.docker_target_name == "mydockertarget"
assert opt.inference is True
assert opt.resume is True
@patch("builtins.open", new_callable=mock_open, read_data="{}")

4
ml-agents/mlagents/trainers/tests/test_meta_curriculum.py


import json
import yaml
from mlagents.trainers.tests.simple_test_envs import Simple1DEnvironment
from mlagents.trainers.tests.simple_test_envs import SimpleEnvironment
from mlagents.trainers.tests.test_simple_rl import _check_environment_trains, BRAIN_NAME
from mlagents.trainers.tests.test_curriculum import dummy_curriculum_json_str

@pytest.mark.parametrize("curriculum_brain_name", [BRAIN_NAME, "WrongBrainName"])
def test_simple_metacurriculum(curriculum_brain_name):
env = Simple1DEnvironment([BRAIN_NAME], use_discrete=False)
env = SimpleEnvironment([BRAIN_NAME], use_discrete=False)
curriculum_config = json.loads(dummy_curriculum_json_str)
mc = MetaCurriculum({curriculum_brain_name: curriculum_config})
trainer_config = yaml.safe_load(TRAINER_CONFIG)

129
ml-agents/mlagents/trainers/tests/test_simple_rl.py


from typing import Dict, Any
from mlagents.trainers.tests.simple_test_envs import (
Simple1DEnvironment,
Memory1DEnvironment,
Record1DEnvironment,
SimpleEnvironment,
MemoryEnvironment,
RecordEnvironment,
)
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.trainer_util import TrainerFactory

lambd: 0.95
learning_rate: 5.0e-3
learning_rate_schedule: constant
max_steps: 2000
max_steps: 3000
memory_size: 16
normalize: false
num_epoch: 3

# Custom reward processors shuld be built within the test function and passed to _check_environment_trains
# Default is average over the last 5 final rewards
def default_reward_processor(rewards, last_n_rewards=5):
rewards_to_use = rewards[-last_n_rewards:]
# For debugging tests
print("Last {} rewards:".format(last_n_rewards), rewards_to_use)
return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean()

trainer_config,
reward_processor=default_reward_processor,
meta_curriculum=None,
success_threshold=0.99,
success_threshold=0.9,
env_manager=None,
):
# Create controller and begin training.

if (
success_threshold is not None
): # For tests where we are just checking setup and not reward
processed_rewards = [
reward_processor(rewards) for rewards in env.final_rewards.values()
]

@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ppo(use_discrete):
env = Simple1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
config = generate_config(PPO_CONFIG)
_check_environment_trains(env, config)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_ppo(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.5
)
config = generate_config(PPO_CONFIG)
_check_environment_trains(env, config)

def test_visual_ppo(num_visual, use_discrete):
env = Simple1DEnvironment(
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=use_discrete,
num_visual=num_visual,

@pytest.mark.parametrize("num_visual", [1, 2])
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn"])
def test_visual_advanced_ppo(vis_encode_type, num_visual):
env = Simple1DEnvironment(
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
num_visual=num_visual,

@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_ppo(use_discrete):
env = Memory1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
"max_steps": 3000,
"max_steps": 5000,
"learning_rate": 1e-3,
_check_environment_trains(env, config)
_check_environment_trains(env, config, success_threshold=0.9)
env = Simple1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
def test_2d_sac(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
override_vals = {"buffer_init_steps": 2000, "max_steps": 4000}
config = generate_config(SAC_CONFIG, override_vals)
_check_environment_trains(env, config, success_threshold=0.8)
@pytest.mark.parametrize("use_discrete", [True, False])
env = Simple1DEnvironment(
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=use_discrete,
num_visual=num_visual,

@pytest.mark.parametrize("num_visual", [1, 2])
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn"])
def test_visual_advanced_sac(vis_encode_type, num_visual):
env = Simple1DEnvironment(
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
num_visual=num_visual,

@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_sac(use_discrete):
env = Memory1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
override_vals = {"batch_size": 32, "use_recurrent": True, "max_steps": 2000}
env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
override_vals = {
"batch_size": 64,
"use_recurrent": True,
"max_steps": 3000,
"learning_rate": 1e-3,
"buffer_init_steps": 500,
}
config = generate_config(SAC_CONFIG, override_vals)
_check_environment_trains(env, config)

env = Simple1DEnvironment(
env = SimpleEnvironment(
"play_against_current_self_ratio": 1.0,
"play_against_latest_model_ratio": 1.0,
"save_steps": 2000,
"swap_steps": 2000,
},

@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost_fails(use_discrete):
env = Simple1DEnvironment(
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
)
# This config should fail because the ghosted policy is never swapped with a competent policy.

"self_play": {
"play_against_current_self_ratio": 1.0,
"play_against_latest_model_ratio": 1.0,
"save_steps": 2000,
"swap_steps": 4000,
},

processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]
success_threshold = 0.9
assert any(reward > success_threshold for reward in processed_rewards) and any(
reward < success_threshold for reward in processed_rewards
)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost(use_discrete):
# Make opponent for asymmetric case
brain_name_opp = BRAIN_NAME + "Opp"
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
)
override_vals = {
"max_steps": 2000,
"self_play": {
"play_against_latest_model_ratio": 1.0,
"save_steps": 5000,
"swap_steps": 5000,
"team_change": 2000,
},
}
config = generate_config(PPO_CONFIG, override_vals)
config[brain_name_opp] = config[BRAIN_NAME]
_check_environment_trains(env, config)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost_fails(use_discrete):
# Make opponent for asymmetric case
brain_name_opp = BRAIN_NAME + "Opp"
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
)
# This config should fail because the team that us not learning when both have reached
# max step should be executing the initial, untrained poliy.
override_vals = {
"max_steps": 2000,
"self_play": {
"play_against_latest_model_ratio": 0.0,
"save_steps": 5000,
"swap_steps": 5000,
"team_change": 2000,
},
}
config = generate_config(PPO_CONFIG, override_vals)
config[brain_name_opp] = config[BRAIN_NAME]
_check_environment_trains(env, config, success_threshold=None)
processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]
success_threshold = 0.99
assert any(reward > success_threshold for reward in processed_rewards) and any(
reward < success_threshold for reward in processed_rewards

@pytest.fixture(scope="session")
def simple_record(tmpdir_factory):
def record_demo(use_discrete, num_visual=0, num_vector=1):
env = Record1DEnvironment(
env = RecordEnvironment(
[BRAIN_NAME],
use_discrete=use_discrete,
num_visual=num_visual,

@pytest.mark.parametrize("trainer_config", [PPO_CONFIG, SAC_CONFIG])
def test_gail(simple_record, use_discrete, trainer_config):
demo_path = simple_record(use_discrete)
env = Simple1DEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
override_vals = {
"max_steps": 500,
"behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},

@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_ppo(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
env = Simple1DEnvironment(
env = SimpleEnvironment(
[BRAIN_NAME],
num_visual=1,
num_vector=0,

@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_sac(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
env = Simple1DEnvironment(
env = SimpleEnvironment(
[BRAIN_NAME],
num_visual=1,
num_vector=0,

30
ml-agents/mlagents/trainers/tests/test_stats.py


import tempfile
import unittest
import csv
import time
from mlagents.trainers.stats import (
StatsReporter,

# Test write_stats
category = "category1"
with tempfile.TemporaryDirectory(prefix="unittest-") as base_dir:
tb_writer = TensorboardWriter(base_dir)
tb_writer = TensorboardWriter(base_dir, clear_past_data=False)
statssummary1 = StatsSummary(mean=1.0, std=1.0, num=1)
tb_writer.write_stats("category1", {"key1": statssummary1}, 10)

assert mock_filewriter.return_value.add_summary.call_count > 1
def test_tensorboard_writer_clear(tmp_path):
tb_writer = TensorboardWriter(tmp_path, clear_past_data=False)
statssummary1 = StatsSummary(mean=1.0, std=1.0, num=1)
tb_writer.write_stats("category1", {"key1": statssummary1}, 10)
# TB has some sort of timeout before making a new file
time.sleep(1.0)
assert len(os.listdir(os.path.join(tmp_path, "category1"))) > 0
# See if creating a new one doesn't delete it
tb_writer = TensorboardWriter(tmp_path, clear_past_data=False)
tb_writer.write_stats("category1", {"key1": statssummary1}, 10)
assert len(os.listdir(os.path.join(tmp_path, "category1"))) > 1
time.sleep(1.0)
# See if creating a new one deletes old ones
tb_writer = TensorboardWriter(tmp_path, clear_past_data=True)
tb_writer.write_stats("category1", {"key1": statssummary1}, 10)
assert len(os.listdir(os.path.join(tmp_path, "category1"))) == 1
def test_csv_writer():
# Test write_stats
category = "category1"

category = "category1"
console_writer = ConsoleWriter()
console_writer.add_property(category, StatsPropertyType.SELF_PLAY, True)
console_writer.add_property(category, StatsPropertyType.SELF_PLAY_TEAM, 1)
statssummary1 = StatsSummary(mean=1.0, std=1.0, num=1)
console_writer.write_stats(
category,

"Self-play/ELO": statssummary1,
"Self-play/Mean Opponent ELO": statssummary1,
"Self-play/Std Opponent ELO": statssummary1,
},
10,
)

)
self.assertIn(
"category1 Team 1: ELO: 1.000. Mean Opponent ELO: 1.000. Std Opponent ELO: 1.000.",
cm.output[1],
)

55
ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py


SubprocessEnvManager,
EnvironmentResponse,
StepResponse,
EnvironmentCommand,
from mlagents.trainers.tests.simple_test_envs import Simple1DEnvironment
from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
from mlagents_envs.exception import UnityEnvironmentException
from mlagents.trainers.tests.simple_test_envs import SimpleEnvironment
from mlagents.trainers.stats import StatsReporter
from mlagents.trainers.tests.test_simple_rl import (
_check_environment_trains,

def create_worker_mock(worker_id, step_queue, env_factor, engine_c):
return MockEnvWorker(worker_id, EnvironmentResponse("reset", worker_id, worker_id))
return MockEnvWorker(
worker_id, EnvironmentResponse(EnvironmentCommand.RESET, worker_id, worker_id)
)
class SubprocessEnvManagerTest(unittest.TestCase):

)
params = {"test": "params"}
manager._reset_env(params)
manager.env_workers[0].send.assert_called_with("reset", (params))
manager.env_workers[0].send.assert_called_with(
EnvironmentCommand.RESET, (params)
)
@mock.patch(
"mlagents.trainers.subprocess_env_manager.SubprocessEnvManager.create_worker"

params = {"test": "params"}
res = manager._reset_env(params)
for i, env in enumerate(manager.env_workers):
env.send.assert_called_with("reset", (params))
env.send.assert_called_with(EnvironmentCommand.RESET, (params))
env.recv.assert_called()
# Check that the "last steps" are set to the value returned for each step
self.assertEqual(

)
manager.step_queue = Mock()
manager.step_queue.get_nowait.side_effect = [
EnvironmentResponse("step", 0, StepResponse(0, None)),
EnvironmentResponse("step", 1, StepResponse(1, None)),
EnvironmentResponse(EnvironmentCommand.STEP, 0, StepResponse(0, None, {})),
EnvironmentResponse(EnvironmentCommand.STEP, 1, StepResponse(1, None, {})),
EmptyQueue(),
]
step_mock = Mock()

res = manager._step()
for i, env in enumerate(manager.env_workers):
if i < 2:
env.send.assert_called_with("step", step_mock)
env.send.assert_called_with(EnvironmentCommand.STEP, step_mock)
manager.step_queue.get_nowait.assert_called()
# Check that the "last steps" are set to the value returned for each step
self.assertEqual(

env_manager.set_agent_manager(brain_name, agent_manager_mock)
step_info_dict = {brain_name: Mock()}
step_info = EnvironmentStep(step_info_dict, 0, action_info_dict)
env_stats = {
"averaged": (1.0, StatsAggregationMethod.AVERAGE),
"most_recent": (2.0, StatsAggregationMethod.MOST_RECENT),
}
step_info = EnvironmentStep(step_info_dict, 0, action_info_dict, env_stats)
step_mock.return_value = [step_info]
env_manager.advance()

assert agent_manager_mock.policy == mock_policy
def simple_env_factory(worker_id, config):
env = Simple1DEnvironment(["1D"], use_discrete=True)
return env
def simple_env_factory(worker_id, config):
env = SimpleEnvironment(["1D"], use_discrete=True)
return env
env_manager = SubprocessEnvManager(
simple_env_factory, EngineConfig.default_config(), num_envs
)

val > 0.7 for val in StatsReporter.writers[0].get_last_rewards().values()
)
env_manager.close()
@pytest.mark.parametrize("num_envs", [1, 4])
def test_subprocess_env_raises_errors(num_envs):
def failing_env_factory(worker_id, config):
import time
# Sleep momentarily to allow time for the EnvManager to be waiting for the
# subprocess response. We won't be able to capture failures from the subprocess
# that cause it to close the pipe before we can send the first message.
time.sleep(0.1)
raise UnityEnvironmentException()
env_manager = SubprocessEnvManager(
failing_env_factory, EngineConfig.default_config(), num_envs
)
with pytest.raises(UnityEnvironmentException):
env_manager.reset()
env_manager.close()

22
ml-agents/mlagents/trainers/tests/test_trainer_util.py


import pytest
import yaml
import io
import os
from unittest.mock import patch
from mlagents.trainers import trainer_util

with pytest.raises(TrainerConfigError):
fp = io.StringIO(file_contents)
_load_config(fp)
def test_existing_directories(tmp_path):
model_path = os.path.join(tmp_path, "runid")
# Unused summary path
summary_path = os.path.join(tmp_path, "runid")
# Test fresh new unused path - should do nothing.
trainer_util.handle_existing_directories(model_path, summary_path, False, False)
# Test resume with fresh path - should throw an exception.
with pytest.raises(UnityTrainerException):
trainer_util.handle_existing_directories(model_path, summary_path, True, False)
# make a directory
os.mkdir(model_path)
# Test try to train w.o. force, should complain
with pytest.raises(UnityTrainerException):
trainer_util.handle_existing_directories(model_path, summary_path, False, False)
# Test try to train w/ resume - should work
trainer_util.handle_existing_directories(model_path, summary_path, True, False)
# Test try to train w/ force - should work
trainer_util.handle_existing_directories(model_path, summary_path, False, True)

10
ml-agents/mlagents/trainers/trainer/trainer.py


# # Unity ML-Agents Toolkit
import logging
from mlagents_envs.logging_util import get_logger
from mlagents.model_serialization import export_policy_model, SerializationSettings
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.stats import StatsReporter

from mlagents.trainers.policy import Policy
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
logger = logging.getLogger("mlagents.trainers")
logger = get_logger(__name__)
class Trainer(abc.ABC):

pass
@abc.abstractmethod
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
) -> None:
"""
Adds policy to trainer.
"""

25
ml-agents/mlagents/trainers/trainer_controller.py


import os
import sys
import logging
import threading
from typing import Dict, Optional, Set, List
from collections import defaultdict

from mlagents_envs.logging_util import get_logger
from mlagents.trainers.env_manager import EnvManager
from mlagents_envs.exception import (
UnityEnvironmentException,

self.trainer_factory = trainer_factory
self.model_path = model_path
self.summaries_dir = summaries_dir
self.logger = logging.getLogger("mlagents.trainers")
self.logger = get_logger(__name__)
self.run_id = run_id
self.save_freq = save_freq
self.train_model = train

self, env_manager: EnvManager, name_behavior_id: str
) -> None:
brain_name = BehaviorIdentifiers.from_name_behavior_id(
name_behavior_id
).brain_name
parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(name_behavior_id)
brain_name = parsed_behavior_id.brain_name
try:
trainer = self.trainers[brain_name]
except KeyError:

policy = trainer.create_policy(env_manager.external_brains[name_behavior_id])
trainer.add_policy(name_behavior_id, policy)
trainer.add_policy(parsed_behavior_id, policy)
agent_manager = AgentManager(
policy,

# Final save Tensorflow model
if global_step != 0 and self.train_model:
self._save_model()
except (KeyboardInterrupt, UnityCommunicationException):
except (
KeyboardInterrupt,
UnityCommunicationException,
UnityEnvironmentException,
) as ex:
pass
if isinstance(ex, KeyboardInterrupt):
pass
else:
# If the environment failed, we want to make sure to raise
# the exception so we exit the process with an return code of 1.
raise ex
if self.train_model:
self._export_graph()

41
ml-agents/mlagents/trainers/trainer_util.py


import os
import yaml
from typing import Any, Dict, TextIO
import logging
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.meta_curriculum import MetaCurriculum
from mlagents.trainers.exception import TrainerConfigError
from mlagents.trainers.trainer import Trainer

from mlagents.trainers.ghost.trainer import GhostTrainer
from mlagents.trainers.ghost.controller import GhostController
logger = logging.getLogger("mlagents.trainers")
logger = get_logger(__name__)
class TrainerFactory:

self.seed = seed
self.meta_curriculum = meta_curriculum
self.multi_gpu = multi_gpu
self.ghost_controller = GhostController()
def generate(self, brain_name: str) -> Trainer:
return initialize_trainer(

self.keep_checkpoints,
self.train_model,
self.load_model,
self.ghost_controller,
self.seed,
self.meta_curriculum,
self.multi_gpu,

keep_checkpoints: int,
train_model: bool,
load_model: bool,
ghost_controller: GhostController,
seed: int,
meta_curriculum: MetaCurriculum = None,
multi_gpu: bool = False,

:param keep_checkpoints: How many model checkpoints to keep
:param train_model: Whether to train the model (vs. run inference)
:param load_model: Whether to load the model or randomly initialize
:param ghost_controller: The object that coordinates ghost trainers
:param seed: The random seed to use
:param meta_curriculum: Optional meta_curriculum, used to determine a reward buffer length for PPOTrainer
:return:

trainer = GhostTrainer(
trainer,
brain_name,
ghost_controller,
min_lesson_length,
trainer_parameters,
train_model,

"Error parsing yaml file. Please check for formatting errors. "
"A tool such as http://www.yamllint.com/ can be helpful with this."
) from e
def handle_existing_directories(
model_path: str, summary_path: str, resume: bool, force: bool
) -> None:
"""
Validates that if the run_id model exists, we do not overwrite it unless --force is specified.
Throws an exception if resume isn't specified and run_id exists. Throws an exception
if --resume is specified and run-id was not found.
:param model_path: The model path specified.
:param summary_path: The summary path to be used.
:param resume: Whether or not the --resume flag was passed.
:param force: Whether or not the --force flag was passed.
"""
model_path_exists = os.path.isdir(model_path)
if model_path_exists:
if not resume and not force:
raise UnityTrainerException(
"Previous data from this run-id was found. "
"Either specify a new run-id, use --resume to resume this run, "
"or use the --force parameter to overwrite existing data."
)
else:
if resume:
raise UnityTrainerException(
"Previous data from this run-id was not found. "
"Train a new run by removing the --resume flag."
)

2
ml-agents/setup.py


# Test-only dependencies should go in test_requirements.txt, not here.
"grpcio>=1.11.0",
"h5py>=2.9.0",
"jupyter",
"matplotlib",
"mlagents_envs=={}".format(VERSION),
"numpy>=1.13.3,<2.0",
"Pillow>=4.2.1",

67
ml-agents/tests/yamato/training_int_tests.py


import argparse
import time
from .yamato_utils import (
get_base_path,

checkout_csharp_version,
undo_git_checkout,
def main():
nn_file_expected = "./models/ppo/3DBall.nn"
def run_training(python_version, csharp_version):
latest = "latest"
run_id = int(time.time() * 1000.0)
print(
f"Running training with python={python_version or latest} and c#={csharp_version or latest}"
)
nn_file_expected = f"./models/{run_id}/3DBall.nn"
if os.path.exists(nn_file_expected):
# Should never happen - make sure nothing leftover from an old test.
print("Artifacts from previous build found!")

print(f"Running in base path {base_path}")
build_returncode = run_standalone_build(base_path)
if build_returncode != 0:
print("Standalone build FAILED!")
sys.exit(build_returncode)
# Only build the standalone player if we're overriding the C# version
# Otherwise we'll use the one built earlier in the pipeline.
if csharp_version is not None:
# We can't rely on the old C# code recognizing the commandline argument to set the output
# So rename testPlayer (containing the most recent build) to something else temporarily
full_player_path = os.path.join("Project", "testPlayer.app")
temp_player_path = os.path.join("Project", "temp_testPlayer.app")
final_player_path = os.path.join("Project", f"testPlayer_{csharp_version}.app")
os.rename(full_player_path, temp_player_path)
checkout_csharp_version(csharp_version)
build_returncode = run_standalone_build(base_path)
if build_returncode != 0:
print("Standalone build FAILED!")
sys.exit(build_returncode)
# Now rename the newly-built executable, and restore the old one
os.rename(full_player_path, final_player_path)
os.rename(temp_player_path, full_player_path)
standalone_player_path = f"testPlayer_{csharp_version}"
else:
standalone_player_path = "testPlayer"
init_venv()
venv_path = init_venv(python_version)
# Copy the default training config but override the max_steps parameter,
# and reduce the batch_size and buffer_size enough to ensure an update step happens.

buffer_size=10,
)
# TODO pass scene name and exe destination to build
# TODO make sure we fail if the exe isn't found - see MLA-559
mla_learn_cmd = "mlagents-learn override.yaml --train --env=Project/testPlayer --no-graphics --env-args -logFile -" # noqa
res = subprocess.run(f"source venv/bin/activate; {mla_learn_cmd}", shell=True)
mla_learn_cmd = (
f"mlagents-learn override.yaml --train --env=Project/{standalone_player_path} "
f"--run-id={run_id} --no-graphics --env-args -logFile -"
) # noqa
res = subprocess.run(
f"source {venv_path}/bin/activate; {mla_learn_cmd}", shell=True
)
if res.returncode != 0 or not os.path.exists(nn_file_expected):
print("mlagents-learn run FAILED!")

sys.exit(0)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--python", default=None)
parser.add_argument("--csharp", default=None)
args = parser.parse_args()
try:
run_training(args.python, args.csharp)
finally:
# Cleanup - this gets executed even if we hit sys.exit()
undo_git_checkout()
if __name__ == "__main__":

69
ml-agents/tests/yamato/yamato_utils.py


import os
import subprocess
import yaml
from typing import List, Optional
def get_unity_executable_path():

return os.getcwd()
def run_standalone_build(base_path: str, verbose: bool = False) -> int:
def run_standalone_build(
base_path: str, verbose: bool = False, output_path: str = None
) -> int:
Run BuildStandalonePlayerOSX test to produce a player at Project/testPlayer
:param base_path:
:return:
Run BuildStandalonePlayerOSX test to produce a player. The location defaults to Project/testPlayer.
"""
unity_exe = get_unity_executable_path()
print(f"Running BuildStandalonePlayerOSX via {unity_exe}")

]
if verbose:
test_args += ["-logfile", "-"]
if output_path is not None:
test_args += ["--mlagents-build-output-path", output_path]
print(f"{' '.join(test_args)} ...")
timeout = 30 * 60 # 30 minutes, just in case

def init_venv():
def init_venv(
mlagents_python_version: str = None, extra_packages: Optional[List[str]] = None
) -> str:
"""
Set up the virtual environment, and return the venv path.
:param mlagents_python_version: The version of mlagents python packcage to install.
If None, will do a local install, otherwise will install from pypi
:return:
"""
# Use a different venv path for different versions
venv_path = "venv"
if mlagents_python_version:
venv_path += "_" + mlagents_python_version
subprocess.check_call("python -m venv venv", shell=True)
subprocess.check_call(f"python -m venv {venv_path}", shell=True)
"-e ./ml-agents-envs",
"-e ./ml-agents",
if mlagents_python_version:
# install from pypi
pip_commands += [
f"mlagents=={mlagents_python_version}",
f"gym-unity=={mlagents_python_version}",
]
else:
# Local install
pip_commands += ["-e ./ml-agents-envs", "-e ./ml-agents", "-e ./gym-unity"]
if extra_packages:
pip_commands += extra_packages
f"source venv/bin/activate; python -m pip install -q {cmd}", shell=True
f"source {venv_path}/bin/activate; python -m pip install -q {cmd}",
shell=True,
return venv_path
def checkout_csharp_version(csharp_version):
"""
Checks out the specific git revision (usually a tag) for the C# package and Project.
If csharp_version is None, no changes are made.
:param csharp_version:
:return:
"""
if csharp_version is None:
return
csharp_dirs = ["com.unity.ml-agents", "Project"]
for csharp_dir in csharp_dirs:
subprocess.check_call(
f"git checkout {csharp_version} -- {csharp_dir}", shell=True
)
def undo_git_checkout():
"""
Clean up the git working directory.
"""
subprocess.check_call("git reset HEAD .", shell=True)
subprocess.check_call("git checkout -- .", shell=True)
def override_config_file(src_path, dest_path, **kwargs):

1
setup.cfg


I200,
banned-modules = tensorflow = use mlagents.tf_utils instead (it handles tf2 compat).
logging = use mlagents_envs.logging_util instead

32
.yamato/gym-interface-test.yml


test_editors:
- version: 2019.3
---
{% for editor in test_editors %}
test_gym_interface_{{ editor.version }}:
name: Test Mac Gym Interface {{ editor.version }}
agent:
type: Unity::VM::osx
image: ml-agents/ml-agents-bokken-mac:0.1.4-492264
flavor: b1.small
variables:
UNITY_VERSION: {{ editor.version }}
commands:
- pip install pyyaml
- python -u -m ml-agents.tests.yamato.setup_venv
- ./venv/bin/python ml-agents/tests/yamato/scripts/run_gym.py
dependencies:
- .yamato/standalone-build-test.yml#test_mac_standalone_{{ editor.version }}
triggers:
cancel_old_ci: true
changes:
only:
- "com.unity.ml-agents/**"
- "Project/**"
- "ml-agents/**"
- "ml-agents-envs/**"
- ".yamato/gym-interface-test.yml"
except:
- "*.md"
- "com.unity.ml-agents/*.md"
- "com.unity.ml-agents/**/*.md"
{% endfor %}

32
.yamato/python-ll-api-test.yml


test_editors:
- version: 2019.3
---
{% for editor in test_editors %}
test_mac_ll_api_{{ editor.version }}:
name: Test Mac LL-API {{ editor.version }}
agent:
type: Unity::VM::osx
image: ml-agents/ml-agents-bokken-mac:0.1.4-492264
flavor: b1.small
variables:
UNITY_VERSION: {{ editor.version }}
commands:
- pip install pyyaml
- python -u -m ml-agents.tests.yamato.setup_venv
- ./venv/bin/python ml-agents/tests/yamato/scripts/run_llapi.py
dependencies:
- .yamato/standalone-build-test.yml#test_mac_standalone_{{ editor.version }}
triggers:
cancel_old_ci: true
changes:
only:
- "com.unity.ml-agents/**"
- "Project/**"
- "ml-agents/**"
- "ml-agents-envs/**"
- ".yamato/python-ll-api-test.yml"
except:
- "*.md"
- "com.unity.ml-agents/*.md"
- "com.unity.ml-agents/**/*.md"
{% endfor %}

234
com.unity.ml-agents/Runtime/SideChannels/SideChannelUtils.cs


using System;
using System.Collections.Generic;
using UnityEngine;
using System.IO;
namespace MLAgents.SideChannels
{
public static class SideChannelUtils
{
private static Dictionary<Guid, SideChannel> RegisteredChannels = new Dictionary<Guid, SideChannel>();
private struct CachedSideChannelMessage
{
public Guid ChannelId;
public byte[] Message;
}
private static Queue<CachedSideChannelMessage> m_CachedMessages = new Queue<CachedSideChannelMessage>();
/// <summary>
/// Registers a side channel to the communicator. The side channel will exchange
/// messages with its Python equivalent.
/// </summary>
/// <param name="sideChannel"> The side channel to be registered.</param>
public static void RegisterSideChannel(SideChannel sideChannel)
{
var channelId = sideChannel.ChannelId;
if (RegisteredChannels.ContainsKey(channelId))
{
throw new UnityAgentsException(string.Format(
"A side channel with type index {0} is already registered. You cannot register multiple " +
"side channels of the same id.", channelId));
}
// Process any messages that we've already received for this channel ID.
var numMessages = m_CachedMessages.Count;
for (int i = 0; i < numMessages; i++)
{
var cachedMessage = m_CachedMessages.Dequeue();
if (channelId == cachedMessage.ChannelId)
{
using (var incomingMsg = new IncomingMessage(cachedMessage.Message))
{
sideChannel.OnMessageReceived(incomingMsg);
}
}
else
{
m_CachedMessages.Enqueue(cachedMessage);
}
}
RegisteredChannels.Add(channelId, sideChannel);
}
/// <summary>
/// Unregisters a side channel from the communicator.
/// </summary>
/// <param name="sideChannel"> The side channel to be unregistered.</param>
public static void UnregisterSideChannel(SideChannel sideChannel)
{
if (RegisteredChannels.ContainsKey(sideChannel.ChannelId))
{
RegisteredChannels.Remove(sideChannel.ChannelId);
}
}
/// <summary>
/// Unregisters all the side channels from the communicator.
/// </summary>
public static void UnregisterAllSideChannels()
{
RegisteredChannels = new Dictionary<Guid, SideChannel>();
}
/// <summary>
/// Returns the SideChannel of Type T if there is one registered, or null if it doesn't.
/// If there are multiple SideChannels of the same type registered, the returned instance is arbitrary.
/// </summary>
/// <typeparam name="T"></typeparam>
/// <returns></returns>
public static T GetSideChannel<T>() where T: SideChannel
{
foreach (var sc in RegisteredChannels.Values)
{
if (sc.GetType() == typeof(T))
{
return (T) sc;
}
}
return null;
}
/// <summary>
/// Returns all SideChannels of Type T that are registered. Use <see cref="GetSideChannel{T}()"/> if possible,
/// as that does not make any memory allocations.
/// </summary>
/// <typeparam name="T"></typeparam>
/// <returns></returns>
public static List<T> GetSideChannels<T>() where T: SideChannel
{
var output = new List<T>();
foreach (var sc in RegisteredChannels.Values)
{
if (sc.GetType() == typeof(T))
{
output.Add((T) sc);
}
}
return output;
}
/// <summary>
/// Grabs the messages that the registered side channels will send to Python at the current step
/// into a singe byte array.
/// </summary>
/// <returns></returns>
internal static byte[] GetSideChannelMessage()
{
return GetSideChannelMessage(RegisteredChannels);
}
/// <summary>
/// Grabs the messages that the registered side channels will send to Python at the current step
/// into a singe byte array.
/// </summary>
/// <param name="sideChannels"> A dictionary of channel type to channel.</param>
/// <returns></returns>
internal static byte[] GetSideChannelMessage(Dictionary<Guid, SideChannel> sideChannels)
{
using (var memStream = new MemoryStream())
{
using (var binaryWriter = new BinaryWriter(memStream))
{
foreach (var sideChannel in sideChannels.Values)
{
var messageList = sideChannel.MessageQueue;
foreach (var message in messageList)
{
binaryWriter.Write(sideChannel.ChannelId.ToByteArray());
binaryWriter.Write(message.Length);
binaryWriter.Write(message);
}
sideChannel.MessageQueue.Clear();
}
return memStream.ToArray();
}
}
}
/// <summary>
/// Separates the data received from Python into individual messages for each registered side channel.
/// </summary>
/// <param name="dataReceived">The byte array of data received from Python.</param>
internal static void ProcessSideChannelData(byte[] dataReceived)
{
ProcessSideChannelData(RegisteredChannels, dataReceived);
}
/// <summary>
/// Separates the data received from Python into individual messages for each registered side channel.
/// </summary>
/// <param name="sideChannels">A dictionary of channel type to channel.</param>
/// <param name="dataReceived">The byte array of data received from Python.</param>
internal static void ProcessSideChannelData(Dictionary<Guid, SideChannel> sideChannels, byte[] dataReceived)
{
while (m_CachedMessages.Count != 0)
{
var cachedMessage = m_CachedMessages.Dequeue();
if (sideChannels.ContainsKey(cachedMessage.ChannelId))
{
using (var incomingMsg = new IncomingMessage(cachedMessage.Message))
{
sideChannels[cachedMessage.ChannelId].OnMessageReceived(incomingMsg);
}
}
else
{
Debug.Log(string.Format(
"Unknown side channel data received. Channel Id is "
+ ": {0}", cachedMessage.ChannelId));
}
}
if (dataReceived.Length == 0)
{
return;
}
using (var memStream = new MemoryStream(dataReceived))
{
using (var binaryReader = new BinaryReader(memStream))
{
while (memStream.Position < memStream.Length)
{
Guid channelId = Guid.Empty;
byte[] message = null;
try
{
channelId = new Guid(binaryReader.ReadBytes(16));
var messageLength = binaryReader.ReadInt32();
message = binaryReader.ReadBytes(messageLength);
}
catch (Exception ex)
{
throw new UnityAgentsException(
"There was a problem reading a message in a SideChannel. Please make sure the " +
"version of MLAgents in Unity is compatible with the Python version. Original error : "
+ ex.Message);
}
if (sideChannels.ContainsKey(channelId))
{
using (var incomingMsg = new IncomingMessage(message))
{
sideChannels[channelId].OnMessageReceived(incomingMsg);
}
}
else
{
// Don't recognize this ID, but cache it in case the SideChannel that can handle
// it is registered before the next call to ProcessSideChannelData.
m_CachedMessages.Enqueue(new CachedSideChannelMessage
{
ChannelId = channelId,
Message = message
});
}
}
}
}
}
}
}

11
com.unity.ml-agents/Runtime/SideChannels/SideChannelUtils.cs.meta


fileFormatVersion: 2
guid: 2506dff31271f49298fbff21e13fa8b6
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

部分文件因为文件数量过多而无法显示

正在加载...
取消
保存