浏览代码

Merge pull request #3038 from Unity-Technologies/develop

Merge develop to master
/develop
GitHub 5 年前
当前提交
35c995e9
共有 130 个文件被更改,包括 1384 次插入1423 次删除
  1. 4
      .circleci/config.yml
  2. 11
      CONTRIBUTING.md
  3. 3
      README.md
  4. 1
      UnitySDK/Assets/ML-Agents/Editor/Tests/DemonstrationTests.cs
  5. 1
      UnitySDK/Assets/ML-Agents/Editor/Tests/EditModeTestInternalBrainTensorGenerator.cs
  6. 8
      UnitySDK/Assets/ML-Agents/Editor/Tests/MLAgentsEditModeTest.cs
  7. 9
      UnitySDK/Assets/ML-Agents/Editor/Tests/TimerTest.cs
  8. 7
      UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAcademy.cs
  9. 8
      UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs
  10. 8
      UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs
  11. 2
      UnitySDK/Assets/ML-Agents/Examples/Basic/Scripts/BasicAgent.cs
  12. 6
      UnitySDK/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs
  13. 5
      UnitySDK/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorAgent.cs
  14. 12
      UnitySDK/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAcademy.cs
  15. 4
      UnitySDK/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAgent.cs
  16. 14
      UnitySDK/Assets/ML-Agents/Examples/GridWorld/Scripts/GridArea.cs
  17. 13
      UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs
  18. 3
      UnitySDK/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAcademy.cs
  19. 9
      UnitySDK/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs
  20. 4
      UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerAcademy.cs
  21. 2
      UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerFieldArea.cs
  22. 5
      UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAcademy.cs
  23. 8
      UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs
  24. 6
      UnitySDK/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAcademy.cs
  25. 14
      UnitySDK/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs
  26. 11
      UnitySDK/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs
  27. 197
      UnitySDK/Assets/ML-Agents/Scripts/Academy.cs
  28. 52
      UnitySDK/Assets/ML-Agents/Scripts/Grpc/CommunicatorObjects/UnityRlInitializationOutput.cs
  29. 119
      UnitySDK/Assets/ML-Agents/Scripts/Grpc/CommunicatorObjects/UnityRlInput.cs
  30. 44
      UnitySDK/Assets/ML-Agents/Scripts/Grpc/CommunicatorObjects/UnityRlOutput.cs
  31. 23
      UnitySDK/Assets/ML-Agents/Scripts/Grpc/GrpcExtensions.cs
  32. 126
      UnitySDK/Assets/ML-Agents/Scripts/Grpc/RpcCommunicator.cs
  33. 34
      UnitySDK/Assets/ML-Agents/Scripts/ICommunicator.cs
  34. 70
      UnitySDK/Assets/ML-Agents/Scripts/Timer.cs
  35. 1
      UnitySDK/UnitySDK.sln.DotSettings
  36. 38
      docs/Basic-Guide.md
  37. 23
      docs/Getting-Started-with-Balance-Ball.md
  38. 6
      docs/Installation-Windows.md
  39. 4
      docs/Installation.md
  40. 12
      docs/Learning-Environment-Design-Academy.md
  41. 6
      docs/Learning-Environment-Design.md
  42. 28
      docs/Learning-Environment-Examples.md
  43. 18
      docs/Learning-Environment-Executable.md
  44. 13
      docs/Migrating.md
  45. 81
      docs/Python-API.md
  46. 5
      docs/Training-Curriculum-Learning.md
  47. 4
      docs/Training-Generalized-Reinforcement-Learning-Agents.md
  48. 17
      docs/Training-ML-Agents.md
  49. 2
      docs/Training-on-Amazon-Web-Service.md
  50. 24
      docs/Using-Virtual-Environment.md
  51. 4
      gym-unity/gym_unity/tests/test_gym.py
  52. 14
      ml-agents-envs/mlagents/envs/base_unity_environment.py
  53. 21
      ml-agents-envs/mlagents/envs/brain.py
  54. 17
      ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_initialization_output_pb2.py
  55. 14
      ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_initialization_output_pb2.pyi
  56. 37
      ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_input_pb2.py
  57. 18
      ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_input_pb2.pyi
  58. 19
      ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_output_pb2.py
  59. 6
      ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_output_pb2.pyi
  60. 11
      ml-agents-envs/mlagents/envs/env_manager.py
  61. 124
      ml-agents-envs/mlagents/envs/environment.py
  62. 29
      ml-agents-envs/mlagents/envs/simple_env_manager.py
  63. 71
      ml-agents-envs/mlagents/envs/subprocess_env_manager.py
  64. 6
      ml-agents-envs/mlagents/envs/tests/test_brain.py
  65. 35
      ml-agents-envs/mlagents/envs/tests/test_subprocess_env_manager.py
  66. 7
      ml-agents/mlagents/trainers/bc/policy.py
  67. 17
      ml-agents/mlagents/trainers/bc/trainer.py
  68. 470
      ml-agents/mlagents/trainers/buffer.py
  69. 15
      ml-agents/mlagents/trainers/components/bc/module.py
  70. 7
      ml-agents/mlagents/trainers/components/reward_signals/__init__.py
  71. 2
      ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
  72. 5
      ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
  73. 9
      ml-agents/mlagents/trainers/curriculum.py
  74. 100
      ml-agents/mlagents/trainers/demo_loader.py
  75. 70
      ml-agents/mlagents/trainers/learn.py
  76. 8
      ml-agents/mlagents/trainers/meta_curriculum.py
  77. 7
      ml-agents/mlagents/trainers/ppo/policy.py
  78. 61
      ml-agents/mlagents/trainers/ppo/trainer.py
  79. 44
      ml-agents/mlagents/trainers/rl_trainer.py
  80. 47
      ml-agents/mlagents/trainers/sac/trainer.py
  81. 49
      ml-agents/mlagents/trainers/tests/__init__.py
  82. 48
      ml-agents/mlagents/trainers/tests/mock_brain.py
  83. 12
      ml-agents/mlagents/trainers/tests/test_bc.py
  84. 71
      ml-agents/mlagents/trainers/tests/test_buffer.py
  85. 8
      ml-agents/mlagents/trainers/tests/test_curriculum.py
  86. 4
      ml-agents/mlagents/trainers/tests/test_demo_loader.py
  87. 4
      ml-agents/mlagents/trainers/tests/test_learn.py
  88. 9
      ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
  89. 6
      ml-agents/mlagents/trainers/tests/test_policy.py
  90. 51
      ml-agents/mlagents/trainers/tests/test_ppo.py
  91. 4
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  92. 22
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  93. 63
      ml-agents/mlagents/trainers/tests/test_sac.py
  94. 4
      ml-agents/mlagents/trainers/tests/test_simple_rl.py
  95. 2
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  96. 16
      ml-agents/mlagents/trainers/tf_policy.py
  97. 4
      ml-agents/mlagents/trainers/trainer_controller.py
  98. 3
      protobuf-definitions/proto/mlagents/envs/communicator_objects/unity_rl_initialization_output.proto
  99. 6
      protobuf-definitions/proto/mlagents/envs/communicator_objects/unity_rl_input.proto
  100. 1
      protobuf-definitions/proto/mlagents/envs/communicator_objects/unity_rl_output.proto

4
.circleci/config.yml


executor: << parameters.executor >>
working_directory: ~/repo
# Run additional numpy checks on unit tests
environment:
TEST_ENFORCE_NUMPY_FLOAT32: 1
steps:
- checkout

11
CONTRIBUTING.md


issues with the `contributions welcome` label.
## Git Branches
The master branch corresponds to the most recent version of the project.
Note that this may be newer that the [latest release](https://github.com/Unity-Technologies/ml-agents/releases/tag/latest_release).
Starting with v0.3, we adopted the
[Gitflow Workflow](http://nvie.com/posts/a-successful-git-branching-model/).
Consequently, the `master` branch corresponds to the latest release of
the project, while the `develop` branch corresponds to the most recent, stable,
version of the project.
Thus, when adding to the project, **please branch off `develop`**
and make sure that your Pull Request (PR) contains the following:
When contributing to the project, please make sure that your Pull Request (PR) contains the following:
* Detailed description of the changes performed
* Corresponding changes to documentation, unit tests and sample environments (if

3
README.md


[![docs badge](https://img.shields.io/badge/docs-reference-blue.svg)](docs/Readme.md)
[![license badge](https://img.shields.io/badge/license-Apache--2.0-green.svg)](LICENSE)
([latest release](https://github.com/Unity-Technologies/ml-agents/releases/tag/latest_release))
([all releases](https://github.com/Unity-Technologies/ml-agents/releases))
**The Unity Machine Learning Agents Toolkit** (ML-Agents) is an open-source
Unity plugin that enables games and simulations to serve as environments for
training intelligent agents. Agents can be trained using reinforcement learning,

1
UnitySDK/Assets/ML-Agents/Editor/Tests/DemonstrationTests.cs


var acaGo = new GameObject("TestAcademy");
acaGo.AddComponent<TestAcademy>();
var aca = acaGo.GetComponent<TestAcademy>();
aca.resetParameters = new ResetParameters();
var academyInitializeMethod = typeof(Academy).GetMethod("InitializeEnvironment",
BindingFlags.Instance | BindingFlags.NonPublic);

1
UnitySDK/Assets/ML-Agents/Editor/Tests/EditModeTestInternalBrainTensorGenerator.cs


var acaGo = new GameObject("TestAcademy");
acaGo.AddComponent<TestAcademy>();
var aca = acaGo.GetComponent<TestAcademy>();
aca.resetParameters = new ResetParameters();
var goA = new GameObject("goA");
var bpA = goA.AddComponent<BehaviorParameters>();

8
UnitySDK/Assets/ML-Agents/Editor/Tests/MLAgentsEditModeTest.cs


var acaGo = new GameObject("TestAcademy");
acaGo.AddComponent<TestAcademy>();
var aca = acaGo.GetComponent<TestAcademy>();
aca.resetParameters = new ResetParameters();
Assert.AreEqual(0, aca.initializeAcademyCalls);
Assert.AreEqual(0, aca.GetStepCount());
Assert.AreEqual(0, aca.GetEpisodeCount());

var acaGo = new GameObject("TestAcademy");
acaGo.AddComponent<TestAcademy>();
var aca = acaGo.GetComponent<TestAcademy>();
aca.resetParameters = new ResetParameters();
Assert.AreEqual(false, agent1.IsDone());
Assert.AreEqual(false, agent2.IsDone());

var acaGo = new GameObject("TestAcademy");
acaGo.AddComponent<TestAcademy>();
var aca = acaGo.GetComponent<TestAcademy>();
aca.resetParameters = new ResetParameters();
var academyInitializeMethod = typeof(Academy).GetMethod("InitializeEnvironment",
BindingFlags.Instance | BindingFlags.NonPublic);
academyInitializeMethod?.Invoke(aca, new object[] { });

var acaGo = new GameObject("TestAcademy");
acaGo.AddComponent<TestAcademy>();
var aca = acaGo.GetComponent<TestAcademy>();
aca.resetParameters = new ResetParameters();
var agentEnableMethod = typeof(Agent).GetMethod(

var acaGo = new GameObject("TestAcademy");
acaGo.AddComponent<TestAcademy>();
var aca = acaGo.GetComponent<TestAcademy>();
aca.resetParameters = new ResetParameters();
var academyInitializeMethod = typeof(Academy).GetMethod(
"InitializeEnvironment", BindingFlags.Instance | BindingFlags.NonPublic);
academyInitializeMethod?.Invoke(aca, new object[] { });

var acaGo = new GameObject("TestAcademy");
acaGo.AddComponent<TestAcademy>();
var aca = acaGo.GetComponent<TestAcademy>();
aca.resetParameters = new ResetParameters();
var agentEnableMethod = typeof(Agent).GetMethod(

var acaGo = new GameObject("TestAcademy");
acaGo.AddComponent<TestAcademy>();
var aca = acaGo.GetComponent<TestAcademy>();
aca.resetParameters = new ResetParameters();
var agentEnableMethod = typeof(Agent).GetMethod(

var acaGo = new GameObject("TestAcademy");
acaGo.AddComponent<TestAcademy>();
var aca = acaGo.GetComponent<TestAcademy>();
aca.resetParameters = new ResetParameters();
var agentEnableMethod = typeof(Agent).GetMethod(

9
UnitySDK/Assets/ML-Agents/Editor/Tests/TimerTest.cs


using NUnit.Framework;
using UnityEditor.Graphs;
using UnityEngine;
namespace MLAgents.Tests

{
TimerStack myTimer = TimerStack.Instance;
myTimer.Reset();
using (myTimer.Scoped("foo"))
{
for (int i = 0; i < 5; i++)

myTimer.SetGauge("my_gauge", (float)i);
}
}
}

Assert.AreEqual(rootChildren["foo"].NumCalls, 1);
var gauge = myTimer.RootNode.Gauges["my_gauge"];
Assert.NotNull(gauge);
Assert.AreEqual(5, gauge.count);
Assert.AreEqual(0, gauge.minValue);
Assert.AreEqual(4, gauge.maxValue);
Assert.AreEqual(4, gauge.value);
var fooChildren = rootChildren["foo"].Children;
Assert.That(fooChildren, Contains.Key("bar"));

7
UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAcademy.cs


public class Ball3DAcademy : Academy
{
public override void AcademyReset()
public override void InitializeAcademy()
Physics.gravity = new Vector3(0, -resetParameters["gravity"], 0);
FloatProperties.RegisterCallback("gravity", f => { Physics.gravity = new Vector3(0, -f, 0); });
public override void AcademyStep()
{
}
}

8
UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs


[Header("Specific to Ball3D")]
public GameObject ball;
Rigidbody m_BallRb;
ResetParameters m_ResetParams;
IFloatProperties m_ResetParams;
m_ResetParams = academy.resetParameters;
m_ResetParams = academy.FloatProperties;
SetResetParameters();
}

public void SetBall()
{
//Set the attributes of the ball by fetching the information from the academy
m_BallRb.mass = m_ResetParams["mass"];
var scale = m_ResetParams["scale"];
m_BallRb.mass = m_ResetParams.GetPropertyWithDefault("mass", 1.0f);
var scale = m_ResetParams.GetPropertyWithDefault("scale", 1.0f);
ball.transform.localScale = new Vector3(scale, scale, scale);
}

8
UnitySDK/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs


[Header("Specific to Ball3DHard")]
public GameObject ball;
Rigidbody m_BallRb;
ResetParameters m_ResetParams;
IFloatProperties m_ResetParams;
m_ResetParams = academy.resetParameters;
m_ResetParams = academy.FloatProperties;
SetResetParameters();
}

public void SetBall()
{
//Set the attributes of the ball by fetching the information from the academy
m_BallRb.mass = m_ResetParams["mass"];
var scale = m_ResetParams["scale"];
m_BallRb.mass = m_ResetParams.GetPropertyWithDefault("mass", 1.0f);
var scale = m_ResetParams.GetPropertyWithDefault("scale", 1.0f);
ball.transform.localScale = new Vector3(scale, scale, scale);
}

2
UnitySDK/Assets/ML-Agents/Examples/Basic/Scripts/BasicAgent.cs


void WaitTimeInference()
{
if (!m_Academy.GetIsInference())
if (!m_Academy.IsCommunicatorOn)
{
RequestDecision();
}

6
UnitySDK/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs


int m_NumberJumps = 20;
int m_JumpLeft = 20;
ResetParameters m_ResetParams;
IFloatProperties m_ResetParams;
public override void InitializeAgent()
{

var academy = FindObjectOfType<Academy>();
m_ResetParams = academy.resetParameters;
m_ResetParams = academy.FloatProperties;
SetResetParameters();
}

public void SetTargetScale()
{
var targetScale = m_ResetParams["target_scale"];
var targetScale = m_ResetParams.GetPropertyWithDefault("target_scale", 1.0f);
target.transform.localScale = new Vector3(targetScale, targetScale, targetScale);
}

5
UnitySDK/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorAgent.cs


public void SetLaserLengths()
{
m_LaserLength = m_MyAcademy.resetParameters.TryGetValue("laser_length", out m_LaserLength) ? m_LaserLength : 1.0f;
m_LaserLength = m_MyAcademy.FloatProperties.GetPropertyWithDefault("laser_length", 1.0f);
float agentScale;
agentScale = m_MyAcademy.resetParameters.TryGetValue("agent_scale", out agentScale) ? agentScale : 1.0f;
float agentScale = m_MyAcademy.FloatProperties.GetPropertyWithDefault("agent_scale", 1.0f);
gameObject.transform.localScale = new Vector3(agentScale, agentScale, agentScale);
}

12
UnitySDK/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAcademy.cs


{
public Camera MainCamera;
public override void AcademyReset()
public override void InitializeAcademy()
MainCamera.transform.position = new Vector3(-((int)resetParameters["gridSize"] - 1) / 2f,
(int)resetParameters["gridSize"] * 1.25f,
-((int)resetParameters["gridSize"] - 1) / 2f);
MainCamera.orthographicSize = ((int)resetParameters["gridSize"] + 5f) / 2f;
FloatProperties.RegisterCallback("gridSize", f =>
{
MainCamera.transform.position = new Vector3(-(f - 1) / 2f, f * 1.25f, -(f - 1) / 2f);
MainCamera.orthographicSize = (f + 5f) / 2f;
});
}
}

4
UnitySDK/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAgent.cs


// Prevents the agent from picking an action that would make it collide with a wall
var positionX = (int)transform.position.x;
var positionZ = (int)transform.position.z;
var maxPosition = (int)m_Academy.resetParameters["gridSize"] - 1;
var maxPosition = (int)m_Academy.FloatProperties.GetPropertyWithDefault("gridSize", 5f) - 1;
if (positionX == 0)
{

renderCamera.Render();
}
if (!m_Academy.GetIsInference())
if (!m_Academy.IsCommunicatorOn)
{
RequestDecision();
}

14
UnitySDK/Assets/ML-Agents/Examples/GridWorld/Scripts/GridArea.cs


public GameObject trueAgent;
ResetParameters m_ResetParameters;
IFloatProperties m_ResetParameters;
Camera m_AgentCam;

public void Awake()
{
m_ResetParameters = FindObjectOfType<Academy>().resetParameters;
m_ResetParameters = FindObjectOfType<Academy>().FloatProperties;
m_Objects = new[] { goalPref, pitPref };

public void SetEnvironment()
{
transform.position = m_InitialPosition * (m_ResetParameters["gridSize"] + 1);
transform.position = m_InitialPosition * (m_ResetParameters.GetPropertyWithDefault("gridSize", 5f) + 1);
for (var i = 0; i < (int)m_ResetParameters["numObstacles"]; i++)
for (var i = 0; i < (int)m_ResetParameters.GetPropertyWithDefault("numObstacles", 1); i++)
for (var i = 0; i < (int)m_ResetParameters["numGoals"]; i++)
for (var i = 0; i < (int)m_ResetParameters.GetPropertyWithDefault("numGoals", 1f); i++)
var gridSize = (int)m_ResetParameters["gridSize"];
var gridSize = (int)m_ResetParameters.GetPropertyWithDefault("gridSize", 5f);
m_Plane.transform.localScale = new Vector3(gridSize / 10.0f, 1f, gridSize / 10.0f);
m_Plane.transform.localPosition = new Vector3((gridSize - 1) / 2f, -0.5f, (gridSize - 1) / 2f);
m_Sn.transform.localScale = new Vector3(1, 1, gridSize + 2);

public void AreaReset()
{
var gridSize = (int)m_ResetParameters["gridSize"];
var gridSize = (int)m_ResetParameters.GetPropertyWithDefault("gridSize", 5f); ;
foreach (var actor in actorObjs)
{
DestroyImmediate(actor);

13
UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs


public void SetGroundMaterialFriction()
{
var resetParams = m_Academy.resetParameters;
var resetParams = m_Academy.FloatProperties;
groundCollider.material.dynamicFriction = resetParams["dynamic_friction"];
groundCollider.material.staticFriction = resetParams["static_friction"];
groundCollider.material.dynamicFriction = resetParams.GetPropertyWithDefault("dynamic_friction", 0);
groundCollider.material.staticFriction = resetParams.GetPropertyWithDefault("static_friction", 0);
var resetParams = m_Academy.resetParameters;
var resetParams = m_Academy.FloatProperties;
var scale = resetParams.GetPropertyWithDefault("block_scale", 2);
m_BlockRb.transform.localScale = new Vector3(resetParams["block_scale"], 0.75f, resetParams["block_scale"]);
m_BlockRb.transform.localScale = new Vector3(scale, 0.75f, scale);
m_BlockRb.drag = resetParams["block_drag"];
m_BlockRb.drag = resetParams.GetPropertyWithDefault("block_drag", 0.5f);
}
public void SetResetParameters()

3
UnitySDK/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAcademy.cs


{
public override void AcademyReset()
{
Physics.gravity = new Vector3(0, -resetParameters["gravity"], 0);
FloatProperties.RegisterCallback("gravity", f => { Physics.gravity = new Vector3(0, -f, 0); });
}
public override void AcademyStep()

9
UnitySDK/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs


public void SetResetParameters()
{
m_GoalSize = m_MyAcademy.resetParameters["goal_size"];
m_GoalSpeed = Random.Range(-1f, 1f) * m_MyAcademy.resetParameters["goal_speed"];
m_Deviation = m_MyAcademy.resetParameters["deviation"];
m_DeviationFreq = m_MyAcademy.resetParameters["deviation_freq"];
var fp = m_MyAcademy.FloatProperties;
m_GoalSize = fp.GetPropertyWithDefault("goal_size", 5);
m_GoalSpeed = Random.Range(-1f, 1f) * fp.GetPropertyWithDefault("goal_speed", 1);
m_Deviation = fp.GetPropertyWithDefault("deviation", 0);
m_DeviationFreq = fp.GetPropertyWithDefault("deviation_freq", 0);
}
}

4
UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerAcademy.cs


Physics.gravity *= gravityMultiplier; //for soccer a multiplier of 3 looks good
}
public override void AcademyReset()
public override void InitializeAcademy()
Physics.gravity = new Vector3(0, -resetParameters["gravity"], 0);
FloatProperties.RegisterCallback("gravity", f => { Physics.gravity = new Vector3(0, -f, 0); });
}
public override void AcademyStep()

2
UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerFieldArea.cs


ballRb.velocity = Vector3.zero;
ballRb.angularVelocity = Vector3.zero;
var ballScale = m_Academy.resetParameters["ball_scale"];
var ballScale = m_Academy.FloatProperties.GetPropertyWithDefault("ball_scale", 0.015f);
ballRb.transform.localScale = new Vector3(ballScale, ballScale, ballScale);
}
}

5
UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAcademy.cs


public class TennisAcademy : Academy
{
public override void AcademyReset()
public override void InitializeAcademy()
Physics.gravity = new Vector3(0, -resetParameters["gravity"], 0);
FloatProperties.RegisterCallback("gravity", f => { Physics.gravity = new Vector3(0, -f, 0); });
}
public override void AcademyStep()

8
UnitySDK/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs


Rigidbody m_AgentRb;
Rigidbody m_BallRb;
float m_InvertMult;
ResetParameters m_ResetParams;
IFloatProperties m_ResetParams;
// Looks for the scoreboard based on the name of the gameObjects.
// Do not modify the names of the Score GameObjects

var canvas = GameObject.Find(k_CanvasName);
GameObject scoreBoard;
var academy = FindObjectOfType<Academy>();
m_ResetParams = academy.resetParameters;
m_ResetParams = academy.FloatProperties;
if (invertX)
{
scoreBoard = canvas.transform.Find(k_ScoreBoardBName).gameObject;

public void SetRacket()
{
angle = m_ResetParams["angle"];
angle = m_ResetParams.GetPropertyWithDefault("angle", 55);
gameObject.transform.eulerAngles = new Vector3(
gameObject.transform.eulerAngles.x,
gameObject.transform.eulerAngles.y,

public void SetBall()
{
scale = m_ResetParams["scale"];
scale = m_ResetParams.GetPropertyWithDefault("scale", 1);
ball.transform.localScale = new Vector3(scale, scale, scale);
}

6
UnitySDK/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAcademy.cs


Physics.defaultSolverVelocityIterations = 12;
Time.fixedDeltaTime = 0.01333f; //(75fps). default is .2 (60fps)
Time.maximumDeltaTime = .15f; // Default is .33
}
public override void AcademyReset()
{
Physics.gravity = new Vector3(0, -resetParameters["gravity"], 0);
FloatProperties.RegisterCallback("gravity", f => { Physics.gravity = new Vector3(0, -f, 0); });
}
public override void AcademyStep()

14
UnitySDK/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs


public class WalkerAgent : Agent
{
[Header("Specific to Walker")][Header("Target To Walk Towards")][Space(10)]
[Header("Specific to Walker")]
[Header("Target To Walk Towards")]
[Space(10)]
public Transform target;
Vector3 m_DirToTarget;

Rigidbody m_ChestRb;
Rigidbody m_SpineRb;
ResetParameters m_ResetParams;
IFloatProperties m_ResetParams;
public override void InitializeAgent()
{

m_SpineRb = spine.GetComponent<Rigidbody>();
var academy = FindObjectOfType<WalkerAcademy>();
m_ResetParams = academy.resetParameters;
m_ResetParams = academy.FloatProperties;
SetResetParameters();
}

public void SetTorsoMass()
{
m_ChestRb.mass = m_ResetParams["chest_mass"];
m_SpineRb.mass = m_ResetParams["spine_mass"];
m_HipsRb.mass = m_ResetParams["hip_mass"];
m_ChestRb.mass = m_ResetParams.GetPropertyWithDefault("chest_mass", 8);
m_SpineRb.mass = m_ResetParams.GetPropertyWithDefault("spine_mass", 10);
m_HipsRb.mass = m_ResetParams.GetPropertyWithDefault("hip_mass", 15);
}
public void SetResetParameters()

11
UnitySDK/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs


{
localScale = new Vector3(
localScale.x,
m_Academy.resetParameters["no_wall_height"],
m_Academy.FloatProperties.GetPropertyWithDefault("no_wall_height", 0),
localScale.z);
wall.transform.localScale = localScale;
GiveModel("SmallWallJump", noWallBrain);

localScale = new Vector3(
localScale.x,
m_Academy.resetParameters["small_wall_height"],
m_Academy.FloatProperties.GetPropertyWithDefault("small_wall_height", 4),
localScale.z);
wall.transform.localScale = localScale;
GiveModel("SmallWallJump", smallWallBrain);

var height =
m_Academy.resetParameters["big_wall_min_height"] +
Random.value * (m_Academy.resetParameters["big_wall_max_height"] -
m_Academy.resetParameters["big_wall_min_height"]);
var min = m_Academy.FloatProperties.GetPropertyWithDefault("big_wall_min_height", 8);
var max = m_Academy.FloatProperties.GetPropertyWithDefault("big_wall_max_height", 8);
var height = min + Random.value * (max - min);
localScale = new Vector3(
localScale.x,
height,

197
UnitySDK/Assets/ML-Agents/Scripts/Academy.cs


namespace MLAgents
{
/// <summary>
/// Wraps the environment-level parameters that are provided within the
/// Editor. These parameters can be provided for training and inference
/// modes separately and represent screen resolution, rendering quality and
/// frame rate.
/// </summary>
[System.Serializable]
public class EnvironmentConfiguration
{
[Tooltip("Width of the environment window in pixels.")]
public int width;
[Tooltip("Height of the environment window in pixels.")]
public int height;
[Tooltip("Rendering quality of environment. (Higher is better quality.)")]
[Range(0, 5)]
public int qualityLevel;
[Tooltip("Speed at which environment is run. (Higher is faster.)")]
[Range(1f, 100f)]
public float timeScale;
[Tooltip("Frames per second (FPS) engine attempts to maintain.")]
public int targetFrameRate;
/// Initializes a new instance of the
/// <see cref="EnvironmentConfiguration"/> class.
/// <param name="width">Width of environment window (pixels).</param>
/// <param name="height">Height of environment window (pixels).</param>
/// <param name="qualityLevel">
/// Rendering quality of environment. Ranges from 0 to 5, with higher.
/// </param>
/// <param name="timeScale">
/// Speed at which environment is run. Ranges from 1 to 100, with higher
/// values representing faster speed.
/// </param>
/// <param name="targetFrameRate">
/// Target frame rate (per second) that the engine tries to maintain.
/// </param>
public EnvironmentConfiguration(
int width, int height, int qualityLevel,
float timeScale, int targetFrameRate)
{
this.width = width;
this.height = height;
this.qualityLevel = qualityLevel;
this.timeScale = timeScale;
this.targetFrameRate = targetFrameRate;
}
}
/// <summary>
/// An Academy is where Agent objects go to train their behaviors.

/// Used to restore original value when deriving Academy modifies it
float m_OriginalMaximumDeltaTime;
// Fields provided in the Inspector
public IFloatProperties FloatProperties;
[FormerlySerializedAs("trainingConfiguration")]
[SerializeField]
[Tooltip("The engine-level settings which correspond to rendering " +
"quality and engine speed during Training.")]
EnvironmentConfiguration m_TrainingConfiguration =
new EnvironmentConfiguration(80, 80, 1, 100.0f, -1);
[FormerlySerializedAs("inferenceConfiguration")]
[SerializeField]
[Tooltip("The engine-level settings which correspond to rendering " +
"quality and engine speed during Inference.")]
EnvironmentConfiguration m_InferenceConfiguration =
new EnvironmentConfiguration(1280, 720, 5, 1.0f, 60);
/// <summary>
/// Contains a mapping from parameter names to float values. They are
/// used in <see cref="AcademyReset"/> and <see cref="AcademyStep"/>
/// to modify elements in the environment at reset time.
/// </summary>
/// <remarks>
/// Default reset parameters are specified in the academy Editor, and can
/// be modified when training by passing a config
/// dictionary at reset.
/// </remarks>
[SerializeField]
[Tooltip("List of custom parameters that can be changed in the " +
"environment when it resets.")]
public ResetParameters resetParameters;
public CommunicatorObjects.CustomResetParametersProto customResetParameters;
// Fields not provided in the Inspector.

get { return Communicator != null; }
}
/// If true, the Academy will use inference settings. This field is
/// initialized in <see cref="Awake"/> depending on the presence
/// or absence of a communicator. Furthermore, it can be modified during
/// training via <see cref="SetIsInference"/>.
bool m_IsInference = true;
/// The number of episodes completed by the environment. Incremented
/// each time the environment is reset.
int m_EpisodeCount;

/// The number of total number of steps completed during the whole simulation. Incremented
/// each time a step is taken in the environment.
int m_TotalStepCount;
/// Flag that indicates whether the inference/training mode of the
/// environment was switched by the training process. This impacts the
/// engine settings at the next environment step.
bool m_ModeSwitched;
/// Pointer to the communicator currently in use by the Academy.
public ICommunicator Communicator;

m_OriginalFixedDeltaTime = Time.fixedDeltaTime;
m_OriginalMaximumDeltaTime = Time.maximumDeltaTime;
var floatProperties = new FloatPropertiesChannel();
FloatProperties = floatProperties;
// Try to launch the communicator by using the arguments passed at launch
try

if (Communicator != null)
{
Communicator.RegisterSideChannel(new EngineConfigurationChannel());
Communicator.RegisterSideChannel(floatProperties);
// We try to exchange the first message with Python. If this fails, it means
// no Python Process is ready to train the environment. In this case, the
//environment must use Inference.

{
version = k_ApiVersion,
name = gameObject.name,
environmentResetParameters = new EnvironmentResetParameters
{
resetParameters = resetParameters,
customResetParameters = customResetParameters
}
});
Random.InitState(unityRLInitParameters.seed);
}

{
Communicator.QuitCommandReceived += OnQuitCommandReceived;
Communicator.ResetCommandReceived += OnResetCommand;
Communicator.RLInputReceived += OnRLInputReceived;
}
}

SetIsInference(!IsCommunicatorOn);
DecideAction += () => { };
DestroyAction += () => { };

AgentAct += () => { };
AgentForceReset += () => { };
ConfigureEnvironment();
}
static void OnQuitCommandReceived()

Application.Quit();
}
void OnResetCommand(EnvironmentResetParameters newResetParameters)
void OnResetCommand()
UpdateResetParameters(newResetParameters);
void OnRLInputReceived(UnityRLInputParameters inputParams)
{
m_IsInference = !inputParams.isTraining;
}
void UpdateResetParameters(EnvironmentResetParameters newResetParameters)
{
if (newResetParameters.resetParameters != null)
{
foreach (var kv in newResetParameters.resetParameters)
{
resetParameters[kv.Key] = kv.Value;
}
}
customResetParameters = newResetParameters.customResetParameters;
}
/// <summary>
/// Configures the environment settings depending on the training/inference
/// mode and the corresponding parameters passed in the Editor.
/// </summary>
void ConfigureEnvironment()
{
if (m_IsInference)
{
ConfigureEnvironmentHelper(m_InferenceConfiguration);
Monitor.SetActive(true);
}
else
{
ConfigureEnvironmentHelper(m_TrainingConfiguration);
Monitor.SetActive(false);
}
}
/// <summary>
/// Helper method for initializing the environment based on the provided
/// configuration.
/// </summary>
/// <param name="config">
/// Environment configuration (specified in the Editor).
/// </param>
static void ConfigureEnvironmentHelper(EnvironmentConfiguration config)
{
Screen.SetResolution(config.width, config.height, false);
QualitySettings.SetQualityLevel(config.qualityLevel, true);
Time.timeScale = config.timeScale;
Time.captureFramerate = 60;
Application.targetFrameRate = config.targetFrameRate;
}
/// <summary>
/// Initializes the academy and environment. Called during the waking-up
/// phase of the environment before any of the scene objects/agents have

{
}
/// <summary>
/// Returns the <see cref="m_IsInference"/> flag.
/// </summary>
/// <returns>
/// <c>true</c>, if current mode is inference, <c>false</c> if training.
/// </returns>
public bool GetIsInference()
{
return m_IsInference;
}
/// <summary>
/// Sets the <see cref="m_IsInference"/> flag to the provided value. If
/// the new flag differs from the current flag value, this signals that
/// the environment configuration needs to be updated.
/// </summary>
/// <param name="isInference">
/// Environment mode, if true then inference, otherwise training.
/// </param>
public void SetIsInference(bool isInference)
{
if (m_IsInference != isInference)
{
m_IsInference = isInference;
// This signals to the academy that at the next environment step
// the engine configurations need updating to the respective mode
// (i.e. training vs inference) configuration.
m_ModeSwitched = true;
}
}
/// <summary>
/// Returns the current episode counter.

/// </summary>
void EnvironmentStep()
{
if (m_ModeSwitched)
{
ConfigureEnvironment();
m_ModeSwitched = false;
}
if (!m_FirstAcademyReset)
{
ForcedFullReset();

52
UnitySDK/Assets/ML-Agents/Scripts/Grpc/CommunicatorObjects/UnityRlInitializationOutput.cs


"CkdtbGFnZW50cy9lbnZzL2NvbW11bmljYXRvcl9vYmplY3RzL3VuaXR5X3Js",
"X2luaXRpYWxpemF0aW9uX291dHB1dC5wcm90bxIUY29tbXVuaWNhdG9yX29i",
"amVjdHMaOW1sYWdlbnRzL2VudnMvY29tbXVuaWNhdG9yX29iamVjdHMvYnJh",
"aW5fcGFyYW1ldGVycy5wcm90bxo/bWxhZ2VudHMvZW52cy9jb21tdW5pY2F0",
"b3Jfb2JqZWN0cy9lbnZpcm9ubWVudF9wYXJhbWV0ZXJzLnByb3RvIusBCiBV",
"bml0eVJMSW5pdGlhbGl6YXRpb25PdXRwdXRQcm90bxIMCgRuYW1lGAEgASgJ",
"Eg8KB3ZlcnNpb24YAiABKAkSEAoIbG9nX3BhdGgYAyABKAkSRAoQYnJhaW5f",
"cGFyYW1ldGVycxgFIAMoCzIqLmNvbW11bmljYXRvcl9vYmplY3RzLkJyYWlu",
"UGFyYW1ldGVyc1Byb3RvElAKFmVudmlyb25tZW50X3BhcmFtZXRlcnMYBiAB",
"KAsyMC5jb21tdW5pY2F0b3Jfb2JqZWN0cy5FbnZpcm9ubWVudFBhcmFtZXRl",
"cnNQcm90b0IfqgIcTUxBZ2VudHMuQ29tbXVuaWNhdG9yT2JqZWN0c2IGcHJv",
"dG8z"));
"aW5fcGFyYW1ldGVycy5wcm90byKfAQogVW5pdHlSTEluaXRpYWxpemF0aW9u",
"T3V0cHV0UHJvdG8SDAoEbmFtZRgBIAEoCRIPCgd2ZXJzaW9uGAIgASgJEhAK",
"CGxvZ19wYXRoGAMgASgJEkQKEGJyYWluX3BhcmFtZXRlcnMYBSADKAsyKi5j",
"b21tdW5pY2F0b3Jfb2JqZWN0cy5CcmFpblBhcmFtZXRlcnNQcm90b0oECAYQ",
"B0IfqgIcTUxBZ2VudHMuQ29tbXVuaWNhdG9yT2JqZWN0c2IGcHJvdG8z"));
new pbr::FileDescriptor[] { global::MLAgents.CommunicatorObjects.BrainParametersReflection.Descriptor, global::MLAgents.CommunicatorObjects.EnvironmentParametersReflection.Descriptor, },
new pbr::FileDescriptor[] { global::MLAgents.CommunicatorObjects.BrainParametersReflection.Descriptor, },
new pbr::GeneratedClrTypeInfo(typeof(global::MLAgents.CommunicatorObjects.UnityRLInitializationOutputProto), global::MLAgents.CommunicatorObjects.UnityRLInitializationOutputProto.Parser, new[]{ "Name", "Version", "LogPath", "BrainParameters", "EnvironmentParameters" }, null, null, null)
new pbr::GeneratedClrTypeInfo(typeof(global::MLAgents.CommunicatorObjects.UnityRLInitializationOutputProto), global::MLAgents.CommunicatorObjects.UnityRLInitializationOutputProto.Parser, new[]{ "Name", "Version", "LogPath", "BrainParameters" }, null, null, null)
}));
}
#endregion

version_ = other.version_;
logPath_ = other.logPath_;
brainParameters_ = other.brainParameters_.Clone();
EnvironmentParameters = other.environmentParameters_ != null ? other.EnvironmentParameters.Clone() : null;
_unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields);
}

get { return brainParameters_; }
}
/// <summary>Field number for the "environment_parameters" field.</summary>
public const int EnvironmentParametersFieldNumber = 6;
private global::MLAgents.CommunicatorObjects.EnvironmentParametersProto environmentParameters_;
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public global::MLAgents.CommunicatorObjects.EnvironmentParametersProto EnvironmentParameters {
get { return environmentParameters_; }
set {
environmentParameters_ = value;
}
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public override bool Equals(object other) {
return Equals(other as UnityRLInitializationOutputProto);

if (Version != other.Version) return false;
if (LogPath != other.LogPath) return false;
if(!brainParameters_.Equals(other.brainParameters_)) return false;
if (!object.Equals(EnvironmentParameters, other.EnvironmentParameters)) return false;
return Equals(_unknownFields, other._unknownFields);
}

if (Version.Length != 0) hash ^= Version.GetHashCode();
if (LogPath.Length != 0) hash ^= LogPath.GetHashCode();
hash ^= brainParameters_.GetHashCode();
if (environmentParameters_ != null) hash ^= EnvironmentParameters.GetHashCode();
if (_unknownFields != null) {
hash ^= _unknownFields.GetHashCode();
}

output.WriteString(LogPath);
}
brainParameters_.WriteTo(output, _repeated_brainParameters_codec);
if (environmentParameters_ != null) {
output.WriteRawTag(50);
output.WriteMessage(EnvironmentParameters);
}
if (_unknownFields != null) {
_unknownFields.WriteTo(output);
}

size += 1 + pb::CodedOutputStream.ComputeStringSize(LogPath);
}
size += brainParameters_.CalculateSize(_repeated_brainParameters_codec);
if (environmentParameters_ != null) {
size += 1 + pb::CodedOutputStream.ComputeMessageSize(EnvironmentParameters);
}
if (_unknownFields != null) {
size += _unknownFields.CalculateSize();
}

LogPath = other.LogPath;
}
brainParameters_.Add(other.brainParameters_);
if (other.environmentParameters_ != null) {
if (environmentParameters_ == null) {
environmentParameters_ = new global::MLAgents.CommunicatorObjects.EnvironmentParametersProto();
}
EnvironmentParameters.MergeFrom(other.EnvironmentParameters);
}
_unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields);
}

}
case 42: {
brainParameters_.AddEntriesFrom(input, _repeated_brainParameters_codec);
break;
}
case 50: {
if (environmentParameters_ == null) {
environmentParameters_ = new global::MLAgents.CommunicatorObjects.EnvironmentParametersProto();
}
input.ReadMessage(environmentParameters_);
break;
}
}

119
UnitySDK/Assets/ML-Agents/Scripts/Grpc/CommunicatorObjects/UnityRlInput.cs


"CjdtbGFnZW50cy9lbnZzL2NvbW11bmljYXRvcl9vYmplY3RzL3VuaXR5X3Js",
"X2lucHV0LnByb3RvEhRjb21tdW5pY2F0b3Jfb2JqZWN0cxo1bWxhZ2VudHMv",
"ZW52cy9jb21tdW5pY2F0b3Jfb2JqZWN0cy9hZ2VudF9hY3Rpb24ucHJvdG8a",
"P21sYWdlbnRzL2VudnMvY29tbXVuaWNhdG9yX29iamVjdHMvZW52aXJvbm1l",
"bnRfcGFyYW1ldGVycy5wcm90bxowbWxhZ2VudHMvZW52cy9jb21tdW5pY2F0",
"b3Jfb2JqZWN0cy9jb21tYW5kLnByb3RvIsMDChFVbml0eVJMSW5wdXRQcm90",
"bxJQCg1hZ2VudF9hY3Rpb25zGAEgAygLMjkuY29tbXVuaWNhdG9yX29iamVj",
"dHMuVW5pdHlSTElucHV0UHJvdG8uQWdlbnRBY3Rpb25zRW50cnkSUAoWZW52",
"aXJvbm1lbnRfcGFyYW1ldGVycxgCIAEoCzIwLmNvbW11bmljYXRvcl9vYmpl",
"Y3RzLkVudmlyb25tZW50UGFyYW1ldGVyc1Byb3RvEhMKC2lzX3RyYWluaW5n",
"GAMgASgIEjMKB2NvbW1hbmQYBCABKA4yIi5jb21tdW5pY2F0b3Jfb2JqZWN0",
"cy5Db21tYW5kUHJvdG8aTQoUTGlzdEFnZW50QWN0aW9uUHJvdG8SNQoFdmFs",
"dWUYASADKAsyJi5jb21tdW5pY2F0b3Jfb2JqZWN0cy5BZ2VudEFjdGlvblBy",
"b3RvGnEKEUFnZW50QWN0aW9uc0VudHJ5EgsKA2tleRgBIAEoCRJLCgV2YWx1",
"ZRgCIAEoCzI8LmNvbW11bmljYXRvcl9vYmplY3RzLlVuaXR5UkxJbnB1dFBy",
"b3RvLkxpc3RBZ2VudEFjdGlvblByb3RvOgI4AUIfqgIcTUxBZ2VudHMuQ29t",
"bXVuaWNhdG9yT2JqZWN0c2IGcHJvdG8z"));
"MG1sYWdlbnRzL2VudnMvY29tbXVuaWNhdG9yX29iamVjdHMvY29tbWFuZC5w",
"cm90byL+AgoRVW5pdHlSTElucHV0UHJvdG8SUAoNYWdlbnRfYWN0aW9ucxgB",
"IAMoCzI5LmNvbW11bmljYXRvcl9vYmplY3RzLlVuaXR5UkxJbnB1dFByb3Rv",
"LkFnZW50QWN0aW9uc0VudHJ5EjMKB2NvbW1hbmQYBCABKA4yIi5jb21tdW5p",
"Y2F0b3Jfb2JqZWN0cy5Db21tYW5kUHJvdG8SFAoMc2lkZV9jaGFubmVsGAUg",
"ASgMGk0KFExpc3RBZ2VudEFjdGlvblByb3RvEjUKBXZhbHVlGAEgAygLMiYu",
"Y29tbXVuaWNhdG9yX29iamVjdHMuQWdlbnRBY3Rpb25Qcm90bxpxChFBZ2Vu",
"dEFjdGlvbnNFbnRyeRILCgNrZXkYASABKAkSSwoFdmFsdWUYAiABKAsyPC5j",
"b21tdW5pY2F0b3Jfb2JqZWN0cy5Vbml0eVJMSW5wdXRQcm90by5MaXN0QWdl",
"bnRBY3Rpb25Qcm90bzoCOAFKBAgCEANKBAgDEARCH6oCHE1MQWdlbnRzLkNv",
"bW11bmljYXRvck9iamVjdHNiBnByb3RvMw=="));
new pbr::FileDescriptor[] { global::MLAgents.CommunicatorObjects.AgentActionReflection.Descriptor, global::MLAgents.CommunicatorObjects.EnvironmentParametersReflection.Descriptor, global::MLAgents.CommunicatorObjects.CommandReflection.Descriptor, },
new pbr::FileDescriptor[] { global::MLAgents.CommunicatorObjects.AgentActionReflection.Descriptor, global::MLAgents.CommunicatorObjects.CommandReflection.Descriptor, },
new pbr::GeneratedClrTypeInfo(typeof(global::MLAgents.CommunicatorObjects.UnityRLInputProto), global::MLAgents.CommunicatorObjects.UnityRLInputProto.Parser, new[]{ "AgentActions", "EnvironmentParameters", "IsTraining", "Command" }, null, null, new pbr::GeneratedClrTypeInfo[] { new pbr::GeneratedClrTypeInfo(typeof(global::MLAgents.CommunicatorObjects.UnityRLInputProto.Types.ListAgentActionProto), global::MLAgents.CommunicatorObjects.UnityRLInputProto.Types.ListAgentActionProto.Parser, new[]{ "Value" }, null, null, null),
new pbr::GeneratedClrTypeInfo(typeof(global::MLAgents.CommunicatorObjects.UnityRLInputProto), global::MLAgents.CommunicatorObjects.UnityRLInputProto.Parser, new[]{ "AgentActions", "Command", "SideChannel" }, null, null, new pbr::GeneratedClrTypeInfo[] { new pbr::GeneratedClrTypeInfo(typeof(global::MLAgents.CommunicatorObjects.UnityRLInputProto.Types.ListAgentActionProto), global::MLAgents.CommunicatorObjects.UnityRLInputProto.Types.ListAgentActionProto.Parser, new[]{ "Value" }, null, null, null),
null, })
}));
}

[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public UnityRLInputProto(UnityRLInputProto other) : this() {
agentActions_ = other.agentActions_.Clone();
EnvironmentParameters = other.environmentParameters_ != null ? other.EnvironmentParameters.Clone() : null;
isTraining_ = other.isTraining_;
sideChannel_ = other.sideChannel_;
_unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields);
}

get { return agentActions_; }
}
/// <summary>Field number for the "environment_parameters" field.</summary>
public const int EnvironmentParametersFieldNumber = 2;
private global::MLAgents.CommunicatorObjects.EnvironmentParametersProto environmentParameters_;
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public global::MLAgents.CommunicatorObjects.EnvironmentParametersProto EnvironmentParameters {
get { return environmentParameters_; }
set {
environmentParameters_ = value;
}
}
/// <summary>Field number for the "is_training" field.</summary>
public const int IsTrainingFieldNumber = 3;
private bool isTraining_;
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public bool IsTraining {
get { return isTraining_; }
set {
isTraining_ = value;
}
}
/// <summary>Field number for the "command" field.</summary>
public const int CommandFieldNumber = 4;
private global::MLAgents.CommunicatorObjects.CommandProto command_ = 0;

}
}
/// <summary>Field number for the "side_channel" field.</summary>
public const int SideChannelFieldNumber = 5;
private pb::ByteString sideChannel_ = pb::ByteString.Empty;
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public pb::ByteString SideChannel {
get { return sideChannel_; }
set {
sideChannel_ = pb::ProtoPreconditions.CheckNotNull(value, "value");
}
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public override bool Equals(object other) {
return Equals(other as UnityRLInputProto);

return true;
}
if (!AgentActions.Equals(other.AgentActions)) return false;
if (!object.Equals(EnvironmentParameters, other.EnvironmentParameters)) return false;
if (IsTraining != other.IsTraining) return false;
if (SideChannel != other.SideChannel) return false;
return Equals(_unknownFields, other._unknownFields);
}

hash ^= AgentActions.GetHashCode();
if (environmentParameters_ != null) hash ^= EnvironmentParameters.GetHashCode();
if (IsTraining != false) hash ^= IsTraining.GetHashCode();
if (SideChannel.Length != 0) hash ^= SideChannel.GetHashCode();
if (_unknownFields != null) {
hash ^= _unknownFields.GetHashCode();
}

[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public void WriteTo(pb::CodedOutputStream output) {
agentActions_.WriteTo(output, _map_agentActions_codec);
if (environmentParameters_ != null) {
output.WriteRawTag(18);
output.WriteMessage(EnvironmentParameters);
}
if (IsTraining != false) {
output.WriteRawTag(24);
output.WriteBool(IsTraining);
}
}
if (SideChannel.Length != 0) {
output.WriteRawTag(42);
output.WriteBytes(SideChannel);
}
if (_unknownFields != null) {
_unknownFields.WriteTo(output);

public int CalculateSize() {
int size = 0;
size += agentActions_.CalculateSize(_map_agentActions_codec);
if (environmentParameters_ != null) {
size += 1 + pb::CodedOutputStream.ComputeMessageSize(EnvironmentParameters);
}
if (IsTraining != false) {
size += 1 + 1;
}
if (SideChannel.Length != 0) {
size += 1 + pb::CodedOutputStream.ComputeBytesSize(SideChannel);
}
if (_unknownFields != null) {
size += _unknownFields.CalculateSize();
}

return;
}
agentActions_.Add(other.agentActions_);
if (other.environmentParameters_ != null) {
if (environmentParameters_ == null) {
environmentParameters_ = new global::MLAgents.CommunicatorObjects.EnvironmentParametersProto();
}
EnvironmentParameters.MergeFrom(other.EnvironmentParameters);
}
if (other.IsTraining != false) {
IsTraining = other.IsTraining;
}
}
if (other.SideChannel.Length != 0) {
SideChannel = other.SideChannel;
}
_unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields);
}

agentActions_.AddEntriesFrom(input, _map_agentActions_codec);
break;
}
case 18: {
if (environmentParameters_ == null) {
environmentParameters_ = new global::MLAgents.CommunicatorObjects.EnvironmentParametersProto();
}
input.ReadMessage(environmentParameters_);
break;
}
case 24: {
IsTraining = input.ReadBool();
case 32: {
command_ = (global::MLAgents.CommunicatorObjects.CommandProto) input.ReadEnum();
case 32: {
command_ = (global::MLAgents.CommunicatorObjects.CommandProto) input.ReadEnum();
case 42: {
SideChannel = input.ReadBytes();
break;
}
}

44
UnitySDK/Assets/ML-Agents/Scripts/Grpc/CommunicatorObjects/UnityRlOutput.cs


string.Concat(
"CjhtbGFnZW50cy9lbnZzL2NvbW11bmljYXRvcl9vYmplY3RzL3VuaXR5X3Js",
"X291dHB1dC5wcm90bxIUY29tbXVuaWNhdG9yX29iamVjdHMaM21sYWdlbnRz",
"L2VudnMvY29tbXVuaWNhdG9yX29iamVjdHMvYWdlbnRfaW5mby5wcm90byKj",
"L2VudnMvY29tbXVuaWNhdG9yX29iamVjdHMvYWdlbnRfaW5mby5wcm90byK5",
"bmZvc0VudHJ5GkkKEkxpc3RBZ2VudEluZm9Qcm90bxIzCgV2YWx1ZRgBIAMo",
"CzIkLmNvbW11bmljYXRvcl9vYmplY3RzLkFnZW50SW5mb1Byb3RvGm4KD0Fn",
"ZW50SW5mb3NFbnRyeRILCgNrZXkYASABKAkSSgoFdmFsdWUYAiABKAsyOy5j",
"b21tdW5pY2F0b3Jfb2JqZWN0cy5Vbml0eVJMT3V0cHV0UHJvdG8uTGlzdEFn",
"ZW50SW5mb1Byb3RvOgI4AUoECAEQAkIfqgIcTUxBZ2VudHMuQ29tbXVuaWNh",
"dG9yT2JqZWN0c2IGcHJvdG8z"));
"bmZvc0VudHJ5EhQKDHNpZGVfY2hhbm5lbBgDIAEoDBpJChJMaXN0QWdlbnRJ",
"bmZvUHJvdG8SMwoFdmFsdWUYASADKAsyJC5jb21tdW5pY2F0b3Jfb2JqZWN0",
"cy5BZ2VudEluZm9Qcm90bxpuCg9BZ2VudEluZm9zRW50cnkSCwoDa2V5GAEg",
"ASgJEkoKBXZhbHVlGAIgASgLMjsuY29tbXVuaWNhdG9yX29iamVjdHMuVW5p",
"dHlSTE91dHB1dFByb3RvLkxpc3RBZ2VudEluZm9Qcm90bzoCOAFKBAgBEAJC",
"H6oCHE1MQWdlbnRzLkNvbW11bmljYXRvck9iamVjdHNiBnByb3RvMw=="));
new pbr::GeneratedClrTypeInfo(typeof(global::MLAgents.CommunicatorObjects.UnityRLOutputProto), global::MLAgents.CommunicatorObjects.UnityRLOutputProto.Parser, new[]{ "AgentInfos" }, null, null, new pbr::GeneratedClrTypeInfo[] { new pbr::GeneratedClrTypeInfo(typeof(global::MLAgents.CommunicatorObjects.UnityRLOutputProto.Types.ListAgentInfoProto), global::MLAgents.CommunicatorObjects.UnityRLOutputProto.Types.ListAgentInfoProto.Parser, new[]{ "Value" }, null, null, null),
new pbr::GeneratedClrTypeInfo(typeof(global::MLAgents.CommunicatorObjects.UnityRLOutputProto), global::MLAgents.CommunicatorObjects.UnityRLOutputProto.Parser, new[]{ "AgentInfos", "SideChannel" }, null, null, new pbr::GeneratedClrTypeInfo[] { new pbr::GeneratedClrTypeInfo(typeof(global::MLAgents.CommunicatorObjects.UnityRLOutputProto.Types.ListAgentInfoProto), global::MLAgents.CommunicatorObjects.UnityRLOutputProto.Types.ListAgentInfoProto.Parser, new[]{ "Value" }, null, null, null),
null, })
}));
}

[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public UnityRLOutputProto(UnityRLOutputProto other) : this() {
agentInfos_ = other.agentInfos_.Clone();
sideChannel_ = other.sideChannel_;
_unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields);
}

get { return agentInfos_; }
}
/// <summary>Field number for the "side_channel" field.</summary>
public const int SideChannelFieldNumber = 3;
private pb::ByteString sideChannel_ = pb::ByteString.Empty;
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public pb::ByteString SideChannel {
get { return sideChannel_; }
set {
sideChannel_ = pb::ProtoPreconditions.CheckNotNull(value, "value");
}
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public override bool Equals(object other) {
return Equals(other as UnityRLOutputProto);

return true;
}
if (!AgentInfos.Equals(other.AgentInfos)) return false;
if (SideChannel != other.SideChannel) return false;
return Equals(_unknownFields, other._unknownFields);
}

hash ^= AgentInfos.GetHashCode();
if (SideChannel.Length != 0) hash ^= SideChannel.GetHashCode();
if (_unknownFields != null) {
hash ^= _unknownFields.GetHashCode();
}

[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public void WriteTo(pb::CodedOutputStream output) {
agentInfos_.WriteTo(output, _map_agentInfos_codec);
if (SideChannel.Length != 0) {
output.WriteRawTag(26);
output.WriteBytes(SideChannel);
}
if (_unknownFields != null) {
_unknownFields.WriteTo(output);
}

public int CalculateSize() {
int size = 0;
size += agentInfos_.CalculateSize(_map_agentInfos_codec);
if (SideChannel.Length != 0) {
size += 1 + pb::CodedOutputStream.ComputeBytesSize(SideChannel);
}
if (_unknownFields != null) {
size += _unknownFields.CalculateSize();
}

return;
}
agentInfos_.Add(other.agentInfos_);
if (other.SideChannel.Length != 0) {
SideChannel = other.SideChannel;
}
_unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields);
}

break;
case 18: {
agentInfos_.AddEntriesFrom(input, _map_agentInfos_codec);
break;
}
case 26: {
SideChannel = input.ReadBytes();
break;
}
}

23
UnitySDK/Assets/ML-Agents/Scripts/Grpc/GrpcExtensions.cs


return bp;
}
/// <summary>
/// Convert a MapField to ResetParameters.
/// </summary>
/// <param name="floatParams">The mapping of strings to floats from a protobuf MapField.</param>
/// <returns></returns>
public static ResetParameters ToResetParameters(this MapField<string, float> floatParams)
{
return new ResetParameters(floatParams);
}
/// <summary>
/// Convert an EnvironmnetParametersProto protobuf object to an EnvironmentResetParameters struct.
/// </summary>
/// <param name="epp">The instance of the EnvironmentParametersProto object.</param>
/// <returns>A new EnvironmentResetParameters struct.</returns>
public static EnvironmentResetParameters ToEnvironmentResetParameters(this EnvironmentParametersProto epp)
{
return new EnvironmentResetParameters
{
resetParameters = epp.FloatParameters?.ToResetParameters(),
customResetParameters = epp.CustomResetParameters
};
}
public static UnityRLInitParameters ToUnityRLInitParameters(this UnityRLInitializationInputProto inputProto)
{

126
UnitySDK/Assets/ML-Agents/Scripts/Grpc/RpcCommunicator.cs


using System.Linq;
using UnityEngine;
using MLAgents.CommunicatorObjects;
using System.IO;
using Google.Protobuf;
namespace MLAgents
{

public event QuitCommandHandler QuitCommandReceived;
public event ResetCommandHandler ResetCommandReceived;
public event RLInputReceivedHandler RLInputReceived;
/// If true, the communication is active.
bool m_IsOpen;

/// The communicator parameters sent at construction
CommunicatorInitParameters m_CommunicatorInitParameters;
Dictionary<int, SideChannel> m_SideChannels = new Dictionary<int, SideChannel>();
/// <summary>
/// Initializes a new instance of the RPCCommunicator class.
/// </summary>

Version = initParameters.version
};
academyParameters.EnvironmentParameters = new EnvironmentParametersProto();
var resetParameters = initParameters.environmentResetParameters.resetParameters;
foreach (var key in resetParameters.Keys)
{
academyParameters.EnvironmentParameters.FloatParameters.Add(key, resetParameters[key]);
}
UnityInputProto input;
UnityInputProto initializationInput;
try

void UpdateEnvironmentWithInput(UnityRLInputProto rlInput)
{
SendRLInputReceivedEvent(rlInput.IsTraining);
SendCommandEvent(rlInput.Command, rlInput.EnvironmentParameters);
ProcessSideChannelData(m_SideChannels, rlInput.SideChannel.ToArray());
SendCommandEvent(rlInput.Command);
}
UnityInputProto Initialize(UnityOutputProto unityOutput,

#region Sending Events
void SendCommandEvent(CommandProto command, EnvironmentParametersProto environmentParametersProto)
void SendCommandEvent(CommandProto command)
{
switch (command)
{

}
case CommandProto.Reset:
{
ResetCommandReceived?.Invoke(environmentParametersProto.ToEnvironmentResetParameters());
ResetCommandReceived?.Invoke();
return;
}
default:

}
}
void SendRLInputReceivedEvent(bool isTraining)
{
RLInputReceived?.Invoke(new UnityRLInputParameters { isTraining = isTraining });
}
#endregion
#region Sending and retreiving data

{
message.RlInitializationOutput = tempUnityRlInitializationOutput;
}
byte[] messageAggregated = GetSideChannelMessage(m_SideChannels);
message.RlOutput.SideChannel = ByteString.CopyFrom(messageAggregated);
var input = Exchange(message);
UpdateSentBrainParameters(tempUnityRlInitializationOutput);

{
m_SentBrainKeys.Add(brainProto.BrainName);
m_UnsentBrainKeys.Remove(brainProto.BrainName);
}
}
#endregion
#region Handling side channels
/// <summary>
/// Registers a side channel to the communicator. The side channel will exchange
/// messages with its Python equivalent.
/// </summary>
/// <param name="sideChannel"> The side channel to be registered.</param>
public void RegisterSideChannel(SideChannel sideChannel)
{
if (m_SideChannels.ContainsKey(sideChannel.ChannelType()))
{
throw new UnityAgentsException(string.Format(
"A side channel with type index {} is already registered. You cannot register multiple " +
"side channels of the same type."));
}
m_SideChannels.Add(sideChannel.ChannelType(), sideChannel);
}
/// <summary>
/// Grabs the messages that the registered side channels will send to Python at the current step
/// into a singe byte array.
/// </summary>
/// <param name="sideChannels"> A dictionary of channel type to channel.</param>
/// <returns></returns>
public static byte[] GetSideChannelMessage(Dictionary<int, SideChannel> sideChannels)
{
using (var memStream = new MemoryStream())
{
using (var binaryWriter = new BinaryWriter(memStream))
{
foreach (var sideChannel in sideChannels.Values)
{
var messageList = sideChannel.MessageQueue;
foreach (var message in messageList)
{
binaryWriter.Write(sideChannel.ChannelType());
binaryWriter.Write(message.Count());
binaryWriter.Write(message);
}
sideChannel.MessageQueue.Clear();
}
return memStream.ToArray();
}
}
}
/// <summary>
/// Separates the data received from Python into individual messages for each registered side channel.
/// </summary>
/// <param name="sideChannels">A dictionary of channel type to channel.</param>
/// <param name="dataReceived">The byte array of data received from Python.</param>
public static void ProcessSideChannelData(Dictionary<int, SideChannel> sideChannels, byte[] dataReceived)
{
if (dataReceived.Length == 0)
{
return;
}
using (var memStream = new MemoryStream(dataReceived))
{
using (var binaryReader = new BinaryReader(memStream))
{
while (memStream.Position < memStream.Length)
{
int channelType = 0;
byte[] message = null;
try
{
channelType = binaryReader.ReadInt32();
var messageLength = binaryReader.ReadInt32();
message = binaryReader.ReadBytes(messageLength);
}
catch (Exception ex)
{
throw new UnityAgentsException(
"There was a problem reading a message in a SideChannel. Please make sure the " +
"version of MLAgents in Unity is compatible with the Python version. Original error : "
+ ex.Message);
}
if (sideChannels.ContainsKey(channelType))
{
sideChannels[channelType].OnMessageReceived(message);
}
else
{
Debug.Log(string.Format(
"Unknown side channel data received. Channel type "
+ ": {0}", channelType));
}
}
}
}
}

34
UnitySDK/Assets/ML-Agents/Scripts/ICommunicator.cs


namespace MLAgents
{
public struct EnvironmentResetParameters
{
/// <summary>
/// Mapping of string : float which defines which parameters can be
/// reset from python.
/// </summary>
public ResetParameters resetParameters;
/// <summary>
/// The protobuf for custom reset parameters.
/// NOTE: This is the last remaining relic of gRPC protocol
/// that is left in our code. We need to decide how to handle this
/// moving forward.
/// </summary>
public CustomResetParametersProto customResetParameters;
}
public struct CommunicatorInitParameters
{
/// <summary>

/// The version of the Unity SDK.
/// </summary>
public string version;
/// <summary>
/// The set of environment parameters defined by the user that will be sent to the communicator.
/// </summary>
public EnvironmentResetParameters environmentResetParameters;
}
public struct UnityRLInitParameters
{

/// Delegate for handling reset parameter updates sent from the communicator.
/// </summary>
/// <param name="resetParams"></param>
public delegate void ResetCommandHandler(EnvironmentResetParameters resetParams);
public delegate void ResetCommandHandler();
/// <summary>
/// Delegate to handle UnityRLInputParameters updates from the communicator.

event ResetCommandHandler ResetCommandReceived;
/// <summary>
/// Unity RL Input was received by the communicator.
/// </summary>
event RLInputReceivedHandler RLInputReceived;
/// <summary>
/// Sends the academy parameters through the Communicator.
/// Is used by the academy to send the AcademyParameters to the communicator.
/// </summary>

/// <param name="key">A key to identify which actions to get</param>
/// <returns></returns>
Dictionary<Agent, AgentAction> GetActions(string key);
/// <summary>
/// Registers a side channel to the communicator. The side channel will exchange
/// messages with its Python equivalent.
/// </summary>
/// <param name="sideChannel"> The side channel to be registered.</param>
void RegisterSideChannel(SideChannel sideChannel);
}
}

70
UnitySDK/Assets/ML-Agents/Scripts/Timer.cs


// Compile with: csc CRefTest.cs -doc:Results.xml
using System;
using UnityEngine;
using System.Collections.Generic;
using System.IO;

/// </summary>
[DataMember(Name = "children", Order = 999)]
Dictionary<string, TimerNode> m_Children;
/// <summary>
/// Gauge Nodes to measure arbitrary values.
/// </summary>
[DataMember(Name = "gauges", EmitDefaultValue = false)]
Dictionary<string, GaugeNode> m_Gauges;
/// <summary>
/// Custom sampler used to add timings to the profiler.

var currentTicks = m_TotalTicks;
if (m_TickStart != 0)
{
currentTicks += (System.DateTime.Now.Ticks - m_TickStart);
currentTicks += (DateTime.Now.Ticks - m_TickStart);
}
return currentTicks;

set {} // Serialization needs this, but unused.
}
public Dictionary<string, GaugeNode> Gauges
{
get { return m_Gauges; }
}
/// <summary>
/// Total seconds spent in this block, excluding it's children.
/// </summary>

// have a sensible value for total time (the running time since reset).
// The root node doesn't have a sampler since that could interfere with the profiler.
m_NumCalls = 1;
m_TickStart = System.DateTime.Now.Ticks;
m_TickStart = DateTime.Now.Ticks;
m_Gauges = new Dictionary<string, GaugeNode>();
}
else
{

public void Begin()
{
m_Sampler?.Begin();
m_TickStart = System.DateTime.Now.Ticks;
m_TickStart = DateTime.Now.Ticks;
}
/// <summary>

{
var elapsed = System.DateTime.Now.Ticks - m_TickStart;
var elapsed = DateTime.Now.Ticks - m_TickStart;
m_TotalTicks += elapsed;
m_TickStart = 0;
m_NumCalls++;

}
/// <summary>
/// Tracks the most recent value of a metric. This is analogous to gauges in statsd.
/// </summary>
[DataContract]
public class GaugeNode
{
[DataMember]
public float value;
[DataMember( Name = "min")]
public float minValue;
[DataMember( Name = "max")]
public float maxValue;
[DataMember]
public uint count;
public GaugeNode(float value)
{
this.value = value;
minValue = value;
maxValue = value;
count = 1;
}
public void Update(float newValue)
{
minValue = Mathf.Min(minValue, newValue);
maxValue = Mathf.Max(maxValue, newValue);
value = newValue;
++count;
}
}
/// <summary>
/// A "stack" of timers that allows for lightweight hierarchical profiling of long-running processes.
/// <example>
/// Example usage:

/// This implements the Singleton pattern (solution 4) as described in
/// https://csharpindepth.com/articles/singleton
/// </remarks>
public class TimerStack : System.IDisposable
public class TimerStack : IDisposable
{
static readonly TimerStack k_Instance = new TimerStack();

public TimerNode RootNode
{
get { return m_RootNode; }
}
public void SetGauge(string name, float value)
{
if (!float.IsNaN(value))
{
GaugeNode gauge;
if (m_RootNode.Gauges.TryGetValue(name, out gauge))
{
gauge.Update(value);
}
else
{
m_RootNode.Gauges[name] = new GaugeNode(value);
}
}
}
void Push(string name)

1
UnitySDK/UnitySDK.sln.DotSettings


<s:Boolean x:Key="/Default/UserDictionary/Words/=protobuf/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Scaler/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=Scriptable/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=statsd/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=stddev/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=vals/@EntryIndexedValue">True</s:Boolean></wpf:ResourceDictionary>

38
docs/Basic-Guide.md


page](Learning-Environment-Executable.md) for instructions on how to build and
use an executable.
```console
ml-agents$ mlagents-learn config/trainer_config.yaml --run-id=first-run --train
▄▄▄▓▓▓▓
╓▓▓▓▓▓▓█▓▓▓▓▓
,▄▄▄m▀▀▀' ,▓▓▓▀▓▓▄ ▓▓▓ ▓▓▌
▄▓▓▓▀' ▄▓▓▀ ▓▓▓ ▄▄ ▄▄ ,▄▄ ▄▄▄▄ ,▄▄ ▄▓▓▌▄ ▄▄▄ ,▄▄
▄▓▓▓▀ ▄▓▓▀ ▐▓▓▌ ▓▓▌ ▐▓▓ ▐▓▓▓▀▀▀▓▓▌ ▓▓▓ ▀▓▓▌▀ ^▓▓▌ ╒▓▓▌
▄▓▓▓▓▓▄▄▄▄▄▄▄▄▓▓▓ ▓▀ ▓▓▌ ▐▓▓ ▐▓▓ ▓▓▓ ▓▓▓ ▓▓▌ ▐▓▓▄ ▓▓▌
▀▓▓▓▓▀▀▀▀▀▀▀▀▀▀▓▓▄ ▓▓ ▓▓▌ ▐▓▓ ▐▓▓ ▓▓▓ ▓▓▓ ▓▓▌ ▐▓▓▐▓▓
^█▓▓▓ ▀▓▓▄ ▐▓▓▌ ▓▓▓▓▄▓▓▓▓ ▐▓▓ ▓▓▓ ▓▓▓ ▓▓▓▄ ▓▓▓▓`
'▀▓▓▓▄ ^▓▓▓ ▓▓▓ └▀▀▀▀ ▀▀ ^▀▀ `▀▀ `▀▀ '▀▀ ▐▓▓▌
▀▀▀▀▓▄▄▄ ▓▓▓▓▓▓, ▓▓▓▓▀
`▀█▓▓▓▓▓▓▓▓▓▌
¬`▀▀▀█▓
INFO:mlagents.learn:{'--curriculum': 'None',
'--docker-target-name': 'Empty',
'--env': 'None',
'--help': False,
'--keep-checkpoints': '5',
'--lesson': '0',
'--load': False,
'--no-graphics': False,
'--num-runs': '1',
'--run-id': 'first-run',
'--save-freq': '50000',
'--seed': '-1',
'--slow': False,
'--train': True,
'--worker-id': '0',
'<trainer-config-path>': 'config/trainer_config.yaml'}
INFO:mlagents.envs:Start training by pressing the Play button in the Unity Editor.
```
**Note**: If you're using Anaconda, don't forget to activate the ml-agents
environment first.

INFO:mlagents.envs:
'Ball3DAcademy' started successfully!
Unity Academy name: Ball3DAcademy
Reset Parameters : {}
INFO:mlagents.envs:Connected new brain:
Unity brain name: 3DBallLearning

23
docs/Getting-Started-with-Balance-Ball.md


### Academy
The Academy object for the scene is placed on the Ball3DAcademy GameObject. When
you look at an Academy component in the inspector, you can see several
properties that control how the environment works.
The **Training Configuration** and **Inference Configuration** properties
set the graphics and timescale properties for the Unity application.
The Academy uses the **Training Configuration** during training and the
**Inference Configuration** when not training. (*Inference* means that the
Agent is using a trained model or heuristics or direct control — in other
words, whenever **not** training.)
Typically, you would set a low graphics quality and timescale to greater `1.0` for the **Training
Configuration** and a high graphics quality and timescale to `1.0` for the
**Inference Configuration** .
**Note:** if you want to observe the environment during training, you can adjust
the **Training Configuration** settings to use a larger window and a timescale
closer to 1:1. Be sure to set these parameters back when training in earnest;
otherwise, training can take a very long time.
Another aspect of an environment is the Academy implementation. Since
The Academy object for the scene is placed on the Ball3DAcademy GameObject. Since
the base Academy class is abstract, you must always define a subclass. There are
three functions you can implement, though they are all optional:

To summarize, go to your command line, enter the `ml-agents` directory and type:
```sh
mlagents-learn config/trainer_config.yaml --run-id=<run-identifier> --train
mlagents-learn config/trainer_config.yaml --run-id=<run-identifier> --train --time-scale=100
```
When the message _"Start training by pressing the Play button in the Unity

environment first.
The `--train` flag tells the ML-Agents toolkit to run in training mode.
The `--time-scale=100` sets the `Time.TimeScale` value in Unity.
**Note**: You can train using an executable rather than the Editor. To do so,
follow the instructions in

6
docs/Installation-Windows.md


Conda environment by typing `activate ml-agents`)_:
```sh
git clone https://github.com/Unity-Technologies/ml-agents.git
git clone --branch latest_release https://github.com/Unity-Technologies/ml-agents.git
The `--branch latest_release` option will switch to the tag of the latest stable release.
Omitting that will get the `master` branch which is potentially unstable.
[here](https://github.com/Unity-Technologies/ml-agents/archive/master.zip).
[here](https://github.com/Unity-Technologies/ml-agents/archive/latest_release.zip).
The `UnitySDK` subdirectory contains the Unity Assets to add to your projects.
It also contains many [example environments](Learning-Environment-Examples.md)

4
docs/Installation.md


Once installed, you will want to clone the ML-Agents Toolkit GitHub repository.
```sh
git clone https://github.com/Unity-Technologies/ml-agents.git
git clone --branch latest_release https://github.com/Unity-Technologies/ml-agents.git
The `--branch latest_release` option will switch to the tag of the latest stable release.
Omitting that will get the `master` branch which is potentially unstable.
The `UnitySDK` subdirectory contains the Unity Assets to add to your projects.
It also contains many [example environments](Learning-Environment-Examples.md)

12
docs/Learning-Environment-Design-Academy.md


you want to add elements to the environment at random intervals, you can put the
logic for creating them in the `AcademyStep()` function.
## Academy Properties
![Academy Inspector](images/academy.png)
* `Configuration` - The engine-level settings which correspond to rendering
quality and engine speed.
* `Width` - Width of the environment window in pixels.
* `Height` - Height of the environment window in pixels.
* `Quality Level` - Rendering quality of environment. (Higher is better)
* `Time Scale` - Speed at which environment is run. (Higher is faster)
* `Target Frame Rate` - FPS engine attempts to maintain.
* `Reset Parameters` - List of custom parameters that can be changed in the
environment on reset.

6
docs/Learning-Environment-Design.md


efficient and practical to create a purpose-built training scene.
Both training and testing (or normal game) scenes must contain an Academy object
to control the agent decision making process. The Academy defines several
properties that can be set differently for a training scene versus a regular
scene. The Academy's **Configuration** properties control rendering and time
scale. You can set the **Training Configuration** to minimize the time Unity
spends rendering graphics in order to speed up training.
to control the agent decision making process.
When you create a training environment in Unity, you must set up the scene so
that it can be controlled by the external training process. Considerations
include:

28
docs/Learning-Environment-Examples.md


* Vector Action space: (Discrete) Two possible actions (Move left, move
right).
* Visual Observations: None
* Reset Parameters: None
* Float Properties: None
* Benchmark Mean Reward: 0.94
## [3DBall: 3D Balance Ball](https://youtu.be/dheeCO29-EI)

* Vector Action space: (Continuous) Size of 2, with one value corresponding to
X-rotation, and the other to Z-rotation.
* Visual Observations: None.
* Reset Parameters: Three
* Float Properties: Three
* scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions)
* Default: 1
* Recommended Minimum: 0.2

using the `Mask Actions` checkbox within the `trueAgent` GameObject).
The trained model file provided was generated with action masking turned on.
* Visual Observations: One corresponding to top-down view of GridWorld.
* Reset Parameters: Three, corresponding to grid size, number of obstacles, and
* Float Properties: Three, corresponding to grid size, number of obstacles, and
number of goals.
* Benchmark Mean Reward: 0.8

* Vector Action space: (Continuous) Size of 2, corresponding to movement
toward net or away from net, and jumping.
* Visual Observations: None
* Reset Parameters: Three
* Float Properties: Three
* angle: Angle of the racket from the vertical (Y) axis.
* Default: 55
* Recommended Minimum: 35

`VisualPushBlock` scene. __The visual observation version of
this environment does not train with the provided default
training parameters.__
* Reset Parameters: Four
* Float Properties: Four
* block_scale: Scale of the block along the x and z dimensions
* Default: 2
* Recommended Minimum: 0.5

* Side Motion (3 possible actions: Left, Right, No Action)
* Jump (2 possible actions: Jump, No Action)
* Visual Observations: None
* Reset Parameters: Four
* Float Properties: Four
* Benchmark Mean Reward (Big & Small Wall): 0.8
## [Reacher](https://youtu.be/2N9EoF6pQyE)

* Vector Action space: (Continuous) Size of 4, corresponding to torque
applicable to two joints.
* Visual Observations: None.
* Reset Parameters: Five
* Float Properties: Five
* goal_size: radius of the goal zone
* Default: 5
* Recommended Minimum: 1

* Vector Action space: (Continuous) Size of 20, corresponding to target
rotations for joints.
* Visual Observations: None
* Reset Parameters: None
* Float Properties: None
* Benchmark Mean Reward for `CrawlerStaticTarget`: 2000
* Benchmark Mean Reward for `CrawlerDynamicTarget`: 400

`VisualFoodCollector` scene. __The visual observation version of
this environment does not train with the provided default
training parameters.__
* Reset Parameters: Two
* Float Properties: Two
* laser_length: Length of the laser used by the agent
* Default: 1
* Recommended Minimum: 0.2

`VisualHallway` scene. __The visual observation version of
this environment does not train with the provided default
training parameters.__
* Reset Parameters: None
* Float Properties: None
* Benchmark Mean Reward: 0.7
* To speed up training, you can enable curiosity by adding `use_curiosity: true` in `config/trainer_config.yaml`

* Vector Action space: (Continuous) 3 corresponding to agent force applied for
the jump.
* Visual Observations: None
* Reset Parameters: Two
* Float Properties: Two
* target_scale: The scale of the green cube in the 3 dimensions
* Default: 150
* Recommended Minimum: 50

as well as rotation.
* Goalie: 4 actions corresponding to forward, backward, sideways movement.
* Visual Observations: None
* Reset Parameters: Two
* Float Properties: Two
* ball_scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions)
* Default: 7.5
* Recommended minimum: 4

* Vector Action space: (Continuous) Size of 39, corresponding to target
rotations applicable to the joints.
* Visual Observations: None
* Reset Parameters: Four
* Float Properties: Four
* gravity: Magnitude of gravity
* Default: 9.81
* Recommended Minimum:

`VisualPyramids` scene. __The visual observation version of
this environment does not train with the provided default
training parameters.__
* Reset Parameters: None
* Float Properties: None
* Benchmark Mean Reward: 1.75

18
docs/Learning-Environment-Executable.md


`▀█▓▓▓▓▓▓▓▓▓▌
¬`▀▀▀█▓
INFO:mlagents.learn:{'--curriculum': 'None',
'--docker-target-name': 'Empty',
'--env': '3DBall',
'--help': False,
'--keep-checkpoints': '5',
'--lesson': '0',
'--load': False,
'--no-graphics': False,
'--num-runs': '1',
'--run-id': 'firstRun',
'--save-freq': '50000',
'--seed': '-1',
'--slow': False,
'--train': True,
'--worker-id': '0',
'<trainer-config-path>': 'config/trainer_config.yaml'}
```
**Note**: If you're using Anaconda, don't forget to activate the ml-agents

INFO:mlagents.envs:
'Ball3DAcademy' started successfully!
Unity Academy name: Ball3DAcademy
Reset Parameters : {}
INFO:mlagents.envs:Connected new brain:
Unity brain name: Ball3DLearning

13
docs/Migrating.md


# Migrating
## Migrating from master to develop
### Important changes
* `CustomResetParameters` are now removed.
* `reset()` on the Low-Level Python API no longer takes a `train_mode` argument. To modify the performance/speed of the engine, you must use an `EngineConfigurationChannel`
* `reset()` on the Low-Level Python API no longer takes a `config` argument. `UnityEnvironment` no longer has a `reset_parameters` field. To modify float properties in the environment, you must use a `FloatPropertiesChannel`. For more information, refer to the [Low Level Python API documentation](Python-API.md)
* The Academy no longer has a `Training Configuration` nor `Inference Configuration` field in the inspector. To modify the configuration from the Low-Level Python API, use an `EngineConfigurationChannel`. To modify it during training, use the new command line arguments `--width`, `--height`, `--quality-level`, `--time-scale` and `--target-frame-rate` in `mlagents-learn`.
* The Academy no longer has a `Default Reset Parameters` field in the inspector. The Academy class no longer has a `ResetParameters`. To access shared float properties with Python, use the new `FloatProperties` field on the Academy.
### Steps to Migrate
* If you had a custom `Training Configuration` in the Academy inspector, you will need to pass your custom configuration at every training run using the new command line arguments `--width`, `--height`, `--quality-level`, `--time-scale` and `--target-frame-rate`.
* If you were using `--slow` in `mlagents-learn`, you will need to pass your old `Inference Configuration` of the Academy inspector with the new command line arguments `--width`, `--height`, `--quality-level`, `--time-scale` and `--target-frame-rate` instead.
## Migrating from ML-Agents toolkit v0.11.0 to v0.12.0
### Important Changes

81
docs/Python-API.md


- **Print : `print(str(env))`**
Prints all parameters relevant to the loaded environment and the
Brains.
- **Reset : `env.reset(train_mode=True, config=None)`**
- **Reset : `env.reset()`**
- `train_mode` indicates whether to run the environment in train (`True`) or
test (`False`) mode.
- `config` is an optional dictionary of configuration flags specific to the
environment. For generic environments, `config` can be ignored. `config` is
a dictionary of strings to floats where the keys are the names of the
`resetParameters` and the values are their corresponding float values.
Define the reset parameters on the Academy Inspector window in the Unity
Editor.
- **Step : `env.step(action)`**
Sends a step signal to the environment using the actions. For each Brain :
- `action` can be one dimensional arrays or two dimensional arrays if you have

- **Close : `env.close()`**
Sends a shutdown signal to the environment and closes the communication
socket.
### Modifying the environment from Python
The Environment can be modified by using side channels to send data to the
environment. When creating the environment, pass a list of side channels as
`side_channels` argument to the constructor.
__Note__ : A side channel will only send/receive messages when `env.step` is
called.
#### EngineConfigurationChannel
An `EngineConfiguration` will allow you to modify the time scale and graphics quality of the Unity engine.
`EngineConfigurationChannel` has two methods :
* `set_configuration_parameters` with arguments
* width: Defines the width of the display. Default 80.
* height: Defines the height of the display. Default 80.
* quality_level: Defines the quality level of the simulation. Default 1.
* time_scale: Defines the multiplier for the deltatime in the simulation. If set to a higher value, time will pass faster in the simulation but the physics might break. Default 20.
* target_frame_rate: Instructs simulation to try to render at a specified frame rate. Default -1.
* `set_configuration` with argument config which is an `EngineConfig`
NamedTuple object.
For example :
```python
from mlagents.envs.environment import UnityEnvironment
from mlagents.envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
channel = EngineConfigurationChannel()
env = UnityEnvironment(base_port = 5004, side_channels = [channel])
channel.set_configuration_parameters(time_scale = 2.0)
i = env.reset()
...
```
#### FloatPropertiesChannel
A `FloatPropertiesChannel` will allow you to get and set float properties
in the environment. You can call get_property and set_property on the
side channel to read and write properties.
`FloatPropertiesChannel` has three methods:
* `set_property` Sets a property in the Unity Environment.
* key: The string identifier of the property.
* value: The float value of the property.
* `get_property` Gets a property in the Unity Environment. If the property was not found, will return None.
* key: The string identifier of the property.
* `list_properties` Returns a list of all the string identifiers of the properties
```python
from mlagents.envs.environment import UnityEnvironment
from mlagents.envs.side_channel.float_properties_channel import FloatPropertiesChannel
channel = FloatPropertiesChannel()
env = UnityEnvironment(base_port = 5004, side_channels = [channel])
channel.set_property("parameter_1", 2.0)
i = env.reset()
...
```
Once a property has been modified in Python, you can access it in C# after the next call to `step` as follows:
```csharp
var academy = FindObjectOfType<Academy>();
var sharedProperties = academy.FloatProperties;
float property1 = sharedProperties.GetPropertyWithDefault("parameter_1", 0.0f);
```
## mlagents-learn

5
docs/Training-Curriculum-Learning.md


In order to define a curriculum, the first step is to decide which parameters of
the environment will vary. In the case of the Wall Jump environment, what varies
is the height of the wall. We define this as a `Reset Parameter` in the Academy
object of our scene, and by doing so it becomes adjustable via the Python API.
is the height of the wall. We define this as a `Shared Float Property` that
can be accessed in `Academy.FloatProperties`, and by doing so it becomes
adjustable via the Python API.
Rather than adjusting it by hand, we will create a JSON file which
describes the structure of the curriculum. Within it, we can specify which
points in the training process our wall height will change, either based on the

4
docs/Training-Generalized-Reinforcement-Learning-Agents.md


## Introducing Generalization Using Reset Parameters
To enable variations in the environments, we implemented `Reset Parameters`. We
To enable variations in the environments, we implemented `Reset Parameters`.
`Reset Parameters` are `Academy.FloatProperties` that are used only when
resetting the environment. We
also included different sampling methods and the ability to create new kinds of
sampling methods for each `Reset Parameter`. In the 3D ball environment example displayed
in the figure above, the reset parameters are `gravity`, `ball_mass` and `ball_scale`.

17
docs/Training-ML-Agents.md


will use the port `(base_port + worker_id)`, where the `worker_id` is sequential IDs
given to each instance from 0 to `num_envs - 1`. Default is 5005. __Note:__ When
training using the Editor rather than an executable, the base port will be ignored.
* `--slow`: Specify this option to run the Unity environment at normal, game
speed. The `--slow` mode uses the **Time Scale** and **Target Frame Rate**
specified in the Academy's **Inference Configuration**. By default, training
runs using the speeds specified in your Academy's **Training Configuration**.
See
[Academy Properties](Learning-Environment-Design-Academy.md#academy-properties).
* `--train`: Specifies whether to train model or only run in inference mode.
When training, **always** use the `--train` option.
* `--load`: If set, the training code loads an already trained model to

* `--debug`: Specify this option to enable debug-level logging for some parts of the code.
* `--multi-gpu`: Setting this flag enables the use of multiple GPU's (if available) during training.
* `--cpu`: Forces training using CPU only.
* Engine Configuration :
* `--width' : The width of the executable window of the environment(s) in pixels
(ignored for editor training) (Default 84)
* `--height` : The height of the executable window of the environment(s) in pixels
(ignored for editor training). (Default 84)
* `--quality-level` : The quality level of the environment(s). Equivalent to
calling `QualitySettings.SetQualityLevel` in Unity. (Default 5)
* `--time-scale` : The time scale of the Unity environment(s). Equivalent to setting
`Time.timeScale` in Unity. (Default 20.0, maximum 100.0)
* `--target-frame-rate` : The target frame rate of the Unity environment(s).
Equivalent to setting `Application.targetFrameRate` in Unity. (Default: -1)
### Training Config File

2
docs/Training-on-Amazon-Web-Service.md


2. Clone the ML-Agents repo and install the required Python packages
```sh
git clone https://github.com/Unity-Technologies/ml-agents.git
git clone --branch latest_release https://github.com/Unity-Technologies/ml-agents.git
cd ml-agents/ml-agents/
pip3 install -e .
```

24
docs/Using-Virtual-Environment.md


to run ML-Agents on (either local laptop/desktop or remote server). Python 3.6 can be
installed from [here](https://www.python.org/downloads/).
## Python Version Requirement (Required)
This guide has been tested with Python 3.6 and 3.7. Python 3.8 is not supported at this time.
## Installing Pip (Required)

1. To create a new environment named `sample-env` execute `$ python3 -m venv ~/python-envs/sample-env`
1. To activate the environment execute `$ source ~/python-envs/sample-env/bin/activate`
1. Verify pip version is the same as in the __Installing Pip__ section. In case it is not the latest, upgrade to
the latest pip version using `pip3 install --upgrade pip`
the latest pip version using `$ pip3 install --upgrade pip`
1. Install ML-Agents package using `$ pip3 install mlagents`
1. To deactivate the environment execute `$ deactivate`

## Windows Setup
1. Create a folder where the virtual environments will reside `$ md python-envs`
1. To create a new environment named `sample-env` execute `$ python3 -m venv python-envs\sample-env`
1. To activate the environment execute `$ python-envs\sample-env\Scripts\activate`
1. Verify pip version is the same as in the __Installing Pip__ section. In case it is not the latest, upgrade to
the latest pip version using `pip3 install --upgrade pip`
1. Install ML-Agents package using `$ pip3 install mlagents`
1. To deactivate the environment execute `$ deactivate`
1. Create a folder where the virtual environments will reside `md python-envs`
1. To create a new environment named `sample-env` execute `python -m venv python-envs\sample-env`
1. To activate the environment execute `python-envs\sample-env\Scripts\activate`
1. Verify pip version is the same as in the __Installing Pip__ section. In case it is not the
latest, upgrade to the latest pip version using `pip install --upgrade pip`
1. Install ML-Agents package using `pip install mlagents`
1. To deactivate the environment execute `deactivate`
Note:
- Verify that you are using Python 3.6 or Python 3.7. Launch a command prompt using `cmd` and
execute `python --version` to verify the version.
- Python3 installation may require admin privileges on Windows.
- This guide is for Windows 10 using a 64-bit architecture only.

4
gym-unity/gym_unity/tests/test_gym.py


mock_braininfo = mock.Mock()
mock_braininfo.return_value.vector_observations = np.array([num_agents * [1, 2, 3]])
if number_visual_observations:
mock_braininfo.return_value.visual_observations = [[np.zeros(shape=(8, 8, 3))]]
mock_braininfo.return_value.visual_observations = [
[np.zeros(shape=(8, 8, 3), dtype=np.float32)]
]
mock_braininfo.return_value.rewards = num_agents * [1.0]
mock_braininfo.return_value.local_done = num_agents * [False]
mock_braininfo.return_value.agents = range(0, num_agents)

14
ml-agents-envs/mlagents/envs/base_unity_environment.py


from abc import ABC, abstractmethod
from typing import Dict, Optional, Any
from typing import Dict, Optional
from mlagents.envs.brain import AllBrainInfo, BrainParameters

pass
@abstractmethod
def reset(
self,
config: Optional[Dict] = None,
train_mode: bool = True,
custom_reset_parameters: Any = None,
) -> AllBrainInfo:
def reset(self) -> AllBrainInfo:
pass
@property
@abstractmethod
def reset_parameters(self) -> Dict[str, float]:
pass
@abstractmethod

21
ml-agents-envs/mlagents/envs/brain.py


@staticmethod
def merge_memories(m1, m2, agents1, agents2):
if len(m1) == 0 and len(m2) != 0:
m1 = np.zeros((len(agents1), m2.shape[1]))
m1 = np.zeros((len(agents1), m2.shape[1]), dtype=np.float32)
m2 = np.zeros((len(agents2), m1.shape[1]))
m2 = np.zeros((len(agents2), m1.shape[1]), dtype=np.float32)
new_m1 = np.zeros((m1.shape[0], m2.shape[1]))
new_m1 = np.zeros((m1.shape[0], m2.shape[1]), dtype=np.float32)
new_m2 = np.zeros((m2.shape[0], m1.shape[1]))
new_m2 = np.zeros((m2.shape[0], m1.shape[1]), dtype=np.float32)
new_m2[0 : m2.shape[0], 0 : m2.shape[1]] = m2
return np.append(m1, new_m2, axis=0)
return np.append(m1, m2, axis=0)

return s
@staticmethod
@timed
def from_agent_proto(
worker_id: int,
agent_info_list: List[AgentInfoProto],

vis_obs = BrainInfo._process_visual_observations(brain_params, agent_info_list)
total_num_actions = sum(brain_params.vector_action_space_size)
mask_actions = np.ones((len(agent_info_list), total_num_actions))
mask_actions = np.ones(
(len(agent_info_list), total_num_actions), dtype=np.float32
)
for agent_index, agent_info in enumerate(agent_info_list):
if agent_info.action_mask is not None:
if len(agent_info.action_mask) == total_num_actions:

brain_params: BrainParameters, agent_info_list: List[AgentInfoProto]
) -> np.ndarray:
if len(agent_info_list) == 0:
vector_obs = np.zeros((0, brain_params.vector_observation_space_size))
vector_obs = np.zeros(
(0, brain_params.vector_observation_space_size), dtype=np.float32
)
else:
stacked_obs = []
has_nan = False

for vo in vec_obs:
# TODO consider itertools.chain here
proto_vector_obs.extend(vo.float_data.data)
np_obs = np.array(proto_vector_obs)
np_obs = np.array(proto_vector_obs, dtype=np.float32)
# Check for NaNs or infs in the observations
# If there's a NaN in the observations, the dot() result will be NaN

has_nan = has_nan or np.isnan(d)
has_inf = has_inf or not np.isfinite(d)
stacked_obs.append(np_obs)
vector_obs = np.array(stacked_obs)
vector_obs = np.array(stacked_obs, dtype=np.float32)
# In we have any NaN or Infs, use np.nan_to_num to replace these with finite values
if has_nan or has_inf:

17
ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_initialization_output_pb2.py


from mlagents.envs.communicator_objects import brain_parameters_pb2 as mlagents_dot_envs_dot_communicator__objects_dot_brain__parameters__pb2
from mlagents.envs.communicator_objects import environment_parameters_pb2 as mlagents_dot_envs_dot_communicator__objects_dot_environment__parameters__pb2
DESCRIPTOR = _descriptor.FileDescriptor(

serialized_pb=_b('\nGmlagents/envs/communicator_objects/unity_rl_initialization_output.proto\x12\x14\x63ommunicator_objects\x1a\x39mlagents/envs/communicator_objects/brain_parameters.proto\x1a?mlagents/envs/communicator_objects/environment_parameters.proto\"\xeb\x01\n UnityRLInitializationOutputProto\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0f\n\x07version\x18\x02 \x01(\t\x12\x10\n\x08log_path\x18\x03 \x01(\t\x12\x44\n\x10\x62rain_parameters\x18\x05 \x03(\x0b\x32*.communicator_objects.BrainParametersProto\x12P\n\x16\x65nvironment_parameters\x18\x06 \x01(\x0b\x32\x30.communicator_objects.EnvironmentParametersProtoB\x1f\xaa\x02\x1cMLAgents.CommunicatorObjectsb\x06proto3')
serialized_pb=_b('\nGmlagents/envs/communicator_objects/unity_rl_initialization_output.proto\x12\x14\x63ommunicator_objects\x1a\x39mlagents/envs/communicator_objects/brain_parameters.proto\"\x9f\x01\n UnityRLInitializationOutputProto\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0f\n\x07version\x18\x02 \x01(\t\x12\x10\n\x08log_path\x18\x03 \x01(\t\x12\x44\n\x10\x62rain_parameters\x18\x05 \x03(\x0b\x32*.communicator_objects.BrainParametersProtoJ\x04\x08\x06\x10\x07\x42\x1f\xaa\x02\x1cMLAgents.CommunicatorObjectsb\x06proto3')
dependencies=[mlagents_dot_envs_dot_communicator__objects_dot_brain__parameters__pb2.DESCRIPTOR,mlagents_dot_envs_dot_communicator__objects_dot_environment__parameters__pb2.DESCRIPTOR,])
dependencies=[mlagents_dot_envs_dot_communicator__objects_dot_brain__parameters__pb2.DESCRIPTOR,])

message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='environment_parameters', full_name='communicator_objects.UnityRLInitializationOutputProto.environment_parameters', index=4,
number=6, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],

extension_ranges=[],
oneofs=[
],
serialized_start=222,
serialized_end=457,
serialized_start=157,
serialized_end=316,
_UNITYRLINITIALIZATIONOUTPUTPROTO.fields_by_name['environment_parameters'].message_type = mlagents_dot_envs_dot_communicator__objects_dot_environment__parameters__pb2._ENVIRONMENTPARAMETERSPROTO
DESCRIPTOR.message_types_by_name['UnityRLInitializationOutputProto'] = _UNITYRLINITIALIZATIONOUTPUTPROTO
_sym_db.RegisterFileDescriptor(DESCRIPTOR)

14
ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_initialization_output_pb2.pyi


BrainParametersProto as mlagents___envs___communicator_objects___brain_parameters_pb2___BrainParametersProto,
)
from mlagents.envs.communicator_objects.environment_parameters_pb2 import (
EnvironmentParametersProto as mlagents___envs___communicator_objects___environment_parameters_pb2___EnvironmentParametersProto,
)
from typing import (
Iterable as typing___Iterable,
Optional as typing___Optional,

@property
def brain_parameters(self) -> google___protobuf___internal___containers___RepeatedCompositeFieldContainer[mlagents___envs___communicator_objects___brain_parameters_pb2___BrainParametersProto]: ...
@property
def environment_parameters(self) -> mlagents___envs___communicator_objects___environment_parameters_pb2___EnvironmentParametersProto: ...
def __init__(self,
*,
name : typing___Optional[typing___Text] = None,

environment_parameters : typing___Optional[mlagents___envs___communicator_objects___environment_parameters_pb2___EnvironmentParametersProto] = None,
) -> None: ...
@classmethod
def FromString(cls, s: builtin___bytes) -> UnityRLInitializationOutputProto: ...

def HasField(self, field_name: typing_extensions___Literal[u"environment_parameters"]) -> builtin___bool: ...
def ClearField(self, field_name: typing_extensions___Literal[u"brain_parameters",u"environment_parameters",u"log_path",u"name",u"version"]) -> None: ...
def ClearField(self, field_name: typing_extensions___Literal[u"brain_parameters",u"log_path",u"name",u"version"]) -> None: ...
def HasField(self, field_name: typing_extensions___Literal[u"environment_parameters",b"environment_parameters"]) -> builtin___bool: ...
def ClearField(self, field_name: typing_extensions___Literal[u"brain_parameters",b"brain_parameters",u"environment_parameters",b"environment_parameters",u"log_path",b"log_path",u"name",b"name",u"version",b"version"]) -> None: ...
def ClearField(self, field_name: typing_extensions___Literal[u"brain_parameters",b"brain_parameters",u"log_path",b"log_path",u"name",b"name",u"version",b"version"]) -> None: ...

37
ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_input_pb2.py


from mlagents.envs.communicator_objects import agent_action_pb2 as mlagents_dot_envs_dot_communicator__objects_dot_agent__action__pb2
from mlagents.envs.communicator_objects import environment_parameters_pb2 as mlagents_dot_envs_dot_communicator__objects_dot_environment__parameters__pb2
from mlagents.envs.communicator_objects import command_pb2 as mlagents_dot_envs_dot_communicator__objects_dot_command__pb2

syntax='proto3',
serialized_pb=_b('\n7mlagents/envs/communicator_objects/unity_rl_input.proto\x12\x14\x63ommunicator_objects\x1a\x35mlagents/envs/communicator_objects/agent_action.proto\x1a?mlagents/envs/communicator_objects/environment_parameters.proto\x1a\x30mlagents/envs/communicator_objects/command.proto\"\xc3\x03\n\x11UnityRLInputProto\x12P\n\ragent_actions\x18\x01 \x03(\x0b\x32\x39.communicator_objects.UnityRLInputProto.AgentActionsEntry\x12P\n\x16\x65nvironment_parameters\x18\x02 \x01(\x0b\x32\x30.communicator_objects.EnvironmentParametersProto\x12\x13\n\x0bis_training\x18\x03 \x01(\x08\x12\x33\n\x07\x63ommand\x18\x04 \x01(\x0e\x32\".communicator_objects.CommandProto\x1aM\n\x14ListAgentActionProto\x12\x35\n\x05value\x18\x01 \x03(\x0b\x32&.communicator_objects.AgentActionProto\x1aq\n\x11\x41gentActionsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12K\n\x05value\x18\x02 \x01(\x0b\x32<.communicator_objects.UnityRLInputProto.ListAgentActionProto:\x02\x38\x01\x42\x1f\xaa\x02\x1cMLAgents.CommunicatorObjectsb\x06proto3')
serialized_pb=_b('\n7mlagents/envs/communicator_objects/unity_rl_input.proto\x12\x14\x63ommunicator_objects\x1a\x35mlagents/envs/communicator_objects/agent_action.proto\x1a\x30mlagents/envs/communicator_objects/command.proto\"\xfe\x02\n\x11UnityRLInputProto\x12P\n\ragent_actions\x18\x01 \x03(\x0b\x32\x39.communicator_objects.UnityRLInputProto.AgentActionsEntry\x12\x33\n\x07\x63ommand\x18\x04 \x01(\x0e\x32\".communicator_objects.CommandProto\x12\x14\n\x0cside_channel\x18\x05 \x01(\x0c\x1aM\n\x14ListAgentActionProto\x12\x35\n\x05value\x18\x01 \x03(\x0b\x32&.communicator_objects.AgentActionProto\x1aq\n\x11\x41gentActionsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12K\n\x05value\x18\x02 \x01(\x0b\x32<.communicator_objects.UnityRLInputProto.ListAgentActionProto:\x02\x38\x01J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04\x42\x1f\xaa\x02\x1cMLAgents.CommunicatorObjectsb\x06proto3')
dependencies=[mlagents_dot_envs_dot_communicator__objects_dot_agent__action__pb2.DESCRIPTOR,mlagents_dot_envs_dot_communicator__objects_dot_environment__parameters__pb2.DESCRIPTOR,mlagents_dot_envs_dot_communicator__objects_dot_command__pb2.DESCRIPTOR,])
dependencies=[mlagents_dot_envs_dot_communicator__objects_dot_agent__action__pb2.DESCRIPTOR,mlagents_dot_envs_dot_communicator__objects_dot_command__pb2.DESCRIPTOR,])

extension_ranges=[],
oneofs=[
],
serialized_start=511,
serialized_end=588,
serialized_start=365,
serialized_end=442,
)
_UNITYRLINPUTPROTO_AGENTACTIONSENTRY = _descriptor.Descriptor(

extension_ranges=[],
oneofs=[
],
serialized_start=590,
serialized_end=703,
serialized_start=444,
serialized_end=557,
)
_UNITYRLINPUTPROTO = _descriptor.Descriptor(

is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='environment_parameters', full_name='communicator_objects.UnityRLInputProto.environment_parameters', index=1,
number=2, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
name='command', full_name='communicator_objects.UnityRLInputProto.command', index=1,
number=4, type=14, cpp_type=8, label=1,
has_default_value=False, default_value=0,
name='is_training', full_name='communicator_objects.UnityRLInputProto.is_training', index=2,
number=3, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='command', full_name='communicator_objects.UnityRLInputProto.command', index=3,
number=4, type=14, cpp_type=8, label=1,
has_default_value=False, default_value=0,
name='side_channel', full_name='communicator_objects.UnityRLInputProto.side_channel', index=2,
number=5, type=12, cpp_type=9, label=1,
has_default_value=False, default_value=_b(""),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),

extension_ranges=[],
oneofs=[
],
serialized_start=252,
serialized_end=703,
serialized_start=187,
serialized_end=569,
)
_UNITYRLINPUTPROTO_LISTAGENTACTIONPROTO.fields_by_name['value'].message_type = mlagents_dot_envs_dot_communicator__objects_dot_agent__action__pb2._AGENTACTIONPROTO

_UNITYRLINPUTPROTO.fields_by_name['agent_actions'].message_type = _UNITYRLINPUTPROTO_AGENTACTIONSENTRY
_UNITYRLINPUTPROTO.fields_by_name['environment_parameters'].message_type = mlagents_dot_envs_dot_communicator__objects_dot_environment__parameters__pb2._ENVIRONMENTPARAMETERSPROTO
_UNITYRLINPUTPROTO.fields_by_name['command'].enum_type = mlagents_dot_envs_dot_communicator__objects_dot_command__pb2._COMMANDPROTO
DESCRIPTOR.message_types_by_name['UnityRLInputProto'] = _UNITYRLINPUTPROTO
_sym_db.RegisterFileDescriptor(DESCRIPTOR)

18
ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_input_pb2.pyi


CommandProto as mlagents___envs___communicator_objects___command_pb2___CommandProto,
)
from mlagents.envs.communicator_objects.environment_parameters_pb2 import (
EnvironmentParametersProto as mlagents___envs___communicator_objects___environment_parameters_pb2___EnvironmentParametersProto,
)
from typing import (
Iterable as typing___Iterable,
Mapping as typing___Mapping,

def HasField(self, field_name: typing_extensions___Literal[u"value",b"value"]) -> builtin___bool: ...
def ClearField(self, field_name: typing_extensions___Literal[u"key",b"key",u"value",b"value"]) -> None: ...
is_training = ... # type: builtin___bool
side_channel = ... # type: builtin___bytes
@property
def environment_parameters(self) -> mlagents___envs___communicator_objects___environment_parameters_pb2___EnvironmentParametersProto: ...
environment_parameters : typing___Optional[mlagents___envs___communicator_objects___environment_parameters_pb2___EnvironmentParametersProto] = None,
is_training : typing___Optional[builtin___bool] = None,
side_channel : typing___Optional[builtin___bytes] = None,
) -> None: ...
@classmethod
def FromString(cls, s: builtin___bytes) -> UnityRLInputProto: ...

def HasField(self, field_name: typing_extensions___Literal[u"environment_parameters"]) -> builtin___bool: ...
def ClearField(self, field_name: typing_extensions___Literal[u"agent_actions",u"command",u"environment_parameters",u"is_training"]) -> None: ...
def ClearField(self, field_name: typing_extensions___Literal[u"agent_actions",u"command",u"side_channel"]) -> None: ...
def HasField(self, field_name: typing_extensions___Literal[u"environment_parameters",b"environment_parameters"]) -> builtin___bool: ...
def ClearField(self, field_name: typing_extensions___Literal[u"agent_actions",b"agent_actions",u"command",b"command",u"environment_parameters",b"environment_parameters",u"is_training",b"is_training"]) -> None: ...
def ClearField(self, field_name: typing_extensions___Literal[u"agent_actions",b"agent_actions",u"command",b"command",u"side_channel",b"side_channel"]) -> None: ...

19
ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_output_pb2.py


name='mlagents/envs/communicator_objects/unity_rl_output.proto',
package='communicator_objects',
syntax='proto3',
serialized_pb=_b('\n8mlagents/envs/communicator_objects/unity_rl_output.proto\x12\x14\x63ommunicator_objects\x1a\x33mlagents/envs/communicator_objects/agent_info.proto\"\xa3\x02\n\x12UnityRLOutputProto\x12L\n\nagentInfos\x18\x02 \x03(\x0b\x32\x38.communicator_objects.UnityRLOutputProto.AgentInfosEntry\x1aI\n\x12ListAgentInfoProto\x12\x33\n\x05value\x18\x01 \x03(\x0b\x32$.communicator_objects.AgentInfoProto\x1an\n\x0f\x41gentInfosEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12J\n\x05value\x18\x02 \x01(\x0b\x32;.communicator_objects.UnityRLOutputProto.ListAgentInfoProto:\x02\x38\x01J\x04\x08\x01\x10\x02\x42\x1f\xaa\x02\x1cMLAgents.CommunicatorObjectsb\x06proto3')
serialized_pb=_b('\n8mlagents/envs/communicator_objects/unity_rl_output.proto\x12\x14\x63ommunicator_objects\x1a\x33mlagents/envs/communicator_objects/agent_info.proto\"\xb9\x02\n\x12UnityRLOutputProto\x12L\n\nagentInfos\x18\x02 \x03(\x0b\x32\x38.communicator_objects.UnityRLOutputProto.AgentInfosEntry\x12\x14\n\x0cside_channel\x18\x03 \x01(\x0c\x1aI\n\x12ListAgentInfoProto\x12\x33\n\x05value\x18\x01 \x03(\x0b\x32$.communicator_objects.AgentInfoProto\x1an\n\x0f\x41gentInfosEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12J\n\x05value\x18\x02 \x01(\x0b\x32;.communicator_objects.UnityRLOutputProto.ListAgentInfoProto:\x02\x38\x01J\x04\x08\x01\x10\x02\x42\x1f\xaa\x02\x1cMLAgents.CommunicatorObjectsb\x06proto3')
,
dependencies=[mlagents_dot_envs_dot_communicator__objects_dot_agent__info__pb2.DESCRIPTOR,])

extension_ranges=[],
oneofs=[
],
serialized_start=236,
serialized_end=309,
serialized_start=258,
serialized_end=331,
)
_UNITYRLOUTPUTPROTO_AGENTINFOSENTRY = _descriptor.Descriptor(

extension_ranges=[],
oneofs=[
],
serialized_start=311,
serialized_end=421,
serialized_start=333,
serialized_end=443,
)
_UNITYRLOUTPUTPROTO = _descriptor.Descriptor(

message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='side_channel', full_name='communicator_objects.UnityRLOutputProto.side_channel', index=1,
number=3, type=12, cpp_type=9, label=1,
has_default_value=False, default_value=_b(""),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],

oneofs=[
],
serialized_start=136,
serialized_end=427,
serialized_end=449,
)
_UNITYRLOUTPUTPROTO_LISTAGENTINFOPROTO.fields_by_name['value'].message_type = mlagents_dot_envs_dot_communicator__objects_dot_agent__info__pb2._AGENTINFOPROTO

6
ml-agents-envs/mlagents/envs/communicator_objects/unity_rl_output_pb2.pyi


def HasField(self, field_name: typing_extensions___Literal[u"value",b"value"]) -> builtin___bool: ...
def ClearField(self, field_name: typing_extensions___Literal[u"key",b"key",u"value",b"value"]) -> None: ...
side_channel = ... # type: builtin___bytes
@property
def agentInfos(self) -> typing___MutableMapping[typing___Text, UnityRLOutputProto.ListAgentInfoProto]: ...

agentInfos : typing___Optional[typing___Mapping[typing___Text, UnityRLOutputProto.ListAgentInfoProto]] = None,
side_channel : typing___Optional[builtin___bytes] = None,
) -> None: ...
@classmethod
def FromString(cls, s: builtin___bytes) -> UnityRLOutputProto: ...

def ClearField(self, field_name: typing_extensions___Literal[u"agentInfos"]) -> None: ...
def ClearField(self, field_name: typing_extensions___Literal[u"agentInfos",u"side_channel"]) -> None: ...
def ClearField(self, field_name: typing_extensions___Literal[u"agentInfos",b"agentInfos"]) -> None: ...
def ClearField(self, field_name: typing_extensions___Literal[u"agentInfos",b"agentInfos",u"side_channel",b"side_channel"]) -> None: ...

11
ml-agents-envs/mlagents/envs/env_manager.py


from abc import ABC, abstractmethod
from typing import Any, List, Dict, NamedTuple, Optional
from typing import List, Dict, NamedTuple, Optional
from mlagents.envs.brain import AllBrainInfo, BrainParameters
from mlagents.envs.policy import Policy
from mlagents.envs.action_info import ActionInfo

pass
@abstractmethod
def reset(
self,
config: Dict = None,
train_mode: bool = True,
custom_reset_parameters: Any = None,
) -> List[EnvironmentStep]:
def reset(self, config: Dict = None) -> List[EnvironmentStep]:
pass
@property

@property
@abstractmethod
def reset_parameters(self) -> Dict[str, float]:
def get_properties(self) -> Dict[str, float]:
pass
@abstractmethod

124
ml-agents-envs/mlagents/envs/environment.py


import subprocess
from typing import Dict, List, Optional, Any
from mlagents.envs.side_channel.side_channel import SideChannel
from mlagents.envs.base_unity_environment import BaseUnityEnvironment
from mlagents.envs.timers import timed, hierarchical_timer
from .brain import AllBrainInfo, BrainInfo, BrainParameters

from mlagents.envs.communicator_objects.unity_rl_input_pb2 import UnityRLInputProto
from mlagents.envs.communicator_objects.unity_rl_output_pb2 import UnityRLOutputProto
from mlagents.envs.communicator_objects.agent_action_pb2 import AgentActionProto
from mlagents.envs.communicator_objects.environment_parameters_pb2 import (
EnvironmentParametersProto,
)
from mlagents.envs.communicator_objects.unity_output_pb2 import UnityOutputProto
from mlagents.envs.communicator_objects.unity_rl_initialization_input_pb2 import (
UnityRLInitializationInputProto,

from .rpc_communicator import RpcCommunicator
from sys import platform
import signal
import struct
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("mlagents.envs")

no_graphics: bool = False,
timeout_wait: int = 60,
args: Optional[List[str]] = None,
side_channels: Optional[List[SideChannel]] = None,
):
"""
Starts a new unity environment and establishes a connection with the environment.

:int timeout_wait: Time (in seconds) to wait for connection from environment.
:bool train_mode: Whether to run in training mode, speeding up the simulation, by default.
:list args: Addition Unity command line arguments
:list side_channels: Additional side channel for no-rl communication with Unity
"""
args = args or []
atexit.register(self._close)

self.timeout_wait: int = timeout_wait
self.communicator = self.get_communicator(worker_id, base_port, timeout_wait)
self.worker_id = worker_id
self.side_channels: Dict[int, SideChannel] = {}
if side_channels is not None:
for _sc in side_channels:
if _sc.channel_type in self.side_channels:
raise UnityEnvironmentException(
"There cannot be two side channels with the same channel type {0}.".format(
_sc.channel_type
)
)
self.side_channels[_sc.channel_type] = _sc
# If the environment name is None, a new environment will not be launched
# and the communicator will directly try to connect to an existing unity environment.

self._external_brain_names: List[str] = []
self._num_external_brains = 0
self._update_brain_parameters(aca_output)
self._resetParameters = dict(aca_params.environment_parameters.float_parameters)
logger.info(
"\n'{0}' started successfully!\n{1}".format(self._academy_name, str(self))
)

for brain_name in self.external_brain_names:
external_brains[brain_name] = self.brains[brain_name]
return external_brains
@property
def reset_parameters(self):
return self._resetParameters
def executable_launcher(self, file_name, docker_training, no_graphics, args):
cwd = os.getcwd()

)
def __str__(self):
reset_params_str = (
"\n\t\t".join(
[
str(k) + " -> " + str(self._resetParameters[k])
for k in self._resetParameters
]
)
if self._resetParameters
else "{}"
)
return f"""Unity Academy name: {self._academy_name}
Reset Parameters : {reset_params_str}"""
return """Unity Academy name: {0}""".format(self._academy_name)
def reset(
self,
config: Dict = None,
train_mode: bool = True,
custom_reset_parameters: Any = None,
) -> AllBrainInfo:
def reset(self) -> AllBrainInfo:
if config is None:
config = self._resetParameters
elif config:
logger.info(
"Academy reset with parameters: {0}".format(
", ".join([str(x) + " -> " + str(config[x]) for x in config])
)
)
for k in config:
if (k in self._resetParameters) and (isinstance(config[k], (int, float))):
self._resetParameters[k] = config[k]
elif not isinstance(config[k], (int, float)):
raise UnityEnvironmentException(
"The value for parameter '{0}'' must be an Integer or a Float.".format(
k
)
)
else:
raise UnityEnvironmentException(
"The parameter '{0}' is not a valid parameter.".format(k)
)
outputs = self.communicator.exchange(
self._generate_reset_input(train_mode, config, custom_reset_parameters)
)
outputs = self.communicator.exchange(self._generate_reset_input())
if outputs is None:
raise UnityCommunicationException("Communicator has stopped.")
self._update_brain_parameters(outputs)

_data[brain_name] = BrainInfo.from_agent_proto(
self.worker_id, agent_info_list, self.brains[brain_name]
)
self._parse_side_channel_message(self.side_channels, output.side_channel)
@staticmethod
def _parse_side_channel_message(
side_channels: Dict[int, SideChannel], data: bytearray
) -> None:
offset = 0
while offset < len(data):
try:
channel_type, message_len = struct.unpack_from("<ii", data, offset)
offset = offset + 8
message_data = data[offset : offset + message_len]
offset = offset + message_len
except Exception:
raise UnityEnvironmentException(
"There was a problem reading a message in a SideChannel. "
"Please make sure the version of MLAgents in Unity is "
"compatible with the Python version."
)
if len(message_data) != message_len:
raise UnityEnvironmentException(
"The message received by the side channel {0} was "
"unexpectedly short. Make sure your Unity Environment "
"sending side channel data properly.".format(channel_type)
)
if channel_type in side_channels:
side_channels[channel_type].on_message_received(message_data)
else:
logger.warning(
"Unknown side channel data received. Channel type "
": {0}.".format(channel_type)
)
@staticmethod
def _generate_side_channel_data(side_channels: Dict[int, SideChannel]) -> bytearray:
result = bytearray()
for channel_type, channel in side_channels.items():
for message in channel.message_queue:
result += struct.pack("<ii", channel_type, len(message))
result += message
channel.message_queue = []
return result
def _update_brain_parameters(self, output: UnityOutputProto) -> None:
init_output = output.rl_initialization_output

action.value = float(value[b][i])
rl_in.agent_actions[b].value.extend([action])
rl_in.command = 0
rl_in.side_channel = bytes(self._generate_side_channel_data(self.side_channels))
def _generate_reset_input(
self, training: bool, config: Dict, custom_reset_parameters: Any
) -> UnityInputProto:
def _generate_reset_input(self) -> UnityInputProto:
rl_in.is_training = training
rl_in.environment_parameters.CopyFrom(EnvironmentParametersProto())
for key in config:
rl_in.environment_parameters.float_parameters[key] = config[key]
if custom_reset_parameters is not None:
rl_in.environment_parameters.custom_reset_parameters.CopyFrom(
custom_reset_parameters
)
rl_in.side_channel = bytes(self._generate_side_channel_data(self.side_channels))
return self.wrap_unity_input(rl_in)
def send_academy_parameters(

29
ml-agents-envs/mlagents/envs/simple_env_manager.py


from typing import Any, Dict, List
from typing import Dict, List
from mlagents.envs.base_unity_environment import BaseUnityEnvironment
from mlagents.envs.env_manager import EnvManager, EnvironmentStep

from mlagents.envs.side_channel.float_properties_channel import FloatPropertiesChannel
class SimpleEnvManager(EnvManager):

"""
def __init__(self, env: BaseUnityEnvironment):
def __init__(
self, env: BaseUnityEnvironment, float_prop_channel: FloatPropertiesChannel
):
self.shared_float_properties = float_prop_channel
self.env = env
self.previous_step: EnvironmentStep = EnvironmentStep(None, {}, None)
self.previous_all_action_info: Dict[str, ActionInfo] = {}

return [step_info]
def reset(
self,
config: Dict[str, float] = None,
train_mode: bool = True,
custom_reset_parameters: Any = None,
self, config: Dict[str, float] = None
all_brain_info = self.env.reset(
config=config,
train_mode=train_mode,
custom_reset_parameters=custom_reset_parameters,
)
if config is not None:
for k, v in config.items():
self.shared_float_properties.set_property(k, v)
all_brain_info = self.env.reset()
self.previous_step = EnvironmentStep(None, all_brain_info, None)
return [self.previous_step]

@property
def reset_parameters(self) -> Dict[str, float]:
return self.env.reset_parameters
def get_properties(self) -> Dict[str, float]:
reset_params = {}
for k in self.shared_float_properties.list_properties():
reset_params[k] = self.shared_float_properties.get_property(k)
return reset_params
def close(self):
self.env.close()

71
ml-agents-envs/mlagents/envs/subprocess_env_manager.py


)
from mlagents.envs.brain import AllBrainInfo, BrainParameters
from mlagents.envs.action_info import ActionInfo
from mlagents.envs.side_channel.float_properties_channel import FloatPropertiesChannel
from mlagents.envs.side_channel.engine_configuration_channel import (
EngineConfigurationChannel,
EngineConfig,
)
from mlagents.envs.side_channel.side_channel import SideChannel
logger = logging.getLogger("mlagents.envs")

def worker(
parent_conn: Connection, step_queue: Queue, pickled_env_factory: str, worker_id: int
parent_conn: Connection,
step_queue: Queue,
pickled_env_factory: str,
worker_id: int,
engine_configuration: EngineConfig,
env_factory: Callable[[int], UnityEnvironment] = cloudpickle.loads(
pickled_env_factory
env_factory: Callable[
[int, List[SideChannel]], UnityEnvironment
] = cloudpickle.loads(pickled_env_factory)
shared_float_properties = FloatPropertiesChannel()
engine_configuration_channel = EngineConfigurationChannel()
engine_configuration_channel.set_configuration(engine_configuration)
env: BaseUnityEnvironment = env_factory(
worker_id, [shared_float_properties, engine_configuration_channel]
env: BaseUnityEnvironment = env_factory(worker_id)
def _send_response(cmd_name, payload):
parent_conn.send(EnvironmentResponse(cmd_name, worker_id, payload))

reset_timers()
elif cmd.name == "external_brains":
_send_response("external_brains", env.external_brains)
elif cmd.name == "reset_parameters":
_send_response("reset_parameters", env.reset_parameters)
elif cmd.name == "get_properties":
reset_params = {}
for k in shared_float_properties.list_properties():
reset_params[k] = shared_float_properties.get_property(k)
_send_response("get_properties", reset_params)
all_brain_info = env.reset(
cmd.payload[0], cmd.payload[1], cmd.payload[2]
)
for k, v in cmd.payload.items():
shared_float_properties.set_property(k, v)
all_brain_info = env.reset()
_send_response("reset", all_brain_info)
elif cmd.name == "close":
break

class SubprocessEnvManager(EnvManager):
def __init__(
self, env_factory: Callable[[int], BaseUnityEnvironment], n_env: int = 1
self,
env_factory: Callable[[int, List[SideChannel]], BaseUnityEnvironment],
engine_configuration: EngineConfig,
n_env: int = 1,
):
super().__init__()
self.env_workers: List[UnityEnvWorker] = []

self.create_worker(worker_idx, self.step_queue, env_factory)
self.create_worker(
worker_idx, self.step_queue, env_factory, engine_configuration
)
)
@staticmethod

env_factory: Callable[[int], BaseUnityEnvironment],
env_factory: Callable[[int, List[SideChannel]], BaseUnityEnvironment],
engine_configuration: EngineConfig,
) -> UnityEnvWorker:
parent_conn, child_conn = Pipe()

child_process = Process(
target=worker, args=(child_conn, step_queue, pickled_env_factory, worker_id)
target=worker,
args=(
child_conn,
step_queue,
pickled_env_factory,
worker_id,
engine_configuration,
),
)
child_process.start()
return UnityEnvWorker(child_process, worker_id, parent_conn)

step_infos = self._postprocess_steps(worker_steps)
return step_infos
def reset(
self,
config: Optional[Dict] = None,
train_mode: bool = True,
custom_reset_parameters: Any = None,
) -> List[EnvironmentStep]:
def reset(self, config: Optional[Dict] = None) -> List[EnvironmentStep]:
while any(ew.waiting for ew in self.env_workers):
if not self.step_queue.empty():
step = self.step_queue.get_nowait()

ew.send("reset", (config, train_mode, custom_reset_parameters))
ew.send("reset", config)
# Next (synchronously) collect the reset observations from each worker in sequence
for ew in self.env_workers:
ew.previous_step = EnvironmentStep(None, ew.recv().payload, None)

return self.env_workers[0].recv().payload
@property
def reset_parameters(self) -> Dict[str, float]:
self.env_workers[0].send("reset_parameters")
def get_properties(self) -> Dict[str, float]:
self.env_workers[0].send("get_properties")
return self.env_workers[0].recv().payload
def close(self) -> None:

6
ml-agents-envs/mlagents/envs/tests/test_brain.py


from typing import List
import logging
import numpy as np
import sys
from unittest import mock
from mlagents.envs.communicator_objects.agent_info_pb2 import AgentInfoProto

agent_info_proto = _make_agent_info_proto([1.0, float("inf"), 0.0])
brain_info = BrainInfo.from_agent_proto(1, [agent_info_proto], test_brain)
# inf should get set to float_max
expected = [1.0, sys.float_info.max, 0.0]
# inf should get set to float32_max
float32_max = np.finfo(np.float32).max
expected = [1.0, float32_max, 0.0]
assert (brain_info.vector_observations == expected).all()
mock_nan_to_num.assert_called()
# We don't warn on inf, just NaN

35
ml-agents-envs/mlagents/envs/tests/test_subprocess_env_manager.py


StepResponse,
)
from mlagents.envs.base_unity_environment import BaseUnityEnvironment
from mlagents.envs.side_channel.engine_configuration_channel import EngineConfig
def mock_env_factory(worker_id):

class SubprocessEnvManagerTest(unittest.TestCase):
def test_environments_are_created(self):
SubprocessEnvManager.create_worker = MagicMock()
env = SubprocessEnvManager(mock_env_factory, 2)
env = SubprocessEnvManager(mock_env_factory, EngineConfig.default_config(), 2)
mock.call(0, env.step_queue, mock_env_factory),
mock.call(1, env.step_queue, mock_env_factory),
mock.call(
0, env.step_queue, mock_env_factory, EngineConfig.default_config()
),
mock.call(
1, env.step_queue, mock_env_factory, EngineConfig.default_config()
),
SubprocessEnvManager.create_worker = lambda em, worker_id, step_queue, env_factory: MockEnvWorker(
SubprocessEnvManager.create_worker = lambda em, worker_id, step_queue, env_factory, engine_c: MockEnvWorker(
manager = SubprocessEnvManager(mock_env_factory, 1)
manager = SubprocessEnvManager(
mock_env_factory, EngineConfig.default_config(), 1
)
manager.reset(params, False)
manager.env_workers[0].send.assert_called_with("reset", (params, False, None))
manager.reset(params)
manager.env_workers[0].send.assert_called_with("reset", (params))
SubprocessEnvManager.create_worker = lambda em, worker_id, step_queue, env_factory: MockEnvWorker(
SubprocessEnvManager.create_worker = lambda em, worker_id, step_queue, env_factory, engine_c: MockEnvWorker(
manager = SubprocessEnvManager(mock_env_factory, 4)
manager = SubprocessEnvManager(
mock_env_factory, EngineConfig.default_config(), 4
)
env.send.assert_called_with("reset", (params, True, None))
env.send.assert_called_with("reset", (params))
env.recv.assert_called()
# Check that the "last steps" are set to the value returned for each step
self.assertEqual(

def test_step_takes_steps_for_all_non_waiting_envs(self):
SubprocessEnvManager.create_worker = lambda em, worker_id, step_queue, env_factory: MockEnvWorker(
SubprocessEnvManager.create_worker = lambda em, worker_id, step_queue, env_factory, engine_c: MockEnvWorker(
manager = SubprocessEnvManager(mock_env_factory, 3)
manager = SubprocessEnvManager(
mock_env_factory, EngineConfig.default_config(), 3
)
manager.step_queue = Mock()
manager.step_queue.get_nowait.side_effect = [
EnvironmentResponse("step", 0, StepResponse(0, None)),

7
ml-agents/mlagents/trainers/bc/policy.py


else:
feed_dict[self.model.true_action] = mini_batch["actions"]
feed_dict[self.model.action_masks] = np.ones(
(num_sequences, sum(self.brain.vector_action_space_size))
(num_sequences, sum(self.brain.vector_action_space_size)),
dtype=np.float32,
)
if self.use_vec_obs:
feed_dict[self.model.vector_in] = mini_batch["vector_obs"]

if self.use_recurrent:
feed_dict[self.model.memory_in] = np.zeros([num_sequences, self.m_size])
feed_dict[self.model.memory_in] = np.zeros(
[num_sequences, self.m_size], dtype=np.float32
)
run_out = self._execute_model(feed_dict, self.update_dict)
return run_out

17
ml-agents/mlagents/trainers/bc/trainer.py


from mlagents.envs.brain import BrainInfo
from mlagents.envs.action_info import ActionInfoOutputs
from mlagents.trainers.bc.policy import BCPolicy
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.agent_processor import ProcessingBuffer
from mlagents.trainers.trainer import Trainer
logger = logging.getLogger("mlagents.trainers")

self.batches_per_epoch = trainer_parameters["batches_per_epoch"]
self.demonstration_buffer = Buffer()
self.evaluation_buffer = Buffer()
self.demonstration_buffer = AgentBuffer()
self.evaluation_buffer = ProcessingBuffer()
def add_experiences(
self,

Returns whether or not the trainer has enough elements to run update model
:return: A boolean corresponding to whether or not update_model() can be run
"""
return (
len(self.demonstration_buffer.update_buffer["actions"]) > self.n_sequences
)
return self.demonstration_buffer.num_experiences > self.n_sequences
self.demonstration_buffer.update_buffer.shuffle(self.policy.sequence_length)
self.demonstration_buffer.shuffle(self.policy.sequence_length)
len(self.demonstration_buffer.update_buffer["actions"]) // batch_size,
self.demonstration_buffer.num_experiences // batch_size,
update_buffer = self.demonstration_buffer.update_buffer
update_buffer = self.demonstration_buffer
mini_batch = update_buffer.make_mini_batch(i, i + batch_size)
run_out = self.policy.update(mini_batch, self.n_sequences)
loss = run_out["policy_loss"]

470
ml-agents/mlagents/trainers/buffer.py


import numpy as np
import h5py
from typing import List, BinaryIO
from mlagents.envs.exception import UnityException

pass
class Buffer(dict):
class AgentBuffer(dict):
Buffer contains a dictionary of AgentBuffer. The AgentBuffers are indexed by agent_id.
Buffer also contains an update_buffer that corresponds to the buffer used when updating the model.
AgentBuffer contains a dictionary of AgentBufferFields. Each agent has his own AgentBuffer.
The keys correspond to the name of the field. Example: state, action
class AgentBuffer(dict):
class AgentBufferField(list):
AgentBuffer contains a dictionary of AgentBufferFields. Each agent has his own AgentBuffer.
The keys correspond to the name of the field. Example: state, action
AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to his
AgentBufferField with the append method.
class AgentBufferField(list):
"""
AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to his
AgentBufferField with the append method.
"""
def __init__(self):
self.padding_value = 0
super(Buffer.AgentBuffer.AgentBufferField, self).__init__()
def __str__(self):
return str(np.array(self).shape)
def append(self, element, padding_value=0):
"""
Adds an element to this list. Also lets you change the padding
type, so that it can be set on append (e.g. action_masks should
be padded with 1.)
:param element: The element to append to the list.
:param padding_value: The value used to pad when get_batch is called.
"""
super(Buffer.AgentBuffer.AgentBufferField, self).append(element)
self.padding_value = padding_value
def extend(self, data):
"""
Adds a list of np.arrays to the end of the list of np.arrays.
:param data: The np.array list to append.
"""
self += list(np.array(data))
def set(self, data):
"""
Sets the list of np.array to the input data
:param data: The np.array list to be set.
"""
self[:] = []
self[:] = list(np.array(data))
def get_batch(self, batch_size=None, training_length=1, sequential=True):
"""
Retrieve the last batch_size elements of length training_length
from the list of np.array
:param batch_size: The number of elements to retrieve. If None:
All elements will be retrieved.
:param training_length: The length of the sequence to be retrieved. If
None: only takes one element.
:param sequential: If true and training_length is not None: the elements
will not repeat in the sequence. [a,b,c,d,e] with training_length = 2 and
sequential=True gives [[0,a],[b,c],[d,e]]. If sequential=False gives
[[a,b],[b,c],[c,d],[d,e]]
"""
if sequential:
# The sequences will not have overlapping elements (this involves padding)
leftover = len(self) % training_length
# leftover is the number of elements in the first sequence (this sequence might need 0 padding)
if batch_size is None:
# retrieve the maximum number of elements
batch_size = len(self) // training_length + 1 * (leftover != 0)
# The maximum number of sequences taken from a list of length len(self) without overlapping
# with padding is equal to batch_size
if batch_size > (
len(self) // training_length + 1 * (leftover != 0)
):
raise BufferException(
"The batch size and training length requested for get_batch where"
" too large given the current number of data points."
)
if batch_size * training_length > len(self):
padding = np.array(self[-1]) * self.padding_value
return np.array(
[padding] * (training_length - leftover) + self[:],
dtype=np.float32,
)
else:
return np.array(
self[len(self) - batch_size * training_length :],
dtype=np.float32,
)
else:
# The sequences will have overlapping elements
if batch_size is None:
# retrieve the maximum number of elements
batch_size = len(self) - training_length + 1
# The number of sequences of length training_length taken from a list of len(self) elements
# with overlapping is equal to batch_size
if (len(self) - training_length + 1) < batch_size:
raise BufferException(
"The batch size and training length requested for get_batch where"
" too large given the current number of data points."
)
tmp_list = []
for end in range(len(self) - batch_size + 1, len(self) + 1):
tmp_list += self[end - training_length : end]
return np.array(tmp_list, dtype=np.float32)
def reset_field(self):
"""
Resets the AgentBufferField
"""
self[:] = []
self.last_brain_info = None
self.last_take_action_outputs = None
super(Buffer.AgentBuffer, self).__init__()
self.padding_value = 0
super().__init__()
return ", ".join(
["'{0}' : {1}".format(k, str(self[k])) for k in self.keys()]
)
return str(np.array(self).shape)
def reset_agent(self):
def append(self, element: np.ndarray, padding_value: float = 0.0) -> None:
Resets the AgentBuffer
Adds an element to this list. Also lets you change the padding
type, so that it can be set on append (e.g. action_masks should
be padded with 1.)
:param element: The element to append to the list.
:param padding_value: The value used to pad when get_batch is called.
for k in self.keys():
self[k].reset_field()
self.last_brain_info = None
self.last_take_action_outputs = None
super().append(element)
self.padding_value = padding_value
def __getitem__(self, key):
if key not in self.keys():
self[key] = self.AgentBufferField()
return super(Buffer.AgentBuffer, self).__getitem__(key)
def check_length(self, key_list):
def extend(self, data: np.ndarray) -> None:
Some methods will require that some fields have the same length.
check_length will return true if the fields in key_list
have the same length.
:param key_list: The fields which length will be compared
Adds a list of np.arrays to the end of the list of np.arrays.
:param data: The np.array list to append.
if len(key_list) < 2:
return True
length = None
for key in key_list:
if key not in self.keys():
return False
if (length is not None) and (length != len(self[key])):
return False
length = len(self[key])
return True
self += list(np.array(data))
def shuffle(self, sequence_length, key_list=None):
def set(self, data):
Shuffles the fields in key_list in a consistent way: The reordering will
be the same across fields.
:param key_list: The fields that must be shuffled.
Sets the list of np.array to the input data
:param data: The np.array list to be set.
if key_list is None:
key_list = list(self.keys())
if not self.check_length(key_list):
raise BufferException(
"Unable to shuffle if the fields are not of same length"
)
s = np.arange(len(self[key_list[0]]) // sequence_length)
np.random.shuffle(s)
for key in key_list:
tmp = []
for i in s:
tmp += self[key][i * sequence_length : (i + 1) * sequence_length]
self[key][:] = tmp
# Make sure we convert incoming data to float32 if it's a float
dtype = None
if data is not None and len(data) and isinstance(data[0], float):
dtype = np.float32
self[:] = []
self[:] = list(np.array(data, dtype=dtype))
def make_mini_batch(self, start, end):
"""
Creates a mini-batch from buffer.
:param start: Starting index of buffer.
:param end: Ending index of buffer.
:return: Dict of mini batch.
"""
mini_batch = {}
for key in self:
mini_batch[key] = self[key][start:end]
return mini_batch
def sample_mini_batch(self, batch_size, sequence_length=1):
"""
Creates a mini-batch from a random start and end.
:param batch_size: number of elements to withdraw.
:param sequence_length: Length of sequences to sample.
Number of sequences to sample will be batch_size/sequence_length.
def get_batch(
self,
batch_size: int = None,
training_length: int = 1,
sequential: bool = True,
) -> np.ndarray:
num_seq_to_sample = batch_size // sequence_length
mini_batch = Buffer.AgentBuffer()
buff_len = len(next(iter(self.values())))
num_sequences_in_buffer = buff_len // sequence_length
start_idxes = (
np.random.randint(num_sequences_in_buffer, size=num_seq_to_sample)
* sequence_length
) # Sample random sequence starts
for i in start_idxes:
for key in self:
mini_batch[key].extend(self[key][i : i + sequence_length])
return mini_batch
def save_to_file(self, file_object):
"""
Saves the AgentBuffer to a file-like object.
Retrieve the last batch_size elements of length training_length
from the list of np.array
:param batch_size: The number of elements to retrieve. If None:
All elements will be retrieved.
:param training_length: The length of the sequence to be retrieved. If
None: only takes one element.
:param sequential: If true and training_length is not None: the elements
will not repeat in the sequence. [a,b,c,d,e] with training_length = 2 and
sequential=True gives [[0,a],[b,c],[d,e]]. If sequential=False gives
[[a,b],[b,c],[c,d],[d,e]]
with h5py.File(file_object) as write_file:
for key, data in self.items():
write_file.create_dataset(
key, data=data, dtype="f", compression="gzip"
if sequential:
# The sequences will not have overlapping elements (this involves padding)
leftover = len(self) % training_length
# leftover is the number of elements in the first sequence (this sequence might need 0 padding)
if batch_size is None:
# retrieve the maximum number of elements
batch_size = len(self) // training_length + 1 * (leftover != 0)
# The maximum number of sequences taken from a list of length len(self) without overlapping
# with padding is equal to batch_size
if batch_size > (len(self) // training_length + 1 * (leftover != 0)):
raise BufferException(
"The batch size and training length requested for get_batch where"
" too large given the current number of data points."
)
if batch_size * training_length > len(self):
padding = np.array(self[-1], dtype=np.float32) * self.padding_value
return np.array(
[padding] * (training_length - leftover) + self[:],
dtype=np.float32,
)
else:
return np.array(
self[len(self) - batch_size * training_length :],
dtype=np.float32,
)
else:
# The sequences will have overlapping elements
if batch_size is None:
# retrieve the maximum number of elements
batch_size = len(self) - training_length + 1
# The number of sequences of length training_length taken from a list of len(self) elements
# with overlapping is equal to batch_size
if (len(self) - training_length + 1) < batch_size:
raise BufferException(
"The batch size and training length requested for get_batch where"
" too large given the current number of data points."
tmp_list: List[np.ndarray] = []
for end in range(len(self) - batch_size + 1, len(self) + 1):
tmp_list += self[end - training_length : end]
return np.array(tmp_list, dtype=np.float32)
def load_from_file(self, file_object):
def reset_field(self) -> None:
Loads the AgentBuffer from a file-like object.
Resets the AgentBufferField
with h5py.File(file_object) as read_file:
for key in list(read_file.keys()):
self[key] = Buffer.AgentBuffer.AgentBufferField()
# extend() will convert the numpy array's first dimension into list
self[key].extend(read_file[key][()])
self[:] = []
self.update_buffer = self.AgentBuffer()
super(Buffer, self).__init__()
self.last_brain_info = None
self.last_take_action_outputs = None
super().__init__()
return "update buffer :\n\t{0}\nlocal_buffers :\n{1}".format(
str(self.update_buffer),
"\n".join(
["\tagent {0} :{1}".format(k, str(self[k])) for k in self.keys()]
),
)
return ", ".join(["'{0}' : {1}".format(k, str(self[k])) for k in self.keys()])
def reset_agent(self) -> None:
"""
Resets the AgentBuffer
"""
for k in self.keys():
self[k].reset_field()
self.last_brain_info = None
self.last_take_action_outputs = None
self[key] = self.AgentBuffer()
return super(Buffer, self).__getitem__(key)
self[key] = self.AgentBufferField()
return super().__getitem__(key)
def check_length(self, key_list: List[str]) -> bool:
"""
Some methods will require that some fields have the same length.
check_length will return true if the fields in key_list
have the same length.
:param key_list: The fields which length will be compared
"""
if len(key_list) < 2:
return True
length = None
for key in key_list:
if key not in self.keys():
return False
if (length is not None) and (length != len(self[key])):
return False
length = len(self[key])
return True
def shuffle(self, sequence_length: int, key_list: List[str] = None) -> None:
"""
Shuffles the fields in key_list in a consistent way: The reordering will
be the same across fields.
:param key_list: The fields that must be shuffled.
"""
if key_list is None:
key_list = list(self.keys())
if not self.check_length(key_list):
raise BufferException(
"Unable to shuffle if the fields are not of same length"
)
s = np.arange(len(self[key_list[0]]) // sequence_length)
np.random.shuffle(s)
for key in key_list:
tmp: List[np.ndarray] = []
for i in s:
tmp += self[key][i * sequence_length : (i + 1) * sequence_length]
self[key][:] = tmp
def make_mini_batch(self, start: int, end: int) -> "AgentBuffer":
"""
Creates a mini-batch from buffer.
:param start: Starting index of buffer.
:param end: Ending index of buffer.
:return: Dict of mini batch.
"""
mini_batch = AgentBuffer()
for key in self:
mini_batch[key] = self[key][start:end]
return mini_batch
def sample_mini_batch(
self, batch_size: int, sequence_length: int = 1
) -> "AgentBuffer":
"""
Creates a mini-batch from a random start and end.
:param batch_size: number of elements to withdraw.
:param sequence_length: Length of sequences to sample.
Number of sequences to sample will be batch_size/sequence_length.
"""
num_seq_to_sample = batch_size // sequence_length
mini_batch = AgentBuffer()
buff_len = self.num_experiences
num_sequences_in_buffer = buff_len // sequence_length
start_idxes = (
np.random.randint(num_sequences_in_buffer, size=num_seq_to_sample)
* sequence_length
) # Sample random sequence starts
for i in start_idxes:
for key in self:
mini_batch[key].extend(self[key][i : i + sequence_length])
return mini_batch
def save_to_file(self, file_object: BinaryIO) -> None:
"""
Saves the AgentBuffer to a file-like object.
"""
with h5py.File(file_object) as write_file:
for key, data in self.items():
write_file.create_dataset(key, data=data, dtype="f", compression="gzip")
def reset_update_buffer(self):
def load_from_file(self, file_object: BinaryIO) -> None:
Resets the update buffer
Loads the AgentBuffer from a file-like object.
self.update_buffer.reset_agent()
with h5py.File(file_object) as read_file:
for key in list(read_file.keys()):
self[key] = AgentBuffer.AgentBufferField()
# extend() will convert the numpy array's first dimension into list
self[key].extend(read_file[key][()])
def truncate_update_buffer(self, max_length, sequence_length=1):
def truncate(self, max_length: int, sequence_length: int = 1) -> None:
Truncates the update buffer to a certain length.
Truncates the buffer to a certain length.
current_length = len(next(iter(self.update_buffer.values())))
current_length = self.num_experiences
for _key in self.update_buffer.keys():
self.update_buffer[_key] = self.update_buffer[_key][
current_length - max_length :
]
for _key in self.keys():
self[_key] = self[_key][current_length - max_length :]
def reset_local_buffers(self):
"""
Resets all the local local_buffers
@property
def num_experiences(self) -> int:
agent_ids = list(self.keys())
for k in agent_ids:
self[k].reset_agent()
def append_update_buffer(
self, agent_id, key_list=None, batch_size=None, training_length=None
):
"""
Appends the buffer of an agent to the update buffer.
:param agent_id: The id of the agent which data will be appended
:param key_list: The fields that must be added. If None: all fields will be appended.
:param batch_size: The number of elements that must be appended. If None: All of them will be.
:param training_length: The length of the samples that must be appended. If None: only takes one element.
"""
if key_list is None:
key_list = self[agent_id].keys()
if not self[agent_id].check_length(key_list):
raise BufferException(
"The length of the fields {0} for agent {1} where not of same length".format(
key_list, agent_id
)
)
for field_key in key_list:
self.update_buffer[field_key].extend(
self[agent_id][field_key].get_batch(
batch_size=batch_size, training_length=training_length
)
)
The number of agent experiences in the AgentBuffer, i.e. the length of the buffer.
def append_all_agent_batch_to_update_buffer(
self, key_list=None, batch_size=None, training_length=None
):
An experience consists of one element across all of the fields of this AgentBuffer.
Note that these all have to be the same length, otherwise shuffle and append_to_update_buffer
will fail.
Appends the buffer of all agents to the update buffer.
:param key_list: The fields that must be added. If None: all fields will be appended.
:param batch_size: The number of elements that must be appended. If None: All of them will be.
:param training_length: The length of the samples that must be appended. If None: only takes one element.
"""
for agent_id in self.keys():
self.append_update_buffer(agent_id, key_list, batch_size, training_length)
if self.values():
return len(next(iter(self.values())))
else:
return 0

15
ml-agents/mlagents/trainers/components/bc/module.py


self.batch_size = batch_size if batch_size else default_batch_size
self.num_epoch = num_epoch if num_epoch else default_num_epoch
self.n_sequences = max(
min(
self.batch_size, len(self.demonstration_buffer.update_buffer["actions"])
)
min(self.batch_size, self.demonstration_buffer.num_experiences)
// policy.sequence_length,
1,
)

batch_losses = []
possible_demo_batches = (
len(self.demonstration_buffer.update_buffer["actions"]) // self.n_sequences
self.demonstration_buffer.num_experiences // self.n_sequences
)
possible_batches = possible_demo_batches

for _ in range(n_epoch):
self.demonstration_buffer.update_buffer.shuffle(
self.demonstration_buffer.shuffle(
sequence_length=self.policy.sequence_length
)
if max_batches == 0:

for i in range(num_batches // self.policy.sequence_length):
demo_update_buffer = self.demonstration_buffer.update_buffer
demo_update_buffer = self.demonstration_buffer
start = i * self.n_sequences * self.policy.sequence_length
end = (i + 1) * self.n_sequences * self.policy.sequence_length
mini_batch_demo = demo_update_buffer.make_mini_batch(start, end)

(
self.n_sequences * self.policy.sequence_length,
sum(self.policy.model.brain.vector_action_space_size),
)
),
dtype=np.float32,
)
if self.policy.model.brain.vector_observation_space_size > 0:
feed_dict[self.policy.model.vector_in] = mini_batch_demo["vector_obs"]

]
if self.use_recurrent:
feed_dict[self.policy.model.memory_in] = np.zeros(
[self.n_sequences, self.policy.m_size]
[self.n_sequences, self.policy.m_size], dtype=np.float32
)
if not self.policy.model.brain.vector_action_space_type == "continuous":
feed_dict[self.policy.model.prev_action] = mini_batch_demo[

7
ml-agents/mlagents/trainers/components/reward_signals/__init__.py


:return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
"""
return RewardSignalResult(
self.strength * np.zeros(len(current_info.agents)),
np.zeros(len(current_info.agents)),
self.strength * np.zeros(len(current_info.agents), dtype=np.float32),
np.zeros(len(current_info.agents), dtype=np.float32),
)
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:

"""
mini_batch_len = len(next(iter(mini_batch.values())))
return RewardSignalResult(
self.strength * np.zeros(mini_batch_len), np.zeros(mini_batch_len)
self.strength * np.zeros(mini_batch_len, dtype=np.float32),
np.zeros(mini_batch_len, dtype=np.float32),
)
def prepare_update(

2
ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py


:param next_info: The BrainInfo from the next timestep.
:return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
"""
unscaled_reward = np.array(next_info.rewards)
unscaled_reward = np.array(next_info.rewards, dtype=np.float32)
scaled_reward = self.strength * unscaled_reward
return RewardSignalResult(scaled_reward, unscaled_reward)

5
ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py


:return: Feed_dict for update process.
"""
max_num_experiences = min(
len(mini_batch["actions"]),
len(self.demonstration_buffer.update_buffer["actions"]),
len(mini_batch["actions"]), self.demonstration_buffer.num_experiences
)
# If num_sequences is less, we need to shorten the input batch.
for key, element in mini_batch.items():

mini_batch_demo = self.demonstration_buffer.update_buffer.sample_mini_batch(
mini_batch_demo = self.demonstration_buffer.sample_mini_batch(
len(mini_batch["actions"]), 1
)

9
ml-agents/mlagents/trainers/curriculum.py


class Curriculum(object):
def __init__(self, location, default_reset_parameters):
def __init__(self, location):
:param default_reset_parameters: Set of reset parameters for
environment.
"""
self.max_lesson_num = 0
self.measure = None

parameters = self.data["parameters"]
for key in parameters:
if key not in default_reset_parameters:
raise CurriculumConfigError(
"The parameter {0} in Curriculum {1} is not present in "
"the Environment".format(key, location)
)
if len(parameters[key]) != self.max_lesson_num + 1:
raise CurriculumConfigError(
"The parameter {0} in Curriculum {1} must have {2} values "

100
ml-agents/mlagents/trainers/demo_loader.py


import os
from typing import List, Tuple
import numpy as np
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.agent_processor import ProcessingBuffer
from mlagents.envs.brain import BrainParameters, BrainInfo
from mlagents.envs.communicator_objects.agent_info_action_pair_pb2 import (
AgentInfoActionPairProto,

DemonstrationMetaProto,
)
from mlagents.envs.timers import timed, hierarchical_timer
from google.protobuf.internal.decoder import _DecodeVarint32 # type: ignore

@timed
) -> Buffer:
) -> AgentBuffer:
demo_buffer = Buffer()
demo_process_buffer = ProcessingBuffer()
demo_buffer = AgentBuffer()
for idx, experience in enumerate(pair_infos):
if idx > len(pair_infos) - 2:
break

next_brain_info = BrainInfo.from_agent_proto(
0, [next_pair_info.agent_info], brain_params
)
previous_action = np.array(pair_infos[idx].action_info.vector_actions) * 0
previous_action = (
np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0
)
previous_action = np.array(pair_infos[idx - 1].action_info.vector_actions)
demo_buffer[0].last_brain_info = current_brain_info
demo_buffer[0]["done"].append(next_brain_info.local_done[0])
demo_buffer[0]["rewards"].append(next_brain_info.rewards[0])
previous_action = np.array(
pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32
)
demo_process_buffer[0].last_brain_info = current_brain_info
demo_process_buffer[0]["done"].append(next_brain_info.local_done[0])
demo_process_buffer[0]["rewards"].append(next_brain_info.rewards[0])
demo_buffer[0]["visual_obs%d" % i].append(
demo_process_buffer[0]["visual_obs%d" % i].append(
demo_buffer[0]["vector_obs"].append(
demo_process_buffer[0]["vector_obs"].append(
demo_buffer[0]["actions"].append(current_pair_info.action_info.vector_actions)
demo_buffer[0]["prev_action"].append(previous_action)
demo_process_buffer[0]["actions"].append(
current_pair_info.action_info.vector_actions
)
demo_process_buffer[0]["prev_action"].append(previous_action)
demo_buffer.append_update_buffer(
0, batch_size=None, training_length=sequence_length
demo_process_buffer.append_to_update_buffer(
demo_buffer, 0, batch_size=None, training_length=sequence_length
demo_buffer.reset_local_buffers()
demo_buffer.append_update_buffer(
0, batch_size=None, training_length=sequence_length
demo_process_buffer.reset_local_buffers()
demo_process_buffer.append_to_update_buffer(
demo_buffer, 0, batch_size=None, training_length=sequence_length
@timed
) -> Tuple[BrainParameters, Buffer]:
) -> Tuple[BrainParameters, AgentBuffer]:
"""
Loads demonstration file and uses it to fill training buffer.
:param file_path: Location of demonstration file (.demo).

return brain_params, demo_buffer
@timed
def load_demonstration(
file_path: str
) -> Tuple[BrainParameters, List[AgentInfoActionPairProto], int]:

info_action_pairs = []
total_expected = 0
for _file_path in file_paths:
data = open(_file_path, "rb").read()
next_pos, pos, obs_decoded = 0, 0, 0
while pos < len(data):
next_pos, pos = _DecodeVarint32(data, pos)
if obs_decoded == 0:
meta_data_proto = DemonstrationMetaProto()
meta_data_proto.ParseFromString(data[pos : pos + next_pos])
total_expected += meta_data_proto.number_steps
pos = INITIAL_POS
if obs_decoded == 1:
brain_param_proto = BrainParametersProto()
brain_param_proto.ParseFromString(data[pos : pos + next_pos])
pos += next_pos
if obs_decoded > 1:
agent_info_action = AgentInfoActionPairProto()
agent_info_action.ParseFromString(data[pos : pos + next_pos])
if brain_params is None:
brain_params = BrainParameters.from_proto(
brain_param_proto, agent_info_action.agent_info
)
info_action_pairs.append(agent_info_action)
if len(info_action_pairs) == total_expected:
break
pos += next_pos
obs_decoded += 1
with open(_file_path, "rb") as fp:
with hierarchical_timer("read_file"):
data = fp.read()
next_pos, pos, obs_decoded = 0, 0, 0
while pos < len(data):
next_pos, pos = _DecodeVarint32(data, pos)
if obs_decoded == 0:
meta_data_proto = DemonstrationMetaProto()
meta_data_proto.ParseFromString(data[pos : pos + next_pos])
total_expected += meta_data_proto.number_steps
pos = INITIAL_POS
if obs_decoded == 1:
brain_param_proto = BrainParametersProto()
brain_param_proto.ParseFromString(data[pos : pos + next_pos])
pos += next_pos
if obs_decoded > 1:
agent_info_action = AgentInfoActionPairProto()
agent_info_action.ParseFromString(data[pos : pos + next_pos])
if brain_params is None:
brain_params = BrainParameters.from_proto(
brain_param_proto, agent_info_action.agent_info
)
info_action_pairs.append(agent_info_action)
if len(info_action_pairs) == total_expected:
break
pos += next_pos
obs_decoded += 1
return brain_params, info_action_pairs, total_expected

70
ml-agents/mlagents/trainers/learn.py


from mlagents.envs.exception import SamplerException
from mlagents.envs.base_unity_environment import BaseUnityEnvironment
from mlagents.envs.subprocess_env_manager import SubprocessEnvManager
from mlagents.envs.side_channel.side_channel import SideChannel
from mlagents.envs.side_channel.engine_configuration_channel import EngineConfig
class CommandLineOptions(NamedTuple):

num_envs: int
curriculum_folder: Optional[str]
lesson: int
slow: bool
no_graphics: bool
multi_gpu: bool # ?
trainer_config_path: str

cpu: bool
@property
def fast_simulation(self) -> bool:
return not self.slow
width: int
height: int
quality_level: int
time_scale: float
target_frame_rate: int
@staticmethod
def from_argparse(args: Any) -> "CommandLineOptions":

"--seed", default=-1, type=int, help="Random seed used for training"
)
parser.add_argument(
"--slow", action="store_true", help="Whether to run the game at training speed"
)
parser.add_argument(
"--train",
default=False,
dest="train_model",

parser.add_argument("--version", action="version", version=get_version_string())
eng_conf = parser.add_argument_group(title="Engine Configuration")
eng_conf.add_argument(
"--width",
default=84,
type=int,
help="The width of the executable window of the environment(s)",
)
eng_conf.add_argument(
"--height",
default=84,
type=int,
help="The height of the executable window of the environment(s)",
)
eng_conf.add_argument(
"--quality-level",
default=5,
type=int,
help="The quality level of the environment(s)",
)
eng_conf.add_argument(
"--time-scale",
default=20,
type=float,
help="The time scale of the Unity environment(s)",
)
eng_conf.add_argument(
"--target-frame-rate",
default=-1,
type=int,
help="The target frame rate of the Unity environment(s)",
)
args = parser.parse_args(argv)
return CommandLineOptions.from_argparse(args)

port,
options.env_args,
)
env = SubprocessEnvManager(env_factory, options.num_envs)
engine_config = EngineConfig(
options.width,
options.height,
options.quality_level,
options.time_scale,
options.target_frame_rate,
)
env = SubprocessEnvManager(env_factory, engine_config, options.num_envs)
options.sampler_file_path, env.reset_parameters, run_seed
options.sampler_file_path, run_seed
)
trainer_factory = TrainerFactory(
trainer_config,

maybe_meta_curriculum,
options.train_model,
run_seed,
options.fast_simulation,
sampler_manager,
resampling_interval,
)

tc.start_learning(env)
def create_sampler_manager(sampler_file_path, env_reset_params, run_seed=None):
def create_sampler_manager(sampler_file_path, run_seed=None):
sampler_config = None
resample_interval = None
if sampler_file_path is not None:

return None
else:
meta_curriculum = MetaCurriculum(curriculum_folder, env.reset_parameters)
meta_curriculum = MetaCurriculum(curriculum_folder)
# TODO: Should be able to start learning at different lesson numbers
# for each curriculum.
meta_curriculum.set_all_curriculums_to_lesson_num(lesson)

seed: Optional[int],
start_port: int,
env_args: Optional[List[str]],
) -> Callable[[int], BaseUnityEnvironment]:
) -> Callable[[int, List[SideChannel]], BaseUnityEnvironment]:
if env_path is not None:
# Strip out executable extensions if passed
env_path = (

seed_count = 10000
seed_pool = [np.random.randint(0, seed_count) for _ in range(seed_count)]
def create_unity_environment(worker_id: int) -> UnityEnvironment:
def create_unity_environment(
worker_id: int, side_channels: List[SideChannel]
) -> UnityEnvironment:
env_seed = seed
if not env_seed:
env_seed = seed_pool[worker_id % len(seed_pool)]

no_graphics=no_graphics,
base_port=start_port,
args=env_args,
side_channels=side_channels,
)
return create_unity_environment

8
ml-agents/mlagents/trainers/meta_curriculum.py


"""Contains the MetaCurriculum class."""
import os
from typing import Any, Dict, Set
from typing import Dict, Set
from mlagents.trainers.curriculum import Curriculum
from mlagents.trainers.exception import MetaCurriculumError

particular brain in the environment.
"""
def __init__(
self, curriculum_folder: str, default_reset_parameters: Dict[str, Any]
):
def __init__(self, curriculum_folder: str):
"""Initializes a MetaCurriculum object.
Args:

curriculum_filepath = os.path.join(
curriculum_folder, curriculum_filename
)
curriculum = Curriculum(curriculum_filepath, default_reset_parameters)
curriculum = Curriculum(curriculum_filepath)
config_keys: Set[str] = set(curriculum.get_config().keys())
# Check if any two curriculums use the same reset params.

7
ml-agents/mlagents/trainers/ppo/policy.py


]
if self.use_vec_obs:
feed_dict[self.model.vector_in] = [brain_info.vector_observations[idx]]
agent_id = brain_info.agents[idx]
feed_dict[self.model.memory_in] = self.retrieve_memories([idx])
feed_dict[self.model.memory_in] = self.retrieve_memories([agent_id])
feed_dict[self.model.prev_action] = self.retrieve_previous_action([idx])
feed_dict[self.model.prev_action] = self.retrieve_previous_action(
[agent_id]
)
value_estimates = self.sess.run(self.model.value_heads, feed_dict)
value_estimates = {k: float(v) for k, v in value_estimates.items()}

61
ml-agents/mlagents/trainers/ppo/trainer.py


if self.is_training:
self.policy.update_normalization(next_info.vector_observations)
for l in range(len(next_info.agents)):
agent_actions = self.training_buffer[next_info.agents[l]]["actions"]
agent_actions = self.processing_buffer[next_info.agents[l]]["actions"]
if (
next_info.local_done[l]
or len(agent_actions) > self.trainer_parameters["time_horizon"]

bootstrapping_info = self.training_buffer[agent_id].last_brain_info
bootstrapping_info = self.processing_buffer[
agent_id
].last_brain_info
idx = bootstrapping_info.agents.index(agent_id)
else:
bootstrapping_info = next_info

for name in self.policy.reward_signals:
bootstrap_value = value_next[name]
local_rewards = self.training_buffer[agent_id][
local_rewards = self.processing_buffer[agent_id][
local_value_estimates = self.training_buffer[agent_id][
local_value_estimates = self.processing_buffer[agent_id][
"{}_value_estimates".format(name)
].get_batch()
local_advantage = get_gae(

)
local_return = local_advantage + local_value_estimates
# This is later use as target for the different value estimates
self.training_buffer[agent_id]["{}_returns".format(name)].set(
self.processing_buffer[agent_id]["{}_returns".format(name)].set(
self.training_buffer[agent_id]["{}_advantage".format(name)].set(
self.processing_buffer[agent_id]["{}_advantage".format(name)].set(
global_advantages = list(np.mean(np.array(tmp_advantages), axis=0))
global_returns = list(np.mean(np.array(tmp_returns), axis=0))
self.training_buffer[agent_id]["advantages"].set(global_advantages)
self.training_buffer[agent_id]["discounted_returns"].set(global_returns)
global_advantages = list(
np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)
)
global_returns = list(
np.mean(np.array(tmp_returns, dtype=np.float32), axis=0)
)
self.processing_buffer[agent_id]["advantages"].set(global_advantages)
self.processing_buffer[agent_id]["discounted_returns"].set(
global_returns
)
self.training_buffer.append_update_buffer(
self.processing_buffer.append_to_update_buffer(
self.update_buffer,
self.training_buffer[agent_id].reset_agent()
self.processing_buffer[agent_id].reset_agent()
if next_info.local_done[l]:
self.stats["Environment/Episode Length"].append(
self.episode_steps.get(agent_id, 0)

actions = take_action_outputs["action"]
if self.policy.use_continuous_act:
actions_pre = take_action_outputs["pre_action"]
self.training_buffer[agent_id]["actions_pre"].append(actions_pre[agent_idx])
self.processing_buffer[agent_id]["actions_pre"].append(
actions_pre[agent_idx]
)
self.training_buffer[agent_id]["random_normal_epsilon"].append(
self.processing_buffer[agent_id]["random_normal_epsilon"].append(
self.training_buffer[agent_id]["actions"].append(actions[agent_idx])
self.training_buffer[agent_id]["action_probs"].append(a_dist[agent_idx])
self.processing_buffer[agent_id]["actions"].append(actions[agent_idx])
self.processing_buffer[agent_id]["action_probs"].append(a_dist[agent_idx])
def add_rewards_outputs(
self,

"""
for name, reward_result in rewards_out.reward_signals.items():
# 0 because we use the scaled reward to train the agent
self.training_buffer[agent_id]["{}_rewards".format(name)].append(
self.processing_buffer[agent_id]["{}_rewards".format(name)].append(
self.training_buffer[agent_id]["{}_value_estimates".format(name)].append(
self.processing_buffer[agent_id]["{}_value_estimates".format(name)].append(
values[name][agent_idx][0]
)

:return: A boolean corresponding to whether or not update_model() can be run
"""
size_of_buffer = len(self.training_buffer.update_buffer["actions"])
size_of_buffer = self.update_buffer.num_experiences
return size_of_buffer > self.trainer_parameters["buffer_size"]
def update_policy(self):

"""
buffer_length = len(self.training_buffer.update_buffer["actions"])
buffer_length = self.update_buffer.num_experiences
self.trainer_metrics.start_policy_update_timer(
number_experiences=buffer_length,
mean_return=float(np.mean(self.cumulative_returns_since_policy_update)),

int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
)
advantages = self.training_buffer.update_buffer["advantages"].get_batch()
self.training_buffer.update_buffer["advantages"].set(
advantages = self.update_buffer["advantages"].get_batch()
self.update_buffer["advantages"].set(
self.training_buffer.update_buffer.shuffle(
sequence_length=self.policy.sequence_length
)
buffer = self.training_buffer.update_buffer
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for l in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.policy.update(

44
ml-agents/mlagents/trainers/rl_trainer.py


from mlagents.envs.brain import BrainInfo
from mlagents.envs.action_info import ActionInfoOutputs
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.agent_processor import ProcessingBuffer
from mlagents.trainers.trainer import Trainer, UnityTrainerException
from mlagents.trainers.components.reward_signals import RewardSignalResult

# used for reporting only. We always want to report the environment reward to Tensorboard, regardless
# of what reward signals are actually present.
self.collected_rewards = {"environment": {}}
self.training_buffer = Buffer()
self.processing_buffer = ProcessingBuffer()
self.update_buffer = AgentBuffer()
self.episode_steps = {}
def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo:

agents = []
action_masks = []
for agent_id in next_info.agents:
agent_brain_info = self.training_buffer[agent_id].last_brain_info
agent_brain_info = self.processing_buffer[agent_id].last_brain_info
if agent_brain_info is None:
agent_brain_info = next_info
agent_index = agent_brain_info.agents.index(agent_id)

)
for agent_id in curr_info.agents:
self.training_buffer[agent_id].last_brain_info = curr_info
self.training_buffer[
self.processing_buffer[agent_id].last_brain_info = curr_info
self.processing_buffer[
agent_id
].last_take_action_outputs = take_action_outputs

curr_to_use, take_action_outputs["action"], next_info
)
# Store the environment reward
tmp_environment = np.array(next_info.rewards)
tmp_environment = np.array(next_info.rewards, dtype=np.float32)
rewards_out = AllRewardsOutput(
reward_signals=tmp_reward_signal_outs, environment=tmp_environment

stored_info = self.training_buffer[agent_id].last_brain_info
stored_take_action_outputs = self.training_buffer[
stored_info = self.processing_buffer[agent_id].last_brain_info
stored_take_action_outputs = self.processing_buffer[
agent_id
].last_take_action_outputs
if stored_info is not None:

for i, _ in enumerate(stored_info.visual_observations):
self.training_buffer[agent_id]["visual_obs%d" % i].append(
self.processing_buffer[agent_id]["visual_obs%d" % i].append(
self.training_buffer[agent_id]["next_visual_obs%d" % i].append(
next_info.visual_observations[i][next_idx]
)
self.processing_buffer[agent_id][
"next_visual_obs%d" % i
].append(next_info.visual_observations[i][next_idx])
self.training_buffer[agent_id]["vector_obs"].append(
self.processing_buffer[agent_id]["vector_obs"].append(
self.training_buffer[agent_id]["next_vector_in"].append(
self.processing_buffer[agent_id]["next_vector_in"].append(
self.training_buffer[agent_id]["memory"].append(
self.processing_buffer[agent_id]["memory"].append(
self.training_buffer[agent_id]["masks"].append(1.0)
self.training_buffer[agent_id]["done"].append(
self.processing_buffer[agent_id]["masks"].append(1.0)
self.processing_buffer[agent_id]["done"].append(
next_info.local_done[next_idx]
)
# Add the outputs of the last eval

self.training_buffer[agent_id]["action_mask"].append(
self.processing_buffer[agent_id]["action_mask"].append(
self.training_buffer[agent_id]["prev_action"].append(
self.processing_buffer[agent_id]["prev_action"].append(
self.policy.retrieve_previous_action([agent_id])[0, :]
)

A signal that the Episode has ended. The buffer must be reset.
Get only called when the academy resets.
"""
self.training_buffer.reset_local_buffers()
self.processing_buffer.reset_local_buffers()
for agent_id in self.episode_steps:
self.episode_steps[agent_id] = 0
for rewards in self.collected_rewards.values():

Clear the buffers that have been built up during inference. If
we're not training, this should be called instead of update_policy.
"""
self.training_buffer.reset_update_buffer()
self.update_buffer.reset_agent()
def add_policy_outputs(
self, take_action_outputs: ActionInfoOutputs, agent_id: str, agent_idx: int

47
ml-agents/mlagents/trainers/sac/trainer.py


)
LOGGER.debug(
"Loaded update buffer with {} sequences".format(
len(self.training_buffer.update_buffer["actions"])
self.update_buffer.num_experiences
)
)

filename = os.path.join(self.policy.model_path, "last_replay_buffer.hdf5")
LOGGER.info("Saving Experience Replay Buffer to {}".format(filename))
with open(filename, "wb") as file_object:
self.training_buffer.update_buffer.save_to_file(file_object)
self.update_buffer.save_to_file(file_object)
def load_replay_buffer(self) -> None:
"""

LOGGER.info("Loading Experience Replay Buffer from {}".format(filename))
with open(filename, "rb+") as file_object:
self.training_buffer.update_buffer.load_from_file(file_object)
self.update_buffer.load_from_file(file_object)
len(self.training_buffer.update_buffer["actions"])
self.update_buffer.num_experiences
)
)

Takes the output of the last action and store it into the training buffer.
"""
actions = take_action_outputs["action"]
self.training_buffer[agent_id]["actions"].append(actions[agent_idx])
self.processing_buffer[agent_id]["actions"].append(actions[agent_idx])
def add_rewards_outputs(
self,

"""
Takes the value output of the last action and store it into the training buffer.
"""
self.training_buffer[agent_id]["environment_rewards"].append(
self.processing_buffer[agent_id]["environment_rewards"].append(
rewards_out.environment[agent_next_idx]
)

if self.is_training:
self.policy.update_normalization(next_info.vector_observations)
for l in range(len(next_info.agents)):
agent_actions = self.training_buffer[next_info.agents[l]]["actions"]
agent_actions = self.processing_buffer[next_info.agents[l]]["actions"]
if (
next_info.local_done[l]
or len(agent_actions) >= self.trainer_parameters["time_horizon"]

# Bootstrap using last brain info. Set last element to duplicate obs and remove dones.
if next_info.max_reached[l]:
bootstrapping_info = self.training_buffer[agent_id].last_brain_info
bootstrapping_info = self.processing_buffer[
agent_id
].last_brain_info
self.training_buffer[agent_id]["next_visual_obs%d" % i][
self.processing_buffer[agent_id]["next_visual_obs%d" % i][
self.training_buffer[agent_id]["next_vector_in"][
self.processing_buffer[agent_id]["next_vector_in"][
self.training_buffer[agent_id]["done"][-1] = False
self.processing_buffer[agent_id]["done"][-1] = False
self.training_buffer.append_update_buffer(
self.processing_buffer.append_to_update_buffer(
self.update_buffer,
self.training_buffer[agent_id].reset_agent()
self.processing_buffer[agent_id].reset_agent()
if next_info.local_done[l]:
self.stats["Environment/Episode Length"].append(
self.episode_steps.get(agent_id, 0)

:return: A boolean corresponding to whether or not update_model() can be run
"""
return (
len(self.training_buffer.update_buffer["actions"])
>= self.trainer_parameters["batch_size"]
self.update_buffer.num_experiences >= self.trainer_parameters["batch_size"]
and self.step >= self.trainer_parameters["buffer_init_steps"]
)

"""
if self.step % self.train_interval == 0:
self.trainer_metrics.start_policy_update_timer(
number_experiences=len(self.training_buffer.update_buffer["actions"]),
number_experiences=self.update_buffer.num_experiences,
mean_return=float(np.mean(self.cumulative_returns_since_policy_update)),
)
self.update_sac_policy()

batch_update_stats: Dict[str, list] = defaultdict(list)
for _ in range(num_updates):
LOGGER.debug("Updating SAC policy at step {}".format(self.step))
buffer = self.training_buffer.update_buffer
buffer = self.update_buffer
len(self.training_buffer.update_buffer["actions"])
self.update_buffer.num_experiences
>= self.trainer_parameters["batch_size"]
):
sampled_minibatch = buffer.sample_mini_batch(

# Truncate update buffer if neccessary. Truncate more than we need to to avoid truncating
# a large buffer at each update.
if (
len(self.training_buffer.update_buffer["actions"])
> self.trainer_parameters["buffer_size"]
):
self.training_buffer.truncate_update_buffer(
if self.update_buffer.num_experiences > self.trainer_parameters["buffer_size"]:
self.update_buffer.truncate(
int(self.trainer_parameters["buffer_size"] * BUFFER_TRUNCATE_PERCENT)
)

N times, then the reward signals are updated N times. Normally, the reward signal
and policy are updated in parallel.
"""
buffer = self.training_buffer.update_buffer
buffer = self.update_buffer
num_updates = self.reward_signal_updates_per_train
n_sequences = max(
int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1

49
ml-agents/mlagents/trainers/tests/__init__.py


import os
# Opt-in checking mode to ensure that we always create numpy arrays using float32
if os.getenv("TEST_ENFORCE_NUMPY_FLOAT32"):
# This file is importer by pytest multiple times, but this breaks the patching
# Removing the env variable seems the easiest way to prevent this.
del os.environ["TEST_ENFORCE_NUMPY_FLOAT32"]
import numpy as np
import traceback
__old_np_array = np.array
__old_np_zeros = np.zeros
__old_np_ones = np.ones
def _check_no_float64(arr, kwargs_dtype):
if arr.dtype == np.float64:
tb = traceback.extract_stack()
# tb[-1] in the stack is this function.
# tb[-2] is the wrapper function, e.g. np_array_no_float64
# we want the calling function, so use tb[-3]
filename = tb[-3].filename
# Only raise if this came from mlagents code, not tensorflow
if (
"ml-agents/mlagents" in filename
or "ml-agents-envs/mlagents" in filename
) and "tensorflow_to_barracuda.py" not in filename:
raise ValueError(
f"float64 array created. Set dtype=np.float32 instead of current dtype={kwargs_dtype}. "
f"Run pytest with TEST_ENFORCE_NUMPY_FLOAT32=1 to confirm fix."
)
def np_array_no_float64(*args, **kwargs):
res = __old_np_array(*args, **kwargs)
_check_no_float64(res, kwargs.get("dtype"))
return res
def np_zeros_no_float64(*args, **kwargs):
res = __old_np_zeros(*args, **kwargs)
_check_no_float64(res, kwargs.get("dtype"))
return res
def np_ones_no_float64(*args, **kwargs):
res = __old_np_ones(*args, **kwargs)
_check_no_float64(res, kwargs.get("dtype"))
return res
np.array = np_array_no_float64
np.zeros = np_zeros_no_float64
np.ones = np_ones_no_float64

48
ml-agents/mlagents/trainers/tests/mock_brain.py


import numpy as np
from mlagents.envs.brain import CameraResolution, BrainParameters
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.agent_processor import ProcessingBuffer
def create_mock_brainparams(

mock_braininfo = mock.Mock()
mock_braininfo.return_value.visual_observations = num_vis_observations * [
np.ones((num_agents, 84, 84, 3))
np.ones((num_agents, 84, 84, 3), dtype=np.float32)
num_agents * [num_vector_observations * [1]]
num_agents * [num_vector_observations * [1]], dtype=np.float32
num_agents * [num_discrete_branches * [0.5]]
num_agents * [num_discrete_branches * [0.5]], dtype=np.float32
num_agents * [num_vector_acts * [1.0]]
num_agents * [num_vector_acts * [1.0]], dtype=np.float32
num_agents * [num_vector_acts * [0.5]]
num_agents * [num_vector_acts * [0.5]], dtype=np.float32
mock_braininfo.return_value.memories = np.ones((num_agents, 8))
mock_braininfo.return_value.memories = np.ones((num_agents, 8), dtype=np.float32)
mock_braininfo.return_value.rewards = num_agents * [1.0]
mock_braininfo.return_value.local_done = num_agents * [False]
mock_braininfo.return_value.max_reached = num_agents * [100]

# If a key_list was given, remove those keys
if exclude_key_list:
for key in exclude_key_list:
if key in buffer.update_buffer:
buffer.update_buffer.pop(key)
if key in buffer:
buffer.pop(key)
buffer = Buffer()
buffer = ProcessingBuffer()
update_buffer = AgentBuffer()
# Make a buffer
for idx, experience in enumerate(brain_infos):
if idx > len(brain_infos) - 2:

fake_action_size = len(brain_params.vector_action_space_size)
if brain_params.vector_action_space_type == "continuous":
fake_action_size = brain_params.vector_action_space_size[0]
buffer[0]["actions"].append(np.zeros(fake_action_size))
buffer[0]["prev_action"].append(np.zeros(fake_action_size))
buffer[0]["actions"].append(np.zeros(fake_action_size, dtype=np.float32))
buffer[0]["prev_action"].append(np.zeros(fake_action_size, dtype=np.float32))
np.ones(sum(brain_params.vector_action_space_size))
np.ones(sum(brain_params.vector_action_space_size), dtype=np.float32)
buffer[0]["action_probs"].append(np.ones(buffer[0]["actions"][0].shape))
buffer[0]["actions_pre"].append(np.ones(buffer[0]["actions"][0].shape))
buffer[0]["action_probs"].append(
np.ones(buffer[0]["actions"][0].shape, dtype=np.float32)
)
buffer[0]["actions_pre"].append(
np.ones(buffer[0]["actions"][0].shape, dtype=np.float32)
)
np.ones(buffer[0]["actions"][0].shape)
np.ones(buffer[0]["actions"][0].shape, dtype=np.float32)
np.ones(np.sum(brain_params.vector_action_space_size))
np.ones(np.sum(brain_params.vector_action_space_size), dtype=np.float32)
buffer[0]["memory"].append(np.ones(memory_size))
buffer[0]["memory"].append(np.ones(memory_size, dtype=np.float32))
buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length)
return buffer
buffer.append_to_update_buffer(
update_buffer, 0, batch_size=None, training_length=sequence_length
)
return update_buffer
def setup_mock_env_and_brains(

12
ml-agents/mlagents/trainers/tests/test_bc.py


model.dropout_rate: 1.0,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.action_masks: np.ones([2, 2]),
model.action_masks: np.ones([2, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.dropout_rate: 1.0,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.visual_in[0]: np.ones([2, 40, 30, 3]),
model.visual_in[1]: np.ones([2, 40, 30, 3]),
model.action_masks: np.ones([2, 2]),
model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.action_masks: np.ones([2, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.visual_in[0]: np.ones([2, 40, 30, 3]),
model.visual_in[1]: np.ones([2, 40, 30, 3]),
model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

71
ml-agents/mlagents/trainers/tests/test_buffer.py


import numpy as np
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.agent_processor import ProcessingBuffer
def assert_array(a, b):

assert la[i] == lb[i]
def construct_fake_buffer():
b = Buffer()
def construct_fake_processing_buffer():
b = ProcessingBuffer()
for fake_agent_id in range(4):
for step in range(9):
b[fake_agent_id]["vector_observation"].append(

def test_buffer():
b = construct_fake_buffer()
b = construct_fake_processing_buffer()
a = b[1]["vector_observation"].get_batch(
batch_size=2, training_length=1, sequential=True
)

)
b[4].reset_agent()
assert len(b[4]) == 0
b.append_update_buffer(3, batch_size=None, training_length=2)
b.append_update_buffer(2, batch_size=None, training_length=2)
assert len(b.update_buffer["action"]) == 20
assert np.array(b.update_buffer["action"]).shape == (20, 2)
update_buffer = AgentBuffer()
b.append_to_update_buffer(update_buffer, 3, batch_size=None, training_length=2)
b.append_to_update_buffer(update_buffer, 2, batch_size=None, training_length=2)
assert len(update_buffer["action"]) == 20
c = b.update_buffer.make_mini_batch(start=0, end=1)
assert c.keys() == b.update_buffer.keys()
assert np.array(update_buffer["action"]).shape == (20, 2)
c = update_buffer.make_mini_batch(start=0, end=1)
assert c.keys() == update_buffer.keys()
assert np.array(c["action"]).shape == (1, 2)

def test_buffer_sample():
b = construct_fake_buffer()
b.append_update_buffer(3, batch_size=None, training_length=2)
b.append_update_buffer(2, batch_size=None, training_length=2)
b = construct_fake_processing_buffer()
update_buffer = AgentBuffer()
b.append_to_update_buffer(update_buffer, 3, batch_size=None, training_length=2)
b.append_to_update_buffer(update_buffer, 2, batch_size=None, training_length=2)
mb = b.update_buffer.sample_mini_batch(batch_size=4, sequence_length=1)
assert mb.keys() == b.update_buffer.keys()
mb = update_buffer.sample_mini_batch(batch_size=4, sequence_length=1)
assert mb.keys() == update_buffer.keys()
mb = b.update_buffer.sample_mini_batch(batch_size=20, sequence_length=19)
assert mb.keys() == b.update_buffer.keys()
mb = update_buffer.sample_mini_batch(batch_size=20, sequence_length=19)
assert mb.keys() == update_buffer.keys()
def test_num_experiences():
b = construct_fake_processing_buffer()
update_buffer = AgentBuffer()
assert len(update_buffer["action"]) == 0
assert update_buffer.num_experiences == 0
b.append_to_update_buffer(update_buffer, 3, batch_size=None, training_length=2)
b.append_to_update_buffer(update_buffer, 2, batch_size=None, training_length=2)
assert len(update_buffer["action"]) == 20
assert update_buffer.num_experiences == 20
b = construct_fake_buffer()
b.append_update_buffer(3, batch_size=None, training_length=2)
b.append_update_buffer(2, batch_size=None, training_length=2)
b = construct_fake_processing_buffer()
update_buffer = AgentBuffer()
b.append_to_update_buffer(update_buffer, 3, batch_size=None, training_length=2)
b.append_to_update_buffer(update_buffer, 2, batch_size=None, training_length=2)
b.truncate_update_buffer(2)
assert len(b.update_buffer["action"]) == 2
update_buffer.truncate(2)
assert update_buffer.num_experiences == 2
b.append_update_buffer(3, batch_size=None, training_length=2)
b.append_update_buffer(2, batch_size=None, training_length=2)
b.append_to_update_buffer(update_buffer, 3, batch_size=None, training_length=2)
b.append_to_update_buffer(update_buffer, 2, batch_size=None, training_length=2)
b.truncate_update_buffer(4, sequence_length=3)
assert len(b.update_buffer["action"]) == 3
update_buffer.truncate(4, sequence_length=3)
assert update_buffer.num_experiences == 3

8
ml-agents/mlagents/trainers/tests/test_curriculum.py


@patch("builtins.open", new_callable=mock_open, read_data=dummy_curriculum_json_str)
def test_init_curriculum_happy_path(mock_file, location, default_reset_parameters):
curriculum = Curriculum(location, default_reset_parameters)
curriculum = Curriculum(location)
assert curriculum._brain_name == "TestBrain"
assert curriculum.lesson_num == 0

mock_file, location, default_reset_parameters
):
with pytest.raises(CurriculumConfigError):
Curriculum(location, default_reset_parameters)
Curriculum(location)
curriculum = Curriculum(location, default_reset_parameters)
curriculum = Curriculum(location)
assert curriculum.lesson_num == 0
curriculum.lesson_num = 1

@patch("builtins.open", new_callable=mock_open, read_data=dummy_curriculum_json_str)
def test_get_config(mock_file):
curriculum = Curriculum("TestBrain.json", {"param1": 1, "param2": 1, "param3": 1})
curriculum = Curriculum("TestBrain.json")
assert curriculum.get_config() == {"param1": 0.7, "param2": 100, "param3": 0.2}
curriculum.lesson_num = 2

4
ml-agents/mlagents/trainers/tests/test_demo_loader.py


assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1)
assert len(demo_buffer.update_buffer["actions"]) == total_expected - 1
assert len(demo_buffer["actions"]) == total_expected - 1
def test_load_demo_dir():

assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1)
assert len(demo_buffer.update_buffer["actions"]) == total_expected - 1
assert len(demo_buffer["actions"]) == total_expected - 1

4
ml-agents/mlagents/trainers/tests/test_learn.py


None,
False,
0,
True,
sampler_manager_mock.return_value,
None,
)

assert opt.run_id == "ppo"
assert opt.save_freq == 50000
assert opt.seed == -1
assert opt.fast_simulation is True
assert opt.train_model is False
assert opt.base_port == 5005
assert opt.num_envs == 1

"--num-runs=3",
"--save-freq=123456",
"--seed=7890",
"--slow",
"--train",
"--base-port=4004",
"--num-envs=2",

assert opt.run_id == "myawesomerun"
assert opt.save_freq == 123456
assert opt.seed == 7890
assert opt.fast_simulation is False
assert opt.train_model is True
assert opt.base_port == 4004
assert opt.num_envs == 2

9
ml-agents/mlagents/trainers/tests/test_meta_curriculum.py


def test_init_meta_curriculum_happy_path(
listdir, mock_curriculum_init, mock_curriculum_get_config, default_reset_parameters
):
meta_curriculum = MetaCurriculum("test/", default_reset_parameters)
meta_curriculum = MetaCurriculum("test/")
assert len(meta_curriculum.brains_to_curriculums) == 2

calls = [
call("test/Brain1.json", default_reset_parameters),
call("test/Brain2.json", default_reset_parameters),
]
calls = [call("test/Brain1.json"), call("test/Brain2.json")]
mock_curriculum_init.assert_has_calls(calls)

with pytest.raises(MetaCurriculumError):
MetaCurriculum("test/", default_reset_parameters)
MetaCurriculum("test/")
@patch("mlagents.trainers.curriculum.Curriculum")

6
ml-agents/mlagents/trainers/tests/test_policy.py


test_seed = 3
policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
policy_eval_out = {
"action": np.array([1.0]),
"memory_out": np.array([[2.5]]),
"value": np.array([1.1]),
"action": np.array([1.0], dtype=np.float32),
"memory_out": np.array([[2.5]], dtype=np.float32),
"value": np.array([1.1], dtype=np.float32),
}
policy.evaluate = MagicMock(return_value=policy_eval_out)
brain_info_with_agents = BrainInfo(

51
ml-agents/mlagents/trainers/tests/test_ppo.py


model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.visual_in[0]: np.ones([2, 40, 30, 3]),
model.visual_in[1]: np.ones([2, 40, 30, 3]),
model.epsilon: np.array([[0, 1], [2, 3]]),
model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.epsilon: np.array([[0, 1], [2, 3]], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.visual_in[0]: np.ones([2, 40, 30, 3]),
model.visual_in[1]: np.ones([2, 40, 30, 3]),
model.action_masks: np.ones([2, 2]),
model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.action_masks: np.ones([2, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.action_masks: np.ones([2, 2]),
model.action_masks: np.ones([2, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.batch_size: 1,
model.sequence_length: 2,
model.prev_action: [[0], [0]],
model.memory_in: np.zeros((1, memory_size)),
model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
model.action_masks: np.ones([1, 2]),
model.action_masks: np.ones([1, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

feed_dict = {
model.batch_size: 1,
model.sequence_length: 2,
model.memory_in: np.zeros((1, memory_size)),
model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.epsilon: np.array([[0, 1]]),
}

def test_rl_functions():
rewards = np.array([0.0, 0.0, 0.0, 1.0])
rewards = np.array([0.0, 0.0, 0.0, 1.0], dtype=np.float32)
np.testing.assert_array_almost_equal(returns, np.array([0.729, 0.81, 0.9, 1.0]))
np.testing.assert_array_almost_equal(
returns, np.array([0.729, 0.81, 0.9, 1.0], dtype=np.float32)
)
def test_trainer_increment_step(dummy_config):

# Test update with sequence length smaller than batch size
buffer = mb.simulate_rollout(env, trainer.policy, BUFFER_INIT_SAMPLES)
# Mock out reward signal eval
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"]
buffer.update_buffer["extrinsic_returns"] = buffer.update_buffer["rewards"]
buffer.update_buffer["extrinsic_value_estimates"] = buffer.update_buffer["rewards"]
buffer.update_buffer["curiosity_rewards"] = buffer.update_buffer["rewards"]
buffer.update_buffer["curiosity_returns"] = buffer.update_buffer["rewards"]
buffer.update_buffer["curiosity_value_estimates"] = buffer.update_buffer["rewards"]
buffer["extrinsic_rewards"] = buffer["rewards"]
buffer["extrinsic_returns"] = buffer["rewards"]
buffer["extrinsic_value_estimates"] = buffer["rewards"]
buffer["curiosity_rewards"] = buffer["rewards"]
buffer["curiosity_returns"] = buffer["rewards"]
buffer["curiosity_value_estimates"] = buffer["rewards"]
trainer.training_buffer = buffer
trainer.update_buffer = buffer
trainer.update_policy()
# Make batch length a larger multiple of sequence length
trainer.trainer_parameters["batch_size"] = 128

rewardsout = AllRewardsOutput(
reward_signals={
"extrinsic": RewardSignalResult(
scaled_reward=np.array([1.0, 1.0]), unscaled_reward=np.array([1.0, 1.0])
scaled_reward=np.array([1.0, 1.0], dtype=np.float32),
unscaled_reward=np.array([1.0, 1.0], dtype=np.float32),
environment=np.array([1.0, 1.0]),
environment=np.array([1.0, 1.0], dtype=np.float32),
values = {"extrinsic": np.array([[2.0]])}
values = {"extrinsic": np.array([[2.0]], dtype=np.float32)}
agent_id = "123"
idx = 0
# make sure that we're grabbing from the next_idx for rewards. If we're not, the test will fail.

agent_idx=idx,
agent_next_idx=next_idx,
)
assert trainer.training_buffer[agent_id]["extrinsic_value_estimates"][0] == 2.0
assert trainer.training_buffer[agent_id]["extrinsic_rewards"][0] == 1.0
assert trainer.processing_buffer[agent_id]["extrinsic_value_estimates"][0] == 2.0
assert trainer.processing_buffer[agent_id]["extrinsic_rewards"][0] == 1.0
if __name__ == "__main__":

4
ml-agents/mlagents/trainers/tests/test_reward_signals.py


brain_info = brain_infos[env.external_brain_names[0]]
next_brain_info = env.step()[env.external_brain_names[0]]
# Test evaluate
action = np.ones((len(brain_info.agents), policy.num_branches))
action = np.ones((len(brain_info.agents), policy.num_branches), dtype=np.float32)
rsig_result = policy.reward_signals[reward_signal_name].evaluate(
brain_info, action, next_brain_info
)

def reward_signal_update(env, policy, reward_signal_name):
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
feed_dict = policy.reward_signals[reward_signal_name].prepare_update(
policy.model, buffer.update_buffer.make_mini_batch(0, 10), 2
policy.model, buffer.make_mini_batch(0, 10), 2
)
out = policy._execute_model(
feed_dict, policy.reward_signals[reward_signal_name].update_dict

22
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


import mlagents.trainers.tests.mock_brain as mb
import numpy as np
from mlagents.trainers.rl_trainer import RLTrainer
from mlagents.trainers.tests.test_buffer import construct_fake_buffer
from mlagents.trainers.tests.test_buffer import construct_fake_processing_buffer
from mlagents.trainers.buffer import AgentBuffer
@pytest.fixture

def create_mock_policy():
mock_policy = mock.Mock()
mock_policy.reward_signals = {}
mock_policy.retrieve_memories.return_value = np.zeros((1, 1))
mock_policy.retrieve_previous_action.return_value = np.zeros((1, 1))
mock_policy.retrieve_memories.return_value = np.zeros((1, 1), dtype=np.float32)
mock_policy.retrieve_previous_action.return_value = np.zeros(
(1, 1), dtype=np.float32
)
return mock_policy

fake_action_outputs = {
"action": [0.1, 0.1],
"value_heads": {},
"entropy": np.array([1.0]),
"entropy": np.array([1.0], dtype=np.float32),
"learning_rate": 1.0,
}
mock_braininfo = mb.create_mock_braininfo(

trainer.end_episode()
for agent_id in trainer.episode_steps:
assert trainer.episode_steps[agent_id] == 0
assert len(trainer.training_buffer[agent_id]["action"]) == 0
assert len(trainer.processing_buffer[agent_id]["action"]) == 0
for rewards in trainer.collected_rewards.values():
for agent_id in rewards:
assert rewards[agent_id] == 0

trainer = create_rl_trainer()
trainer.training_buffer = construct_fake_buffer()
trainer.training_buffer.append_update_buffer(2, batch_size=None, training_length=2)
trainer.processing_buffer = construct_fake_processing_buffer()
trainer.update_buffer = AgentBuffer()
trainer.processing_buffer.append_to_update_buffer(
trainer.update_buffer, 2, batch_size=None, training_length=2
)
for _, arr in trainer.training_buffer.update_buffer.items():
for _, arr in trainer.update_buffer.items():
assert len(arr) == 0

63
ml-agents/mlagents/trainers/tests/test_sac.py


assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE[0])
# Test update
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"]
policy.update(
buffer.update_buffer, num_sequences=len(buffer.update_buffer["actions"])
)
update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
policy.update(update_buffer, num_sequences=update_buffer.num_experiences)
env.close()

)
# Test update, while removing PPO-specific buffer elements.
buffer = mb.simulate_rollout(
update_buffer = mb.simulate_rollout(
env,
policy,
BUFFER_INIT_SAMPLES,

# Mock out reward signal eval
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"]
buffer.update_buffer["curiosity_rewards"] = buffer.update_buffer["rewards"]
update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
update_buffer["curiosity_rewards"] = update_buffer["rewards"]
{"curiosity": buffer.update_buffer},
num_sequences=len(buffer.update_buffer["actions"]),
{"curiosity": update_buffer}, num_sequences=update_buffer.num_experiences
)
env.close()

assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
# Test update
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"]
policy.update(
buffer.update_buffer, num_sequences=len(buffer.update_buffer["actions"])
)
update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
policy.update(update_buffer, num_sequences=update_buffer.num_experiences)
env.close()

assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
# Test update
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"]
run_out = policy.update(
buffer.update_buffer, num_sequences=len(buffer.update_buffer["actions"])
)
update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
run_out = policy.update(update_buffer, num_sequences=update_buffer.num_experiences)
assert type(run_out) is dict

assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
# Test update
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"]
policy.update(buffer.update_buffer, num_sequences=2)
update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
policy.update(update_buffer, num_sequences=2)
env.close()

model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.visual_in[0]: np.ones([2, 40, 30, 3]),
model.visual_in[1]: np.ones([2, 40, 30, 3]),
model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.visual_in[0]: np.ones([2, 40, 30, 3]),
model.visual_in[1]: np.ones([2, 40, 30, 3]),
model.action_masks: np.ones([2, 2]),
model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.action_masks: np.ones([2, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.action_masks: np.ones([2, 2]),
model.action_masks: np.ones([2, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.batch_size: 1,
model.sequence_length: 2,
model.prev_action: [[0], [0]],
model.memory_in: np.zeros((1, memory_size)),
model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
model.action_masks: np.ones([1, 2]),
model.action_masks: np.ones([1, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

feed_dict = {
model.batch_size: 1,
model.sequence_length: 2,
model.memory_in: np.zeros((1, memory_size)),
model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
}
sess.run(run_list, feed_dict=feed_dict)

trainer_params["model_path"] = str(tmpdir)
trainer_params["save_replay_buffer"] = True
trainer = SACTrainer(mock_brain, 1, trainer_params, True, False, 0, 0)
trainer.training_buffer = mb.simulate_rollout(
trainer.update_buffer = mb.simulate_rollout(
buffer_len = len(trainer.training_buffer.update_buffer["actions"])
buffer_len = trainer.update_buffer.num_experiences
assert len(trainer2.training_buffer.update_buffer["actions"]) == buffer_len
assert trainer2.update_buffer.num_experiences == buffer_len
if __name__ == "__main__":

4
ml-agents/mlagents/trainers/tests/test_simple_rl.py


)
from mlagents.envs.simple_env_manager import SimpleEnvManager
from mlagents.envs.sampler_class import SamplerManager
from mlagents.envs.side_channel.float_properties_channel import FloatPropertiesChannel
BRAIN_NAME = __name__

seed = 1337
trainer_config = yaml.safe_load(config)
env_manager = SimpleEnvManager(env)
env_manager = SimpleEnvManager(env, FloatPropertiesChannel())
trainer_factory = TrainerFactory(
trainer_config=trainer_config,
summaries_dir=dir,

meta_curriculum=None,
train=True,
training_seed=seed,
fast_simulation=True,
sampler_manager=SamplerManager(None),
resampling_interval=None,
save_freq=save_freq,

2
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


meta_curriculum=None,
train=True,
training_seed=99,
fast_simulation=True,
sampler_manager=SamplerManager({}),
resampling_interval=None,
)

meta_curriculum=None,
train=True,
training_seed=seed,
fast_simulation=True,
sampler_manager=SamplerManager({}),
resampling_interval=None,
)

16
ml-agents/mlagents/trainers/tf_policy.py


self.seed = seed
self.brain = brain
self.use_recurrent = trainer_parameters["use_recurrent"]
self.memory_dict: Dict[int, np.ndarray] = {}
self.memory_dict: Dict[str, np.ndarray] = {}
self.previous_action_dict: Dict[int, np.array] = {}
self.previous_action_dict: Dict[str, np.array] = {}
self.normalize = trainer_parameters.get("normalize", False)
self.use_continuous_act = brain.vector_action_space_type == "continuous"
if self.use_continuous_act:

:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.m_size), dtype=np.float)
return np.zeros((num_agents, self.m_size), dtype=np.float32)
self, agent_ids: List[int], memory_matrix: Optional[np.ndarray]
self, agent_ids: List[str], memory_matrix: Optional[np.ndarray]
) -> None:
if memory_matrix is None:
return

def retrieve_memories(self, agent_ids: List[int]) -> np.ndarray:
memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float)
def retrieve_memories(self, agent_ids: List[str]) -> np.ndarray:
memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.memory_dict:
memory_matrix[index, :] = self.memory_dict[agent_id]

return np.zeros((num_agents, self.num_branches), dtype=np.int)
def save_previous_action(
self, agent_ids: List[int], action_matrix: Optional[np.ndarray]
self, agent_ids: List[str], action_matrix: Optional[np.ndarray]
) -> None:
if action_matrix is None:
return

def retrieve_previous_action(self, agent_ids: List[int]) -> np.ndarray:
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.previous_action_dict:

4
ml-agents/mlagents/trainers/trainer_controller.py


meta_curriculum: Optional[MetaCurriculum],
train: bool,
training_seed: int,
fast_simulation: bool,
sampler_manager: SamplerManager,
resampling_interval: Optional[int],
):

self.trainer_metrics: Dict[str, TrainerMetrics] = {}
self.meta_curriculum = meta_curriculum
self.training_start_time = time()
self.fast_simulation = fast_simulation
self.sampler_manager = sampler_manager
self.resampling_interval = resampling_interval
np.random.seed(training_seed)

self.meta_curriculum.get_config() if self.meta_curriculum else {}
)
sampled_reset_param.update(new_meta_curriculum_config)
return env.reset(train_mode=self.fast_simulation, config=sampled_reset_param)
return env.reset(config=sampled_reset_param)
def _should_save_model(self, global_step: int) -> bool:
return (

3
protobuf-definitions/proto/mlagents/envs/communicator_objects/unity_rl_initialization_output.proto


syntax = "proto3";
import "mlagents/envs/communicator_objects/brain_parameters.proto";
import "mlagents/envs/communicator_objects/environment_parameters.proto";
option csharp_namespace = "MLAgents.CommunicatorObjects";
package communicator_objects;

string version = 2;
string log_path = 3;
repeated BrainParametersProto brain_parameters = 5;
EnvironmentParametersProto environment_parameters = 6;
reserved 6; //environment parameters
}

6
protobuf-definitions/proto/mlagents/envs/communicator_objects/unity_rl_input.proto


syntax = "proto3";
import "mlagents/envs/communicator_objects/agent_action.proto";
import "mlagents/envs/communicator_objects/environment_parameters.proto";
import "mlagents/envs/communicator_objects/command.proto";
option csharp_namespace = "MLAgents.CommunicatorObjects";

repeated AgentActionProto value = 1;
}
map<string, ListAgentActionProto> agent_actions = 1;
EnvironmentParametersProto environment_parameters = 2;
bool is_training = 3;
reserved 2; //deprecated environment proto
reserved 3; //deprecated is_trainig
bytes side_channel = 5;
}

1
protobuf-definitions/proto/mlagents/envs/communicator_objects/unity_rl_output.proto


}
reserved 1; // deprecated bool global_done field
map<string, ListAgentInfoProto> agentInfos = 2;
bytes side_channel = 3;
}

部分文件因为文件数量过多而无法显示

正在加载...
取消
保存