浏览代码

Merge branch 'master' into develop-add-inference-examples

/develop/add-fire
GitHub 4 年前
当前提交
aae58330
共有 105 个文件被更改,包括 4886 次插入3675 次删除
  1. 4
      .yamato/protobuf-generation-test.yml
  2. 6
      .yamato/standalone-build-test.yml
  3. 11
      .yamato/training-int-tests.yml
  4. 3
      Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/SensorBase.cs
  5. 7
      Project/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs
  6. 2
      README.md
  7. 6
      com.unity.ml-agents/CHANGELOG.md
  8. 21
      com.unity.ml-agents/Runtime/Academy.cs
  9. 17
      com.unity.ml-agents/Runtime/Agent.cs
  10. 5
      com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs
  11. 34
      com.unity.ml-agents/Runtime/DecisionRequester.cs
  12. 3
      com.unity.ml-agents/Runtime/Sensors/CameraSensor.cs
  13. 6
      com.unity.ml-agents/Runtime/Sensors/ISensor.cs
  14. 3
      com.unity.ml-agents/Runtime/Sensors/RayPerceptionSensor.cs
  15. 3
      com.unity.ml-agents/Runtime/Sensors/RenderTextureSensor.cs
  16. 13
      com.unity.ml-agents/Runtime/Sensors/StackingSensor.cs
  17. 6
      com.unity.ml-agents/Runtime/Sensors/VectorSensor.cs
  18. 44
      com.unity.ml-agents/Runtime/SideChannels/IncomingMessage.cs
  19. 56
      com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs
  20. 1
      com.unity.ml-agents/Tests/Editor/ParameterLoaderTest.cs
  21. 102
      com.unity.ml-agents/Tests/Editor/PublicAPI/PublicApiValidation.cs
  22. 1
      com.unity.ml-agents/Tests/Editor/Sensor/FloatVisualSensorTests.cs
  23. 1
      com.unity.ml-agents/Tests/Editor/Sensor/SensorShapeValidatorTests.cs
  24. 18
      com.unity.ml-agents/Tests/Editor/Sensor/StackingSensorTests.cs
  25. 26
      com.unity.ml-agents/Tests/Editor/SideChannelTests.cs
  26. 16
      com.unity.ml-agents/package.json
  27. 6
      config/trainer_config.yaml
  28. 3
      docs/Getting-Started.md
  29. 1
      docs/Learning-Environment-Examples.md
  30. 10
      docs/ML-Agents-Overview.md
  31. 4
      docs/Migrating.md
  32. 141
      docs/Python-API.md
  33. 10
      docs/Training-ML-Agents.md
  34. 11
      docs/Training-PPO.md
  35. 11
      docs/Training-SAC.md
  36. 98
      docs/Training-Self-Play.md
  37. 999
      docs/images/3dball_big.png
  38. 852
      docs/images/3dball_small.png
  39. 974
      docs/images/curriculum.png
  40. 999
      docs/images/ml-agents-LSTM.png
  41. 181
      docs/images/monitor.png
  42. 15
      gym-unity/README.md
  43. 342
      gym-unity/gym_unity/envs/__init__.py
  44. 198
      gym-unity/gym_unity/tests/test_gym.py
  45. 249
      ml-agents-envs/mlagents_envs/base_env.py
  46. 104
      ml-agents-envs/mlagents_envs/environment.py
  47. 99
      ml-agents-envs/mlagents_envs/rpc_utils.py
  48. 38
      ml-agents-envs/mlagents_envs/side_channel/incoming_message.py
  49. 43
      ml-agents-envs/mlagents_envs/tests/test_envs.py
  50. 170
      ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py
  51. 20
      ml-agents-envs/mlagents_envs/tests/test_side_channel.py
  52. 180
      ml-agents/mlagents/trainers/agent_processor.py
  53. 52
      ml-agents/mlagents/trainers/behavior_id_utils.py
  54. 16
      ml-agents/mlagents/trainers/brain_conversion_utils.py
  55. 73
      ml-agents/mlagents/trainers/demo_loader.py
  56. 37
      ml-agents/mlagents/trainers/env_manager.py
  57. 453
      ml-agents/mlagents/trainers/ghost/trainer.py
  58. 20
      ml-agents/mlagents/trainers/learn.py
  59. 10
      ml-agents/mlagents/trainers/policy/nn_policy.py
  60. 4
      ml-agents/mlagents/trainers/policy/policy.py
  61. 85
      ml-agents/mlagents/trainers/policy/tf_policy.py
  62. 11
      ml-agents/mlagents/trainers/ppo/trainer.py
  63. 9
      ml-agents/mlagents/trainers/sac/trainer.py
  64. 26
      ml-agents/mlagents/trainers/simple_env_manager.py
  65. 18
      ml-agents/mlagents/trainers/stats.py
  66. 24
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  67. 42
      ml-agents/mlagents/trainers/tests/mock_brain.py
  68. 137
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  69. 54
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  70. 32
      ml-agents/mlagents/trainers/tests/test_demo_loader.py
  71. 48
      ml-agents/mlagents/trainers/tests/test_ghost.py
  72. 2
      ml-agents/mlagents/trainers/tests/test_learn.py
  73. 55
      ml-agents/mlagents/trainers/tests/test_nn_policy.py
  74. 24
      ml-agents/mlagents/trainers/tests/test_policy.py
  75. 4
      ml-agents/mlagents/trainers/tests/test_ppo.py
  76. 6
      ml-agents/mlagents/trainers/tests/test_sac.py
  77. 59
      ml-agents/mlagents/trainers/tests/test_simple_rl.py
  78. 7
      ml-agents/mlagents/trainers/tests/test_stats.py
  79. 5
      ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
  80. 12
      ml-agents/mlagents/trainers/tests/test_trainer_util.py
  81. 9
      ml-agents/mlagents/trainers/trainer/trainer.py
  82. 11
      ml-agents/mlagents/trainers/trainer_controller.py
  83. 33
      ml-agents/mlagents/trainers/trainer_util.py
  84. 18
      ml-agents/tests/yamato/standalone_build_tests.py
  85. 13
      ml-agents/tests/yamato/training_int_tests.py
  86. 58
      ml-agents/tests/yamato/yamato_utils.py
  87. 1
      utils/make_readme_table.py
  88. 32
      .yamato/gym-interface-test.yml
  89. 32
      .yamato/python-ll-api-test.yml
  90. 129
      com.unity.ml-agents/Tests/Runtime/RuntimeAPITest.cs
  91. 11
      com.unity.ml-agents/Tests/Runtime/RuntimeAPITest.cs.meta
  92. 25
      com.unity.ml-agents/Tests/Runtime/Unity.ML-Agents.Runtime.Tests.asmdef
  93. 7
      com.unity.ml-agents/Tests/Runtime/Unity.ML-Agents.Runtime.Tests.asmdef.meta
  94. 92
      ml-agents/mlagents/trainers/ghost/controller.py
  95. 21
      ml-agents/tests/yamato/setup_venv.py
  96. 41
      ml-agents/tests/yamato/scripts/run_gym.py
  97. 94
      ml-agents/tests/yamato/scripts/run_llapi.py
  98. 429
      com.unity.ml-agents/Tests/Runtime/SerializeTestScene.unity
  99. 7
      com.unity.ml-agents/Tests/Runtime/SerializeTestScene.unity.meta

4
.yamato/protobuf-generation-test.yml


- "protobuf-definitions/*.md"
- "protobuf-definitions/**/*.md"
artifacts:
dist:
patch:
- "artifacts/*"
- "artifacts/*.*"

6
.yamato/standalone-build-test.yml


commands:
- pip install pyyaml
- python -u -m ml-agents.tests.yamato.standalone_build_tests
- python -u -m ml-agents.tests.yamato.standalone_build_tests --scene=Assets/ML-Agents/Examples/Basic/Scenes/Basic.unity
triggers:
cancel_old_ci: true
changes:

- "com.unity.ml-agents/*.md"
- "com.unity.ml-agents/**/*.md"
artifacts:
logs:
paths:
- "artifacts/standalone_build.txt"
- "Project/testPlayer*/**"
- "artifacts/testPlayer*/**"
{% endfor %}

11
.yamato/training-int-tests.yml


# Backwards-compatibility tests.
# If we make a breaking change to the communication protocol, these will need
# to be disabled until the next release.
- python -u -m ml-agents.tests.yamato.training_int_tests --python=0.15.0
- python -u -m ml-agents.tests.yamato.training_int_tests --csharp=0.15.0
# - python -u -m ml-agents.tests.yamato.training_int_tests --python=0.15.0
# - python -u -m ml-agents.tests.yamato.training_int_tests --csharp=0.15.0
dependencies:
- .yamato/standalone-build-test.yml#test_mac_standalone_{{ editor.version }}
triggers:

- "com.unity.ml-agents/*.md"
- "com.unity.ml-agents/**/*.md"
artifacts:
unit:
logs:
paths:
- "artifacts/standalone_build.txt"
standalonebuild:
- "artifacts/**"
- "artifacts/testplayer*/**"
{% endfor %}

3
Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/SensorBase.cs


public void Update() {}
/// <inheritdoc/>
public void Reset() { }
/// <inheritdoc/>
public virtual byte[] GetCompressedObservation()
{
return null;

7
Project/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs


public override float[] Heuristic()
{
var action = new float[2];
var action = new float[3];
action[0] = Input.GetAxis("Horizontal");
action[1] = Input.GetKey(KeyCode.Space) ? 1f : 0f;
action[0] = Input.GetAxis("Horizontal"); // Racket Movement
action[1] = Input.GetKey(KeyCode.Space) ? 1f : 0f; // Racket Jumping
action[2] = Input.GetAxis("Vertical"); // Racket Rotation
return action;
}

2
README.md


|:-------:|:------:|:-------------:|:-------:|:------------:|
| **master (unstable)** | -- | [source](https://github.com/Unity-Technologies/ml-agents/tree/master) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/master/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/master.zip) |
| **0.15.1** | **March 30, 2020** | **[source](https://github.com/Unity-Technologies/ml-agents/tree/0.15.1)** | **[docs](https://github.com/Unity-Technologies/ml-agents/tree/0.15.1/docs/Readme.md)** | **[download](https://github.com/Unity-Technologies/ml-agents/archive/0.15.1.zip)** |
| **0.15.0** | **March 18, 2020** | **[source](https://github.com/Unity-Technologies/ml-agents/tree/0.15.0)** | **[docs](https://github.com/Unity-Technologies/ml-agents/tree/0.15.0/docs/Readme.md)** | **[download](https://github.com/Unity-Technologies/ml-agents/archive/0.15.0.zip)** |
| **0.15.0** | March 18, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/0.15.0) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/0.15.0/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/0.15.0.zip) |
| **0.14.1** | February 26, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/0.14.1) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/0.14.1/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/0.14.1.zip) |
| **0.14.0** | February 13, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/0.14.0) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/0.14.0/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/0.14.0.zip) |
| **0.13.1** | January 21, 2020 | [source](https://github.com/Unity-Technologies/ml-agents/tree/0.13.1) | [docs](https://github.com/Unity-Technologies/ml-agents/tree/0.13.1/docs/Readme.md) | [download](https://github.com/Unity-Technologies/ml-agents/archive/0.13.1.zip) |

6
com.unity.ml-agents/CHANGELOG.md


- The Jupyter notebooks have been removed from the repository.
- Introduced the `SideChannelUtils` to register, unregister and access side channels.
- `Academy.FloatProperties` was removed, please use `SideChannelUtils.GetSideChannel<FloatPropertiesChannel>()` instead.
- Removed the multi-agent gym option from the gym wrapper. For multi-agent scenarios, use the [Low Level Python API](Python-API.md).
- The low level Python API has changed. You can look at the document [Low Level Python API documentation](Python-API.md) for more information. If you use `mlagents-learn` for training, this should be a transparent change.
- Added ability to start training (initialize model weights) from a previous run ID. (#3710)
- The internal event `Academy.AgentSetStatus` was renamed to `Academy.AgentPreStep` and made public.
- The offset logic was removed from DecisionRequester.
- SideChannel IncomingMessages methods now take an optional default argument, which is used when trying to read more data than the message contains.
- The way that UnityEnvironment decides the port was changed. If no port is specified, the behavior will depend on the `file_name` parameter. If it is `None`, 5004 (the editor port) will be used; otherwise 5005 (the base environment port) will be used.
- Fixed an issue where exceptions from environments provided a returncode of 0. (#3680)
- Running `mlagents-learn` with the same `--run-id` twice will no longer overwrite the existing files. (#3705)

21
com.unity.ml-agents/Runtime/Academy.cs


/// on each side, although we may allow some flexibility in the future.
/// This should be incremented whenever a change is made to the communication protocol.
/// </summary>
const string k_ApiVersion = "0.15.0";
const string k_ApiVersion = "0.16.0";
internal const string k_PackageVersion = "0.15.0-preview";
internal const string k_PackageVersion = "0.15.1-preview";
const int k_EditorTrainingPort = 5004;

// This will mark the Agent as Done if it has reached its maxSteps.
internal event Action AgentIncrementStep;
// Signals to all the agents at each environment step along with the
// Academy's maxStepReached, done and stepCount values. The agents rely
// on this event to update their own values of max step reached and done
// in addition to aligning on the step count of the global episode.
internal event Action<int> AgentSetStatus;
/// <summary>
/// Signals to all of the <see cref="Agent"/>s that their step is about to begin.
/// This is a good time for an <see cref="Agent"/> to decide if it would like to
/// call <see cref="Agent.RequestDecision"/> or <see cref="Agent.RequestAction"/>
/// for this step. Any other pre-step setup could be done during this even as well.
/// </summary>
public event Action<int> AgentPreStep;
// Signals to all the agents at each environment step so they can send
// their state to their Policy if they have requested a decision.

{
DecideAction = () => {};
DestroyAction = () => {};
AgentSetStatus = i => {};
AgentPreStep = i => {};
AgentSendState = () => {};
AgentAct = () => {};
AgentForceReset = () => {};

ForcedFullReset();
}
AgentSetStatus?.Invoke(m_StepCount);
AgentPreStep?.Invoke(m_StepCount);
m_StepCount += 1;
m_TotalStepCount += 1;

17
com.unity.ml-agents/Runtime/Agent.cs


m_Info.reward = m_Reward;
m_Info.done = true;
m_Info.maxStepReached = doneReason == DoneReason.MaxStepReached;
if (collectObservationsSensor != null)
{
// Make sure the latest observations are being passed to training.
collectObservationsSensor.Reset();
CollectObservations(collectObservationsSensor);
}
ResetSensors();
// We also have to write any to any DemonstationStores so that they get the "done" flag.
foreach (var demoWriter in DemonstrationWriters)

UpdateRewardStats();
}
// The Agent is done, so we give it a new episode Id
m_EpisodeId = EpisodeIdCounter.GetEpisodeId();
m_Reward = 0f;
m_CumulativeReward = 0f;
m_RequestAction = false;

foreach (var sensor in sensors)
{
sensor.Update();
}
}
void ResetSensors()
{
foreach (var sensor in sensors)
{
sensor.Reset();
}
}

5
com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs


{
m_OrderedAgentsRequestingDecisions[behaviorName] = new List<int>();
}
m_OrderedAgentsRequestingDecisions[behaviorName].Add(info.episodeId);
if (!info.done)
{
m_OrderedAgentsRequestingDecisions[behaviorName].Add(info.episodeId);
}
if (!m_LastActionsReceived.ContainsKey(behaviorName))
{
m_LastActionsReceived[behaviorName] = new Dictionary<int, float[]>();

34
com.unity.ml-agents/Runtime/DecisionRequester.cs


using System;
using UnityEngine;
using UnityEngine.Serialization;

/// at regular intervals.
/// </summary>
[AddComponentMenu("ML Agents/Decision Requester", (int)MenuGroup.Default)]
internal class DecisionRequester : MonoBehaviour
[RequireComponent(typeof(Agent))]
public class DecisionRequester : MonoBehaviour
/// that the Agent will request a decision every 5 Academy steps.
/// </summary>
/// that the Agent will request a decision every 5 Academy steps. /// </summary>
[Range(1, 20)]
[Tooltip("The frequency with which the agent requests a decision. A DecisionPeriod " +
"of 5 means that the Agent will request a decision every 5 Academy steps.")]

[FormerlySerializedAs("RepeatAction")]
public bool TakeActionsBetweenDecisions = true;
/// <summary>
/// Whether or not the Agent decisions should start at an offset (different for each agent).
/// This does not affect <see cref="DecisionPeriod"/>. Turning this on will distribute
/// the decision-making computations for all the agents across multiple Academy steps.
/// This can be valuable in scenarios where you have many agents in the scene, particularly
/// during the inference phase.
/// </summary>
[Tooltip("Whether or not Agent decisions should start at an offset.")]
public bool offsetStep;
[NonSerialized]
int m_Offset;
m_Offset = offsetStep ? gameObject.GetInstanceID() : 0;
Academy.Instance.AgentSetStatus += MakeRequests;
Debug.Assert(m_Agent != null, "Agent component was not found on this gameObject and is required.");
Academy.Instance.AgentPreStep += MakeRequests;
}
void OnDestroy()

Academy.Instance.AgentSetStatus -= MakeRequests;
Academy.Instance.AgentPreStep -= MakeRequests;
void MakeRequests(int count)
/// <summary>
/// Method that hooks into the Academy in order inform the Agent on whether or not it should request a
/// decision, and whether or not it should take actions between decisions.
/// </summary>
/// <param name="academyStepCount">The current step count of the academy.</param>
void MakeRequests(int academyStepCount)
if ((count + m_Offset) % DecisionPeriod == 0)
if (academyStepCount % DecisionPeriod == 0)
{
m_Agent?.RequestDecision();
}

3
com.unity.ml-agents/Runtime/Sensors/CameraSensor.cs


public void Update() {}
/// <inheritdoc/>
public void Reset() { }
/// <inheritdoc/>
public SensorCompressionType GetCompressionType()
{
return m_CompressionType;

6
com.unity.ml-agents/Runtime/Sensors/ISensor.cs


void Update();
/// <summary>
/// Resets the internal states of the sensor. This is called at the end of an Agent's episode.
/// Most implementations can leave this empty.
/// </summary>
void Reset();
/// <summary>
/// Return the compression type being used. If no compression is used, return
/// <see cref="SensorCompressionType.None"/>.
/// </summary>

3
com.unity.ml-agents/Runtime/Sensors/RayPerceptionSensor.cs


}
/// <inheritdoc/>
public void Reset() { }
/// <inheritdoc/>
public int[] GetObservationShape()
{
return m_Shape;

3
com.unity.ml-agents/Runtime/Sensors/RenderTextureSensor.cs


public void Update() {}
/// <inheritdoc/>
public void Reset() { }
/// <inheritdoc/>
public SensorCompressionType GetCompressionType()
{
return m_CompressionType;

13
com.unity.ml-agents/Runtime/Sensors/StackingSensor.cs


using System;
namespace MLAgents.Sensors
{
/// <summary>

{
m_WrappedSensor.Update();
m_CurrentIndex = (m_CurrentIndex + 1) % m_NumStackedObservations;
}
/// <inheritdoc/>
public void Reset()
{
m_WrappedSensor.Reset();
// Zero out the buffer.
for (var i = 0; i < m_NumStackedObservations; i++)
{
Array.Clear(m_StackedObservations[i], 0, m_StackedObservations[i].Length);
}
}
/// <inheritdoc/>

6
com.unity.ml-agents/Runtime/Sensors/VectorSensor.cs


}
/// <inheritdoc/>
public void Reset()
{
Clear();
}
/// <inheritdoc/>
public int[] GetObservationShape()
{
return m_Shape;

44
com.unity.ml-agents/Runtime/SideChannels/IncomingMessage.cs


using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System;
using System.IO;
using System.Text;

}
/// <summary>
/// Read a boolan value from the message.
/// Read a boolean value from the message.
/// <param name="defaultValue">Default value to use if the end of the message is reached.</param>
public bool ReadBoolean()
public bool ReadBoolean(bool defaultValue = false)
return m_Reader.ReadBoolean();
return CanReadMore() ? m_Reader.ReadBoolean() : defaultValue;
/// <param name="defaultValue">Default value to use if the end of the message is reached.</param>
public int ReadInt32()
public int ReadInt32(int defaultValue = 0)
return m_Reader.ReadInt32();
return CanReadMore() ? m_Reader.ReadInt32() : defaultValue;
/// <param name="defaultValue">Default value to use if the end of the message is reached.</param>
public float ReadFloat32()
public float ReadFloat32(float defaultValue = 0.0f)
return m_Reader.ReadSingle();
return CanReadMore() ? m_Reader.ReadSingle() : defaultValue;
/// <param name="defaultValue">Default value to use if the end of the message is reached.</param>
public string ReadString()
public string ReadString(string defaultValue = default)
if (!CanReadMore())
{
return defaultValue;
}
var strLength = ReadInt32();
var str = Encoding.ASCII.GetString(m_Reader.ReadBytes(strLength));
return str;

/// Reads a list of floats from the message. The length of the list is stored in the message.
/// </summary>
/// <param name="defaultValue">Default value to use if the end of the message is reached.</param>
public IList<float> ReadFloatList()
public IList<float> ReadFloatList(IList<float> defaultValue = default)
if (!CanReadMore())
{
return defaultValue;
}
var len = ReadInt32();
var output = new float[len];
for (var i = 0; i < len; i++)

{
m_Reader?.Dispose();
m_Stream?.Dispose();
}
/// <summary>
/// Whether or not there is more data left in the stream that can be read.
/// </summary>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
bool CanReadMore()
{
return m_Stream.Position < m_Stream.Length;
}
}
}

56
com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs


using System.CodeDom;
using System;
using UnityEngine;
using NUnit.Framework;
using System.Reflection;

{
internal class TestPolicy : IPolicy
{
public void RequestDecision(AgentInfo info, List<ISensor> sensors) {}
public Action OnRequestDecision;
private WriteAdapter m_Adapter = new WriteAdapter();
public void RequestDecision(AgentInfo info, List<ISensor> sensors) {
foreach(var sensor in sensors){
sensor.GetObservationProto(m_Adapter);
}
OnRequestDecision?.Invoke();
}
public float[] DecideAction() { return new float[0]; }

{
collectObservationsCalls += 1;
collectObservationsCallsForEpisode += 1;
sensor.AddObservation(0f);
sensor.AddObservation(collectObservationsCallsForEpisode);
}
public override void OnActionReceived(float[] vectorAction)

public string sensorName;
public int numWriteCalls;
public int numCompressedCalls;
public int numResetCalls;
public SensorCompressionType compressionType = SensorCompressionType.None;
public TestSensor(string n)

}
public void Update() {}
public void Reset()
{
numResetCalls++;
}
}
[TestFixture]

aca.EnvironmentStep();
}
}
[Test]
public void AssertStackingReset()
{
var agentGo1 = new GameObject("TestAgent");
agentGo1.AddComponent<TestAgent>();
var behaviorParameters = agentGo1.GetComponent<BehaviorParameters>();
behaviorParameters.brainParameters.numStackedVectorObservations = 3;
var agent1 = agentGo1.GetComponent<TestAgent>();
var aca = Academy.Instance;
agent1.LazyInitialize();
var policy = new TestPolicy();
agent1.SetPolicy(policy);
StackingSensor sensor = null;
foreach(ISensor s in agent1.sensors){
if (s is StackingSensor){
sensor = s as StackingSensor;
}
}
Assert.NotNull(sensor);
for (int i = 0; i < 20; i++)
{
agent1.RequestDecision();
aca.EnvironmentStep();
}
policy.OnRequestDecision = () => SensorTestHelper.CompareObservation(sensor, new[] {18f, 19f, 21f});
agent1.EndEpisode();
SensorTestHelper.CompareObservation(sensor, new[] {0f, 0f, 0f});
}
}
[TestFixture]

var expectedCollectObsCalls = 0;
var expectedCollectObsCallsForEpisode = 0;
var expectedCompletedEpisodes = 0;
var expectedSensorResetCalls = 0;
for (var i = 0; i < 15; i++)
{

expectedCollectObsCallsForEpisode = 0;
expectedAgentStepCount = 0;
expectedCompletedEpisodes++;
expectedSensorResetCalls++;
expectedCollectObsCalls += 1;
}
aca.EnvironmentStep();

Assert.AreEqual(expectedCollectObsCalls, agent1.collectObservationsCalls);
Assert.AreEqual(expectedCollectObsCallsForEpisode, agent1.collectObservationsCallsForEpisode);
Assert.AreEqual(expectedCompletedEpisodes, agent1.CompletedEpisodes);
Assert.AreEqual(expectedSensorResetCalls, agent1.sensor1.numResetCalls);
}
}

1
com.unity.ml-agents/Tests/Editor/ParameterLoaderTest.cs


}
public void Update() {}
public void Reset() { }
public SensorCompressionType GetCompressionType()
{

102
com.unity.ml-agents/Tests/Editor/PublicAPI/PublicApiValidation.cs


using MLAgents.Sensors;
using NUnit.Framework;
using UnityEngine;
using UnityEngine.TestTools;
namespace MLAgentsExamples
{

sensorComponent.observationStacks = 2;
sensorComponent.CreateSensor();
}
class PublicApiAgent : Agent
{
public int numHeuristicCalls;
public override float[] Heuristic()
{
numHeuristicCalls++;
return base.Heuristic();
}
}
// Simple SensorComponent that sets up a StackingSensor
class StackingComponent : SensorComponent
{
public SensorComponent wrappedComponent;
public int numStacks;
public override ISensor CreateSensor()
{
var wrappedSensor = wrappedComponent.CreateSensor();
return new StackingSensor(wrappedSensor, numStacks);
}
public override int[] GetObservationShape()
{
int[] shape = (int[]) wrappedComponent.GetObservationShape().Clone();
for (var i = 0; i < shape.Length; i++)
{
shape[i] *= numStacks;
}
return shape;
}
}
[Test]
public void CheckSetupAgent()
{
var gameObject = new GameObject();
var behaviorParams = gameObject.AddComponent<BehaviorParameters>();
behaviorParams.brainParameters.vectorObservationSize = 3;
behaviorParams.brainParameters.numStackedVectorObservations = 2;
behaviorParams.brainParameters.vectorActionDescriptions = new[] { "TestActionA", "TestActionB" };
behaviorParams.brainParameters.vectorActionSize = new[] { 2, 2 };
behaviorParams.brainParameters.vectorActionSpaceType = SpaceType.Discrete;
behaviorParams.behaviorName = "TestBehavior";
behaviorParams.TeamId = 42;
behaviorParams.useChildSensors = true;
var agent = gameObject.AddComponent<PublicApiAgent>();
// Make sure we can set the behavior type correctly after the agent is added
behaviorParams.behaviorType = BehaviorType.InferenceOnly;
// Can't actually create an Agent with InferenceOnly and no model, so change back
behaviorParams.behaviorType = BehaviorType.Default;
// TODO - not internal yet
// var decisionRequester = gameObject.AddComponent<DecisionRequester>();
// decisionRequester.DecisionPeriod = 2;
var sensorComponent = gameObject.AddComponent<RayPerceptionSensorComponent3D>();
sensorComponent.sensorName = "ray3d";
sensorComponent.detectableTags = new List<string> { "Player", "Respawn" };
sensorComponent.raysPerDirection = 3;
// Make a StackingSensor that wraps the RayPerceptionSensorComponent3D
// This isn't necessarily practical, just to ensure that it can be done
var wrappingSensorComponent = gameObject.AddComponent<StackingComponent>();
wrappingSensorComponent.wrappedComponent = sensorComponent;
wrappingSensorComponent.numStacks = 3;
// ISensor isn't set up yet.
Assert.IsNull(sensorComponent.raySensor);
agent.LazyInitialize();
// Make sure we can set the behavior type correctly after the agent is initialized
// (this creates a new policy).
behaviorParams.behaviorType = BehaviorType.HeuristicOnly;
// Initialization should set up the sensors
Assert.IsNotNull(sensorComponent.raySensor);
// Let's change the inference device
var otherDevice = behaviorParams.inferenceDevice == InferenceDevice.CPU ? InferenceDevice.GPU : InferenceDevice.CPU;
agent.SetModel(behaviorParams.behaviorName, behaviorParams.model, otherDevice);
agent.AddReward(1.0f);
agent.RequestAction();
agent.RequestDecision();
Academy.Instance.AutomaticSteppingEnabled = false;
Academy.Instance.EnvironmentStep();
var actions = agent.GetAction();
// default Heuristic implementation should return zero actions.
Assert.AreEqual(new[] {0.0f, 0.0f}, actions);
Assert.AreEqual(1, agent.numHeuristicCalls);
}
}
}

1
com.unity.ml-agents/Tests/Editor/Sensor/FloatVisualSensorTests.cs


}
public void Update() {}
public void Reset() { }
public SensorCompressionType GetCompressionType()
{

1
com.unity.ml-agents/Tests/Editor/Sensor/SensorShapeValidatorTests.cs


}
public void Update() { }
public void Reset() { }
public SensorCompressionType GetCompressionType()
{

18
com.unity.ml-agents/Tests/Editor/Sensor/StackingSensorTests.cs


// Check that if we don't call Update(), the same observations are produced
SensorTestHelper.CompareObservation(sensor, new[] {5f, 6f, 7f, 8f, 9f, 10f});
}
[Test]
public void TestStackingReset()
{
VectorSensor wrapped = new VectorSensor(2);
ISensor sensor = new StackingSensor(wrapped, 3);
wrapped.AddObservation(new[] {1f, 2f});
SensorTestHelper.CompareObservation(sensor, new[] {0f, 0f, 0f, 0f, 1f, 2f});
sensor.Update();
wrapped.AddObservation(new[] {3f, 4f});
SensorTestHelper.CompareObservation(sensor, new[] {0f, 0f, 1f, 2f, 3f, 4f});
sensor.Reset();
wrapped.AddObservation(new[] {5f, 6f});
SensorTestHelper.CompareObservation(sensor, new[] {0f, 0f, 0f, 0f, 5f, 6f});
}
}
}

26
com.unity.ml-agents/Tests/Editor/SideChannelTests.cs


Assert.AreEqual(stringVal, incomingMsg.ReadString());
Assert.AreEqual(floatListVal, incomingMsg.ReadFloatList());
}
[Test]
public void TestMessageReadDefaults()
{
// Make sure reading past the end of a message will apply defaults.
IncomingMessage incomingMsg;
using (var outgoingMsg = new OutgoingMessage())
{
incomingMsg = new IncomingMessage(outgoingMsg.ToByteArray());
}
Assert.AreEqual(false, incomingMsg.ReadBoolean());
Assert.AreEqual(true, incomingMsg.ReadBoolean(defaultValue: true));
Assert.AreEqual(0, incomingMsg.ReadInt32());
Assert.AreEqual(42, incomingMsg.ReadInt32(defaultValue: 42));
Assert.AreEqual(0.0f, incomingMsg.ReadFloat32());
Assert.AreEqual(1337.0f, incomingMsg.ReadFloat32(defaultValue: 1337.0f));
Assert.AreEqual(default(string), incomingMsg.ReadString());
Assert.AreEqual("foo", incomingMsg.ReadString(defaultValue: "foo"));
Assert.AreEqual(default(float[]), incomingMsg.ReadFloatList());
Assert.AreEqual(new float[] { 1001, 1002 }, incomingMsg.ReadFloatList(new float[] { 1001, 1002 }));
}
}
}

16
com.unity.ml-agents/package.json


{
"name": "com.unity.ml-agents",
"displayName":"ML Agents",
"version": "0.15.0-preview",
"unity": "2018.4",
"description": "Add interactivity to your game with Machine Learning Agents trained using Deep Reinforcement Learning.",
"dependencies": {
"com.unity.barracuda": "0.6.1-preview"
}
"name": "com.unity.ml-agents",
"displayName": "ML Agents",
"version": "0.15.1-preview",
"unity": "2018.4",
"description": "Add interactivity to your game with Machine Learning Agents trained using Deep Reinforcement Learning.",
"dependencies": {
"com.unity.barracuda": "0.6.1-preview"
}
}

6
config/trainer_config.yaml


time_horizon: 1000
self_play:
window: 10
play_against_current_self_ratio: 0.5
play_against_latest_model_ratio: 0.5
team_change: 100000
Soccer:
normalize: false

num_layers: 2
self_play:
window: 10
play_against_current_self_ratio: 0.5
play_against_latest_model_ratio: 0.5
team_change: 100000
CrawlerStatic:
normalize: true

3
docs/Getting-Started.md


- For a "Hello World" introduction to creating your own Learning Environment,
check out the [Making a New Learning
Environment](Learning-Environment-Create-New.md) page.
- For a series of YouTube video tutorials, checkout the
[Machine Learning Agents PlayList](https://www.youtube.com/playlist?list=PLX2vGYjWbI0R08eWQkO7nQkGiicHAX7IX)
page.

1
docs/Learning-Environment-Examples.md


* Goal:
* Get the ball into the opponent's goal while preventing
the ball from entering own goal.
* Goalie:
* Agents: The environment contains four agents, with the same
Behavior Parameters : Soccer.
* Agent Reward Function (dependent):

10
docs/ML-Agents-Overview.md


[Training With Environment Parameter Randomization](Training-Environment-Parameter-Randomization.md)
to learn more about this feature.
- **Cloud Training on AWS** - To facilitate using the ML-Agents toolkit on
Amazon Web Services (AWS) machines, we provide a
[guide](Training-on-Amazon-Web-Service.md) on how to set-up EC2 instances in
addition to a public pre-configured Amazon Machine Image (AMI).
- **Cloud Training on Microsoft Azure** - To facilitate using the ML-Agents
toolkit on Azure machines, we provide a
[guide](Training-on-Microsoft-Azure.md) on how to set-up virtual machine
instances in addition to a pre-configured data science image.
## Summary and Next Steps
To briefly summarize: The ML-Agents toolkit enables games and simulations built

4
docs/Migrating.md


### Important changes
* The `--load` and `--train` command-line flags have been deprecated and replaced with `--resume` and `--inference`.
* Running with the same `--run-id` twice will now throw an error.
* The `play_against_current_self_ratio` self-play trainer hyperparameter has been renamed to `play_against_latest_model_ratio`
* Removed the multi-agent gym option from the gym wrapper. For multi-agent scenarios, use the [Low Level Python API](Python-API.md).
* The low level Python API has changed. You can look at the document [Low Level Python API documentation](Python-API.md) for more information. If you use `mlagents-learn` for training, this should be a transparent change.
### Steps to Migrate
* Replace the `--load` flag with `--resume` when calling `mlagents-learn`, and don't use the `--train` flag as training

* Replace `Academy.FloatProperties` with `SideChannelUtils.GetSideChannel<FloatPropertiesChannel>()`.
* Replace `Academy.RegisterSideChannel` with `SideChannelUtils.RegisterSideChannel()`.
* Replace `Academy.UnregisterSideChannel` with `SideChannelUtils.UnregisterSideChannel`.
## Migrating from 0.14 to 0.15

141
docs/Python-API.md


- **UnityEnvironment** — the main interface between the Unity application and
your code. Use UnityEnvironment to start and control a simulation or training
session.
- **BatchedStepResult** — contains the data from Agents belonging to the same
"AgentGroup" in the simulation, such as observations and rewards.
- **AgentGroupSpec** — describes the shape of the data inside a BatchedStepResult.
For example, provides the dimensions of the observations of a group.
- **BehaviorName** - is a string that identifies a behavior in the simulation.
- **AgentId** - is an `int` that serves as unique identifier for Agents in the
simulation.
- **DecisionSteps** — contains the data from Agents belonging to the same
"Behavior" in the simulation, such as observations and rewards. Only Agents
that requested a decision since the last call to `env.step()` are in the
DecisionSteps object.
- **TerminalSteps** — contains the data from Agents belonging to the same
"Behavior" in the simulation, such as observations and rewards. Only Agents
whose episode ended since the last call to `env.step()` are in the
TerminalSteps object.
- **BehaviorSpec** — describes the shape of the observation data inside
DecisionSteps and TerminalSteps as well as the expected action shapes.
An Agent Group is a group of Agents identified by a string name that share the same
observations and action types. You can think about Agent Group as a group of agents
that will share the same policy or behavior. All Agents in a group have the same goal
and reward signals.
An Agent "Behavior" is a group of Agents identified by a `BehaviorName` that share the same
observations and action types (described in their `BehaviorSpec`). You can think about Agent
Behavior as a group of agents that will share the same policy. All Agents with the same
behavior have the same goal and reward signals.
__Note__: The `Behavior Name` corresponds to the Agent Group name on Python.
_Notice: Currently communication between Unity and Python takes place over an
open socket without authentication. As such, please make sure that the network

move forward until an Agent in the simulation needs a input from Python to act.
- **Close : `env.close()`** Sends a shutdown signal to the environment and terminates
the communication.
- **Get Agent Group Names : `env.get_agent_groups()`** Returns a list of agent group ids.
- **Get Behavior Names : `env.get_behavior_names()`** Returns a list of `BehaviorName`.
agent groups are created in the simulation.
- **Get Agent Group Spec : `env.get_agent_group_spec(agent_group: str)`** Returns
the `AgentGroupSpec` corresponding to the agent_group given as input. An
`AgentGroupSpec` contains information such as the observation shapes, the action
type (multi-discrete or continuous) and the action shape. Note that the `AgentGroupSpec`
Agent behaviors are created in the simulation.
- **Get Behavior Spec : `env.get_behavior_spec(behavior_name: str)`** Returns
the `BehaviorSpec` corresponding to the behavior_name given as input. A
`BehaviorSpec` contains information such as the observation shapes, the action
type (multi-discrete or continuous) and the action shape. Note that the `BehaviorSpec`
- **Get Batched Step Result for Agent Group : `env.get_step_result(agent_group: str)`**
Returns a `BatchedStepResult` corresponding to the agent_group given as input.
A `BatchedStepResult` contains information about the state of the agents in a group
such as the observations, the rewards, the done flags and the agent identifiers. The
data is in `np.array` of which the first dimension is always the number of agents which
requested a decision in the simulation since the last call to `env.step()` note that the
number of agents is not guaranteed to remain constant during the simulation.
- **Set Actions for Agent Group :`env.set_actions(agent_group: str, action: np.array)`**
- **Get Steps : `env.get_steps(behavior_name: str)`**
Returns a tuple `DecisionSteps, TerminalSteps` corresponding to the behavior_name
given as input.
The `DecisionSteps` contains information about the state of the agents
**that need an action this step** and have the behavior behavior_name.
The `TerminalSteps` contains information about the state of the agents
**whose episode ended** and have the behavior behavior_name.
Both `DecisionSteps` and `TerminalSteps` contain information such as
the observations, the rewards and the agent identifiers.
`DecisionSteps` also contains action masks for the next action while `TerminalSteps`
contains the reason for termination (did the Agent reach its maximum step and was
interrupted). The data is in `np.array` of which the first dimension is always the
number of agents note that the number of agents is not guaranteed to remain constant
during the simulation and it is not unusual to have either `DecisionSteps` or `TerminalSteps`
contain no Agents at all.
- **Set Actions :`env.set_actions(behavior_name: str, action: np.array)`**
Sets the actions for a whole agent group. `action` is a 2D `np.array` of `dtype=np.int32`
in the discrete action case and `dtype=np.float32` in the continuous action case.
The first dimension of `action` is the number of agents that requested a decision

__Note:__ If no action is provided for an agent group between two calls to `env.step()` then
the default action will be all zeros (in either discrete or continuous action space)
#### BathedStepResult and StepResult
#### DecisionSteps and DecisionStep
`DecisionSteps` (with `s`) contains information about a whole batch of Agents while
`DecisionStep` (no `s`) only contains information about a single Agent.
A `BatchedStepResult` has the following fields :
A `DecisionSteps` has the following fields :
- `obs` is a list of numpy arrays observations collected by the group of
agent. The first dimension of the array corresponds to the batch size of

rewards collected by each agent since the last simulation step.
- `done` is an array of booleans of length batch size. Is true if the
associated Agent was terminated during the last simulation step.
- `max_step` is an array of booleans of length batch size. Is true if the
associated Agent reached its maximum number of steps during the last
simulation step.
- `agent_id` is an int vector of length batch size containing unique
identifier for the corresponding Agent. This is used to track Agents
across simulation steps.

It also has the two following methods:
- `n_agents()` Returns the number of agents requesting a decision since
the last call to `env.step()`
- `get_agent_step_result(agent_id: int)` Returns a `StepResult`
- `len(DecisionSteps)` Returns the number of agents requesting a decision since
the last call to `env.step()`.
- `DecisionSteps[agent_id]` Returns a `DecisionStep`
A `StepResult` has the following fields:
A `DecisionStep` has the following fields:
- `obs` is a list of numpy arrays observations collected by the group of
agent. (Each array has one less dimension than the arrays in `BatchedStepResult`)
- `obs` is a list of numpy arrays observations collected by the agent.
(Each array has one less dimension than the arrays in `DecisionSteps`)
- `max_step` is a bool. Is true if the Agent reached its maximum number of
steps during the last simulation step.
- `agent_id` is an int and an unique identifier for the corresponding Agent.
- `action_mask` is an optional list of one dimensional array of booleans.
Only available in multi-discrete action space type.

#### AgentGroupSpec
#### TerminalSteps and TerminalStep
Similarly to `DecisionSteps` and `DecisionStep`,
`TerminalSteps` (with `s`) contains information about a whole batch of Agents while
`TerminalStep` (no `s`) only contains information about a single Agent.
An Agent group can either have discrete or continuous actions. To check which type
A `TerminalSteps` has the following fields :
- `obs` is a list of numpy arrays observations collected by the group of
agent. The first dimension of the array corresponds to the batch size of
the group (number of agents requesting a decision since the last call to
`env.step()`).
- `reward` is a float vector of length batch size. Corresponds to the
rewards collected by each agent since the last simulation step.
- `done` is an array of booleans of length batch size. Is true if the
associated Agent was terminated during the last simulation step.
- `agent_id` is an int vector of length batch size containing unique
identifier for the corresponding Agent. This is used to track Agents
across simulation steps.
- `max_step` is an array of booleans of length batch size. Is true if the
associated Agent reached its maximum number of steps during the last
simulation step.
It also has the two following methods:
- `len(TerminalSteps)` Returns the number of agents requesting a decision since
the last call to `env.step()`.
- `TerminalSteps[agent_id]` Returns a `TerminalStep`
for the Agent with the `agent_id` unique identifier.
A `TerminalStep` has the following fields:
- `obs` is a list of numpy arrays observations collected by the agent.
(Each array has one less dimension than the arrays in `TerminalSteps`)
- `reward` is a float. Corresponds to the rewards collected by the agent
since the last simulation step.
- `done` is a bool. Is true if the Agent was terminated during the last
simulation step.
- `agent_id` is an int and an unique identifier for the corresponding Agent.
- `max_step` is a bool. Is true if the Agent reached its maximum number of
steps during the last simulation step.
#### BehaviorSpec
An Agent behavior can either have discrete or continuous actions. To check which type
An `AgentGroupSpec` has the following fields :
A `BehaviorSpec` has the following fields :
BatchedStepResult and StepResult.
DecisionSteps, DecisionStep, TerminalSteps and TerminalStep.
- `action_type` is the type of data of the action. it can be discrete or
continuous. If discrete, the action tensors are expected to be `np.int32`. If
continuous, the actions are expected to be `np.float32`.

### Communicating additional information with the Environment
In addition to the means of communicating between Unity and python described above,
we also provide methods for sharing agent-agnostic information. These
additional methods are referred to as side channels. ML-Agents includes two ready-made

10
docs/Training-ML-Agents.md


specified, you will not be able to continue with training. Use `--force` to force ML-Agents to
overwrite the existing data.
Alternatively, you might want to start a new training run but _initialize_ it using an already-trained
model. You may want to do this, for instance, if your environment changed and you want
a new model, but the old behavior is still better than random. You can do this by specifying `--initialize-from=<run-identifier>`, where `<run-identifier>` is the old run ID.
### Command Line Training Options
In addition to passing the path of the Unity executable containing your training

as the current agents in your scene.
* `--force`: Attempting to train a model with a run-id that has been used before will
throw an error. Use `--force` to force-overwrite this run-id's summary and model data.
* `--initialize-from=<run-identifier>`: Specify an old run-id here to initialize your model from
a previously trained model. Note that the previously saved models _must_ have the same behavior
parameters as your current environment.
* `--no-graphics`: Specify this option to run the Unity executable in
`-batchmode` and doesn't initialize the graphics driver. Use this only if your
training doesn't involve visual observations (reading from Pixels). See

* `--cpu`: Forces training using CPU only.
* Engine Configuration :
* `--width' : The width of the executable window of the environment(s) in pixels
* `--width` : The width of the executable window of the environment(s) in pixels
(ignored for editor training) (Default 84)
* `--height` : The height of the executable window of the environment(s) in pixels
(ignored for editor training). (Default 84)

| train_interval | How often to update the agent. | SAC |
| num_update | Number of mini-batches to update the agent with during each update. | SAC |
| use_recurrent | Train using a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, SAC |
| init_path | Initialize trainer from a previously saved model. | PPO, SAC |
\*PPO = Proximal Policy Optimization, SAC = Soft Actor-Critic, BC = Behavioral Cloning (Imitation), GAIL = Generative Adversarial Imitaiton Learning

11
docs/Training-PPO.md


Typical Range: Approximately equal to PPO's `buffer_size`
### (Optional) Advanced: Initialize Model Path
`init_path` can be specified to initialize your model from a previous run before starting.
Note that the prior run should have used the same trainer configurations as the current run,
and have been saved with the same version of ML-Agents. You should provide the full path
to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`.
This option is provided in case you want to initialize different behaviors from different runs;
in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize
all models from the same run.
## Training Statistics
To view training statistics, use TensorBoard. For information on launching and

11
docs/Training-SAC.md


Typical Range (Discrete): `32` - `512`
### (Optional) Advanced: Initialize Model Path
`init_path` can be specified to initialize your model from a previous run before starting.
Note that the prior run should have used the same trainer configurations as the current run,
and have been saved with the same version of ML-Agents. You should provide the full path
to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`.
This option is provided in case you want to initialize different behaviors from different runs;
in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize
all models from the same run.
## Training Statistics
To view training statistics, use TensorBoard. For information on launching and

98
docs/Training-Self-Play.md


# Training with Self-Play
ML-Agents provides the functionality to train symmetric, adversarial games with [Self-Play](https://openai.com/blog/competitive-self-play/).
A symmetric game is one in which opposing agents are *equal* in form and function. In reinforcement learning,
this means both agents have the same observation and action spaces.
With self-play, an agent learns in adversarial games by competing against fixed, past versions of itself
to provide a more stable, stationary learning environment. This is compared
to competing against its current self in every episode, which is a constantly changing opponent.
ML-Agents provides the functionality to train both symmetric and asymmetric adversarial games with
[Self-Play](https://openai.com/blog/competitive-self-play/).
A symmetric game is one in which opposing agents are equal in form, function and objective. Examples of symmetric games
are our Tennis and Soccer example environments. In reinforcement learning, this means both agents have the same observation and
action spaces and learn from the same reward function and so *they can share the same policy*. In asymmetric games,
this is not the case. An example of an asymmetric games are Hide and Seek. Agents in these
types of games do not always have the same observation or action spaces and so sharing policy networks is not
necessarily ideal.
With self-play, an agent learns in adversarial games by competing against fixed, past versions of its opponent
(which could be itself as in symmetric games) to provide a more stable, stationary learning environment. This is compared
to competing against the current, best opponent in every episode, which is constantly changing (because it's learning).
However, from the perspective of an individual agent, these scenarios appear to have non-stationary dynamics because the opponent is often changing.
This can cause significant issues in the experience replay mechanism used by SAC. Thus, we recommend that users use PPO. For further reading on
this issue in particular, see the paper [Stabilising Experience Replay for Deep Multi-Agent Reinforcement Learning](https://arxiv.org/pdf/1702.08887.pdf).
For more general information on training with ML-Agents, see [Training ML-Agents](Training-ML-Agents.md).
For more algorithm specific instruction, please see the documentation for [PPO](Training-PPO.md) or [SAC](Training-SAC.md).

See the trainer configuration and agent prefabs for our Tennis environment for an example.
***Team ID must be 0 or an integer greater than 0.***
In symmetric games, since all agents (even on opposing teams) will share the same policy, they should have the same 'Behavior Name' in their
Behavior Parameters Script. In asymmetric games, they should have a different Behavior Name in their Behavior Parameters script.
Note, in asymmetric games, the agents must have both different Behavior Names *and* different team IDs! Then, specify the trainer configuration
for each Behavior Name in your scene as you would normally, and remember to include the self-play hyperparameter hierarchy!
For examples of how to use this feature, you can see the trainer configurations and agent prefabs for our Tennis and Soccer environments.
Tennis and Soccer provide examples of symmetric games. To train an asymmetric game, specify trainer configurations for each of your behavior names
and include the self-play hyperparameter hierarchy in both.
## Best Practices Training with Self-Play

Training against a set of slowly or unchanging adversaries with low diversity
results in a more stable learning process than training against a set of quickly
changing adversaries with high diversity. With this context, this guide discusses the exposed self-play hyperparameters and intuitions for tuning them.
changing adversaries with high diversity. With this context, this guide discusses
the exposed self-play hyperparameters and intuitions for tuning them.
## Hyperparameters

### Save Steps
The `save_steps` parameter corresponds to the number of *trainer steps* between snapshots. For example, if `save_steps`=10000 then a snapshot of the current policy will be saved every 10000 trainer steps. Note, trainer steps are counted per agent. For more information, please see the [migration doc](Migrating.md) after v0.13.
The `save_steps` parameter corresponds to the number of *trainer steps* between snapshots. For example, if `save_steps=10000` then a snapshot of the current policy will be saved every `10000` trainer steps. Note, trainer steps are counted per agent. For more information, please see the [migration doc](Migrating.md) after v0.13.
### Team Change
The `team_change` parameter corresponds to the number of *trainer_steps* between switching the learning team.
This is the number of trainer steps the teams associated with a specific ghost trainer will train before a different team
becomes the new learning team. It is possible that, in asymmetric games, opposing teams require fewer trainer steps to make similar
performance gains. This enables users to train a more complicated team of agents for more trainer steps than a simpler team of agents
per team switch.
A larger value of `team-change` will allow the agent to train longer against it's opponents. The longer an agent trains against the same set of opponents
the more able it will be to defeat them. However, training against them for too long may result in overfitting to the particular opponent strategies
and so the agent may fail against the next batch of opponents.
The value of `team-change` will determine how many snapshots of the agent's policy are saved to be used as opponents for the other team. So, we
recommend setting this value as a function of the `save_steps` parameter discussed previously.
Recommended Range : 4x-10x where x=`save_steps`
The `swap_steps` parameter corresponds to the number of *trainer steps* between swapping the opponents policy with a different snapshot. As in the `save_steps` discussion, note that trainer steps are counted per agent. For more information, please see the [migration doc](Migrating.md) after v0.13.
The `swap_steps` parameter corresponds to the number of *ghost steps* (not trainer steps) between swapping the opponents policy with a different snapshot.
A 'ghost step' refers to a step taken by an agent *that is following a fixed policy and not learning*. The reason for this distinction is that in asymmetric games,
we may have teams with an unequal number of agents e.g. a 2v1 scenario. The team with two agents collects
twice as many agent steps per environment step as the team with one agent. Thus, these two values will need to be distinct to ensure that the same number
of trainer steps corresponds to the same number of opponent swaps for each team. The formula for `swap_steps` if
a user desires `x` swaps of a team with `num_agents` agents against an opponent team with `num_opponent_agents`
agents during `team-change` total steps is:
```
swap_steps = (num_agents / num_opponent_agents) * (team_change / x)
```
As an example, in a 2v1 scenario, if we want the swap to occur `x=4` times during `team-change=200000` steps,
the `swap_steps` for the team of one agent is:
```
swap_steps = (1 / 2) * (200000 / 4) = 25000
```
The `swap_steps` for the team of two agents is:
```
swap_steps = (2 / 1) * (200000 / 4) = 100000
```
Note, with equal team sizes, the first term is equal to 1 and `swap_steps` can be calculated by just dividing the total steps by the desired number of swaps.
### Play against current self ratio
### Play against latest model ratio
The `play_against_current_self_ratio` parameter corresponds to the probability
an agent will play against its ***current*** self. With probability
1 - `play_against_current_self_ratio`, the agent will play against a snapshot of itself
from a past iteration.
The `play_against_latest_model_ratio` parameter corresponds to the probability
an agent will play against the latest opponent policy. With probability
1 - `play_against_latest_model_ratio`, the agent will play against a snapshot of its
opponent from a past iteration.
A larger value of `play_against_current_self_ratio` indicates that an agent will be playing against itself more often. Since the agent is updating it's policy, the opponent will be different from iteration to iteration. This can lead to an unstable learning environment, but poses the agent with an [auto-curricula](https://openai.com/blog/emergent-tool-use/) of more increasingly challenging situations which may lead to a stronger final policy.
A larger value of `play_against_latest_model_ratio` indicates that an agent will be playing against the current opponent more often. Since the agent is updating it's policy, the opponent will be different from iteration to iteration. This can lead to an unstable learning environment, but poses the agent with an [auto-curricula](https://openai.com/blog/emergent-tool-use/) of more increasingly challenging situations which may lead to a stronger final policy.
Recommended Range : 0.0 - 1.0
Range : 0.0 - 1.0
### Window

In adversarial games, the cumulative environment reward may not be a meaningful metric by which to track learning progress. This is because cumulative reward is entirely dependent on the skill of the opponent. An agent at a particular skill level will get more or less reward against a worse or better agent, respectively.
We provide an implementation of the ELO rating system, a method for calculating the relative skill level between two players from a given population in a zero-sum game. For more information on ELO, please see [the ELO wiki](https://en.wikipedia.org/wiki/Elo_rating_system).
In a proper training run, the ELO of the agent should steadily increase. The absolute value of the ELO is less important than the change in ELO over training iterations.
In a proper training run, the ELO of the agent should steadily increase. The absolute value of the ELO is less important than the change in ELO over training iterations.
Note, this implementation will support any number of teams but ELO is only applicable to games with two teams. It is ongoing work to implement
a reliable metric for measuring progress in scenarios with three or more teams. These scenarios can still train, though as of now, reward and qualitative observations
are the only metric by which we can judge performance.

999
docs/images/3dball_big.png
文件差异内容过多而无法显示
查看文件

852
docs/images/3dball_small.png

之前 之后
宽度: 906  |  高度: 759  |  大小: 165 KiB

974
docs/images/curriculum.png

之前 之后
宽度: 2066  |  高度: 342  |  大小: 152 KiB

999
docs/images/ml-agents-LSTM.png
文件差异内容过多而无法显示
查看文件

181
docs/images/monitor.png

之前 之后
宽度: 961  |  高度: 745  |  大小: 33 KiB

15
gym-unity/README.md


information on the gym interface, see [here](https://github.com/openai/gym).
We provide a gym wrapper and instructions for using it with existing machine
learning algorithms which utilize gyms. Both wrappers provide interfaces on top
learning algorithms which utilize gym. Our wrapper provides interfaces on top
of our `UnityEnvironment` class, which is the default way of interfacing with a
Unity environment via Python.

or by running the following from the `/gym-unity` directory of the repository:
```sh
pip install .
pip install -e .
```
## Using the Gym Wrapper

```python
from gym_unity.envs import UnityEnv
env = UnityEnv(environment_filename, worker_id, use_visual, uint8_visual, multiagent)
env = UnityEnv(environment_filename, worker_id, use_visual, uint8_visual)
```
* `environment_filename` refers to the path to the Unity environment.

(0-255). Many common Gym environments (e.g. Atari) do this. By default they
will be floats (0.0-1.0). Defaults to `False`.
* `multiagent` refers to whether you intent to launch an environment which
contains more than one agent. Defaults to `False`.
* `flatten_branched` will flatten a branched discrete action space into a Gym Discrete.
Otherwise, it will be converted into a MultiDiscrete. Defaults to `False`.

## Limitations
* It is only possible to use an environment with a single Agent.
* It is only possible to use an environment with a **single** Agent.
* The `BatchedStepResult` output from the environment can still be accessed from the
`info` provided by `env.step(action)`.
* The `TerminalSteps` or `DecisionSteps` output from the environment can still be
accessed from the `info` provided by `env.step(action)`.
* Stacked vector observations are not supported.
* Environment registration for use with `gym.make()` is currently not supported.

342
gym-unity/gym_unity/envs/__init__.py


from gym import error, spaces
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.base_env import BatchedStepResult
from mlagents_envs.base_env import DecisionSteps, TerminalSteps
from mlagents_envs import logging_util

logger = logging_util.get_logger(__name__)
logging_util.set_log_level(logging_util.INFO)
GymSingleStepResult = Tuple[np.ndarray, float, bool, Dict]
GymMultiStepResult = Tuple[List[np.ndarray], List[float], List[bool], Dict]
GymStepResult = Union[GymSingleStepResult, GymMultiStepResult]
GymStepResult = Tuple[np.ndarray, float, bool, Dict]
Multi-agent environments use lists for object types, as done here:
https://github.com/openai/multiagent-particle-envs
"""
def __init__(

use_visual: bool = False,
uint8_visual: bool = False,
multiagent: bool = False,
flatten_branched: bool = False,
no_graphics: bool = False,
allow_multiple_visual_obs: bool = False,

:param worker_id: Worker number for environment.
:param use_visual: Whether to use visual observation or vector observation.
:param uint8_visual: Return visual observations as uint8 (0-255) matrices instead of float (0.0-1.0).
:param multiagent: Whether to run in multi-agent mode (lists of obs, reward, done).
:param flatten_branched: If True, turn branched discrete action spaces into a Discrete space rather than
MultiDiscrete.
:param no_graphics: Whether to run the Unity simulator in no-graphics mode

)
# Take a single step so that the brain information will be sent over
if not self._env.get_agent_groups():
if not self._env.get_behavior_names():
self.agent_mapper = AgentIdIndexMapper()
self._previous_step_result: BatchedStepResult = None
self._multiagent = multiagent
self._previous_decision_step: DecisionSteps = None
self._flattener = None
# Hidden flag used by Atari environments to determine if the game is over
self.game_over = False

if len(self._env.get_agent_groups()) != 1:
if len(self._env.get_behavior_names()) != 1:
"There can only be one brain in a UnityEnvironment "
"There can only be one behavior in a UnityEnvironment "
self.brain_name = self._env.get_agent_groups()[0]
self.name = self.brain_name
self.group_spec = self._env.get_agent_group_spec(self.brain_name)
self.name = self._env.get_behavior_names()[0]
self.group_spec = self._env.get_behavior_spec(self.name)
if use_visual and self._get_n_vis_obs() == 0:
raise UnityGymException(

# Check for number of agents in scene.
self._env.reset()
step_result = self._env.get_step_result(self.brain_name)
self._check_agents(step_result.n_agents())
self._previous_step_result = step_result
self.agent_mapper.set_initial_agents(list(self._previous_step_result.agent_id))
decision_steps, _ = self._env.get_steps(self.name)
self._check_agents(len(decision_steps))
self._previous_decision_step = decision_steps
# Set observation and action spaces
if self.group_spec.is_action_discrete():

def reset(self) -> Union[List[np.ndarray], np.ndarray]:
"""Resets the state of the environment and returns an initial observation.
In the case of multi-agent environments, this is a list.
step_result = self._step(True)
n_agents = step_result.n_agents()
self._env.reset()
decision_step, _ = self._env.get_steps(self.name)
n_agents = len(decision_step)
if not self._multiagent:
res: GymStepResult = self._single_step(step_result)
else:
res = self._multi_step(step_result)
res: GymStepResult = self._single_step(decision_step)
return res[0]
def step(self, action: List[Any]) -> GymStepResult:

Accepts an action and returns a tuple (observation, reward, done, info).
In the case of multi-agent environments, these are lists.
Args:
action (object/list): an action provided by the environment
Returns:

info (dict): contains auxiliary diagnostic information, including BatchedStepResult.
info (dict): contains auxiliary diagnostic information.
# Use random actions for all other agents in environment.
if self._multiagent:
if not isinstance(action, list):
raise UnityGymException(
"The environment was expecting `action` to be a list."
)
if len(action) != self._n_agents:
raise UnityGymException(
"The environment was expecting a list of {} actions.".format(
self._n_agents
)
)
else:
if self._flattener is not None:
# Action space is discrete and flattened - we expect a list of scalars
action = [self._flattener.lookup_action(_act) for _act in action]
action = np.array(action)
else:
if self._flattener is not None:
# Translate action into list
action = self._flattener.lookup_action(action)
if self._flattener is not None:
# Translate action into list
action = self._flattener.lookup_action(action)
action = np.array(action).reshape((self._n_agents, spec.action_size))
action = self._sanitize_action(action)
self._env.set_actions(self.brain_name, action)
action = np.array(action).reshape((1, spec.action_size))
self._env.set_actions(self.name, action)
step_result = self._step()
n_agents = step_result.n_agents()
self._check_agents(n_agents)
if not self._multiagent:
single_res = self._single_step(step_result)
self.game_over = single_res[2]
return single_res
self._env.step()
decision_step, terminal_step = self._env.get_steps(self.name)
if len(terminal_step) != 0:
# The agent is done
self.game_over = True
return self._single_step(terminal_step)
multi_res = self._multi_step(step_result)
self.game_over = all(multi_res[2])
return multi_res
return self._single_step(decision_step)
def _single_step(self, info: BatchedStepResult) -> GymSingleStepResult:
def _single_step(self, info: Union[DecisionSteps, TerminalSteps]) -> GymStepResult:
if self.use_visual:
visual_obs = self._get_vis_obs_list(info)

"The Agent does not have vector observations and the environment was not setup "
+ "to use visual observations."
)
done = isinstance(info, TerminalSteps)
return (
default_observation,
info.reward[0],
info.done[0],
{"batched_step_result": info},
)
return (default_observation, info.reward[0], done, {"step": info})
def _preprocess_single(self, single_visual_obs: np.ndarray) -> np.ndarray:
if self.uint8_visual:

def _multi_step(self, info: BatchedStepResult) -> GymMultiStepResult:
if self.use_visual:
self.visual_obs = self._preprocess_multi(self._get_vis_obs_list(info))
default_observation = self.visual_obs
else:
default_observation = self._get_vector_obs(info)
return (
list(default_observation),
list(info.reward),
list(info.done),
{"batched_step_result": info},
)
def _get_n_vis_obs(self) -> int:
result = 0
for shape in self.group_spec.observation_shapes:

return shape
return None
def _get_vis_obs_list(self, step_result: BatchedStepResult) -> List[np.ndarray]:
def _get_vis_obs_list(
self, step_result: Union[DecisionSteps, TerminalSteps]
) -> List[np.ndarray]:
result: List[np.ndarray] = []
for obs in step_result.obs:
if len(obs.shape) == 4:

def _get_vector_obs(self, step_result: BatchedStepResult) -> np.ndarray:
def _get_vector_obs(
self, step_result: Union[DecisionSteps, TerminalSteps]
) -> np.ndarray:
result: List[np.ndarray] = []
for obs in step_result.obs:
if len(obs.shape) == 2:

result += shape[0]
return result
def _preprocess_multi(
self, multiple_visual_obs: List[np.ndarray]
) -> List[np.ndarray]:
if self.uint8_visual:
return [
(255.0 * _visual_obs).astype(np.uint8)
for _visual_obs in multiple_visual_obs
]
else:
return multiple_visual_obs
def render(self, mode="rgb_array"):
return self.visual_obs

return
def _check_agents(self, n_agents: int) -> None:
if not self._multiagent and n_agents > 1:
raise UnityGymException(
"The environment was launched as a single-agent environment, however "
"there is more than one agent in the scene."
)
elif self._multiagent and n_agents <= 1:
raise UnityGymException(
"The environment was launched as a mutli-agent environment, however "
"there is only one agent in the scene."
)
if self._n_agents == -1:
self._n_agents = n_agents
logger.info("{} agents within environment.".format(n_agents))
elif self._n_agents != n_agents:
if self._n_agents > 1:
"The number of agents in the environment has changed since "
"initialization. This is not supported."
"There can only be one Agent in the environment but {n_agents} were detected."
def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult:
n_extra_agents = step_result.n_agents() - self._n_agents
if n_extra_agents < 0:
# In this case, some Agents did not request a decision when expected
raise UnityGymException(
"The number of agents in the scene does not match the expected number."
)
if step_result.n_agents() - sum(step_result.done) != self._n_agents:
raise UnityGymException(
"The number of agents in the scene does not match the expected number."
)
for index, agent_id in enumerate(step_result.agent_id):
if step_result.done[index]:
self.agent_mapper.mark_agent_done(agent_id, step_result.reward[index])
# Set the new AgentDone flags to True
# Note that the corresponding agent_id that gets marked done will be different
# than the original agent that was done, but this is OK since the gym interface
# only cares about the ordering.
for index, agent_id in enumerate(step_result.agent_id):
if not self._previous_step_result.contains_agent(agent_id):
if step_result.done[index]:
# If the Agent is already done (e.g. it ended its epsiode twice in one step)
# Don't try to register it here.
continue
# Register this agent, and get the reward of the previous agent that
# was in its index, so that we can return it to the gym.
last_reward = self.agent_mapper.register_new_agent_id(agent_id)
step_result.done[index] = True
step_result.reward[index] = last_reward
self._previous_step_result = step_result # store the new original
# Get a permutation of the agent IDs so that a given ID stays in the same
# index as where it was first seen.
new_id_order = self.agent_mapper.get_id_permutation(list(step_result.agent_id))
_mask: Optional[List[np.array]] = None
if step_result.action_mask is not None:
_mask = []
for mask_index in range(len(step_result.action_mask)):
_mask.append(step_result.action_mask[mask_index][new_id_order])
new_obs: List[np.array] = []
for obs_index in range(len(step_result.obs)):
new_obs.append(step_result.obs[obs_index][new_id_order])
return BatchedStepResult(
obs=new_obs,
reward=step_result.reward[new_id_order],
done=step_result.done[new_id_order],
max_step=step_result.max_step[new_id_order],
agent_id=step_result.agent_id[new_id_order],
action_mask=_mask,
)
def _sanitize_action(self, action: np.array) -> np.array:
sanitized_action = np.zeros(
(self._previous_step_result.n_agents(), self.group_spec.action_size)
)
for index, agent_id in enumerate(self._previous_step_result.agent_id):
if not self._previous_step_result.done[index]:
array_index = self.agent_mapper.get_gym_index(agent_id)
sanitized_action[index, :] = action[array_index, :]
return sanitized_action
def _step(self, needs_reset: bool = False) -> BatchedStepResult:
if needs_reset:
self._env.reset()
else:
self._env.step()
info = self._env.get_step_result(self.brain_name)
# Two possible cases here:
# 1) all agents requested decisions (some of which might be done)
# 2) some Agents were marked Done in between steps.
# In case 2, we re-request decisions until all agents request a real decision.
while info.n_agents() - sum(info.done) < self._n_agents:
if not info.done.all():
raise UnityGymException(
"The environment does not have the expected amount of agents. "
+ "Some agents did not request decisions at the same time."
)
for agent_id, reward in zip(info.agent_id, info.reward):
self.agent_mapper.mark_agent_done(agent_id, reward)
self._env.step()
info = self._env.get_step_result(self.brain_name)
return self._sanitize_info(info)
@property
def metadata(self):
return {"render.modes": ["rgb_array"]}

:return: The List containing the branched actions.
"""
return self.action_lookup[action]
class AgentIdIndexMapper:
def __init__(self) -> None:
self._agent_id_to_gym_index: Dict[int, int] = {}
self._done_agents_index_to_last_reward: Dict[int, float] = {}
def set_initial_agents(self, agent_ids: List[int]) -> None:
"""
Provide the initial list of agent ids for the mapper
"""
for idx, agent_id in enumerate(agent_ids):
self._agent_id_to_gym_index[agent_id] = idx
def mark_agent_done(self, agent_id: int, reward: float) -> None:
"""
Declare the agent done with the corresponding final reward.
"""
if agent_id in self._agent_id_to_gym_index:
gym_index = self._agent_id_to_gym_index.pop(agent_id)
self._done_agents_index_to_last_reward[gym_index] = reward
else:
# Agent was never registered in the first place (e.g. EndEpisode called multiple times)
pass
def register_new_agent_id(self, agent_id: int) -> float:
"""
Adds the new agent ID and returns the reward to use for the previous agent in this index
"""
# Any free index is OK here.
free_index, last_reward = self._done_agents_index_to_last_reward.popitem()
self._agent_id_to_gym_index[agent_id] = free_index
return last_reward
def get_id_permutation(self, agent_ids: List[int]) -> List[int]:
"""
Get the permutation from new agent ids to the order that preserves the positions of previous agents.
The result is a list with each integer from 0 to len(_agent_id_to_gym_index)-1
appearing exactly once.
"""
# Map the new agent ids to the their index
new_agent_ids_to_index = {
agent_id: idx for idx, agent_id in enumerate(agent_ids)
}
# Make the output list. We don't write to it sequentially, so start with dummy values.
new_permutation = [-1] * len(self._agent_id_to_gym_index)
# For each agent ID, find the new index of the agent, and write it in the original index.
for agent_id, original_index in self._agent_id_to_gym_index.items():
new_permutation[original_index] = new_agent_ids_to_index[agent_id]
return new_permutation
def get_gym_index(self, agent_id: int) -> int:
"""
Get the gym index for the current agent.
"""
return self._agent_id_to_gym_index[agent_id]
class AgentIdIndexMapperSlow:
"""
Reference implementation of AgentIdIndexMapper.
The operations are O(N^2) so it shouldn't be used for large numbers of agents.
See AgentIdIndexMapper for method descriptions
"""
def __init__(self) -> None:
self._gym_id_order: List[int] = []
self._done_agents_index_to_last_reward: Dict[int, float] = {}
def set_initial_agents(self, agent_ids: List[int]) -> None:
self._gym_id_order = list(agent_ids)
def mark_agent_done(self, agent_id: int, reward: float) -> None:
try:
gym_index = self._gym_id_order.index(agent_id)
self._done_agents_index_to_last_reward[gym_index] = reward
self._gym_id_order[gym_index] = -1
except ValueError:
# Agent was never registered in the first place (e.g. EndEpisode called multiple times)
pass
def register_new_agent_id(self, agent_id: int) -> float:
original_index = self._gym_id_order.index(-1)
self._gym_id_order[original_index] = agent_id
reward = self._done_agents_index_to_last_reward.pop(original_index)
return reward
def get_id_permutation(self, agent_ids):
new_id_order = []
for agent_id in self._gym_id_order:
new_id_order.append(agent_ids.index(agent_id))
return new_id_order
def get_gym_index(self, agent_id: int) -> int:
return self._gym_id_order.index(agent_id)

198
gym-unity/gym_unity/tests/test_gym.py


import numpy as np
from gym import spaces
from gym_unity.envs import (
UnityEnv,
UnityGymException,
AgentIdIndexMapper,
AgentIdIndexMapperSlow,
from gym_unity.envs import UnityEnv
from mlagents_envs.base_env import (
BehaviorSpec,
ActionType,
DecisionSteps,
TerminalSteps,
from mlagents_envs.base_env import AgentGroupSpec, ActionType, BatchedStepResult
mock_step = create_mock_vector_step_result()
setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
mock_decision_step, mock_terminal_step = create_mock_vector_steps(mock_spec)
setup_mock_unityenvironment(
mock_env, mock_spec, mock_decision_step, mock_terminal_step
)
env = UnityEnv(" ", use_visual=False, multiagent=False)
env = UnityEnv(" ", use_visual=False)
assert isinstance(env, UnityEnv)
assert isinstance(env.reset(), np.ndarray)
actions = env.action_space.sample()

@mock.patch("gym_unity.envs.UnityEnvironment")
def test_multi_agent(mock_env):
mock_spec = create_mock_group_spec()
mock_step = create_mock_vector_step_result(num_agents=2)
setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
with pytest.raises(UnityGymException):
UnityEnv(" ", multiagent=False)
env = UnityEnv(" ", use_visual=False, multiagent=True)
assert isinstance(env.reset(), list)
actions = [env.action_space.sample() for i in range(env.number_agents)]
obs, rew, done, info = env.step(actions)
assert isinstance(obs, list)
assert isinstance(rew, list)
assert isinstance(done, list)
assert isinstance(info, dict)
@mock.patch("gym_unity.envs.UnityEnvironment")
mock_step = create_mock_vector_step_result(num_agents=1)
setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
mock_decision_step, mock_terminal_step = create_mock_vector_steps(
mock_spec, num_agents=1
)
setup_mock_unityenvironment(
mock_env, mock_spec, mock_decision_step, mock_terminal_step
)
env = UnityEnv(" ", use_visual=False, multiagent=False, flatten_branched=True)
env = UnityEnv(" ", use_visual=False, flatten_branched=True)
assert isinstance(env.action_space, spaces.Discrete)
assert env.action_space.n == 12
assert env._flattener.lookup_action(0) == [0, 0, 0]

env = UnityEnv(" ", use_visual=False, multiagent=False, flatten_branched=False)
env = UnityEnv(" ", use_visual=False, flatten_branched=False)
assert isinstance(env.action_space, spaces.MultiDiscrete)

mock_spec = create_mock_group_spec(number_visual_observations=1)
mock_step = create_mock_vector_step_result(number_visual_observations=1)
setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
mock_decision_step, mock_terminal_step = create_mock_vector_steps(
mock_spec, number_visual_observations=1
)
setup_mock_unityenvironment(
mock_env, mock_spec, mock_decision_step, mock_terminal_step
)
env = UnityEnv(" ", use_visual=True, multiagent=False, uint8_visual=use_uint8)
env = UnityEnv(" ", use_visual=True, uint8_visual=use_uint8)
assert isinstance(env, UnityEnv)
assert isinstance(env.reset(), np.ndarray)
actions = env.action_space.sample()

assert isinstance(info, dict)
@mock.patch("gym_unity.envs.UnityEnvironment")
def test_sanitize_action_shuffled_id(mock_env):
mock_spec = create_mock_group_spec(
vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3]
)
mock_step = create_mock_vector_step_result(num_agents=5)
mock_step.agent_id = np.array(range(5))
setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
env = UnityEnv(" ", use_visual=False, multiagent=True)
shuffled_step_result = create_mock_vector_step_result(num_agents=5)
shuffled_order = [4, 2, 3, 1, 0]
shuffled_step_result.reward = np.array(shuffled_order)
shuffled_step_result.agent_id = np.array(shuffled_order)
sanitized_result = env._sanitize_info(shuffled_step_result)
for expected_reward, reward in zip(range(5), sanitized_result.reward):
assert expected_reward == reward
for expected_agent_id, agent_id in zip(range(5), sanitized_result.agent_id):
assert expected_agent_id == agent_id
@mock.patch("gym_unity.envs.UnityEnvironment")
def test_sanitize_action_one_agent_done(mock_env):
mock_spec = create_mock_group_spec(
vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3]
)
mock_step = create_mock_vector_step_result(num_agents=5)
mock_step.agent_id = np.array(range(5))
setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
env = UnityEnv(" ", use_visual=False, multiagent=True)
received_step_result = create_mock_vector_step_result(num_agents=6)
received_step_result.agent_id = np.array(range(6))
# agent #3 (id = 2) is Done
received_step_result.done = np.array([False] * 2 + [True] + [False] * 3)
sanitized_result = env._sanitize_info(received_step_result)
for expected_agent_id, agent_id in zip([0, 1, 5, 3, 4], sanitized_result.agent_id):
assert expected_agent_id == agent_id
@mock.patch("gym_unity.envs.UnityEnvironment")
def test_sanitize_action_new_agent_done(mock_env):
mock_spec = create_mock_group_spec(
vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3]
)
mock_step = create_mock_vector_step_result(num_agents=3)
mock_step.agent_id = np.array(range(5))
setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
env = UnityEnv(" ", use_visual=False, multiagent=True)
received_step_result = create_mock_vector_step_result(num_agents=7)
received_step_result.agent_id = np.array(range(7))
# agent #3 (id = 2) is Done
# so is the "new" agent (id = 5)
done = [False] * 7
done[2] = True
done[5] = True
received_step_result.done = np.array(done)
sanitized_result = env._sanitize_info(received_step_result)
for expected_agent_id, agent_id in zip([0, 1, 6, 3, 4], sanitized_result.agent_id):
assert expected_agent_id == agent_id
@mock.patch("gym_unity.envs.UnityEnvironment")
def test_sanitize_action_single_agent_multiple_done(mock_env):
mock_spec = create_mock_group_spec(
vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3]
)
mock_step = create_mock_vector_step_result(num_agents=1)
mock_step.agent_id = np.array(range(1))
setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
env = UnityEnv(" ", use_visual=False, multiagent=False)
received_step_result = create_mock_vector_step_result(num_agents=3)
received_step_result.agent_id = np.array(range(3))
# original agent (id = 0) is Done
# so is the "new" agent (id = 1)
done = [True, True, False]
received_step_result.done = np.array(done)
sanitized_result = env._sanitize_info(received_step_result)
for expected_agent_id, agent_id in zip([2], sanitized_result.agent_id):
assert expected_agent_id == agent_id
# Helper methods

obs_shapes = [(vector_observation_space_size,)]
for _ in range(number_visual_observations):
obs_shapes += [(8, 8, 3)]
return AgentGroupSpec(obs_shapes, act_type, vector_action_space_size)
return BehaviorSpec(obs_shapes, act_type, vector_action_space_size)
def create_mock_vector_step_result(num_agents=1, number_visual_observations=0):
def create_mock_vector_steps(specs, num_agents=1, number_visual_observations=0):
:BehaviorSpecs specs: The BehaviorSpecs for this mock
:int num_agents: Number of "agents" to imitate in your BatchedStepResult values.
"""
obs = [np.array([num_agents * [1, 2, 3]]).reshape(num_agents, 3)]

done = np.array(num_agents * [False])
return BatchedStepResult(obs, rewards, done, done, agents, None)
return DecisionSteps(obs, rewards, agents, None), TerminalSteps.empty(specs)
def setup_mock_unityenvironment(mock_env, mock_spec, mock_result):
def setup_mock_unityenvironment(mock_env, mock_spec, mock_decision, mock_termination):
"""
Takes a mock UnityEnvironment and adds the appropriate properties, defined by the mock
GroupSpec and BatchedStepResult.

:Mock mock_result: A BatchedStepResult object that will be returned at each step and reset.
:Mock mock_decision: A DecisionSteps object that will be returned at each step and reset.
:Mock mock_termination: A TerminationSteps object that will be returned at each step and reset.
mock_env.return_value.get_agent_groups.return_value = ["MockBrain"]
mock_env.return_value.get_agent_group_spec.return_value = mock_spec
mock_env.return_value.get_step_result.return_value = mock_result
@pytest.mark.parametrize("mapper_cls", [AgentIdIndexMapper, AgentIdIndexMapperSlow])
def test_agent_id_index_mapper(mapper_cls):
mapper = mapper_cls()
initial_agent_ids = [1001, 1002, 1003, 1004]
mapper.set_initial_agents(initial_agent_ids)
# Mark some agents as done with their last rewards.
mapper.mark_agent_done(1001, 42.0)
mapper.mark_agent_done(1004, 1337.0)
# Make sure we can handle an unknown agent id being marked done.
# This can happen when an agent ends an episode on the same step it starts.
mapper.mark_agent_done(9999, -1.0)
# Now add new agents, and get the rewards of the agent they replaced.
old_reward1 = mapper.register_new_agent_id(2001)
old_reward2 = mapper.register_new_agent_id(2002)
# Order of the rewards don't matter
assert {old_reward1, old_reward2} == {42.0, 1337.0}
new_agent_ids = [1002, 1003, 2001, 2002]
permutation = mapper.get_id_permutation(new_agent_ids)
# Make sure it's actually a permutation - needs to contain 0..N-1 with no repeats.
assert set(permutation) == set(range(0, 4))
# For initial agents that were in the initial group, they need to be in the same slot.
# Agents that were added later can appear in any free slot.
permuted_ids = [new_agent_ids[i] for i in permutation]
for idx, agent_id in enumerate(initial_agent_ids):
if agent_id in permuted_ids:
assert permuted_ids[idx] == agent_id
mock_env.return_value.get_behavior_names.return_value = ["MockBrain"]
mock_env.return_value.get_behavior_spec.return_value = mock_spec
mock_env.return_value.get_steps.return_value = (mock_decision, mock_termination)

249
ml-agents-envs/mlagents_envs/base_env.py


"""
Python Environment API for the ML-Agents toolkit
The aim of this API is to expose groups of similar Agents evolving in Unity
The aim of this API is to expose Agents evolving in a simulation
There can be multiple groups of similar Agents (same observations and actions
spaces) in the simulation. These groups are identified by a agent_group that
corresponds to a single group of Agents in the simulation.
This API supports multi-agent scenarios and groups similar Agents (same
observations, actions spaces and behavior) together. These groups of Agents are
identified by their BehaviorName.
batched manner. When retrieving the state of a group of Agents, said state
contains the data for the whole group. Agents in these groups are identified
by a unique int identifier that allows tracking of Agents across simulation
steps. Note that there is no guarantee that the number or order of the Agents
in the state will be consistent across simulation steps.
batched manner. Agents are identified by a unique AgentId identifier that
allows tracking of Agents across simulation steps. Note that there is no
guarantee that the number or order of the Agents in the state will be
consistent across simulation steps.
A simulation steps corresponds to moving the simulation forward until at least
one agent in the simulation sends its observations to Python again. Since
Agents can request decisions at different frequencies, a simulation step does

from abc import ABC, abstractmethod
from typing import List, NamedTuple, Tuple, Optional, Union, Dict
from collections.abc import Mapping
from typing import List, NamedTuple, Tuple, Optional, Union, Dict, Iterator, Any
AgentGroup = str
BehaviorName = str
class StepResult(NamedTuple):
class DecisionStep(NamedTuple):
- obs is a list of numpy arrays observations collected by the group of
agent.
- obs is a list of numpy arrays observations collected by the agent.
- done is a bool. Is true if the Agent was terminated during the last
simulation step.
- max_step is a bool. Is true if the Agent reached its maximum number of
steps during the last simulation step.
- agent_id is an int and an unique identifier for the corresponding Agent.
- action_mask is an optional list of one dimensional array of booleans.
Only available in multi-discrete action space type.

obs: List[np.ndarray]
reward: float
done: bool
max_step: bool
class BatchedStepResult:
class DecisionSteps(Mapping):
Contains the data a group of similar Agents collected since the last
Contains the data a batch of similar Agents collected since the last
agents and the batch size of the BatchedStepResult are not fixed across
agents and the batch size of the DecisionSteps are not fixed across
- obs is a list of numpy arrays observations collected by the group of
agent. Each obs has one extra dimension compared to StepResult: the first
dimension of the array corresponds to the batch size of
the group.
- obs is a list of numpy arrays observations collected by the batch of
agent. Each obs has one extra dimension compared to DecisionStep: the
first dimension of the array corresponds to the batch size of the batch.
- done is an array of booleans of length batch size. Is true if the
associated Agent was terminated during the last simulation step.
- max_step is an array of booleans of length batch size. Is true if the
associated Agent reached its maximum number of steps during the last
simulation step.
- agent_id is an int vector of length batch size containing unique
identifier for the corresponding Agent. This is used to track Agents
across simulation steps.

this simulation step.
"""
def __init__(self, obs, reward, done, max_step, agent_id, action_mask):
def __init__(self, obs, reward, agent_id, action_mask):
self.done: np.ndarray = done
self.max_step: np.ndarray = max_step
self.agent_id: np.ndarray = agent_id
self.action_mask: Optional[List[np.ndarray]] = action_mask
self._agent_id_to_index: Optional[Dict[AgentId, int]] = None

"""
:returns: A Dict that maps agent_id to the index of those agents in
this BatchedStepResult.
this DecisionSteps.
"""
if self._agent_id_to_index is None:
self._agent_id_to_index = {}

def contains_agent(self, agent_id: AgentId) -> bool:
return agent_id in self.agent_id_to_index
def __len__(self) -> int:
return len(self.agent_id)
def get_agent_step_result(self, agent_id: AgentId) -> StepResult:
def __getitem__(self, agent_id: AgentId) -> DecisionStep:
returns the step result for a specific agent.
returns the DecisionStep for a specific agent.
:returns: obs, reward, done, agent_id and optional action mask for a
specific agent
:returns: The DecisionStep
if not self.contains_agent(agent_id):
raise IndexError(
"get_agent_step_result failed. agent_id {} is not present in the BatchedStepResult".format(
agent_id
)
if agent_id not in self.agent_id_to_index:
raise KeyError(
"agent_id {} is not present in the DecisionSteps".format(agent_id)
)
agent_index = self._agent_id_to_index[agent_id] # type: ignore
agent_obs = []

agent_mask = []
for mask in self.action_mask:
agent_mask.append(mask[agent_index])
return StepResult(
return DecisionStep(
done=self.done[agent_index],
max_step=self.max_step[agent_index],
def __iter__(self) -> Iterator[Any]:
yield from self.agent_id
def empty(spec: "AgentGroupSpec") -> "BatchedStepResult":
def empty(spec: "BehaviorSpec") -> "DecisionSteps":
Returns an empty BatchedStepResult.
:param spec: The AgentGroupSpec for the BatchedStepResult
Returns an empty DecisionSteps.
:param spec: The BehaviorSpec for the DecisionSteps
return BatchedStepResult(
return DecisionSteps(
done=np.zeros(0, dtype=np.bool),
max_step=np.zeros(0, dtype=np.bool),
def n_agents(self) -> int:
class TerminalStep(NamedTuple):
"""
Contains the data a single Agent collected when its episode ended.
- obs is a list of numpy arrays observations collected by the agent.
- reward is a float. Corresponds to the rewards collected by the agent
since the last simulation step.
- max_step is a bool. Is true if the Agent reached its maximum number of
steps during the last simulation step.
- agent_id is an int and an unique identifier for the corresponding Agent.
"""
obs: List[np.ndarray]
reward: float
max_step: bool
agent_id: AgentId
class TerminalSteps(Mapping):
"""
Contains the data a batch of Agents collected when their episode
terminated. All Agents present in the TerminalSteps have ended their
episode.
- obs is a list of numpy arrays observations collected by the batch of
agent. Each obs has one extra dimension compared to DecisionStep: the
first dimension of the array corresponds to the batch size of the batch.
- reward is a float vector of length batch size. Corresponds to the
rewards collected by each agent since the last simulation step.
- max_step is an array of booleans of length batch size. Is true if the
associated Agent reached its maximum number of steps during the last
simulation step.
- agent_id is an int vector of length batch size containing unique
identifier for the corresponding Agent. This is used to track Agents
across simulation steps.
"""
def __init__(self, obs, reward, max_step, agent_id):
self.obs: List[np.ndarray] = obs
self.reward: np.ndarray = reward
self.max_step: np.ndarray = max_step
self.agent_id: np.ndarray = agent_id
self._agent_id_to_index: Optional[Dict[AgentId, int]] = None
@property
def agent_id_to_index(self) -> Dict[AgentId, int]:
"""
:returns: A Dict that maps agent_id to the index of those agents in
this TerminalSteps.
"""
if self._agent_id_to_index is None:
self._agent_id_to_index = {}
for a_idx, a_id in enumerate(self.agent_id):
self._agent_id_to_index[a_id] = a_idx
return self._agent_id_to_index
def __len__(self) -> int:
def __getitem__(self, agent_id: AgentId) -> TerminalStep:
"""
returns the TerminalStep for a specific agent.
:param agent_id: The id of the agent
:returns: obs, reward, done, agent_id and optional action mask for a
specific agent
"""
if agent_id not in self.agent_id_to_index:
raise KeyError(
"agent_id {} is not present in the TerminalSteps".format(agent_id)
)
agent_index = self._agent_id_to_index[agent_id] # type: ignore
agent_obs = []
for batched_obs in self.obs:
agent_obs.append(batched_obs[agent_index])
return TerminalStep(
obs=agent_obs,
reward=self.reward[agent_index],
max_step=self.max_step[agent_index],
agent_id=agent_id,
)
def __iter__(self) -> Iterator[Any]:
yield from self.agent_id
@staticmethod
def empty(spec: "BehaviorSpec") -> "TerminalSteps":
"""
Returns an empty TerminalSteps.
:param spec: The BehaviorSpec for the TerminalSteps
"""
obs: List[np.ndarray] = []
for shape in spec.observation_shapes:
obs += [np.zeros((0,) + shape, dtype=np.float32)]
return TerminalSteps(
obs=obs,
reward=np.zeros(0, dtype=np.float32),
max_step=np.zeros(0, dtype=np.bool),
agent_id=np.zeros(0, dtype=np.int32),
)
class ActionType(Enum):
DISCRETE = 0

class AgentGroupSpec(NamedTuple):
class BehaviorSpec(NamedTuple):
spaces for a group of Agents.
spaces for a group of Agents under the same behavior.
the ordering of the BatchedStepResult and StepResult.
the ordering of the DecisionSteps and TerminalSteps.
- action_type is the type of data of the action. it can be discrete or
continuous. If discrete, the action tensors are expected to be int32. If
continuous, the actions are expected to be float32.

def is_action_discrete(self) -> bool:
"""
Returns true if the Agent group uses discrete actions
Returns true if this Behavior uses discrete actions
Returns true if the Agent group uses continuous actions
Returns true if this Behavior uses continuous actions
"""
return self.action_type == ActionType.CONTINUOUS

pass
@abstractmethod
def get_agent_groups(self) -> List[AgentGroup]:
def get_behavior_names(self) -> List[BehaviorName]:
Returns the list of the agent group names present in the environment.
Agents grouped under the same group name have the same action and
observation specs, and are expected to behave similarly in the environment.
Returns the list of the behavior names present in the environment.
Agents grouped under the same behavior name have the same action and
observation specs, and are expected to behave similarly in the
environment.
:return: the list of agent group names.
:return: the list of agent BehaviorName.
def set_actions(self, agent_group: AgentGroup, action: np.ndarray) -> None:
def set_actions(self, behavior_name: BehaviorName, action: np.ndarray) -> None:
the step result.
:param agent_group: The name of the group the agents are part of
the DecisionSteps.
:param behavior_name: The name of the behavior the agents are part of
:param action: A two dimensional np.ndarray corresponding to the action
(either int or float)
"""

def set_action_for_agent(
self, agent_group: AgentGroup, agent_id: AgentId, action: np.ndarray
self, behavior_name: BehaviorName, agent_id: AgentId, action: np.ndarray
:param agent_group: The name of the group the agent is part of
:param behavior_name: The name of the behavior the agent is part of
:param action: A two dimensional np.ndarray corresponding to the action
:param action: A one dimensional np.ndarray corresponding to the action
def get_step_result(self, agent_group: AgentGroup) -> BatchedStepResult:
def get_steps(
self, behavior_name: BehaviorName
) -> Tuple[DecisionSteps, TerminalSteps]:
Retrieves the observations of the agents that requested a step in the
Retrieves the steps of the agents that requested a step in the
:param agent_group: The name of the group the agents are part of
:return: A BatchedStepResult NamedTuple containing the observations,
the rewards and the done flags for this group of agents.
:param behavior_name: The name of the behavior the agents are part of
:return: A tuple containing :
- A DecisionSteps NamedTuple containing the observations,
the rewards, the agent ids and the action masks for the Agents
of the specified behavior. These Agents need an action this step.
- A TerminalSteps NamedTuple containing the observations,
rewards, agent ids and max_step flags of the agents that had their
episode terminated last step.
def get_agent_group_spec(self, agent_group: AgentGroup) -> AgentGroupSpec:
def get_behavior_spec(self, behavior_name: BehaviorName) -> BehaviorSpec:
Get the AgentGroupSpec corresponding to the agent group name
:param agent_group: The name of the group the agents are part of
:return: A AgentGroupSpec corresponding to that agent group name
Get the BehaviorSpec corresponding to the behavior name
:param behavior_name: The name of the behavior the agents are part of
:return: A BehaviorSpec corresponding to that behavior
"""
pass

104
ml-agents-envs/mlagents_envs/environment.py


import numpy as np
import os
import subprocess
from typing import Dict, List, Optional, Any
from typing import Dict, List, Optional, Any, Tuple
import mlagents_envs

from mlagents_envs.base_env import (
BaseEnv,
BatchedStepResult,
AgentGroupSpec,
AgentGroup,
DecisionSteps,
TerminalSteps,
BehaviorSpec,
BehaviorName,
AgentId,
)
from mlagents_envs.timers import timed, hierarchical_timer

)
from mlagents_envs.communicator_objects.command_pb2 import STEP, RESET
from mlagents_envs.rpc_utils import (
agent_group_spec_from_proto,
batched_step_result_from_proto,
)
from mlagents_envs.rpc_utils import behavior_spec_from_proto, steps_from_proto
from mlagents_envs.communicator_objects.unity_rl_input_pb2 import UnityRLInputProto
from mlagents_envs.communicator_objects.unity_rl_output_pb2 import UnityRLOutputProto

# Currently we require strict equality between the communication protocol
# on each side, although we may allow some flexibility in the future.
# This should be incremented whenever a change is made to the communication protocol.
API_VERSION = "0.15.0"
API_VERSION = "0.16.0"
# Default port that the editor listens on. If an environment executable
# isn't specified, this port will be used.

f"Connected to Unity environment with package version {aca_params.package_version} "
f"and communication version {aca_params.communication_version}"
)
self._env_state: Dict[str, BatchedStepResult] = {}
self._env_specs: Dict[str, AgentGroupSpec] = {}
self._env_state: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
self._env_specs: Dict[str, BehaviorSpec] = {}
self._update_group_specs(aca_output)
self._update_behavior_specs(aca_output)
@staticmethod
def get_communicator(worker_id, base_port, timeout_wait):

f'"chmod -R 755 {launch_string}"'
) from perm
def _update_group_specs(self, output: UnityOutputProto) -> None:
def _update_behavior_specs(self, output: UnityOutputProto) -> None:
init_output = output.rl_initialization_output
for brain_param in init_output.brain_parameters:
# Each BrainParameter in the rl_initialization_output should have at least one AgentInfo

agent = agent_infos.value[0]
new_spec = agent_group_spec_from_proto(brain_param, agent)
new_spec = behavior_spec_from_proto(brain_param, agent)
self._env_specs[brain_param.brain_name] = new_spec
logger.info(f"Connected new brain:\n{brain_param.brain_name}")

for brain_name in self._env_specs.keys():
if brain_name in output.agentInfos:
agent_info_list = output.agentInfos[brain_name].value
self._env_state[brain_name] = batched_step_result_from_proto(
self._env_state[brain_name] = steps_from_proto(
self._env_state[brain_name] = BatchedStepResult.empty(
self._env_specs[brain_name]
self._env_state[brain_name] = (
DecisionSteps.empty(self._env_specs[brain_name]),
TerminalSteps.empty(self._env_specs[brain_name]),
)
self._parse_side_channel_message(self.side_channels, output.side_channel)

if outputs is None:
raise UnityCommunicationException("Communicator has stopped.")
self._update_group_specs(outputs)
self._update_behavior_specs(outputs)
rl_output = outputs.rl_output
self._update_state(rl_output)
self._is_first_message = False

if group_name not in self._env_actions:
n_agents = 0
if group_name in self._env_state:
n_agents = self._env_state[group_name].n_agents()
n_agents = len(self._env_state[group_name][0])
self._env_actions[group_name] = self._env_specs[
group_name
].create_empty_action(n_agents)

if outputs is None:
raise UnityCommunicationException("Communicator has stopped.")
self._update_group_specs(outputs)
self._update_behavior_specs(outputs)
def get_agent_groups(self) -> List[AgentGroup]:
def get_behavior_names(self):
def _assert_group_exists(self, agent_group: str) -> None:
if agent_group not in self._env_specs:
def _assert_behavior_exists(self, behavior_name: str) -> None:
if behavior_name not in self._env_specs:
"in the environment".format(agent_group)
"in the environment".format(behavior_name)
def set_actions(self, agent_group: AgentGroup, action: np.ndarray) -> None:
self._assert_group_exists(agent_group)
if agent_group not in self._env_state:
def set_actions(self, behavior_name: BehaviorName, action: np.ndarray) -> None:
self._assert_behavior_exists(behavior_name)
if behavior_name not in self._env_state:
spec = self._env_specs[agent_group]
spec = self._env_specs[behavior_name]
expected_shape = (self._env_state[agent_group].n_agents(), spec.action_size)
expected_shape = (len(self._env_state[behavior_name][0]), spec.action_size)
"The group {0} needs an input of dimension {1} but received input of dimension {2}".format(
agent_group, expected_shape, action.shape
"The behavior {0} needs an input of dimension {1} but received input of dimension {2}".format(
behavior_name, expected_shape, action.shape
self._env_actions[agent_group] = action
self._env_actions[behavior_name] = action
self, agent_group: AgentGroup, agent_id: AgentId, action: np.ndarray
self, behavior_name: BehaviorName, agent_id: AgentId, action: np.ndarray
self._assert_group_exists(agent_group)
if agent_group not in self._env_state:
self._assert_behavior_exists(behavior_name)
if behavior_name not in self._env_state:
spec = self._env_specs[agent_group]
spec = self._env_specs[behavior_name]
"The Agent {0} in group {1} needs an input of dimension {2} but received input of dimension {3}".format(
agent_id, agent_group, expected_shape, action.shape
f"The Agent {0} with BehaviorName {1} needs an input of dimension "
f"{2} but received input of dimension {3}".format(
agent_id, behavior_name, expected_shape, action.shape
)
)
expected_type = np.float32 if spec.is_action_continuous() else np.int32

if agent_group not in self._env_actions:
self._env_actions[agent_group] = spec.create_empty_action(
self._env_state[agent_group].n_agents()
if behavior_name not in self._env_actions:
self._env_actions[behavior_name] = spec.create_empty_action(
len(self._env_state[behavior_name][0])
index = np.where(self._env_state[agent_group].agent_id == agent_id)[0][0]
index = np.where(self._env_state[behavior_name][0].agent_id == agent_id)[0][
0
]
except IndexError as ie:
raise IndexError(
"agent_id {} is did not request a decision at the previous step".format(

self._env_actions[agent_group][index] = action
self._env_actions[behavior_name][index] = action
def get_step_result(self, agent_group: AgentGroup) -> BatchedStepResult:
self._assert_group_exists(agent_group)
return self._env_state[agent_group]
def get_steps(
self, behavior_name: BehaviorName
) -> Tuple[DecisionSteps, TerminalSteps]:
self._assert_behavior_exists(behavior_name)
return self._env_state[behavior_name]
def get_agent_group_spec(self, agent_group: AgentGroup) -> AgentGroupSpec:
self._assert_group_exists(agent_group)
return self._env_specs[agent_group]
def get_behavior_spec(self, behavior_name: BehaviorName) -> BehaviorSpec:
self._assert_behavior_exists(behavior_name)
return self._env_specs[behavior_name]
def close(self):
"""

) -> UnityInputProto:
rl_in = UnityRLInputProto()
for b in vector_action:
n_agents = self._env_state[b].n_agents()
n_agents = len(self._env_state[b][0])
if n_agents == 0:
continue
for i in range(n_agents):

99
ml-agents-envs/mlagents_envs/rpc_utils.py


from mlagents_envs.base_env import AgentGroupSpec, ActionType, BatchedStepResult
from mlagents_envs.base_env import (
BehaviorSpec,
ActionType,
DecisionSteps,
TerminalSteps,
)
from mlagents_envs.exception import UnityObservationException
from mlagents_envs.timers import hierarchical_timer, timed
from mlagents_envs.communicator_objects.agent_info_pb2 import AgentInfoProto

from PIL import Image
def agent_group_spec_from_proto(
def behavior_spec_from_proto(
) -> AgentGroupSpec:
) -> BehaviorSpec:
Converts brain parameter and agent info proto to AgentGroupSpec object.
Converts brain parameter and agent info proto to BehaviorSpec object.
:return: AgentGroupSpec object.
:return: BehaviorSpec object.
"""
observation_shape = [tuple(obs.shape) for obs in agent_info.observations]
action_type = (

] = brain_param_proto.vector_action_size[0]
else:
action_shape = tuple(brain_param_proto.vector_action_size)
return AgentGroupSpec(observation_shape, action_type, action_shape)
return BehaviorSpec(observation_shape, action_type, action_shape)
@timed

@timed
def batched_step_result_from_proto(
def steps_from_proto(
group_spec: AgentGroupSpec,
) -> BatchedStepResult:
obs_list: List[np.ndarray] = []
for obs_index, obs_shape in enumerate(group_spec.observation_shapes):
behavior_spec: BehaviorSpec,
) -> Tuple[DecisionSteps, TerminalSteps]:
decision_agent_info_list = [
agent_info for agent_info in agent_info_list if not agent_info.done
]
terminal_agent_info_list = [
agent_info for agent_info in agent_info_list if agent_info.done
]
decision_obs_list: List[np.ndarray] = []
terminal_obs_list: List[np.ndarray] = []
for obs_index, obs_shape in enumerate(behavior_spec.observation_shapes):
obs_list.append(
_process_visual_observation(obs_index, obs_shape, agent_info_list)
decision_obs_list.append(
_process_visual_observation(
obs_index, obs_shape, decision_agent_info_list
)
)
terminal_obs_list.append(
_process_visual_observation(
obs_index, obs_shape, terminal_agent_info_list
)
obs_list.append(
_process_vector_observation(obs_index, obs_shape, agent_info_list)
decision_obs_list.append(
_process_vector_observation(
obs_index, obs_shape, decision_agent_info_list
)
)
terminal_obs_list.append(
_process_vector_observation(
obs_index, obs_shape, terminal_agent_info_list
)
rewards = np.array(
[agent_info.reward for agent_info in agent_info_list], dtype=np.float32
decision_rewards = np.array(
[agent_info.reward for agent_info in decision_agent_info_list], dtype=np.float32
)
terminal_rewards = np.array(
[agent_info.reward for agent_info in terminal_agent_info_list], dtype=np.float32
_raise_on_nan_and_inf(rewards, "rewards")
_raise_on_nan_and_inf(decision_rewards, "rewards")
_raise_on_nan_and_inf(terminal_rewards, "rewards")
done = np.array([agent_info.done for agent_info in agent_info_list], dtype=np.bool)
[agent_info.max_step_reached for agent_info in agent_info_list], dtype=np.bool
[agent_info.max_step_reached for agent_info in terminal_agent_info_list],
dtype=np.bool,
agent_id = np.array(
[agent_info.id for agent_info in agent_info_list], dtype=np.int32
decision_agent_id = np.array(
[agent_info.id for agent_info in decision_agent_info_list], dtype=np.int32
)
terminal_agent_id = np.array(
[agent_info.id for agent_info in terminal_agent_info_list], dtype=np.int32
if group_spec.is_action_discrete():
if any([agent_info.action_mask is not None] for agent_info in agent_info_list):
n_agents = len(agent_info_list)
a_size = np.sum(group_spec.discrete_action_branches)
if behavior_spec.is_action_discrete():
if any(
[agent_info.action_mask is not None]
for agent_info in decision_agent_info_list
):
n_agents = len(decision_agent_info_list)
a_size = np.sum(behavior_spec.discrete_action_branches)
for agent_index, agent_info in enumerate(agent_info_list):
for agent_index, agent_info in enumerate(decision_agent_info_list):
if agent_info.action_mask is not None:
if len(agent_info.action_mask) == a_size:
mask_matrix[agent_index, :] = [

action_mask = (1 - mask_matrix).astype(np.bool)
indices = _generate_split_indices(group_spec.discrete_action_branches)
indices = _generate_split_indices(behavior_spec.discrete_action_branches)
return BatchedStepResult(obs_list, rewards, done, max_step, agent_id, action_mask)
return (
DecisionSteps(
decision_obs_list, decision_rewards, decision_agent_id, action_mask
),
TerminalSteps(terminal_obs_list, terminal_rewards, max_step, terminal_agent_id),
)
def _generate_split_indices(dims):

38
ml-agents-envs/mlagents_envs/side_channel/incoming_message.py


self.buffer = buffer
self.offset = offset
def read_bool(self) -> bool:
def read_bool(self, default_value: bool = False) -> bool:
:param default_value: Default value to use if the end of the message is reached.
:return: The value read from the message, or the default value if the end was reached.
if self._at_end_of_buffer():
return default_value
def read_int32(self) -> int:
def read_int32(self, default_value: int = 0) -> int:
:param default_value: Default value to use if the end of the message is reached.
:return: The value read from the message, or the default value if the end was reached.
if self._at_end_of_buffer():
return default_value
def read_float32(self) -> float:
def read_float32(self, default_value: float = 0.0) -> float:
:param default_value: Default value to use if the end of the message is reached.
:return: The value read from the message, or the default value if the end was reached.
if self._at_end_of_buffer():
return default_value
def read_float32_list(self) -> List[float]:
def read_float32_list(self, default_value: List[float] = None) -> List[float]:
:param default_value: Default value to use if the end of the message is reached.
:return: The value read from the message, or the default value if the end was reached.
if self._at_end_of_buffer():
return [] if default_value is None else default_value
list_len = self.read_int32()
output = []
for _ in range(list_len):

def read_string(self) -> str:
def read_string(self, default_value: str = "") -> str:
:param default_value: Default value to use if the end of the message is reached.
:return: The value read from the message, or the default value if the end was reached.
if self._at_end_of_buffer():
return default_value
encoded_str_len = self.read_int32()
val = self.buffer[self.offset : self.offset + encoded_str_len].decode("ascii")
self.offset += encoded_str_len

Get a copy of the internal bytes used by the message.
"""
return bytearray(self.buffer)
def _at_end_of_buffer(self) -> bool:
return self.offset >= len(self.buffer)

43
ml-agents-envs/mlagents_envs/tests/test_envs.py


import numpy as np
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.base_env import BatchedStepResult
from mlagents_envs.base_env import DecisionSteps, TerminalSteps
from mlagents_envs.exception import UnityEnvironmentException, UnityActionException
from mlagents_envs.mock_communicator import MockCommunicator

discrete_action=False, visual_inputs=0
)
env = UnityEnvironment(" ")
assert env.get_agent_groups() == ["RealFakeBrain"]
assert env.get_behavior_names() == ["RealFakeBrain"]
env.close()

discrete_action=False, visual_inputs=0
)
env = UnityEnvironment(" ")
spec = env.get_agent_group_spec("RealFakeBrain")
spec = env.get_behavior_spec("RealFakeBrain")
batched_step_result = env.get_step_result("RealFakeBrain")
decision_steps, terminal_steps = env.get_steps("RealFakeBrain")
assert isinstance(batched_step_result, BatchedStepResult)
assert len(spec.observation_shapes) == len(batched_step_result.obs)
n_agents = batched_step_result.n_agents()
for shape, obs in zip(spec.observation_shapes, batched_step_result.obs):
assert isinstance(decision_steps, DecisionSteps)
assert isinstance(terminal_steps, TerminalSteps)
assert len(spec.observation_shapes) == len(decision_steps.obs)
assert len(spec.observation_shapes) == len(terminal_steps.obs)
n_agents = len(decision_steps)
for shape, obs in zip(spec.observation_shapes, decision_steps.obs):
assert (n_agents,) + shape == obs.shape
n_agents = len(terminal_steps)
for shape, obs in zip(spec.observation_shapes, terminal_steps.obs):
assert (n_agents,) + shape == obs.shape

discrete_action=False, visual_inputs=0
)
env = UnityEnvironment(" ")
spec = env.get_agent_group_spec("RealFakeBrain")
spec = env.get_behavior_spec("RealFakeBrain")
batched_step_result = env.get_step_result("RealFakeBrain")
n_agents = batched_step_result.n_agents()
decision_steps, terminal_steps = env.get_steps("RealFakeBrain")
n_agents = len(decision_steps)
env.set_actions(
"RealFakeBrain", np.zeros((n_agents, spec.action_size), dtype=np.float32)
)

"RealFakeBrain",
np.zeros((n_agents - 1, spec.action_size), dtype=np.float32),
)
batched_step_result = env.get_step_result("RealFakeBrain")
n_agents = batched_step_result.n_agents()
decision_steps, terminal_steps = env.get_steps("RealFakeBrain")
n_agents = len(decision_steps)
env.set_actions(
"RealFakeBrain", -1 * np.ones((n_agents, spec.action_size), dtype=np.float32)
)

assert isinstance(batched_step_result, BatchedStepResult)
assert len(spec.observation_shapes) == len(batched_step_result.obs)
for shape, obs in zip(spec.observation_shapes, batched_step_result.obs):
assert isinstance(decision_steps, DecisionSteps)
assert isinstance(terminal_steps, TerminalSteps)
assert len(spec.observation_shapes) == len(decision_steps.obs)
assert len(spec.observation_shapes) == len(terminal_steps.obs)
for shape, obs in zip(spec.observation_shapes, decision_steps.obs):
assert not batched_step_result.done[0]
assert batched_step_result.done[2]
assert 0 in decision_steps
assert 2 in terminal_steps
@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")

170
ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py


AgentInfoActionPairProto,
)
from mlagents_envs.communicator_objects.agent_action_pb2 import AgentActionProto
from mlagents_envs.base_env import AgentGroupSpec, ActionType, BatchedStepResult
from mlagents_envs.base_env import (
BehaviorSpec,
ActionType,
DecisionSteps,
TerminalSteps,
)
agent_group_spec_from_proto,
behavior_spec_from_proto,
batched_step_result_from_proto,
steps_from_proto,
)
from PIL import Image

ap = AgentInfoProto()
ap.reward = float("inf") if infinite_rewards else agent_index
ap.done = agent_index % 2 == 0
ap.max_step_reached = agent_index % 2 == 1
ap.max_step_reached = agent_index % 4 == 0
ap.id = agent_index
ap.action_mask.extend([True, False] * 5)
obs_proto_list = []

return obs_proto
def proto_from_batched_step_result(
batched_step_result: BatchedStepResult
def proto_from_steps(
decision_steps: DecisionSteps, terminal_steps: TerminalSteps
for agent_id in batched_step_result.agent_id:
agent_id_index = batched_step_result.agent_id_to_index[agent_id]
reward = batched_step_result.reward[agent_id_index]
done = batched_step_result.done[agent_id_index]
max_step_reached = batched_step_result.max_step[agent_id_index]
# Take care of the DecisionSteps first
for agent_id in decision_steps.agent_id:
agent_id_index = decision_steps.agent_id_to_index[agent_id]
reward = decision_steps.reward[agent_id_index]
done = False
max_step_reached = False
if batched_step_result.action_mask is not None:
if decision_steps.action_mask is not None:
for _branch in batched_step_result.action_mask:
for _branch in decision_steps.action_mask:
for all_observations_of_type in batched_step_result.obs:
for all_observations_of_type in decision_steps.obs:
observation = all_observations_of_type[agent_id_index]
if len(observation.shape) == 3:
observations.append(generate_uncompressed_proto_obs(observation))

compression_type=NONE,
)
)
agent_info_proto = AgentInfoProto(
reward=reward,
done=done,

observations=observations,
)
agent_info_protos.append(agent_info_proto)
# Take care of the TerminalSteps second
for agent_id in terminal_steps.agent_id:
agent_id_index = terminal_steps.agent_id_to_index[agent_id]
reward = terminal_steps.reward[agent_id_index]
done = True
max_step_reached = terminal_steps.max_step[agent_id_index]
final_observations: List[ObservationProto] = []
for all_observations_of_type in terminal_steps.obs:
observation = all_observations_of_type[agent_id_index]
if len(observation.shape) == 3:
final_observations.append(generate_uncompressed_proto_obs(observation))
else:
final_observations.append(
ObservationProto(
float_data=ObservationProto.FloatData(data=observation),
shape=[len(observation)],
compression_type=NONE,
)
)
agent_info_proto = AgentInfoProto(
reward=reward,
done=done,
id=agent_id,
max_step_reached=max_step_reached,
action_mask=None,
observations=final_observations,
)
agent_info_protos.append(agent_info_proto)
# The arguments here are the BatchedStepResult and actions for a single agent name
def proto_from_batched_step_result_and_action(
batched_step_result: BatchedStepResult, actions: np.ndarray
# The arguments here are the DecisionSteps, TerminalSteps and actions for a single agent name
def proto_from_steps_and_action(
decision_steps: DecisionSteps, terminal_steps: TerminalSteps, actions: np.ndarray
agent_info_protos = proto_from_batched_step_result(batched_step_result)
agent_info_protos = proto_from_steps(decision_steps, terminal_steps)
agent_action_protos = [
AgentActionProto(vector_actions=action) for action in actions
]

def test_batched_step_result_from_proto():
n_agents = 10
shapes = [(3,), (4,)]
group_spec = AgentGroupSpec(shapes, ActionType.CONTINUOUS, 3)
spec = BehaviorSpec(shapes, ActionType.CONTINUOUS, 3)
result = batched_step_result_from_proto(ap_list, group_spec)
assert list(result.reward) == list(range(n_agents))
assert list(result.agent_id) == list(range(n_agents))
for index in range(n_agents):
assert result.done[index] == (index % 2 == 0)
assert result.max_step[index] == (index % 2 == 1)
assert list(result.obs[0].shape) == [n_agents] + list(shapes[0])
assert list(result.obs[1].shape) == [n_agents] + list(shapes[1])
decision_steps, terminal_steps = steps_from_proto(ap_list, spec)
for agent_id in range(n_agents):
if agent_id in decision_steps:
# we set the reward equal to the agent id in generate_list_agent_proto
assert decision_steps[agent_id].reward == agent_id
elif agent_id in terminal_steps:
assert terminal_steps[agent_id].reward == agent_id
else:
raise Exception("Missing agent from the steps")
# We sort the AgentId since they are split between DecisionSteps and TerminalSteps
combined_agent_id = list(decision_steps.agent_id) + list(terminal_steps.agent_id)
combined_agent_id.sort()
assert combined_agent_id == list(range(n_agents))
for agent_id in range(n_agents):
assert (agent_id in terminal_steps) == (agent_id % 2 == 0)
if agent_id in terminal_steps:
assert terminal_steps[agent_id].max_step == (agent_id % 4 == 0)
assert decision_steps.obs[0].shape[1] == shapes[0][0]
assert decision_steps.obs[1].shape[1] == shapes[1][0]
assert terminal_steps.obs[0].shape[1] == shapes[0][0]
assert terminal_steps.obs[1].shape[1] == shapes[1][0]
group_spec = AgentGroupSpec(shapes, ActionType.DISCRETE, (7, 3))
behavior_spec = BehaviorSpec(shapes, ActionType.DISCRETE, (7, 3))
result = batched_step_result_from_proto(ap_list, group_spec)
masks = result.action_mask
decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
masks = decision_steps.action_mask
assert masks[0].shape == (n_agents, 7)
assert masks[1].shape == (n_agents, 3)
assert masks[0].shape == (n_agents / 2, 7) # half agents are done
assert masks[1].shape == (n_agents / 2, 3) # half agents are done
assert masks[0][0, 0]
assert not masks[1][0, 0]
assert masks[1][0, 1]

n_agents = 10
shapes = [(3,), (4,)]
group_spec = AgentGroupSpec(shapes, ActionType.DISCRETE, (10,))
behavior_spec = BehaviorSpec(shapes, ActionType.DISCRETE, (10,))
result = batched_step_result_from_proto(ap_list, group_spec)
masks = result.action_mask
decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
masks = decision_steps.action_mask
assert masks[0].shape == (n_agents, 10)
assert masks[0].shape == (n_agents / 2, 10)
assert masks[0][0, 0]

group_spec = AgentGroupSpec(shapes, ActionType.DISCRETE, (2, 2, 6))
behavior_spec = BehaviorSpec(shapes, ActionType.DISCRETE, (2, 2, 6))
result = batched_step_result_from_proto(ap_list, group_spec)
masks = result.action_mask
decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
masks = decision_steps.action_mask
assert masks[0].shape == (n_agents, 2)
assert masks[1].shape == (n_agents, 2)
assert masks[2].shape == (n_agents, 6)
assert masks[0].shape == (n_agents / 2, 2)
assert masks[1].shape == (n_agents / 2, 2)
assert masks[2].shape == (n_agents / 2, 6)
assert masks[0][0, 0]

group_spec = AgentGroupSpec(shapes, ActionType.CONTINUOUS, 10)
behavior_spec = BehaviorSpec(shapes, ActionType.CONTINUOUS, 10)
result = batched_step_result_from_proto(ap_list, group_spec)
masks = result.action_mask
decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
masks = decision_steps.action_mask
def test_agent_group_spec_from_proto():
def test_agent_behavior_spec_from_proto():
group_spec = agent_group_spec_from_proto(bp, agent_proto)
assert group_spec.is_action_discrete()
assert not group_spec.is_action_continuous()
assert group_spec.observation_shapes == [(3,), (4,)]
assert group_spec.discrete_action_branches == (5, 4)
assert group_spec.action_size == 2
behavior_spec = behavior_spec_from_proto(bp, agent_proto)
assert behavior_spec.is_action_discrete()
assert not behavior_spec.is_action_continuous()
assert behavior_spec.observation_shapes == [(3,), (4,)]
assert behavior_spec.discrete_action_branches == (5, 4)
assert behavior_spec.action_size == 2
group_spec = agent_group_spec_from_proto(bp, agent_proto)
assert not group_spec.is_action_discrete()
assert group_spec.is_action_continuous()
assert group_spec.action_size == 6
behavior_spec = behavior_spec_from_proto(bp, agent_proto)
assert not behavior_spec.is_action_discrete()
assert behavior_spec.is_action_continuous()
assert behavior_spec.action_size == 6
group_spec = AgentGroupSpec(shapes, ActionType.CONTINUOUS, 3)
behavior_spec = BehaviorSpec(shapes, ActionType.CONTINUOUS, 3)
batched_step_result_from_proto(ap_list, group_spec)
steps_from_proto(ap_list, behavior_spec)
group_spec = AgentGroupSpec(shapes, ActionType.CONTINUOUS, 3)
behavior_spec = BehaviorSpec(shapes, ActionType.CONTINUOUS, 3)
batched_step_result_from_proto(ap_list, group_spec)
steps_from_proto(ap_list, behavior_spec)

20
ml-agents-envs/mlagents_envs/tests/test_side_channel.py


read_vals.append(msg_in.read_bool())
assert vals == read_vals
# Test reading with defaults
assert msg_in.read_bool() is False
assert msg_in.read_bool(default_value=True) is True
def test_message_int32():
val = 1337

read_val = msg_in.read_int32()
assert val == read_val
# Test reading with defaults
assert 0 == msg_in.read_int32()
assert val == msg_in.read_int32(default_value=val)
def test_message_float32():
val = 42.0

# These won't be exactly equal in general, since python floats are 64-bit.
assert val == read_val
# Test reading with defaults
assert 0.0 == msg_in.read_float32()
assert val == msg_in.read_float32(default_value=val)
def test_message_string():
val = "mlagents!"

read_val = msg_in.read_string()
assert val == read_val
# Test reading with defaults
assert "" == msg_in.read_string()
assert val == msg_in.read_string(default_value=val)
def test_message_float_list():
val = [1.0, 3.0, 9.0]

read_val = msg_in.read_float32_list()
# These won't be exactly equal in general, since python floats are 64-bit.
assert val == read_val
# Test reading with defaults
assert [] == msg_in.read_float32_list()
assert val == msg_in.read_float32_list(default_value=val)

180
ml-agents/mlagents/trainers/agent_processor.py


import sys
from typing import List, Dict, Deque, TypeVar, Generic, Tuple, Any
from typing import List, Dict, Deque, TypeVar, Generic, Tuple, Any, Union
from mlagents_envs.base_env import BatchedStepResult, StepResult
from mlagents_envs.base_env import (
DecisionSteps,
DecisionStep,
TerminalSteps,
TerminalStep,
)
from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents.trainers.policy.tf_policy import TFPolicy

:param stats_category: The category under which to write the stats. Usually, this comes from the Trainer.
"""
self.experience_buffers: Dict[str, List[AgentExperience]] = defaultdict(list)
self.last_step_result: Dict[str, Tuple[StepResult, int]] = {}
self.last_step_result: Dict[str, Tuple[DecisionStep, int]] = {}
# last_take_action_outputs stores the action a_t taken before the current observation s_(t+1), while
# grabbing previous_action from the policy grabs the action PRIOR to that, a_(t-1).
self.last_take_action_outputs: Dict[str, ActionInfoOutputs] = {}

def add_experiences(
self,
batched_step_result: BatchedStepResult,
decision_steps: DecisionSteps,
terminal_steps: TerminalSteps,
:param batched_step_result: current BatchedStepResult.
:param decision_steps: current DecisionSteps.
:param terminal_steps: current TerminalSteps.
:param previous_action: The outputs of the Policy's get_action method.
"""
take_action_outputs = previous_action.outputs

if global_id in self.last_step_result: # Don't store if agent just reset
self.last_take_action_outputs[global_id] = take_action_outputs
for _id in batched_step_result.agent_id: # Assume agent_id is 1-D
local_id = int(
_id
) # Needed for mypy to pass since ndarray has no content type
curr_agent_step = batched_step_result.get_agent_step_result(local_id)
# Iterate over all the terminal steps
for terminal_step in terminal_steps.values():
local_id = terminal_step.agent_id
stored_agent_step, idx = self.last_step_result.get(global_id, (None, None))
stored_take_action_outputs = self.last_take_action_outputs.get(
global_id, None
self._process_step(
terminal_step, global_id, terminal_steps.agent_id_to_index[local_id]
if stored_agent_step is not None and stored_take_action_outputs is not None:
# We know the step is from the same worker, so use the local agent id.
obs = stored_agent_step.obs
if not stored_agent_step.done:
if self.policy.use_recurrent:
memory = self.policy.retrieve_memories([global_id])[0, :]
else:
memory = None
done = curr_agent_step.done
max_step = curr_agent_step.max_step
# Add the outputs of the last eval
action = stored_take_action_outputs["action"][idx]
if self.policy.use_continuous_act:
action_pre = stored_take_action_outputs["pre_action"][idx]
else:
action_pre = None
action_probs = stored_take_action_outputs["log_probs"][idx]
action_mask = stored_agent_step.action_mask
prev_action = self.policy.retrieve_previous_action([global_id])[
0, :
]
experience = AgentExperience(
obs=obs,
reward=curr_agent_step.reward,
done=done,
action=action,
action_probs=action_probs,
action_pre=action_pre,
action_mask=action_mask,
prev_action=prev_action,
max_step=max_step,
memory=memory,
)
# Add the value outputs if needed
self.experience_buffers[global_id].append(experience)
self.episode_rewards[global_id] += curr_agent_step.reward
if (
curr_agent_step.done
or (
len(self.experience_buffers[global_id])
>= self.max_trajectory_length
)
) and len(self.experience_buffers[global_id]) > 0:
# Make next AgentExperience
next_obs = curr_agent_step.obs
trajectory = Trajectory(
steps=self.experience_buffers[global_id],
agent_id=global_id,
next_obs=next_obs,
behavior_id=self.behavior_id,
)
for traj_queue in self.trajectory_queues:
traj_queue.put(trajectory)
self.experience_buffers[global_id] = []
if curr_agent_step.done:
# Record episode length for agents which have had at least
# 1 step. Done after reset ignored.
self.stats_reporter.add_stat(
"Environment/Episode Length",
self.episode_steps.get(global_id, 0),
)
elif not curr_agent_step.done:
self.episode_steps[global_id] += 1
# Index is needed to grab from last_take_action_outputs
self.last_step_result[global_id] = (
curr_agent_step,
batched_step_result.agent_id_to_index[_id],
# Iterate over all the decision steps
for ongoing_step in decision_steps.values():
local_id = ongoing_step.agent_id
global_id = get_global_agent_id(worker_id, local_id)
self._process_step(
ongoing_step, global_id, decision_steps.agent_id_to_index[local_id]
# Delete all done agents, regardless of if they had a 0-length episode.
if curr_agent_step.done:
self._clean_agent_data(global_id)
for _gid in action_global_agent_ids:
# If the ID doesn't have a last step result, the agent just reset,

self.policy.save_previous_action(
[_gid], take_action_outputs["action"]
)
def _process_step(
self, step: Union[TerminalStep, DecisionStep], global_id: str, index: int
) -> None:
terminated = isinstance(step, TerminalStep)
stored_decision_step, idx = self.last_step_result.get(global_id, (None, None))
stored_take_action_outputs = self.last_take_action_outputs.get(global_id, None)
if not terminated:
# Index is needed to grab from last_take_action_outputs
self.last_step_result[global_id] = (step, index)
# This state is the consequence of a past action
if stored_decision_step is not None and stored_take_action_outputs is not None:
obs = stored_decision_step.obs
if self.policy.use_recurrent:
memory = self.policy.retrieve_memories([global_id])[0, :]
else:
memory = None
done = terminated # Since this is an ongoing step
max_step = step.max_step if terminated else False
# Add the outputs of the last eval
action = stored_take_action_outputs["action"][idx]
if self.policy.use_continuous_act:
action_pre = stored_take_action_outputs["pre_action"][idx]
else:
action_pre = None
action_probs = stored_take_action_outputs["log_probs"][idx]
action_mask = stored_decision_step.action_mask
prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
experience = AgentExperience(
obs=obs,
reward=step.reward,
done=done,
action=action,
action_probs=action_probs,
action_pre=action_pre,
action_mask=action_mask,
prev_action=prev_action,
max_step=max_step,
memory=memory,
)
# Add the value outputs if needed
self.experience_buffers[global_id].append(experience)
self.episode_rewards[global_id] += step.reward
if not terminated:
self.episode_steps[global_id] += 1
# if the trajectory is too long, we truncate it
if (
len(self.experience_buffers[global_id]) >= self.max_trajectory_length
or terminated
):
# Make next AgentExperience
next_obs = step.obs
trajectory = Trajectory(
steps=self.experience_buffers[global_id],
agent_id=global_id,
next_obs=next_obs,
behavior_id=self.behavior_id,
)
for traj_queue in self.trajectory_queues:
traj_queue.put(trajectory)
self.experience_buffers[global_id] = []
if terminated:
# Record episode length.
self.stats_reporter.add_stat(
"Environment/Episode Length", self.episode_steps.get(global_id, 0)
)
self._clean_agent_data(global_id)
def _clean_agent_data(self, global_id: str) -> None:
"""

52
ml-agents/mlagents/trainers/behavior_id_utils.py


from typing import Dict, NamedTuple
from typing import NamedTuple
from urllib.parse import urlparse, parse_qs
name_behavior_id: str
"""
BehaviorIdentifiers is a named tuple of the identifiers that uniquely distinguish
an agent encountered in the trainer_controller. The named tuple consists of the
fully qualified behavior name, the name of the brain name (corresponds to trainer
in the trainer controller) and the team id. In the future, this can be extended
to support further identifiers.
"""
behavior_id: str
behavior_ids: Dict[str, int]
team_id: int
Parses a name_behavior_id of the form name?team=0&param1=i&...
Parses a name_behavior_id of the form name?team=0
This allows you to access the brain name and distinguishing identifiers
without parsing more than once.
This allows you to access the brain name and team id of an agent
ids: Dict[str, int] = {}
if "?" in name_behavior_id:
name, identifiers = name_behavior_id.rsplit("?", 1)
if "&" in identifiers:
list_of_identifiers = identifiers.split("&")
else:
list_of_identifiers = [identifiers]
for identifier in list_of_identifiers:
key, value = identifier.split("=")
ids[key] = int(value)
else:
name = name_behavior_id
parsed = urlparse(name_behavior_id)
name = parsed.path
ids = parse_qs(parsed.query)
team_id: int = 0
if "team" in ids:
team_id = int(ids["team"][0])
name_behavior_id=name_behavior_id, brain_name=name, behavior_ids=ids
behavior_id=name_behavior_id, brain_name=name, team_id=team_id
def create_name_behavior_id(name: str, team_id: int) -> str:
"""
Reconstructs fully qualified behavior name from name and team_id
:param name: brain name
:param team_id: team ID
:return: name_behavior_id
"""
return name + "?team=" + str(team_id)

16
ml-agents/mlagents/trainers/brain_conversion_utils.py


from mlagents.trainers.brain import BrainParameters, CameraResolution
from mlagents_envs.base_env import AgentGroupSpec
from mlagents_envs.base_env import BehaviorSpec
def group_spec_to_brain_parameters(
name: str, group_spec: AgentGroupSpec
def behavior_spec_to_brain_parameters(
name: str, behavior_spec: BehaviorSpec
[shape[0] for shape in group_spec.observation_shapes if len(shape) == 1]
[shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1]
vis_sizes = [shape for shape in group_spec.observation_shapes if len(shape) == 3]
vis_sizes = [shape for shape in behavior_spec.observation_shapes if len(shape) == 3]
if group_spec.is_action_discrete():
a_size += list(group_spec.discrete_action_branches)
if behavior_spec.is_action_discrete():
a_size += list(behavior_spec.discrete_action_branches)
a_size += [group_spec.action_size]
a_size += [behavior_spec.action_size]
vector_action_space_type = 1
return BrainParameters(
name, int(vec_size), cam_res, a_size, [], vector_action_space_type

73
ml-agents/mlagents/trainers/demo_loader.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.brain_conversion_utils import group_spec_to_brain_parameters
from mlagents.trainers.brain_conversion_utils import behavior_spec_to_brain_parameters
from mlagents_envs.rpc_utils import (
agent_group_spec_from_proto,
batched_step_result_from_proto,
)
from mlagents_envs.base_env import AgentGroupSpec
from mlagents_envs.rpc_utils import behavior_spec_from_proto, steps_from_proto
from mlagents_envs.base_env import BehaviorSpec
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
DemonstrationMetaProto,

from google.protobuf.internal.encoder import _EncodeVarint # type: ignore
INITIAL_POS = 33
SUPPORTED_DEMONSTRATION_VERSIONS = frozenset([0, 1])
group_spec: AgentGroupSpec,
behavior_spec: BehaviorSpec,
sequence_length: int,
) -> AgentBuffer:
# Create and populate buffer using experiences

if idx > len(pair_infos) - 2:
break
next_pair_info = pair_infos[idx + 1]
current_step_info = batched_step_result_from_proto(
[current_pair_info.agent_info], group_spec
current_decision_step, current_terminal_step = steps_from_proto(
[current_pair_info.agent_info], behavior_spec
next_step_info = batched_step_result_from_proto(
[next_pair_info.agent_info], group_spec
next_decision_step, next_terminal_step = steps_from_proto(
[next_pair_info.agent_info], behavior_spec
)
previous_action = (
np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0

pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32
)
curr_agent_id = current_step_info.agent_id[0]
current_agent_step_info = current_step_info.get_agent_step_result(curr_agent_id)
next_agent_id = next_step_info.agent_id[0]
next_agent_step_info = next_step_info.get_agent_step_result(next_agent_id)
next_done = len(next_terminal_step) == 1
next_reward = 0
if len(next_terminal_step) == 1:
next_reward = next_terminal_step.reward[0]
else:
next_reward = next_decision_step.reward[0]
current_obs = None
if len(current_terminal_step) == 1:
current_obs = list(current_terminal_step.values())[0].obs
else:
current_obs = list(current_decision_step.values())[0].obs
demo_raw_buffer["done"].append(next_agent_step_info.done)
demo_raw_buffer["rewards"].append(next_agent_step_info.reward)
split_obs = SplitObservations.from_observations(current_agent_step_info.obs)
demo_raw_buffer["done"].append(next_done)
demo_raw_buffer["rewards"].append(next_reward)
split_obs = SplitObservations.from_observations(current_obs)
if next_step_info.done:
if next_done:
demo_raw_buffer.resequence_and_append(
demo_processed_buffer, batch_size=None, training_length=sequence_length
)

:param sequence_length: Length of trajectories to fill buffer.
:return:
"""
group_spec, info_action_pair, _ = load_demonstration(file_path)
demo_buffer = make_demo_buffer(info_action_pair, group_spec, sequence_length)
brain_params = group_spec_to_brain_parameters("DemoBrain", group_spec)
behavior_spec, info_action_pair, _ = load_demonstration(file_path)
demo_buffer = make_demo_buffer(info_action_pair, behavior_spec, sequence_length)
brain_params = behavior_spec_to_brain_parameters("DemoBrain", behavior_spec)
return brain_params, demo_buffer

)
INITIAL_POS = 33
@timed
def load_demonstration(
file_path: str

# First 32 bytes of file dedicated to meta-data.
file_paths = get_demo_files(file_path)
group_spec = None
behavior_spec = None
brain_param_proto = None
info_action_pairs = []
total_expected = 0

if obs_decoded == 0:
meta_data_proto = DemonstrationMetaProto()
meta_data_proto.ParseFromString(data[pos : pos + next_pos])
if (
meta_data_proto.api_version
not in SUPPORTED_DEMONSTRATION_VERSIONS
):
raise RuntimeError(
f"Can't load Demonstration data from an unsupported version ({meta_data_proto.api_version})"
)
total_expected += meta_data_proto.number_steps
pos = INITIAL_POS
if obs_decoded == 1:

if obs_decoded > 1:
agent_info_action = AgentInfoActionPairProto()
agent_info_action.ParseFromString(data[pos : pos + next_pos])
if group_spec is None:
group_spec = agent_group_spec_from_proto(
if behavior_spec is None:
behavior_spec = behavior_spec_from_proto(
brain_param_proto, agent_info_action.agent_info
)
info_action_pairs.append(agent_info_action)

obs_decoded += 1
if not group_spec:
if not behavior_spec:
return group_spec, info_action_pairs, total_expected
return behavior_spec, info_action_pairs, total_expected
def write_delimited(f, message):

37
ml-agents/mlagents/trainers/env_manager.py


from abc import ABC, abstractmethod
from typing import List, Dict, NamedTuple, Iterable, Tuple
from mlagents_envs.base_env import BatchedStepResult, AgentGroupSpec, AgentGroup
from mlagents_envs.base_env import (
DecisionSteps,
TerminalSteps,
BehaviorSpec,
BehaviorName,
)
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.agent_processor import AgentManager, AgentManagerQueue

AllStepResult = Dict[AgentGroup, BatchedStepResult]
AllGroupSpec = Dict[AgentGroup, AgentGroupSpec]
AllStepResult = Dict[BehaviorName, Tuple[DecisionSteps, TerminalSteps]]
AllGroupSpec = Dict[BehaviorName, BehaviorSpec]
logger = get_logger(__name__)

worker_id: int
brain_name_to_action_info: Dict[AgentGroup, ActionInfo]
brain_name_to_action_info: Dict[BehaviorName, ActionInfo]
def name_behavior_ids(self) -> Iterable[AgentGroup]:
def name_behavior_ids(self) -> Iterable[BehaviorName]:
return self.current_all_step_result.keys()
@staticmethod

class EnvManager(ABC):
def __init__(self):
self.policies: Dict[AgentGroup, TFPolicy] = {}
self.agent_managers: Dict[AgentGroup, AgentManager] = {}
self.policies: Dict[BehaviorName, TFPolicy] = {}
self.agent_managers: Dict[BehaviorName, AgentManager] = {}
def set_policy(self, brain_name: AgentGroup, policy: TFPolicy) -> None:
def set_policy(self, brain_name: BehaviorName, policy: TFPolicy) -> None:
def set_agent_manager(self, brain_name: AgentGroup, manager: AgentManager) -> None:
def set_agent_manager(
self, brain_name: BehaviorName, manager: AgentManager
) -> None:
self.agent_managers[brain_name] = manager
@abstractmethod

@property
@abstractmethod
def external_brains(self) -> Dict[AgentGroup, BrainParameters]:
def external_brains(self) -> Dict[BehaviorName, BrainParameters]:
def get_properties(self) -> Dict[AgentGroup, float]:
def get_properties(self) -> Dict[BehaviorName, float]:
pass
@abstractmethod

)
)
continue
decision_steps, terminal_steps = step_info.current_all_step_result[
name_behavior_id
]
step_info.current_all_step_result[name_behavior_id],
decision_steps,
terminal_steps,
step_info.worker_id,
step_info.brain_name_to_action_info.get(
name_behavior_id, ActionInfo.empty()

453
ml-agents/mlagents/trainers/ghost/trainer.py


# # Unity ML-Agents Toolkit
# ## ML-Agent Learning (Ghost Trainer)
from typing import Deque, Dict, List, Any, cast
from typing import Deque, Dict, List, cast
import numpy as np

from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.stats import StatsPropertyType
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.behavior_id_utils import (
BehaviorIdentifiers,
create_name_behavior_id,
)
logger = get_logger(__name__)

"""
The GhostTrainer trains agents in adversarial games (there are teams in opposition) using a self-play mechanism.
In adversarial settings with self-play, at any time, there is only a single learning team. The other team(s) is
"ghosted" which means that its agents are executing fixed policies and not learning. The GhostTrainer wraps
a standard RL trainer which trains the learning team and ensures that only the trajectories collected
by the learning team are used for training. The GhostTrainer also maintains past policy snapshots to be used
as the fixed policies when the team is not learning. The GhostTrainer is 1:1 with brain_names as the other
trainers, and is responsible for one or more teams. Note, a GhostTrainer can have only one team in
asymmetric games where there is only one team with a particular behavior i.e. Hide and Seek.
The GhostController manages high level coordination between multiple ghost trainers. The learning team id
is cycled throughout a training run.
"""
self, trainer, brain_name, reward_buff_cap, trainer_parameters, training, run_id
self,
trainer,
brain_name,
controller,
reward_buff_cap,
trainer_parameters,
training,
run_id,
Responsible for collecting experiences and training trainer model via self_play.
Creates a GhostTrainer.
:param controller: GhostController that coordinates all ghost trainers and calculates ELO
:param reward_buff_cap: Max reward history to track in the reward buffer
:param trainer_parameters: The parameters for the trainer (dictionary).
:param training: Whether the trainer is set for training.

)
self.trainer = trainer
self.controller = controller
self.internal_policy_queues: List[AgentManagerQueue[Policy]] = []
self.internal_trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
self.ignored_trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
self.learning_policy_queues: Dict[str, AgentManagerQueue[Policy]] = {}
self._internal_trajectory_queues: Dict[str, AgentManagerQueue[Trajectory]] = {}
self._internal_policy_queues: Dict[str, AgentManagerQueue[Policy]] = {}
self._team_to_name_to_policy_queue: Dict[
int, Dict[str, AgentManagerQueue[Policy]]
] = {}
self._name_to_parsed_behavior_id: Dict[str, BehaviorIdentifiers] = {}
# assign ghost's stats collection to wrapped trainer's
self._stats_reporter = self.trainer.stats_reporter

self_play_parameters = trainer_parameters["self_play"]
self.window = self_play_parameters.get("window", 10)
self.play_against_current_self_ratio = self_play_parameters.get(
"play_against_current_self_ratio", 0.5
self.play_against_latest_model_ratio = self_play_parameters.get(
"play_against_latest_model_ratio", 0.5
if (
self.play_against_latest_model_ratio > 1.0
or self.play_against_latest_model_ratio < 0.0
):
logger.warning(
"The play_against_latest_model_ratio is not between 0 and 1."
)
self.steps_to_train_team = self_play_parameters.get("team_change", 100000)
if self.steps_to_train_team > self.get_max_steps:
logger.warning(
"The max steps of the GhostTrainer for behavior name {} is less than team change. This team will not face \
opposition that has been trained if the opposition is managed by a different GhostTrainer as in an \
asymmetric game.".format(
self.brain_name
)
)
# Counts the The number of steps of the ghost policies. Snapshot swapping
# depends on this counter whereas snapshot saving and team switching depends
# on the wrapped. This ensures that all teams train for the same number of trainer
# steps.
self.ghost_step: int = 0
# A list of dicts from brain name to a single snapshot for this trainer's policies
self.policy_snapshots: List[Dict[str, List[float]]] = []
# A dict from brain name to the current snapshot of this trainer's policies
self.current_policy_snapshot: Dict[str, List[float]] = {}
self.snapshot_counter: int = 0
self.policy_snapshots: List[Any] = []
self.snapshot_counter: int = 0
self.learning_behavior_name: str = None
self.current_policy_snapshot = None
self.last_save = 0
self.last_swap = 0
# wrapped_training_team and learning team need to be separate
# in the situation where new agents are created destroyed
# after learning team switches. These agents need to be added
# to trainers properly.
self._learning_team: int = None
self.wrapped_trainer_team: int = None
self.last_save: int = 0
self.last_swap: int = 0
self.last_team_change: int = 0
self.current_elo: float = self.initial_elo
self.policy_elos: List[float] = [self.initial_elo] * (
self.window + 1
) # for learning policy

def get_step(self) -> int:
"""
Returns the number of steps the trainer has performed
:return: the step count of the trainer
Returns the number of steps the wrapped trainer has performed
:return: the step count of the wrapped trainer
"""
return self.trainer.get_step

"""
return self.trainer.reward_buffer
@property
def current_elo(self) -> float:
"""
Gets ELO of current policy which is always last in the list
:return: ELO of current policy
"""
return self.policy_elos[-1]
def change_current_elo(self, change: float) -> None:
"""
Changes elo of current policy which is always last in the list
:param change: Amount to change current elo by
"""
self.policy_elos[-1] += change
def get_opponent_elo(self) -> float:
"""
Get elo of current opponent policy
:return: ELO of current opponent policy
"""
return self.policy_elos[self.current_opponent]
def change_opponent_elo(self, change: float) -> None:
"""
Changes elo of current opponent policy
:param change: Amount to change current opponent elo by
"""
self.policy_elos[self.current_opponent] -= change
if trajectory.done_reached and not trajectory.max_step_reached:
# Assumption is that final reward is 1/.5/0 for win/draw/loss
"""
Determines the final result of an episode and asks the GhostController
to calculate the ELO change. The GhostController changes the ELO
of the opponent policy since this may be in a different GhostTrainer
i.e. in asymmetric games. We assume the last reward determines the winner.
:param trajectory: Trajectory.
"""
if trajectory.done_reached:
# Assumption is that final reward is >0/0/<0 for win/draw/loss
final_reward = trajectory.steps[-1].reward
result = 0.5
if final_reward > 0:

change = compute_elo_rating_changes(
self.current_elo, self.policy_elos[self.current_opponent], result
change = self.controller.compute_elo_rating_changes(
self.current_elo, result
self.current_elo += change
self.policy_elos[self.current_opponent] -= change
opponents = np.array(self.policy_elos, dtype=np.float32)
self.change_current_elo(change)
self._stats_reporter.add_stat(
"Self-play/Mean Opponent ELO", opponents.mean()
)
self._stats_reporter.add_stat("Self-play/Std Opponent ELO", opponents.std())
for traj_queue, internal_traj_queue in zip(
self.trajectory_queues, self.internal_trajectory_queues
):
try:
# We grab at most the maximum length of the queue.
# This ensures that even if the queue is being filled faster than it is
# being emptied, the trajectories in the queue are on-policy.
for _ in range(traj_queue.maxlen):
t = traj_queue.get_nowait()
# adds to wrapped trainers queue
internal_traj_queue.put(t)
self._process_trajectory(t)
except AgentManagerQueue.Empty:
pass
for trajectory_queue in self.trajectory_queues:
parsed_behavior_id = self._name_to_parsed_behavior_id[
trajectory_queue.behavior_id
]
if parsed_behavior_id.team_id == self._learning_team:
# With a future multiagent trainer, this will be indexed by 'role'
internal_trajectory_queue = self._internal_trajectory_queues[
parsed_behavior_id.brain_name
]
try:
# We grab at most the maximum length of the queue.
# This ensures that even if the queue is being filled faster than it is
# being emptied, the trajectories in the queue are on-policy.
for _ in range(trajectory_queue.maxlen):
t = trajectory_queue.get_nowait()
# adds to wrapped trainers queue
internal_trajectory_queue.put(t)
self._process_trajectory(t)
except AgentManagerQueue.Empty:
pass
else:
# Dump trajectories from non-learning policy
try:
for _ in range(trajectory_queue.maxlen):
t = trajectory_queue.get_nowait()
# count ghost steps
self.ghost_step += len(t.steps)
except AgentManagerQueue.Empty:
pass
if self.get_step - self.last_team_change > self.steps_to_train_team:
self.controller.change_training_team(self.get_step)
self.last_team_change = self.get_step
for internal_q in self.internal_policy_queues:
# Get policies that correspond to the policy queue in question
next_learning_team = self.controller.get_learning_team
# CASE 1: Current learning team is managed by this GhostTrainer.
# If the learning team changes, the following loop over queues will push the
# new policy into the policy queue for the new learning agent if
# that policy is managed by this GhostTrainer. Otherwise, it will save the current snapshot.
# CASE 2: Current learning team is managed by a different GhostTrainer.
# If the learning team changes to a team managed by this GhostTrainer, this loop
# will push the current_snapshot into the correct queue. Otherwise,
# it will continue skipping and swap_snapshot will continue to handle
# pushing fixed snapshots
# Case 3: No team change. The if statement just continues to push the policy
# into the correct queue (or not if not learning team).
for brain_name in self._internal_policy_queues:
internal_policy_queue = self._internal_policy_queues[brain_name]
policy = cast(TFPolicy, internal_q.get_nowait())
self.current_policy_snapshot = policy.get_weights()
self.learning_policy_queues[internal_q.behavior_id].put(policy)
policy = cast(TFPolicy, internal_policy_queue.get_nowait())
self.current_policy_snapshot[brain_name] = policy.get_weights()
if next_learning_team in self._team_to_name_to_policy_queue:
name_to_policy_queue = self._team_to_name_to_policy_queue[
next_learning_team
]
if brain_name in name_to_policy_queue:
behavior_id = create_name_behavior_id(
brain_name, next_learning_team
)
policy = self.get_policy(behavior_id)
policy.load_weights(self.current_policy_snapshot[brain_name])
name_to_policy_queue[brain_name].put(policy)
# Note save and swap should be on different step counters.
# We don't want to save unless the policy is learning.
self._save_snapshot(self.trainer.policy)
self._save_snapshot()
if self.get_step - self.last_swap > self.steps_between_swap:
if (
self._learning_team != next_learning_team
or self.ghost_step - self.last_swap > self.steps_between_swap
):
self._learning_team = next_learning_team
self.last_swap = self.get_step
# Dump trajectories from non-learning policy
for traj_queue in self.ignored_trajectory_queues:
try:
for _ in range(traj_queue.maxlen):
traj_queue.get_nowait()
except AgentManagerQueue.Empty:
pass
self.last_swap = self.ghost_step
"""
Forwarding call to wrapped trainers end_episode
"""
self.trainer.save_model(name_behavior_id)
"""
Forwarding call to wrapped trainers save_model
"""
parsed_behavior_id = self._name_to_parsed_behavior_id[name_behavior_id]
brain_name = parsed_behavior_id.brain_name
self.trainer.save_model(brain_name)
self.trainer.export_model(name_behavior_id)
"""
Forwarding call to wrapped trainers export_model.
"""
parsed_behavior_id = self._name_to_parsed_behavior_id[name_behavior_id]
brain_name = parsed_behavior_id.brain_name
self.trainer.export_model(brain_name)
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> TFPolicy:
"""
Creates policy with the wrapped trainer's create_policy function
The first policy encountered sets the wrapped
trainer team. This is to ensure that all agents from the same multi-agent
team are grouped. All policies associated with this team are added to the
wrapped trainer to be trained.
"""
policy = self.trainer.create_policy(parsed_behavior_id, brain_parameters)
policy.create_tf_graph()
policy.init_load_weights()
team_id = parsed_behavior_id.team_id
self.controller.subscribe_team_id(team_id, self)
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
return self.trainer.create_policy(brain_parameters)
# First policy or a new agent on the same team encountered
if self.wrapped_trainer_team is None or team_id == self.wrapped_trainer_team:
internal_trainer_policy = self.trainer.create_policy(
parsed_behavior_id, brain_parameters
)
internal_trainer_policy.create_tf_graph()
internal_trainer_policy.init_load_weights()
self.current_policy_snapshot[
parsed_behavior_id.brain_name
] = internal_trainer_policy.get_weights()
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
policy.load_weights(internal_trainer_policy.get_weights())
self._save_snapshot() # Need to save after trainer initializes policy
self.trainer.add_policy(parsed_behavior_id, internal_trainer_policy)
self._learning_team = self.controller.get_learning_team
self.wrapped_trainer_team = team_id
return policy
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
) -> None:
Adds policy to trainer. For the first policy added, add a trainer
to the policy and set the learning behavior name to name_behavior_id.
:param name_behavior_id: Behavior ID that the policy should belong to.
Adds policy to GhostTrainer.
:param parsed_behavior_id: Behavior ID that the policy should belong to.
name_behavior_id = parsed_behavior_id.behavior_id
self._name_to_parsed_behavior_id[name_behavior_id] = parsed_behavior_id
policy.create_tf_graph()
# First policy encountered
if not self.learning_behavior_name:
weights = policy.get_weights()
self.current_policy_snapshot = weights
self.trainer.add_policy(name_behavior_id, policy)
self._save_snapshot(policy) # Need to save after trainer initializes policy
self.learning_behavior_name = name_behavior_id
behavior_id_parsed = BehaviorIdentifiers.from_name_behavior_id(
self.learning_behavior_name
)
team_id = behavior_id_parsed.behavior_ids["team"]
self._stats_reporter.add_property(StatsPropertyType.SELF_PLAY_TEAM, team_id)
else:
# for saving/swapping snapshots
policy.init_load_weights()
"""
Gets policy associated with name_behavior_id
:param name_behavior_id: Fully qualified behavior name
:return: Policy associated with name_behavior_id
"""
def _save_snapshot(self, policy: TFPolicy) -> None:
weights = policy.get_weights()
try:
self.policy_snapshots[self.snapshot_counter] = weights
except IndexError:
self.policy_snapshots.append(weights)
def _save_snapshot(self) -> None:
"""
Saves a snapshot of the current weights of the policy and maintains the policy_snapshots
according to the window size
"""
for brain_name in self.current_policy_snapshot:
current_snapshot_for_brain_name = self.current_policy_snapshot[brain_name]
try:
self.policy_snapshots[self.snapshot_counter][
brain_name
] = current_snapshot_for_brain_name
except IndexError:
self.policy_snapshots.append(
{brain_name: current_snapshot_for_brain_name}
)
for q in self.policy_queues:
name_behavior_id = q.behavior_id
# here is the place for a sampling protocol
if name_behavior_id == self.learning_behavior_name:
"""
Swaps the appropriate weight to the policy and pushes it to respective policy queues
"""
for team_id in self._team_to_name_to_policy_queue:
if team_id == self._learning_team:
elif np.random.uniform() < (1 - self.play_against_current_self_ratio):
elif np.random.uniform() < (1 - self.play_against_latest_model_ratio):
self.policy_elos[-1] = self.current_elo
logger.debug(
"Step {}: Swapping snapshot {} to id {} with {} learning".format(
self.get_step, x, name_behavior_id, self.learning_behavior_name
name_to_policy_queue = self._team_to_name_to_policy_queue[team_id]
for brain_name in self._team_to_name_to_policy_queue[team_id]:
behavior_id = create_name_behavior_id(brain_name, team_id)
policy = self.get_policy(behavior_id)
policy.load_weights(snapshot[brain_name])
name_to_policy_queue[brain_name].put(policy)
logger.debug(
"Step {}: Swapping snapshot {} to id {} with team {} learning".format(
self.ghost_step, x, behavior_id, self._learning_team
)
)
policy = self.get_policy(name_behavior_id)
policy.load_weights(snapshot)
q.put(policy)
Adds a policy queue to the list of queues to publish to when this Trainer
makes a policy update
Adds a policy queue for every member of the team to the list of queues to publish to when this Trainer
makes a policy update. Creates an internal policy queue for the wrapped
trainer to push to. The GhostTrainer pushes all policies to the env.
if policy_queue.behavior_id == self.learning_behavior_name:
parsed_behavior_id = self._name_to_parsed_behavior_id[policy_queue.behavior_id]
try:
self._team_to_name_to_policy_queue[parsed_behavior_id.team_id][
parsed_behavior_id.brain_name
] = policy_queue
except KeyError:
self._team_to_name_to_policy_queue[parsed_behavior_id.team_id] = {
parsed_behavior_id.brain_name: policy_queue
}
if parsed_behavior_id.team_id == self.wrapped_trainer_team:
# With a future multiagent trainer, this will be indexed by 'role'
policy_queue.behavior_id
parsed_behavior_id.brain_name
self.internal_policy_queues.append(internal_policy_queue)
self.learning_policy_queues[policy_queue.behavior_id] = policy_queue
self._internal_policy_queues[
parsed_behavior_id.brain_name
] = internal_policy_queue
self.trainer.publish_policy_queue(internal_policy_queue)
def subscribe_trajectory_queue(

Adds a trajectory queue to the list of queues for the trainer to ingest Trajectories from.
Adds a trajectory queue for every member of the team to the list of queues for the trainer
to ingest Trajectories from. Creates an internal trajectory queue to push trajectories from
the learning team. The wrapped trainer subscribes to this queue.
if trajectory_queue.behavior_id == self.learning_behavior_name:
super().subscribe_trajectory_queue(trajectory_queue)
super().subscribe_trajectory_queue(trajectory_queue)
parsed_behavior_id = self._name_to_parsed_behavior_id[
trajectory_queue.behavior_id
]
if parsed_behavior_id.team_id == self.wrapped_trainer_team:
# With a future multiagent trainer, this will be indexed by 'role'
] = AgentManagerQueue(trajectory_queue.behavior_id)
] = AgentManagerQueue(parsed_behavior_id.brain_name)
self.internal_trajectory_queues.append(internal_trajectory_queue)
self._internal_trajectory_queues[
parsed_behavior_id.brain_name
] = internal_trajectory_queue
else:
self.ignored_trajectory_queues.append(trajectory_queue)
# Taken from https://github.com/Unity-Technologies/ml-agents/pull/1975 and
# https://metinmediamath.wordpress.com/2013/11/27/how-to-calculate-the-elo-rating-including-example/
# ELO calculation
def compute_elo_rating_changes(rating1: float, rating2: float, result: float) -> float:
r1 = pow(10, rating1 / 400)
r2 = pow(10, rating2 / 400)
summed = r1 + r2
e1 = r1 / summed
change = result - e1
return change

20
ml-agents/mlagents/trainers/learn.py


default=False,
dest="force",
action="store_true",
help="Force-overwrite existing models and summaries for a run-id that has been used "
help="Force-overwrite existing models and summaries for a run ID that has been used "
help="The directory name for model and summary statistics",
help="The run identifier for model and summary statistics.",
)
argparser.add_argument(
"--initialize-from",
metavar="RUN_ID",
default=None,
help="Specify a previously saved run ID from which to initialize the model from. "
"This can be used, for instance, to fine-tune an existing model on a new environment. ",
)
argparser.add_argument(
"--save-freq", default=50000, type=int, help="Frequency at which to save model"

dest="inference",
action="store_true",
help="Run in Python inference mode (don't train). Use with --resume to load a model trained with an "
"existing run-id.",
"existing run ID.",
)
argparser.add_argument(
"--base-port",

seed: int = parser.get_default("seed")
env_path: Optional[str] = parser.get_default("env_path")
run_id: str = parser.get_default("run_id")
initialize_from: str = parser.get_default("initialize_from")
load_model: bool = parser.get_default("load_model")
resume: bool = parser.get_default("resume")
force: bool = parser.get_default("force")

"""
with hierarchical_timer("run_training.setup"):
model_path = f"./models/{options.run_id}"
maybe_init_path = (
f"./models/{options.initialize_from}" if options.initialize_from else None
)
summaries_dir = "./summaries"
port = options.base_port

],
)
handle_existing_directories(
model_path, summaries_dir, options.resume, options.force
model_path, summaries_dir, options.resume, options.force, maybe_init_path
)
tb_writer = TensorboardWriter(summaries_dir, clear_past_data=not options.resume)
gauge_write = GaugeWriter()

not options.inference,
options.resume,
run_seed,
maybe_init_path,
maybe_meta_curriculum,
options.multi_gpu,
)

10
ml-agents/mlagents/trainers/policy/nn_policy.py


from typing import Any, Dict, Optional, List
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents_envs.base_env import BatchedStepResult
from mlagents_envs.base_env import DecisionSteps
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.models import EncoderType
from mlagents.trainers.models import ModelUtils

@timed
def evaluate(
self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]
self, decision_requests: DecisionSteps, global_agent_ids: List[str]
:param batched_step_result: BatchedStepResult object containing inputs.
:param decision_requests: DecisionSteps object containing inputs.
self.batch_size_ph: batched_step_result.n_agents(),
self.batch_size_ph: len(decision_requests),
self.sequence_length_ph: 1,
}
if self.use_recurrent:

)
feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
feed_dict = self.fill_eval_dict(feed_dict, batched_step_result)
feed_dict = self.fill_eval_dict(feed_dict, decision_requests)
run_out = self._execute_model(feed_dict, self.inference_dict)
return run_out

4
ml-agents/mlagents/trainers/policy/policy.py


from abc import ABC, abstractmethod
from mlagents_envs.base_env import BatchedStepResult
from mlagents_envs.base_env import DecisionSteps
from mlagents.trainers.action_info import ActionInfo

self, batched_step_result: BatchedStepResult, worker_id: int = 0
self, decision_requests: DecisionSteps, worker_id: int = 0
) -> ActionInfo:
pass

85
ml-agents/mlagents/trainers/policy/tf_policy.py


from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.brain_conversion_utils import get_global_agent_id
from mlagents_envs.base_env import BatchedStepResult
from mlagents_envs.base_env import DecisionSteps
from mlagents.trainers.models import ModelUtils

if self.use_continuous_act:
self.num_branches = self.brain.vector_action_space_size[0]
self.model_path = trainer_parameters["model_path"]
self.initialize_path = trainer_parameters.get("init_path", None)
self.keep_checkpoints = trainer_parameters.get("keep_checkpoints", 5)
self.graph = tf.Graph()
self.sess = tf.Session(

init = tf.global_variables_initializer()
self.sess.run(init)
def _load_graph(self):
def _load_graph(self, model_path: str, reset_global_steps: bool = False) -> None:
logger.info("Loading Model for brain {}".format(self.brain.brain_name))
ckpt = tf.train.get_checkpoint_state(self.model_path)
logger.info(
"Loading model for brain {} from {}.".format(
self.brain.brain_name, model_path
)
)
ckpt = tf.train.get_checkpoint_state(model_path)
"--run-id. and that the previous run you are resuming from had the same "
"behavior names.".format(self.model_path)
"--run-id and that the previous run you are loading from had the same "
"behavior names.".format(model_path)
)
try:
self.saver.restore(self.sess, ckpt.model_checkpoint_path)
except tf.errors.NotFoundError:
raise UnityPolicyException(
"The model {0} was found but could not be loaded. Make "
"sure the model is from the same version of ML-Agents, has the same behavior parameters, "
"and is using the same trainer configuration as the current run.".format(
model_path
)
)
if reset_global_steps:
logger.info(
"Starting training from step 0 and saving to {}.".format(
self.model_path
)
)
else:
logger.info(
"Resuming training from step {}.".format(self.get_current_step())
self.saver.restore(self.sess, ckpt.model_checkpoint_path)
if self.load:
self._load_graph()
# If there is an initialize path, load from that. Else, load from the set model path.
# If load is set to True, don't reset steps to 0. Else, do. This allows a user to,
# e.g., resume from an initialize path.
reset_steps = not self.load
if self.initialize_path is not None:
self._load_graph(self.initialize_path, reset_global_steps=reset_steps)
elif self.load:
self._load_graph(self.model_path, reset_global_steps=reset_steps)
else:
self._initialize_graph()

self.assign_ops.append(tf.assign(var, assign_ph))
def load_weights(self, values):
if len(self.assign_ops) == 0:
logger.warning(
"Calling load_weights in tf_policy but assign_ops is empty. Did you forget to call init_load_weights?"
)
with self.graph.as_default():
feed_dict = {}
for assign_ph, value in zip(self.assign_phs, values):

def evaluate(
self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]
self, decision_requests: DecisionSteps, global_agent_ids: List[str]
:param batched_step_result: BatchedStepResult input to network.
:param decision_requests: DecisionSteps input to network.
self, batched_step_result: BatchedStepResult, worker_id: int = 0
self, decision_requests: DecisionSteps, worker_id: int = 0
:param batched_step_result: A dictionary of brain names and BatchedStepResult from environment.
:param decision_requests: A dictionary of brain names and DecisionSteps from environment.
the BatchedStepResult came from. Used to construct a globally unique id for each agent.
the DecisionSteps came from. Used to construct a globally unique id for each agent.
if batched_step_result.n_agents() == 0:
if len(decision_requests) == 0:
for agent_id in batched_step_result.agent_id
for agent_id in decision_requests.agent_id
batched_step_result, global_agent_ids
decision_requests, global_agent_ids
)
self.save_memories(global_agent_ids, run_out.get("memory_out"))

outputs=run_out,
agent_ids=batched_step_result.agent_id,
agent_ids=decision_requests.agent_id,
)
def update(self, mini_batch, num_sequences):

feed_dict[self.vector_in] = vec_vis_obs.vector_observations
if not self.use_continuous_act:
mask = np.ones(
(
batched_step_result.n_agents(),
np.sum(self.brain.vector_action_space_size),
),
(len(batched_step_result), np.sum(self.brain.vector_action_space_size)),
dtype=np.float32,
)
if batched_step_result.action_mask is not None:

"""
step = self.sess.run(self.global_step)
return step
def _set_step(self, step: int) -> int:
"""
Sets current model step to step without creating additional ops.
:param step: Step to set the current model step to.
:return: The step the model was set to.
"""
current_step = self.get_current_step()
# Increment a positive or negative number of steps.
return self.increment_step(step - current_step)
def increment_step(self, n_steps):
"""

11
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
logger = get_logger(__name__)

self._stats_reporter.add_stat(stat, val)
self._clear_update_buffer()
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> TFPolicy:
"""
Creates a PPO policy to trainers list of policies.
:param brain_parameters: specifications for policy construction

return policy
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
) -> None:
:param name_behavior_id: Behavior ID that the policy should belong to.
:param parsed_behavior_id: Behavior identifiers that the policy should belong to.
:param policy: Policy to associate with name_behavior_id.
"""
if self.policy:

9
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents.trainers.trajectory import Trajectory, SplitObservations
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
logger = get_logger(__name__)

self.update_sac_policy()
self.update_reward_signals()
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> TFPolicy:
policy = NNPolicy(
self.seed,
brain_parameters,

for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
) -> None:
"""
Adds policy to trainer.
:param brain_parameters: specifications for policy construction

26
ml-agents/mlagents/trainers/simple_env_manager.py


from typing import Dict, List
from mlagents_envs.base_env import BaseEnv, AgentGroup
from mlagents_envs.base_env import BaseEnv, BehaviorName
from mlagents.trainers.brain_conversion_utils import group_spec_to_brain_parameters
from mlagents.trainers.brain_conversion_utils import behavior_spec_to_brain_parameters
class SimpleEnvManager(EnvManager):

return [step_info]
def _reset_env(
self, config: Dict[AgentGroup, float] = None
self, config: Dict[BehaviorName, float] = None
) -> List[EnvironmentStep]: # type: ignore
if config is not None:
for k, v in config.items():

return [self.previous_step]
@property
def external_brains(self) -> Dict[AgentGroup, BrainParameters]:
def external_brains(self) -> Dict[BehaviorName, BrainParameters]:
for brain_name in self.env.get_agent_groups():
result[brain_name] = group_spec_to_brain_parameters(
brain_name, self.env.get_agent_group_spec(brain_name)
for brain_name in self.env.get_behavior_names():
result[brain_name] = behavior_spec_to_brain_parameters(
brain_name, self.env.get_behavior_spec(brain_name)
def get_properties(self) -> Dict[AgentGroup, float]:
def get_properties(self) -> Dict[BehaviorName, float]:
return self.shared_float_properties.get_property_dict_copy()
def close(self):

def _take_step(self, last_step: EnvironmentStep) -> Dict[AgentGroup, ActionInfo]:
def _take_step(self, last_step: EnvironmentStep) -> Dict[BehaviorName, ActionInfo]:
for brain_name, step_info in last_step.current_all_step_result.items():
for brain_name, step_tuple in last_step.current_all_step_result.items():
step_info,
step_tuple[0],
0, # As there is only one worker, we assign the worker_id to 0.
)
return all_action_info

for brain_name in self.env.get_agent_groups():
all_step_result[brain_name] = self.env.get_step_result(brain_name)
for brain_name in self.env.get_behavior_names():
all_step_result[brain_name] = self.env.get_steps(brain_name)
return all_step_result

18
ml-agents/mlagents/trainers/stats.py


class StatsPropertyType(Enum):
HYPERPARAMETERS = "hyperparameters"
SELF_PLAY = "selfplay"
SELF_PLAY_TEAM = "selfplayteam"
class StatsWriter(abc.ABC):

)
if self.self_play and "Self-play/ELO" in values:
elo_stats = values["Self-play/ELO"]
mean_opponent_elo = values["Self-play/Mean Opponent ELO"]
std_opponent_elo = values["Self-play/Std Opponent ELO"]
logger.info(
"{} Team {}: ELO: {:0.3f}. "
"Mean Opponent ELO: {:0.3f}. "
"Std Opponent ELO: {:0.3f}. ".format(
category,
self.self_play_team,
elo_stats.mean,
mean_opponent_elo.mean,
std_opponent_elo.mean,
)
)
logger.info("{} ELO: {:0.3f}. ".format(category, elo_stats.mean))
else:
logger.info(
"{}: Step: {}. No episode was completed since last summary. {}".format(

elif property_type == StatsPropertyType.SELF_PLAY:
assert isinstance(value, bool)
self.self_play = value
elif property_type == StatsPropertyType.SELF_PLAY_TEAM:
assert isinstance(value, int)
self.self_play_team = value
def _dict_to_str(self, param_dict: Dict[str, Any], num_tabs: int) -> str:
"""

24
ml-agents/mlagents/trainers/subprocess_env_manager.py


from multiprocessing import Process, Pipe, Queue
from multiprocessing.connection import Connection
from queue import Empty as EmptyQueueException
from mlagents_envs.base_env import BaseEnv, AgentGroup
from mlagents_envs.base_env import BaseEnv, BehaviorName
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.env_manager import EnvManager, EnvironmentStep, AllStepResult
from mlagents_envs.timers import (

StatsAggregationMethod,
)
from mlagents_envs.side_channel.side_channel import SideChannel
from mlagents.trainers.brain_conversion_utils import group_spec_to_brain_parameters
from mlagents.trainers.brain_conversion_utils import behavior_spec_to_brain_parameters
logger = get_logger(__name__)

def _generate_all_results() -> AllStepResult:
all_step_result: AllStepResult = {}
for brain_name in env.get_agent_groups():
all_step_result[brain_name] = env.get_step_result(brain_name)
for brain_name in env.get_behavior_names():
all_step_result[brain_name] = env.get_steps(brain_name)
for brain_name in env.get_agent_groups():
result[brain_name] = group_spec_to_brain_parameters(
brain_name, env.get_agent_group_spec(brain_name)
for brain_name in env.get_behavior_names():
result[brain_name] = behavior_spec_to_brain_parameters(
brain_name, env.get_behavior_spec(brain_name)
)
return result

return list(map(lambda ew: ew.previous_step, self.env_workers))
@property
def external_brains(self) -> Dict[AgentGroup, BrainParameters]:
def external_brains(self) -> Dict[BehaviorName, BrainParameters]:
def get_properties(self) -> Dict[AgentGroup, float]:
def get_properties(self) -> Dict[BehaviorName, float]:
self.env_workers[0].send(EnvironmentCommand.GET_PROPERTIES)
return self.env_workers[0].recv().payload

return step_infos
@timed
def _take_step(self, last_step: EnvironmentStep) -> Dict[AgentGroup, ActionInfo]:
def _take_step(self, last_step: EnvironmentStep) -> Dict[BehaviorName, ActionInfo]:
for brain_name, batch_step_result in last_step.current_all_step_result.items():
for brain_name, step_tuple in last_step.current_all_step_result.items():
batch_step_result, last_step.worker_id
step_tuple[0], last_step.worker_id
)
return all_action_info

42
ml-agents/mlagents/trainers/tests/mock_brain.py


from unittest import mock
from typing import List
from typing import List, Tuple
from mlagents_envs.base_env import BatchedStepResult
from mlagents_envs.base_env import (
DecisionSteps,
TerminalSteps,
BehaviorSpec,
ActionType,
)
def create_mock_brainparams(

return mock_brain()
def create_mock_batchedstep(
def create_mock_steps(
num_agents: int = 1,
num_vector_observations: int = 0,
num_vis_observations: int = 0,

) -> BatchedStepResult:
) -> Tuple[DecisionSteps, TerminalSteps]:
Creates a mock BatchedStepResult with observations. Imitates constant
vector/visual observations, rewards, dones, and agents.
Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations.
Imitates constant vector/visual observations, rewards, dones, and agents.
:int num_agents: Number of "agents" to imitate.
:int num_vector_observations: Number of "observations" in your observation space

:bool done: Whether all the agents in the batch are done
"""
if action_shape is None:
action_shape = [2]

]
reward = np.array(num_agents * [1.0], dtype=np.float32)
done = np.array(num_agents * [done], dtype=np.bool)
return BatchedStepResult(obs_list, reward, done, max_step, agent_id, action_mask)
behavior_spec = BehaviorSpec(
[(84, 84, 3)] * num_vis_observations + [(num_vector_observations, 0, 0)],
ActionType.DISCRETE if discrete else ActionType.CONTINUOUS,
action_shape if discrete else action_shape[0],
)
if done:
return (
DecisionSteps.empty(behavior_spec),
TerminalSteps(obs_list, reward, max_step, agent_id),
)
else:
return (
DecisionSteps(obs_list, reward, agent_id, action_mask),
TerminalSteps.empty(behavior_spec),
)
def create_batchedstep_from_brainparams(
def create_steps_from_brainparams(
) -> BatchedStepResult:
return create_mock_batchedstep(
) -> Tuple[DecisionSteps, TerminalSteps]:
return create_mock_steps(
num_agents=num_agents,
num_vector_observations=brain_params.vector_observation_space_size,
num_vis_observations=brain_params.number_visual_observations,

137
ml-agents/mlagents/trainers/tests/simple_test_envs.py


from mlagents_envs.base_env import (
BaseEnv,
AgentGroupSpec,
BatchedStepResult,
BehaviorSpec,
DecisionSteps,
TerminalSteps,
from mlagents_envs.tests.test_rpc_utils import proto_from_batched_step_result_and_action
from mlagents_envs.tests.test_rpc_utils import proto_from_steps_and_action
from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
AgentInfoActionPairProto,
)

self.vis_obs_size = vis_obs_size
self.vec_obs_size = vec_obs_size
action_type = ActionType.DISCRETE if use_discrete else ActionType.CONTINUOUS
self.group_spec = AgentGroupSpec(
self.behavior_spec = BehaviorSpec(
self._make_obs_spec(),
action_type,
tuple(2 for _ in range(action_size)) if use_discrete else action_size,

self.positions: Dict[str, List[float]] = {}
self.step_count: Dict[str, float] = {}
self.random = random.Random(str(self.group_spec))
self.random = random.Random(str(self.behavior_spec))
self.step_result: Dict[str, BatchedStepResult] = {}
self.step_result: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
self.agent_id: Dict[str, int] = {}
self.step_size = step_size # defines the difficulty of the test

obs.append(np.ones((1,) + self.vis_obs_size, dtype=np.float32) * value)
return obs
def get_agent_groups(self):
def get_behavior_names(self):
def get_agent_group_spec(self, name):
return self.group_spec
def get_behavior_spec(self, behavior_name):
return self.behavior_spec
def set_action_for_agent(self, name, id, data):
def set_action_for_agent(self, behavior_name, agent_id, action):
def set_actions(self, name, data):
self.action[name] = data
def set_actions(self, behavior_name, action):
self.action[behavior_name] = action
def get_step_result(self, name):
return self.step_result[name]
def get_steps(self, behavior_name):
return self.step_result[behavior_name]
def _take_action(self, name: str) -> bool:
deltas = []

def _make_batched_step(
self, name: str, done: bool, reward: float
) -> BatchedStepResult:
) -> Tuple[DecisionSteps, TerminalSteps]:
m_done = np.array([done], dtype=np.bool)
decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id, action_mask)
terminal_step = TerminalSteps.empty(self.behavior_spec)
m_vector_obs,
m_reward,
m_done,
m_agent_id,
action_mask,
) = self._construct_reset_step(
m_vector_obs,
new_vector_obs,
m_reward,
m_done,
m_agent_id,
action_mask,
name,
new_reward,
new_done,
new_agent_id,
new_action_mask,
) = self._construct_reset_step(name)
decision_step = DecisionSteps(
new_vector_obs, new_reward, new_agent_id, new_action_mask
return BatchedStepResult(
m_vector_obs,
m_reward,
m_done,
np.zeros(m_done.shape, dtype=bool),
m_agent_id,
action_mask,
)
terminal_step = TerminalSteps(
m_vector_obs, m_reward, np.array([False], dtype=np.bool), m_agent_id
)
return (decision_step, terminal_step)
self,
vector_obs: List[np.ndarray],
new_vector_obs: List[np.ndarray],
reward: np.ndarray,
done: np.ndarray,
agent_id: np.ndarray,
action_mask: List[np.ndarray],
name: str,
) -> Tuple[List[np.ndarray], np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
self, name: str
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
m_vector_obs = [
np.concatenate((old, new), axis=0)
for old, new in zip(vector_obs, new_vector_obs)
]
m_reward = np.concatenate((reward, new_reward), axis=0)
m_done = np.concatenate((done, new_done), axis=0)
m_agent_id = np.concatenate((agent_id, new_agent_id), axis=0)
if action_mask is not None:
action_mask = [
np.concatenate((old, new), axis=0)
for old, new in zip(action_mask, new_action_mask)
]
return m_vector_obs, m_reward, m_done, m_agent_id, action_mask
return new_reward, new_done, new_agent_id, new_action_mask
def step(self) -> None:
assert all(action is not None for action in self.action.values())

def _make_batched_step(
self, name: str, done: bool, reward: float
) -> BatchedStepResult:
) -> Tuple[DecisionSteps, TerminalSteps]:
m_done = np.array([done], dtype=np.bool)
decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id, action_mask)
terminal_step = TerminalSteps.empty(self.behavior_spec)
if done:
self._reset_agent(name)
recurrent_obs_val = (

(
m_vector_obs,
m_reward,
m_done,
m_agent_id,
action_mask,
) = self._construct_reset_step(
m_vector_obs,
new_vector_obs,
m_reward,
m_done,
m_agent_id,
action_mask,
name,
new_reward,
new_done,
new_agent_id,
new_action_mask,
) = self._construct_reset_step(name)
decision_step = DecisionSteps(
new_vector_obs, new_reward, new_agent_id, new_action_mask
return BatchedStepResult(
m_vector_obs,
m_reward,
m_done,
np.zeros(m_done.shape, dtype=bool),
m_agent_id,
action_mask,
)
terminal_step = TerminalSteps(
m_vector_obs, m_reward, np.array([False], dtype=np.bool), m_agent_id
)
return (decision_step, terminal_step)
class RecordEnvironment(SimpleEnvironment):

def step(self) -> None:
super().step()
for name in self.names:
self.demonstration_protos[
name
] += proto_from_batched_step_result_and_action(
self.step_result[name], self.action[name]
self.demonstration_protos[name] += proto_from_steps_and_action(
self.step_result[name][0], self.step_result[name][1], self.action[name]
)
self.demonstration_protos[name] = self.demonstration_protos[name][
-self.n_demos :

54
ml-agents/mlagents/trainers/tests/test_agent_processor.py


"pre_action": [0.1, 0.1],
"log_probs": [0.1, 0.1],
}
mock_step = mb.create_mock_batchedstep(
mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
num_agents=2,
num_vector_observations=8,
action_shape=[2],

action=[0.1, 0.1],
value=[0.1, 0.1],
outputs=fake_action_outputs,
agent_ids=mock_step.agent_id,
agent_ids=mock_decision_steps.agent_id,
processor.add_experiences(mock_step, 0, ActionInfo.empty())
processor.add_experiences(
mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
)
processor.add_experiences(mock_step, 0, fake_action_info)
processor.add_experiences(
mock_decision_steps, mock_terminal_steps, 0, fake_action_info
)
# Assert that two trajectories have been added to the Trainer
assert len(tqueue.put.call_args_list) == 2

# Assert that the AgentProcessor is empty
assert len(processor.experience_buffers[0]) == 0
# Test empty BatchedStepResult
mock_step = mb.create_mock_batchedstep(
# Test empty steps
mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
processor.add_experiences(mock_step, 0, ActionInfo([], [], {}, []))
processor.add_experiences(
mock_decision_steps, mock_terminal_steps, 0, ActionInfo([], [], {}, [])
)
# Assert that the AgentProcessor is still empty
assert len(processor.experience_buffers[0]) == 0

"pre_action": [0.1],
"log_probs": [0.1],
}
mock_step = mb.create_mock_batchedstep(
mock_decision_step, mock_terminal_step = mb.create_mock_steps(
mock_done_step = mb.create_mock_batchedstep(
mock_done_decision_step, mock_done_terminal_step = mb.create_mock_steps(
num_agents=1,
num_vector_observations=8,
action_shape=[2],

action=[0.1],
value=[0.1],
outputs=fake_action_outputs,
agent_ids=mock_step.agent_id,
agent_ids=mock_decision_step.agent_id,
processor.add_experiences(mock_step, 0, ActionInfo.empty())
processor.add_experiences(
mock_decision_step, mock_terminal_step, 0, ActionInfo.empty()
)
# Run 3 trajectories, with different workers (to simulate different agents)
add_calls = []

processor.add_experiences(mock_step, _ep, fake_action_info)
processor.add_experiences(
mock_decision_step, mock_terminal_step, _ep, fake_action_info
)
processor.add_experiences(mock_done_step, _ep, fake_action_info)
processor.add_experiences(
mock_done_decision_step, mock_done_terminal_step, _ep, fake_action_info
)
# Make sure we don't add experiences from the prior agents after the done
remove_calls.append(mock.call([get_global_agent_id(_ep, 0)]))

assert len(processor.last_step_result.keys()) == 0
# check that steps with immediate dones don't add to dicts
processor.add_experiences(mock_done_step, 0, ActionInfo.empty())
processor.add_experiences(
mock_done_decision_step, mock_done_terminal_step, 0, ActionInfo.empty()
)
assert len(processor.experience_buffers.keys()) == 0
assert len(processor.last_take_action_outputs.keys()) == 0
assert len(processor.episode_steps.keys()) == 0

"pre_action": [0.1],
"log_probs": [0.1],
}
mock_step = mb.create_mock_batchedstep(
mock_decision_step, mock_terminal_step = mb.create_mock_steps(
num_agents=1,
num_vector_observations=8,
action_shape=[2],

action=[0.1],
value=[0.1],
outputs=fake_action_outputs,
agent_ids=mock_step.agent_id,
agent_ids=mock_decision_step.agent_id,
processor.add_experiences(mock_step, 0, ActionInfo.empty())
processor.add_experiences(
mock_decision_step, mock_terminal_step, 0, ActionInfo.empty()
)
processor.add_experiences(mock_step, _ep, fake_action_info)
processor.add_experiences(
mock_decision_step, mock_terminal_step, _ep, fake_action_info
)
# Make sure we don't add experiences from the prior agents after the done
# Call end episode

32
ml-agents/mlagents/trainers/tests/test_demo_loader.py


import io
from unittest import mock
from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
DemonstrationMetaProto,
)
write_delimited,
group_spec, pair_infos, total_expected = load_demonstration(
behavior_spec, pair_infos, total_expected = load_demonstration(
assert np.sum(group_spec.observation_shapes[0]) == 8
assert np.sum(behavior_spec.observation_shapes[0]) == 8
assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1)

def test_load_demo_dir():
path_prefix = os.path.dirname(os.path.abspath(__file__))
group_spec, pair_infos, total_expected = load_demonstration(
behavior_spec, pair_infos, total_expected = load_demonstration(
assert np.sum(group_spec.observation_shapes[0]) == 8
assert np.sum(behavior_spec.observation_shapes[0]) == 8
assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1)

assert get_demo_files(valid_fname) == [valid_fname]
# valid directory
assert get_demo_files(tmpdirname) == [valid_fname]
@mock.patch("mlagents.trainers.demo_loader.get_demo_files", return_value=["foo.demo"])
def test_unsupported_version_raises_error(mock_get_demo_files):
# Create a metadata proto with an unsupported version
bad_metadata = DemonstrationMetaProto()
bad_metadata.api_version = 1337
# Write the metadata to a temporary buffer, which will get returned by open()
buffer = io.BytesIO()
write_delimited(buffer, bad_metadata)
m = mock.mock_open(read_data=buffer.getvalue())
# Make sure that we get a RuntimeError when trying to load this.
with mock.patch("builtins.open", m):
with pytest.raises(RuntimeError):
load_demonstration("foo")

48
ml-agents/mlagents/trainers/tests/test_ghost.py


import yaml
from mlagents.trainers.ghost.trainer import GhostTrainer
from mlagents.trainers.ghost.controller import GhostController
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory

trainer_params = dummy_config
trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0")
trainer.seed = 1
policy = trainer.create_policy(mock_brain)
policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
to_load_policy = trainer.create_policy(mock_brain)
to_load_policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
to_load_policy.create_tf_graph()
to_load_policy.init_load_weights()

dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
trainer = GhostTrainer(ppo_trainer, brain_name, 0, dummy_config, True, "0")
controller = GhostController(100)
trainer = GhostTrainer(
ppo_trainer, brain_name, controller, 0, dummy_config, True, "0"
)
policy = trainer.create_policy(brain_params_team0)
trainer.add_policy(brain_params_team0.brain_name, policy)
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
brain_params_team0.brain_name
)
policy = trainer.create_policy(parsed_behavior_id0, brain_params_team0)
trainer.add_policy(parsed_behavior_id0, policy)
policy = trainer.create_policy(brain_params_team1)
trainer.add_policy(brain_params_team1.brain_name, policy)
parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
brain_params_team1.brain_name
)
policy = trainer.create_policy(parsed_behavior_id1, brain_params_team1)
trainer.add_policy(parsed_behavior_id1, policy)
trajectory_queue1 = AgentManagerQueue(brain_params_team1.brain_name)
trainer.subscribe_trajectory_queue(trajectory_queue1)

vector_action_space_type=0,
)
brain_name = BehaviorIdentifiers.from_name_behavior_id(
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
).brain_name
)
brain_name = parsed_behavior_id0.brain_name
brain_params_team1 = BrainParameters(
brain_name="test_brain?team=1",

dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
trainer = GhostTrainer(ppo_trainer, brain_name, 0, dummy_config, True, "0")
controller = GhostController(100)
trainer = GhostTrainer(
ppo_trainer, brain_name, controller, 0, dummy_config, True, "0"
)
policy = trainer.create_policy(brain_params_team0)
trainer.add_policy(brain_params_team0.brain_name, policy)
policy = trainer.create_policy(parsed_behavior_id0, brain_params_team0)
trainer.add_policy(parsed_behavior_id0, policy)
policy = trainer.create_policy(brain_params_team1)
trainer.add_policy(brain_params_team1.brain_name, policy)
parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
brain_params_team1.brain_name
)
policy = trainer.create_policy(parsed_behavior_id1, brain_params_team1)
trainer.add_policy(parsed_behavior_id1, policy)
policy_queue1 = AgentManagerQueue(brain_params_team1.brain_name)
trainer.publish_policy_queue(policy_queue1)

2
ml-agents/mlagents/trainers/tests/test_learn.py


None,
)
handle_dir_mock.assert_called_once_with(
"./models/ppo", "./summaries", False, False
"./models/ppo", "./summaries", False, False, None
)
StatsReporter.writers.clear() # make sure there aren't any writers as added by learn.py

55
ml-agents/mlagents/trainers/tests/test_nn_policy.py


import pytest
import os
from typing import Dict, Any
import numpy as np
from mlagents.tf_utils import tf

NUM_AGENTS = 12
def create_policy_mock(dummy_config, use_rnn, use_discrete, use_visual):
def create_policy_mock(
dummy_config: Dict[str, Any],
use_rnn: bool = False,
use_discrete: bool = True,
use_visual: bool = False,
load: bool = False,
seed: int = 0,
) -> NNPolicy:
mock_brain = mb.setup_mock_brain(
use_discrete,
use_visual,

trainer_parameters = dummy_config
trainer_parameters["keep_checkpoints"] = 3
trainer_parameters["use_recurrent"] = use_rnn
policy = NNPolicy(0, mock_brain, trainer_parameters, False, False)
policy = NNPolicy(seed, mock_brain, trainer_parameters, False, load)
def test_load_save(dummy_config, tmp_path):
path1 = os.path.join(tmp_path, "runid1")
path2 = os.path.join(tmp_path, "runid2")
trainer_params = dummy_config
trainer_params["model_path"] = path1
policy = create_policy_mock(trainer_params)
policy.initialize_or_load()
policy.save_model(2000)
assert len(os.listdir(tmp_path)) > 0
# Try load from this path
policy2 = create_policy_mock(trainer_params, load=True, seed=1)
policy2.initialize_or_load()
_compare_two_policies(policy, policy2)
# Try initialize from path 1
trainer_params["model_path"] = path2
trainer_params["init_path"] = path1
policy3 = create_policy_mock(trainer_params, load=False, seed=2)
policy3.initialize_or_load()
_compare_two_policies(policy2, policy3)
def _compare_two_policies(policy1: NNPolicy, policy2: NNPolicy) -> None:
"""
Make sure two policies have the same output for the same input.
"""
decision_step, _ = mb.create_steps_from_brainparams(policy1.brain, num_agents=1)
run_out1 = policy1.evaluate(decision_step, list(decision_step.agent_id))
run_out2 = policy2.evaluate(decision_step, list(decision_step.agent_id))
np.testing.assert_array_equal(run_out2["log_probs"], run_out1["log_probs"])
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])

policy = create_policy_mock(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
decision_step, terminal_step = mb.create_steps_from_brainparams(
policy.brain, num_agents=NUM_AGENTS
)
run_out = policy.evaluate(step, list(step.agent_id))
run_out = policy.evaluate(decision_step, list(decision_step.agent_id))
if discrete:
run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
else:

24
ml-agents/mlagents/trainers/tests/test_policy.py


from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents_envs.base_env import BatchedStepResult, AgentGroupSpec
from mlagents_envs.base_env import DecisionSteps, BehaviorSpec
from mlagents.trainers.action_info import ActionInfo
from unittest.mock import MagicMock
import numpy as np

test_seed = 3
policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
# Doesn't really matter what this is
dummy_groupspec = AgentGroupSpec([(1,)], "continuous", 1)
no_agent_step = BatchedStepResult.empty(dummy_groupspec)
dummy_groupspec = BehaviorSpec([(1,)], "continuous", 1)
no_agent_step = DecisionSteps.empty(dummy_groupspec)
result = policy.get_action(no_agent_step)
assert result == ActionInfo.empty()

policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
policy.evaluate = MagicMock(return_value={})
policy.save_memories = MagicMock()
step_with_agents = BatchedStepResult(
[],
np.array([], dtype=np.float32),
np.array([False], dtype=np.bool),
np.array([], dtype=np.bool),
np.array([0]),
None,
step_with_agents = DecisionSteps(
[], np.array([], dtype=np.float32), np.array([0]), None
)
result = policy.get_action(step_with_agents, worker_id=0)
assert result == ActionInfo(None, None, {}, [0])

"value": np.array([1.1], dtype=np.float32),
}
policy.evaluate = MagicMock(return_value=policy_eval_out)
step_with_agents = BatchedStepResult(
[],
np.array([], dtype=np.float32),
np.array([False], dtype=np.bool),
np.array([], dtype=np.bool),
np.array([0]),
None,
step_with_agents = DecisionSteps(
[], np.array([], dtype=np.float32), np.array([0]), None
)
result = policy.get_action(step_with_agents)
expected = ActionInfo(

4
ml-agents/mlagents/trainers/tests/test_ppo.py


trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128
trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0")
policy = trainer.create_policy(mock_brain)
policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
trainer.add_policy(mock_brain.brain_name, policy)
# Test update with sequence length smaller than batch size
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain)

dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = trainer.create_policy(brain_params)
policy = trainer.create_policy(brain_params.brain_name, brain_params)
trainer.add_policy(brain_params.brain_name, policy)
trajectory_queue = AgentManagerQueue("testbrain")
trainer.subscribe_trajectory_queue(trajectory_queue)

6
ml-agents/mlagents/trainers/tests/test_sac.py


trainer_params["model_path"] = str(tmpdir)
trainer_params["save_replay_buffer"] = True
trainer = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, False, 0, 0)
policy = trainer.create_policy(mock_brain)
policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
trainer.add_policy(mock_brain.brain_name, policy)
trainer.update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)

# Wipe Trainer and try to load
trainer2 = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, True, 0, 0)
policy = trainer2.create_policy(mock_brain)
policy = trainer2.create_policy(mock_brain.brain_name, mock_brain)
trainer2.add_policy(mock_brain.brain_name, policy)
assert trainer2.update_buffer.num_experiences == buffer_len

dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = trainer.create_policy(brain_params)
policy = trainer.create_policy(brain_params.brain_name, brain_params)
trainer.add_policy(brain_params.brain_name, policy)
trajectory_queue = AgentManagerQueue("testbrain")

59
ml-agents/mlagents/trainers/tests/test_simple_rl.py


env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
override_vals = {"buffer_init_steps": 2000, "max_steps": 4000}
override_vals = {"buffer_init_steps": 2000, "max_steps": 10000}
config = generate_config(SAC_CONFIG, override_vals)
_check_environment_trains(env, config, success_threshold=0.8)

override_vals = {
"max_steps": 2500,
"self_play": {
"play_against_current_self_ratio": 1.0,
"play_against_latest_model_ratio": 1.0,
"save_steps": 2000,
"swap_steps": 2000,
},

override_vals = {
"max_steps": 2500,
"self_play": {
"play_against_current_self_ratio": 1.0,
"play_against_latest_model_ratio": 1.0,
"save_steps": 2000,
"swap_steps": 4000,
},

)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost(use_discrete):
# Make opponent for asymmetric case
brain_name_opp = BRAIN_NAME + "Opp"
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
)
override_vals = {
"max_steps": 2000,
"self_play": {
"play_against_latest_model_ratio": 1.0,
"save_steps": 5000,
"swap_steps": 5000,
"team_change": 2000,
},
}
config = generate_config(PPO_CONFIG, override_vals)
config[brain_name_opp] = config[BRAIN_NAME]
_check_environment_trains(env, config)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost_fails(use_discrete):
# Make opponent for asymmetric case
brain_name_opp = BRAIN_NAME + "Opp"
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
)
# This config should fail because the team that us not learning when both have reached
# max step should be executing the initial, untrained poliy.
override_vals = {
"max_steps": 2000,
"self_play": {
"play_against_latest_model_ratio": 0.0,
"save_steps": 5000,
"swap_steps": 5000,
"team_change": 2000,
},
}
config = generate_config(PPO_CONFIG, override_vals)
config[brain_name_opp] = config[BRAIN_NAME]
_check_environment_trains(env, config, success_threshold=None)
processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]
success_threshold = 0.9
assert any(reward > success_threshold for reward in processed_rewards) and any(
reward < success_threshold for reward in processed_rewards
)
@pytest.fixture(scope="session")
def simple_record(tmpdir_factory):
def record_demo(use_discrete, num_visual=0, num_vector=1):

step_size=0.2,
)
override_vals = {
"max_steps": 1000,
"max_steps": 500,
"learning_rate": 3.0e-4,
"behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},
"reward_signals": {

7
ml-agents/mlagents/trainers/tests/test_stats.py


category = "category1"
console_writer = ConsoleWriter()
console_writer.add_property(category, StatsPropertyType.SELF_PLAY, True)
console_writer.add_property(category, StatsPropertyType.SELF_PLAY_TEAM, 1)
statssummary1 = StatsSummary(mean=1.0, std=1.0, num=1)
console_writer.write_stats(
category,

"Self-play/ELO": statssummary1,
"Self-play/Mean Opponent ELO": statssummary1,
"Self-play/Std Opponent ELO": statssummary1,
},
10,
)

)
self.assertIn(
"category1 Team 1: ELO: 1.000. Mean Opponent ELO: 1.000. Std Opponent ELO: 1.000.",
cm.output[1],
)

5
ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py


agent_manager_mock = mock.Mock()
env_manager.set_agent_manager(brain_name, agent_manager_mock)
step_info_dict = {brain_name: Mock()}
step_info_dict = {brain_name: (Mock(), Mock())}
env_stats = {
"averaged": (1.0, StatsAggregationMethod.AVERAGE),
"most_recent": (2.0, StatsAggregationMethod.MOST_RECENT),

env_manager._step.assert_called_once()
agent_manager_mock.add_experiences.assert_called_once_with(
step_info.current_all_step_result[brain_name],
step_info.current_all_step_result[brain_name][0],
step_info.current_all_step_result[brain_name][1],
0,
step_info.brain_name_to_action_info[brain_name],
)

12
ml-agents/mlagents/trainers/tests/test_trainer_util.py


trainer_util.handle_existing_directories(model_path, summary_path, True, False)
# Test try to train w/ force - should work
trainer_util.handle_existing_directories(model_path, summary_path, False, True)
# Test initialize option
init_path = os.path.join(tmp_path, "runid2")
with pytest.raises(UnityTrainerException):
trainer_util.handle_existing_directories(
model_path, summary_path, False, True, init_path
)
os.mkdir(init_path)
# Should pass since the directory exists now.
trainer_util.handle_existing_directories(
model_path, summary_path, False, True, init_path
)

9
ml-agents/mlagents/trainers/trainer/trainer.py


from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.policy import Policy
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
logger = get_logger(__name__)

pass
@abc.abstractmethod
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> TFPolicy:
"""
Creates policy
"""

def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
) -> None:
"""
Adds policy to trainer.
"""

11
ml-agents/mlagents/trainers/trainer_controller.py


self, env_manager: EnvManager, name_behavior_id: str
) -> None:
brain_name = BehaviorIdentifiers.from_name_behavior_id(
name_behavior_id
).brain_name
parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(name_behavior_id)
brain_name = parsed_behavior_id.brain_name
try:
trainer = self.trainers[brain_name]
except KeyError:

policy = trainer.create_policy(env_manager.external_brains[name_behavior_id])
trainer.add_policy(name_behavior_id, policy)
policy = trainer.create_policy(
parsed_behavior_id, env_manager.external_brains[name_behavior_id]
)
trainer.add_policy(parsed_behavior_id, policy)
agent_manager = AgentManager(
policy,

33
ml-agents/mlagents/trainers/trainer_util.py


from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.sac.trainer import SACTrainer
from mlagents.trainers.ghost.trainer import GhostTrainer
from mlagents.trainers.ghost.controller import GhostController
logger = get_logger(__name__)

train_model: bool,
load_model: bool,
seed: int,
init_path: str = None,
meta_curriculum: MetaCurriculum = None,
multi_gpu: bool = False,
):

self.model_path = model_path
self.init_path = init_path
self.keep_checkpoints = keep_checkpoints
self.train_model = train_model
self.load_model = load_model

self.ghost_controller = GhostController()
def generate(self, brain_name: str) -> Trainer:
return initialize_trainer(

self.keep_checkpoints,
self.train_model,
self.load_model,
self.ghost_controller,
self.init_path,
self.meta_curriculum,
self.multi_gpu,
)

keep_checkpoints: int,
train_model: bool,
load_model: bool,
ghost_controller: GhostController,
init_path: str = None,
meta_curriculum: MetaCurriculum = None,
multi_gpu: bool = False,
) -> Trainer:

:param keep_checkpoints: How many model checkpoints to keep
:param train_model: Whether to train the model (vs. run inference)
:param load_model: Whether to load the model or randomly initialize
:param ghost_controller: The object that coordinates ghost trainers
:param init_path: Path from which to load model, if different from model_path.
:param meta_curriculum: Optional meta_curriculum, used to determine a reward buffer length for PPOTrainer
:return:
"""

trainer_parameters["model_path"] = "{basedir}/{name}".format(
basedir=model_path, name=brain_name
)
if init_path is not None:
trainer_parameters["init_path"] = "{basedir}/{name}".format(
basedir=init_path, name=brain_name
)
trainer_parameters["keep_checkpoints"] = keep_checkpoints
if brain_name in trainer_config:
_brain_key: Any = brain_name

trainer = GhostTrainer(
trainer,
brain_name,
ghost_controller,
min_lesson_length,
trainer_parameters,
train_model,

def handle_existing_directories(
model_path: str, summary_path: str, resume: bool, force: bool
model_path: str, summary_path: str, resume: bool, force: bool, init_path: str = None
) -> None:
"""
Validates that if the run_id model exists, we do not overwrite it unless --force is specified.

if model_path_exists:
if not resume and not force:
raise UnityTrainerException(
"Previous data from this run-id was found. "
"Either specify a new run-id, use --resume to resume this run, "
"Previous data from this run ID was found. "
"Either specify a new run ID, use --resume to resume this run, "
"Previous data from this run-id was not found. "
"Previous data from this run ID was not found. "
# Verify init path if specified.
if init_path is not None:
if not os.path.isdir(init_path):
raise UnityTrainerException(
"Could not initialize from {}. "
"Make sure models have already been saved with that run ID.".format(
init_path
)
)

18
ml-agents/tests/yamato/standalone_build_tests.py


import sys
import argparse
def main():
def main(scene_path):
returncode = run_standalone_build(base_path, verbose=True)
executable_name = None
if scene_path is not None:
executable_name = scene_path.strip(".unity")
executable_name = executable_name.split("/")[-1]
executable_name = "testPlayer-" + executable_name
returncode = run_standalone_build(
base_path, output_path=executable_name, scene_path=scene_path
)
if returncode == 0:
print("Test run SUCCEEDED!")

if __name__ == "__main__":
main()
parser = argparse.ArgumentParser()
parser.add_argument("--scene", default=None)
args = parser.parse_args()
main(args.scene)

13
ml-agents/tests/yamato/training_int_tests.py


from .yamato_utils import (
get_base_path,
get_base_output_path,
run_standalone_build,
init_venv,
override_config_file,

if csharp_version is not None:
# We can't rely on the old C# code recognizing the commandline argument to set the output
# So rename testPlayer (containing the most recent build) to something else temporarily
full_player_path = os.path.join("Project", "testPlayer.app")
temp_player_path = os.path.join("Project", "temp_testPlayer.app")
final_player_path = os.path.join("Project", f"testPlayer_{csharp_version}.app")
artifact_path = get_base_output_path()
full_player_path = os.path.join(artifact_path, "testPlayer.app")
temp_player_path = os.path.join(artifact_path, "temp_testPlayer.app")
final_player_path = os.path.join(
artifact_path, f"testPlayer_{csharp_version}.app"
)
os.rename(full_player_path, temp_player_path)

)
mla_learn_cmd = (
f"mlagents-learn override.yaml --train --env=Project/{standalone_player_path} "
f"mlagents-learn override.yaml --train --env="
f"{os.path.join(get_base_output_path(), standalone_player_path)} "
f"--run-id={run_id} --no-graphics --env-args -logFile -"
) # noqa
res = subprocess.run(

58
ml-agents/tests/yamato/yamato_utils.py


import os
import shutil
from typing import List, Optional
def get_unity_executable_path():

return os.getcwd()
def get_base_output_path():
""""
Returns the artifact folder to use for yamato jobs.
"""
return os.path.join(get_base_path(), "artifacts")
base_path: str, verbose: bool = False, output_path: str = None
base_path: str,
verbose: bool = False,
output_path: str = None,
scene_path: str = None,
log_output_path: str = f"{get_base_output_path()}/standalone_build.txt",
Run BuildStandalonePlayerOSX test to produce a player. The location defaults to Project/testPlayer.
Run BuildStandalonePlayerOSX test to produce a player. The location defaults to
artifacts/standalone_build/testPlayer.
"""
unity_exe = get_unity_executable_path()
print(f"Running BuildStandalonePlayerOSX via {unity_exe}")

"-executeMethod",
"MLAgents.StandaloneBuildTest.BuildStandalonePlayerOSX",
]
if verbose:
test_args += ["-logfile", "-"]
os.makedirs(os.path.dirname(log_output_path), exist_ok=True)
subprocess.run(["touch", log_output_path])
test_args += ["-logfile", log_output_path]
output_path = os.path.join(get_base_output_path(), output_path)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
if scene_path is not None:
test_args += ["--mlagents-build-scene-path", scene_path]
# Copy the default build name into the artifacts folder.
if output_path is None and res.returncode == 0:
shutil.move(
os.path.join(base_path, "Project", "testPlayer.app"),
os.path.join(get_base_output_path(), "testPlayer.app"),
)
# Print if we fail or want verbosity.
if verbose or res.returncode != 0:
subprocess.run(["cat", log_output_path])
def init_venv(mlagents_python_version: str = None) -> str:
def init_venv(
mlagents_python_version: str = None, extra_packages: Optional[List[str]] = None
) -> str:
"""
Set up the virtual environment, and return the venv path.
:param mlagents_python_version: The version of mlagents python packcage to install.

]
if mlagents_python_version:
# install from pypi
pip_commands.append(f"mlagents=={mlagents_python_version}")
pip_commands += [
f"mlagents=={mlagents_python_version}",
f"gym-unity=={mlagents_python_version}",
]
pip_commands += ["-e ./ml-agents-envs", "-e ./ml-agents"]
pip_commands += ["-e ./ml-agents-envs", "-e ./ml-agents", "-e ./gym-unity"]
if extra_packages:
pip_commands += extra_packages
for cmd in pip_commands:
subprocess.check_call(
f"source {venv_path}/bin/activate; python -m pip install -q {cmd}",

"""
if csharp_version is None:
return
subprocess.check_call(f"rm -rf {csharp_dir}", shell=True)
subprocess.check_call(
f"git checkout {csharp_version} -- {csharp_dir}", shell=True
)

"""
subprocess.check_call("git reset HEAD .", shell=True)
subprocess.check_call("git checkout -- .", shell=True)
# Ensure the cache isn't polluted with old compiled assemblies.
subprocess.check_call(f"rm -rf Project/Library", shell=True)
def override_config_file(src_path, dest_path, **kwargs):

1
utils/make_readme_table.py


["0.14.0", "February 13, 2020"],
["0.14.1", "February 26, 2020"],
["0.15.0", "March 18, 2020"],
["0.15.1", "March 30, 2020"],
]
MAX_DAYS = 150 # do not print releases older than this many days

32
.yamato/gym-interface-test.yml


test_editors:
- version: 2019.3
---
{% for editor in test_editors %}
test_gym_interface_{{ editor.version }}:
name: Test Mac Gym Interface {{ editor.version }}
agent:
type: Unity::VM::osx
image: ml-agents/ml-agents-bokken-mac:0.1.4-492264
flavor: b1.small
variables:
UNITY_VERSION: {{ editor.version }}
commands:
- pip install pyyaml
- python -u -m ml-agents.tests.yamato.setup_venv
- ./venv/bin/python ml-agents/tests/yamato/scripts/run_gym.py --env=artifacts/testPlayer-Basic
dependencies:
- .yamato/standalone-build-test.yml#test_mac_standalone_{{ editor.version }}
triggers:
cancel_old_ci: true
changes:
only:
- "com.unity.ml-agents/**"
- "Project/**"
- "ml-agents/**"
- "ml-agents-envs/**"
- ".yamato/gym-interface-test.yml"
except:
- "*.md"
- "com.unity.ml-agents/*.md"
- "com.unity.ml-agents/**/*.md"
{% endfor %}

32
.yamato/python-ll-api-test.yml


test_editors:
- version: 2019.3
---
{% for editor in test_editors %}
test_mac_ll_api_{{ editor.version }}:
name: Test Mac LL-API {{ editor.version }}
agent:
type: Unity::VM::osx
image: ml-agents/ml-agents-bokken-mac:0.1.4-492264
flavor: b1.small
variables:
UNITY_VERSION: {{ editor.version }}
commands:
- pip install pyyaml
- python -u -m ml-agents.tests.yamato.setup_venv
- ./venv/bin/python ml-agents/tests/yamato/scripts/run_llapi.py
dependencies:
- .yamato/standalone-build-test.yml#test_mac_standalone_{{ editor.version }}
triggers:
cancel_old_ci: true
changes:
only:
- "com.unity.ml-agents/**"
- "Project/**"
- "ml-agents/**"
- "ml-agents-envs/**"
- ".yamato/python-ll-api-test.yml"
except:
- "*.md"
- "com.unity.ml-agents/*.md"
- "com.unity.ml-agents/**/*.md"
{% endfor %}

129
com.unity.ml-agents/Tests/Runtime/RuntimeAPITest.cs


#if UNITY_INCLUDE_TESTS
using System.Collections;
using System.Collections.Generic;
using MLAgents;
using MLAgents.Policies;
using MLAgents.Sensors;
using NUnit.Framework;
using UnityEngine;
using UnityEngine.TestTools;
namespace Tests
{
public class PublicApiAgent : Agent
{
public int numHeuristicCalls;
public override float[] Heuristic()
{
numHeuristicCalls++;
return base.Heuristic();
}
}// Simple SensorComponent that sets up a StackingSensor
public class StackingComponent : SensorComponent
{
public SensorComponent wrappedComponent;
public int numStacks;
public override ISensor CreateSensor()
{
var wrappedSensor = wrappedComponent.CreateSensor();
return new StackingSensor(wrappedSensor, numStacks);
}
public override int[] GetObservationShape()
{
int[] shape = (int[]) wrappedComponent.GetObservationShape().Clone();
for (var i = 0; i < shape.Length; i++)
{
shape[i] *= numStacks;
}
return shape;
}
}
public class RuntimeApiTest
{
[SetUp]
public static void Setup()
{
Academy.Instance.AutomaticSteppingEnabled = false;
}
[UnityTest]
public IEnumerator RuntimeApiTestWithEnumeratorPasses()
{
var gameObject = new GameObject();
var behaviorParams = gameObject.AddComponent<BehaviorParameters>();
behaviorParams.brainParameters.vectorObservationSize = 3;
behaviorParams.brainParameters.numStackedVectorObservations = 2;
behaviorParams.brainParameters.vectorActionDescriptions = new[] { "TestActionA", "TestActionB" };
behaviorParams.brainParameters.vectorActionSize = new[] { 2, 2 };
behaviorParams.brainParameters.vectorActionSpaceType = SpaceType.Discrete;
behaviorParams.behaviorName = "TestBehavior";
behaviorParams.TeamId = 42;
behaviorParams.useChildSensors = true;
// Can't actually create an Agent with InferenceOnly and no model, so change back
behaviorParams.behaviorType = BehaviorType.Default;
var sensorComponent = gameObject.AddComponent<RayPerceptionSensorComponent3D>();
sensorComponent.sensorName = "ray3d";
sensorComponent.detectableTags = new List<string> { "Player", "Respawn" };
sensorComponent.raysPerDirection = 3;
// Make a StackingSensor that wraps the RayPerceptionSensorComponent3D
// This isn't necessarily practical, just to ensure that it can be done
var wrappingSensorComponent = gameObject.AddComponent<StackingComponent>();
wrappingSensorComponent.wrappedComponent = sensorComponent;
wrappingSensorComponent.numStacks = 3;
// ISensor isn't set up yet.
Assert.IsNull(sensorComponent.raySensor);
// Make sure we can set the behavior type correctly after the agent is initialized
// (this creates a new policy).
behaviorParams.behaviorType = BehaviorType.HeuristicOnly;
// Agent needs to be added after everything else is setup.
var agent = gameObject.AddComponent<PublicApiAgent>();
// DecisionRequester has to be added after Agent.
var decisionRequester = gameObject.AddComponent<DecisionRequester>();
decisionRequester.DecisionPeriod = 2;
decisionRequester.TakeActionsBetweenDecisions = true;
// Initialization should set up the sensors
Assert.IsNotNull(sensorComponent.raySensor);
// Let's change the inference device
var otherDevice = behaviorParams.inferenceDevice == InferenceDevice.CPU ? InferenceDevice.GPU : InferenceDevice.CPU;
agent.SetModel(behaviorParams.behaviorName, behaviorParams.model, otherDevice);
agent.AddReward(1.0f);
// skip a frame.
yield return null;
Academy.Instance.EnvironmentStep();
var actions = agent.GetAction();
// default Heuristic implementation should return zero actions.
Assert.AreEqual(new[] {0.0f, 0.0f}, actions);
Assert.AreEqual(1, agent.numHeuristicCalls);
Academy.Instance.EnvironmentStep();
Assert.AreEqual(1, agent.numHeuristicCalls);
Academy.Instance.EnvironmentStep();
Assert.AreEqual(2, agent.numHeuristicCalls);
}
}
}
#endif

11
com.unity.ml-agents/Tests/Runtime/RuntimeAPITest.cs.meta


fileFormatVersion: 2
guid: 17878576e4ed14b09875e37394e5ad90
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

25
com.unity.ml-agents/Tests/Runtime/Unity.ML-Agents.Runtime.Tests.asmdef


{
"name": "Tests",
"references": [
"Unity.ML-Agents",
"Barracuda",
"Unity.ML-Agents.CommunicatorObjects",
"Unity.ML-Agents.Editor"
],
"optionalUnityReferences": [
"TestAssemblies"
],
"includePlatforms": [],
"excludePlatforms": [],
"allowUnsafeCode": false,
"overrideReferences": true,
"precompiledReferences": [
"System.IO.Abstractions.dll",
"System.IO.Abstractions.TestingHelpers.dll",
"Google.Protobuf.dll"
],
"autoReferenced": false,
"defineConstraints": [
"UNITY_INCLUDE_TESTS"
]
}

7
com.unity.ml-agents/Tests/Runtime/Unity.ML-Agents.Runtime.Tests.asmdef.meta


fileFormatVersion: 2
guid: d29014db7ebcd4cf4a14f537fbf02110
AssemblyDefinitionImporter:
externalObjects: {}
userData:
assetBundleName:
assetBundleVariant:

92
ml-agents/mlagents/trainers/ghost/controller.py


from mlagents_envs.logging_util import get_logger
from typing import Deque, Dict
from collections import deque
from mlagents.trainers.ghost.trainer import GhostTrainer
logger = get_logger(__name__)
class GhostController:
"""
GhostController contains a queue of team ids. GhostTrainers subscribe to the GhostController and query
it to get the current learning team. The GhostController cycles through team ids every 'swap_interval'
which corresponds to the number of trainer steps between changing learning teams.
The GhostController is a unique object and there can only be one per training run.
"""
def __init__(self, maxlen: int = 10):
"""
Create a GhostController.
:param maxlen: Maximum number of GhostTrainers allowed in this GhostController
"""
# Tracks last swap step for each learning team because trainer
# steps of all GhostTrainers do not increment together
self._queue: Deque[int] = deque(maxlen=maxlen)
self._learning_team: int = -1
# Dict from team id to GhostTrainer for ELO calculation
self._ghost_trainers: Dict[int, GhostTrainer] = {}
@property
def get_learning_team(self) -> int:
"""
Returns the current learning team.
:return: The learning team id
"""
return self._learning_team
def subscribe_team_id(self, team_id: int, trainer: GhostTrainer) -> None:
"""
Given a team_id and trainer, add to queue and trainers if not already.
The GhostTrainer is used later by the controller to get ELO ratings of agents.
:param team_id: The team_id of an agent managed by this GhostTrainer
:param trainer: A GhostTrainer that manages this team_id.
"""
if team_id not in self._ghost_trainers:
self._ghost_trainers[team_id] = trainer
if self._learning_team < 0:
self._learning_team = team_id
else:
self._queue.append(team_id)
def change_training_team(self, step: int) -> None:
"""
The current learning team is added to the end of the queue and then updated with the
next in line.
:param step: The step of the trainer for debugging
"""
self._queue.append(self._learning_team)
self._learning_team = self._queue.popleft()
logger.debug(
"Learning team {} swapped on step {}".format(self._learning_team, step)
)
# Adapted from https://github.com/Unity-Technologies/ml-agents/pull/1975 and
# https://metinmediamath.wordpress.com/2013/11/27/how-to-calculate-the-elo-rating-including-example/
# ELO calculation
# TODO : Generalize this to more than two teams
def compute_elo_rating_changes(self, rating: float, result: float) -> float:
"""
Calculates ELO. Given the rating of the learning team and result. The GhostController
queries the other GhostTrainers for the ELO of their agent that is currently being deployed.
Note, this could be the current agent or a past snapshot.
:param rating: Rating of the learning team.
:param result: Win, loss, or draw from the perspective of the learning team.
:return: The change in ELO.
"""
opponent_rating: float = 0.0
for team_id, trainer in self._ghost_trainers.items():
if team_id != self._learning_team:
opponent_rating = trainer.get_opponent_elo()
r1 = pow(10, rating / 400)
r2 = pow(10, opponent_rating / 400)
summed = r1 + r2
e1 = r1 / summed
change = result - e1
for team_id, trainer in self._ghost_trainers.items():
if team_id != self._learning_team:
trainer.change_opponent_elo(change)
return change

21
ml-agents/tests/yamato/setup_venv.py


import argparse
from .yamato_utils import init_venv
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--mlagents-version", default=None)
parser.add_argument("--extra-packages", default=None)
args = parser.parse_args()
extra_packages = []
if args.extra_packages is not None:
extra_packages = args.extra_packages.split(",")
init_venv(
mlagents_python_version=args.mlagents_version, extra_packages=extra_packages
)
if __name__ == "__main__":
main()

41
ml-agents/tests/yamato/scripts/run_gym.py


import argparse
from gym_unity.envs import UnityEnv
def main(env_name):
"""
Run the gym test using the specified environment
:param env_name: Name of the Unity environment binary to launch
"""
env = UnityEnv(env_name, worker_id=1, use_visual=False, no_graphics=True)
try:
# Examine environment parameters
print(str(env))
# Reset the environment
initial_observations = env.reset()
if len(env.observation_space.shape) == 1:
# Examine the initial vector observation
print("Agent observations look like: \n{}".format(initial_observations))
for _episode in range(10):
env.reset()
done = False
episode_rewards = 0
while not done:
actions = env.action_space.sample()
obs, reward, done, _ = env.step(actions)
episode_rewards += reward
print("Total reward this episode: {}".format(episode_rewards))
finally:
env.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--env", default="Project/testPlayer")
args = parser.parse_args()
main(args.env)

94
ml-agents/tests/yamato/scripts/run_llapi.py


import argparse
import numpy as np
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.engine_configuration_channel import (
EngineConfigurationChannel,
)
def main(env_name):
"""
Run the low-level API test using the specified environment
:param env_name: Name of the Unity environment binary to launch
"""
engine_configuration_channel = EngineConfigurationChannel()
env = UnityEnvironment(
file_name=env_name,
side_channels=[engine_configuration_channel],
no_graphics=True,
args=["-logFile", "-"],
)
try:
# Reset the environment
env.reset()
# Set the default brain to work with
group_name = env.get_behavior_names()[0]
group_spec = env.get_behavior_spec(group_name)
# Set the time scale of the engine
engine_configuration_channel.set_configuration_parameters(time_scale=3.0)
# Get the state of the agents
decision_steps, terminal_steps = env.get_steps(group_name)
# Examine the number of observations per Agent
print("Number of observations : ", len(group_spec.observation_shapes))
# Is there a visual observation ?
vis_obs = any(len(shape) == 3 for shape in group_spec.observation_shapes)
print("Is there a visual observation ?", vis_obs)
# Examine the state space for the first observation for the first agent
print(
"First Agent observation looks like: \n{}".format(decision_steps.obs[0][0])
)
for _episode in range(10):
env.reset()
decision_steps, terminal_steps = env.get_steps(group_name)
done = False
episode_rewards = 0
tracked_agent = -1
while not done:
if group_spec.is_action_continuous():
action = np.random.randn(
len(decision_steps), group_spec.action_size
)
elif group_spec.is_action_discrete():
branch_size = group_spec.discrete_action_branches
action = np.column_stack(
[
np.random.randint(
0, branch_size[i], size=(len(decision_steps))
)
for i in range(len(branch_size))
]
)
else:
# Should never happen
action = None
if tracked_agent == -1 and len(decision_steps) > 1:
tracked_agent = decision_steps.agent_id[0]
env.set_actions(group_name, action)
env.step()
decision_steps, terminal_steps = env.get_steps(group_name)
done = False
if tracked_agent in decision_steps:
episode_rewards += decision_steps[tracked_agent].reward
if tracked_agent in terminal_steps:
episode_rewards += terminal_steps[tracked_agent].reward
done = True
print("Total reward this episode: {}".format(episode_rewards))
finally:
env.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--env", default="artifacts/testPlayer")
args = parser.parse_args()
main(args.env)

429
com.unity.ml-agents/Tests/Runtime/SerializeTestScene.unity


%YAML 1.1
%TAG !u! tag:unity3d.com,2011:
--- !u!29 &1
OcclusionCullingSettings:
m_ObjectHideFlags: 0
serializedVersion: 2
m_OcclusionBakeSettings:
smallestOccluder: 5
smallestHole: 0.25
backfaceThreshold: 100
m_SceneGUID: 00000000000000000000000000000000
m_OcclusionCullingData: {fileID: 0}
--- !u!104 &2
RenderSettings:
m_ObjectHideFlags: 0
serializedVersion: 9
m_Fog: 0
m_FogColor: {r: 0.5, g: 0.5, b: 0.5, a: 1}
m_FogMode: 3
m_FogDensity: 0.01
m_LinearFogStart: 0
m_LinearFogEnd: 300
m_AmbientSkyColor: {r: 0.212, g: 0.227, b: 0.259, a: 1}
m_AmbientEquatorColor: {r: 0.114, g: 0.125, b: 0.133, a: 1}
m_AmbientGroundColor: {r: 0.047, g: 0.043, b: 0.035, a: 1}
m_AmbientIntensity: 1
m_AmbientMode: 0
m_SubtractiveShadowColor: {r: 0.42, g: 0.478, b: 0.627, a: 1}
m_SkyboxMaterial: {fileID: 10304, guid: 0000000000000000f000000000000000, type: 0}
m_HaloStrength: 0.5
m_FlareStrength: 1
m_FlareFadeSpeed: 3
m_HaloTexture: {fileID: 0}
m_SpotCookie: {fileID: 10001, guid: 0000000000000000e000000000000000, type: 0}
m_DefaultReflectionMode: 0
m_DefaultReflectionResolution: 128
m_ReflectionBounces: 1
m_ReflectionIntensity: 1
m_CustomReflection: {fileID: 0}
m_Sun: {fileID: 0}
m_IndirectSpecularColor: {r: 0.44657898, g: 0.49641287, b: 0.5748173, a: 1}
m_UseRadianceAmbientProbe: 0
--- !u!157 &3
LightmapSettings:
m_ObjectHideFlags: 0
serializedVersion: 11
m_GIWorkflowMode: 0
m_GISettings:
serializedVersion: 2
m_BounceScale: 1
m_IndirectOutputScale: 1
m_AlbedoBoost: 1
m_EnvironmentLightingMode: 0
m_EnableBakedLightmaps: 1
m_EnableRealtimeLightmaps: 1
m_LightmapEditorSettings:
serializedVersion: 10
m_Resolution: 2
m_BakeResolution: 40
m_AtlasSize: 1024
m_AO: 0
m_AOMaxDistance: 1
m_CompAOExponent: 1
m_CompAOExponentDirect: 0
m_Padding: 2
m_LightmapParameters: {fileID: 0}
m_LightmapsBakeMode: 1
m_TextureCompression: 1
m_FinalGather: 0
m_FinalGatherFiltering: 1
m_FinalGatherRayCount: 256
m_ReflectionCompression: 2
m_MixedBakeMode: 2
m_BakeBackend: 1
m_PVRSampling: 1
m_PVRDirectSampleCount: 32
m_PVRSampleCount: 500
m_PVRBounces: 2
m_PVRFilterTypeDirect: 0
m_PVRFilterTypeIndirect: 0
m_PVRFilterTypeAO: 0
m_PVRFilteringMode: 1
m_PVRCulling: 1
m_PVRFilteringGaussRadiusDirect: 1
m_PVRFilteringGaussRadiusIndirect: 5
m_PVRFilteringGaussRadiusAO: 2
m_PVRFilteringAtrousPositionSigmaDirect: 0.5
m_PVRFilteringAtrousPositionSigmaIndirect: 2
m_PVRFilteringAtrousPositionSigmaAO: 1
m_ShowResolutionOverlay: 1
m_LightingDataAsset: {fileID: 0}
m_UseShadowmask: 1
--- !u!196 &4
NavMeshSettings:
serializedVersion: 2
m_ObjectHideFlags: 0
m_BuildSettings:
serializedVersion: 2
agentTypeID: 0
agentRadius: 0.5
agentHeight: 2
agentSlope: 45
agentClimb: 0.4
ledgeDropHeight: 0
maxJumpAcrossDistance: 0
minRegionArea: 2
manualCellSize: 0
cellSize: 0.16666667
manualTileSize: 0
tileSize: 256
accuratePlacement: 0
debug:
m_Flags: 0
m_NavMeshData: {fileID: 0}
--- !u!1 &106586301
GameObject:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
serializedVersion: 6
m_Component:
- component: {fileID: 106586304}
- component: {fileID: 106586303}
- component: {fileID: 106586302}
m_Layer: 0
m_Name: Agent
m_TagString: Untagged
m_Icon: {fileID: 0}
m_NavMeshLayer: 0
m_StaticEditorFlags: 0
m_IsActive: 1
--- !u!114 &106586302
MonoBehaviour:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 106586301}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: c3d607733e457478885f15ee89725709, type: 3}
m_Name:
m_EditorClassIdentifier:
agentParameters:
maxStep: 5000
hasUpgradedFromAgentParameters: 1
maxStep: 5000
--- !u!114 &106586303
MonoBehaviour:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 106586301}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 5d1c4e0b1822b495aa52bc52839ecb30, type: 3}
m_Name:
m_EditorClassIdentifier:
m_BrainParameters:
vectorObservationSize: 1
numStackedVectorObservations: 1
vectorActionSize: 01000000
vectorActionDescriptions: []
vectorActionSpaceType: 0
m_Model: {fileID: 0}
m_InferenceDevice: 0
m_BehaviorType: 0
m_BehaviorName: My Behavior
m_TeamID: 0
m_UseChildSensors: 1
--- !u!4 &106586304
Transform:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 106586301}
m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
m_LocalPosition: {x: 0, y: 0, z: 0}
m_LocalScale: {x: 1, y: 1, z: 1}
m_Children:
- {fileID: 1471486645}
m_Father: {fileID: 0}
m_RootOrder: 2
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
--- !u!1 &185701317
GameObject:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
serializedVersion: 6
m_Component:
- component: {fileID: 185701319}
- component: {fileID: 185701318}
m_Layer: 0
m_Name: Directional Light
m_TagString: Untagged
m_Icon: {fileID: 0}
m_NavMeshLayer: 0
m_StaticEditorFlags: 0
m_IsActive: 1
--- !u!108 &185701318
Light:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 185701317}
m_Enabled: 1
serializedVersion: 8
m_Type: 1
m_Color: {r: 1, g: 0.95686275, b: 0.8392157, a: 1}
m_Intensity: 1
m_Range: 10
m_SpotAngle: 30
m_CookieSize: 10
m_Shadows:
m_Type: 2
m_Resolution: -1
m_CustomResolution: -1
m_Strength: 1
m_Bias: 0.05
m_NormalBias: 0.4
m_NearPlane: 0.2
m_Cookie: {fileID: 0}
m_DrawHalo: 0
m_Flare: {fileID: 0}
m_RenderMode: 0
m_CullingMask:
serializedVersion: 2
m_Bits: 4294967295
m_Lightmapping: 4
m_LightShadowCasterMode: 0
m_AreaSize: {x: 1, y: 1}
m_BounceIntensity: 1
m_ColorTemperature: 6570
m_UseColorTemperature: 0
m_ShadowRadius: 0
m_ShadowAngle: 0
--- !u!4 &185701319
Transform:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 185701317}
m_LocalRotation: {x: 0.40821788, y: -0.23456968, z: 0.10938163, w: 0.8754261}
m_LocalPosition: {x: 0, y: 3, z: 0}
m_LocalScale: {x: 1, y: 1, z: 1}
m_Children: []
m_Father: {fileID: 0}
m_RootOrder: 1
m_LocalEulerAnglesHint: {x: 50, y: -30, z: 0}
--- !u!1 &804630118
GameObject:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
serializedVersion: 6
m_Component:
- component: {fileID: 804630121}
- component: {fileID: 804630120}
- component: {fileID: 804630119}
m_Layer: 0
m_Name: Main Camera
m_TagString: MainCamera
m_Icon: {fileID: 0}
m_NavMeshLayer: 0
m_StaticEditorFlags: 0
m_IsActive: 1
--- !u!81 &804630119
AudioListener:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 804630118}
m_Enabled: 1
--- !u!20 &804630120
Camera:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 804630118}
m_Enabled: 1
serializedVersion: 2
m_ClearFlags: 1
m_BackGroundColor: {r: 0.19215687, g: 0.3019608, b: 0.4745098, a: 0}
m_projectionMatrixMode: 1
m_SensorSize: {x: 36, y: 24}
m_LensShift: {x: 0, y: 0}
m_GateFitMode: 2
m_FocalLength: 50
m_NormalizedViewPortRect:
serializedVersion: 2
x: 0
y: 0
width: 1
height: 1
near clip plane: 0.3
far clip plane: 1000
field of view: 60
orthographic: 0
orthographic size: 5
m_Depth: -1
m_CullingMask:
serializedVersion: 2
m_Bits: 4294967295
m_RenderingPath: -1
m_TargetTexture: {fileID: 0}
m_TargetDisplay: 0
m_TargetEye: 3
m_HDR: 1
m_AllowMSAA: 1
m_AllowDynamicResolution: 0
m_ForceIntoRT: 0
m_OcclusionCulling: 1
m_StereoConvergence: 10
m_StereoSeparation: 0.022
--- !u!4 &804630121
Transform:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 804630118}
m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
m_LocalPosition: {x: 0, y: 1, z: -10}
m_LocalScale: {x: 1, y: 1, z: 1}
m_Children: []
m_Father: {fileID: 0}
m_RootOrder: 0
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
--- !u!1 &1471486644
GameObject:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
serializedVersion: 6
m_Component:
- component: {fileID: 1471486645}
- component: {fileID: 1471486648}
- component: {fileID: 1471486647}
- component: {fileID: 1471486646}
m_Layer: 0
m_Name: Cube
m_TagString: Untagged
m_Icon: {fileID: 0}
m_NavMeshLayer: 0
m_StaticEditorFlags: 0
m_IsActive: 1
--- !u!4 &1471486645
Transform:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1471486644}
m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
m_LocalPosition: {x: 0, y: 0, z: 0}
m_LocalScale: {x: 1, y: 1, z: 1}
m_Children: []
m_Father: {fileID: 106586304}
m_RootOrder: 0
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
--- !u!65 &1471486646
BoxCollider:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1471486644}
m_Material: {fileID: 0}
m_IsTrigger: 0
m_Enabled: 1
serializedVersion: 2
m_Size: {x: 1, y: 1, z: 1}
m_Center: {x: 0, y: 0, z: 0}
--- !u!23 &1471486647
MeshRenderer:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1471486644}
m_Enabled: 1
m_CastShadows: 1
m_ReceiveShadows: 1
m_DynamicOccludee: 1
m_MotionVectors: 1
m_LightProbeUsage: 1
m_ReflectionProbeUsage: 1
m_RenderingLayerMask: 1
m_RendererPriority: 0
m_Materials:
- {fileID: 10303, guid: 0000000000000000f000000000000000, type: 0}
m_StaticBatchInfo:
firstSubMesh: 0
subMeshCount: 0
m_StaticBatchRoot: {fileID: 0}
m_ProbeAnchor: {fileID: 0}
m_LightProbeVolumeOverride: {fileID: 0}
m_ScaleInLightmap: 1
m_PreserveUVs: 0
m_IgnoreNormalsForChartDetection: 0
m_ImportantGI: 0
m_StitchLightmapSeams: 0
m_SelectedEditorRenderState: 3
m_MinimumChartSize: 4
m_AutoUVMaxDistance: 0.5
m_AutoUVMaxAngle: 89
m_LightmapParameters: {fileID: 0}
m_SortingLayerID: 0
m_SortingLayer: 0
m_SortingOrder: 0
--- !u!33 &1471486648
MeshFilter:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1471486644}
m_Mesh: {fileID: 10202, guid: 0000000000000000e000000000000000, type: 0}

7
com.unity.ml-agents/Tests/Runtime/SerializeTestScene.unity.meta


fileFormatVersion: 2
guid: 60783bd849bd242eeb66243542762b23
DefaultImporter:
externalObjects: {}
userData:
assetBundleName:
assetBundleVariant:

部分文件因为文件数量过多而无法显示

正在加载...
取消
保存