浏览代码

Merge branch 'master' into merge-release-0.13.0

/release-0.13.0
GitHub 4 年前
当前提交
d985dded
共有 66 个文件被更改,包括 1302 次插入969 次删除
  1. 2
      .circleci/config.yml
  2. 1
      .gitignore
  3. 2
      .pre-commit-config.yaml
  4. 4
      UnitySDK/Assets/ML-Agents/Editor/DemonstrationImporter.cs
  5. 2
      UnitySDK/Assets/ML-Agents/Editor/Tests/MLAgentsEditModeTest.cs
  6. 2
      UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/StackingSensorTests.cs
  7. 2
      UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/VectorSensorTests.cs
  8. 12
      UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/WriterAdapterTests.cs
  9. 7
      UnitySDK/Assets/ML-Agents/Editor/Tests/StandaloneBuildTest.cs
  10. 673
      UnitySDK/Assets/ML-Agents/Examples/GridWorld/Scenes/GridWorld.unity
  11. 9
      UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
  12. 19
      UnitySDK/Assets/ML-Agents/Scripts/Grpc/RpcCommunicator.cs
  13. 5
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/GeneratorImpl.cs
  14. 2
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorGenerator.cs
  15. 4
      UnitySDK/Assets/ML-Agents/Scripts/Policy/BarracudaPolicy.cs
  16. 4
      UnitySDK/Assets/ML-Agents/Scripts/Policy/RemotePolicy.cs
  17. 9
      UnitySDK/Assets/ML-Agents/Scripts/Sensor/CameraSensor.cs
  18. 3
      UnitySDK/Assets/ML-Agents/Scripts/Sensor/CameraSensorComponent.cs
  19. 4
      UnitySDK/Assets/ML-Agents/Scripts/Sensor/ISensor.cs
  20. 2
      UnitySDK/Assets/ML-Agents/Scripts/Sensor/RayPerceptionSensor.cs
  21. 11
      UnitySDK/Assets/ML-Agents/Scripts/Sensor/RenderTextureSensor.cs
  22. 3
      UnitySDK/Assets/ML-Agents/Scripts/Sensor/RenderTextureSensorComponent.cs
  23. 6
      UnitySDK/Assets/ML-Agents/Scripts/Sensor/SensorBase.cs
  24. 7
      UnitySDK/Assets/ML-Agents/Scripts/Sensor/StackingSensor.cs
  25. 2
      UnitySDK/Assets/ML-Agents/Scripts/Sensor/VectorSensor.cs
  26. 53
      UnitySDK/Assets/ML-Agents/Scripts/Sensor/WriteAdapter.cs
  27. 71
      config/sac_trainer_config.yaml
  28. 72
      config/trainer_config.yaml
  29. 5
      docs/Learning-Environment-Create-New.md
  30. 11
      docs/Migrating.md
  31. 4
      docs/Training-ML-Agents.md
  32. 2
      gym-unity/gym_unity/tests/test_gym.py
  33. 8
      ml-agents-envs/mlagents_envs/exception.py
  34. 43
      ml-agents-envs/mlagents_envs/rpc_utils.py
  35. 2
      ml-agents-envs/mlagents_envs/tests/test_envs.py
  36. 43
      ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py
  37. 84
      ml-agents/mlagents/trainers/agent_processor.py
  38. 32
      ml-agents/mlagents/trainers/brain.py
  39. 58
      ml-agents/mlagents/trainers/curriculum.py
  40. 78
      ml-agents/mlagents/trainers/learn.py
  41. 118
      ml-agents/mlagents/trainers/meta_curriculum.py
  42. 9
      ml-agents/mlagents/trainers/ppo/trainer.py
  43. 14
      ml-agents/mlagents/trainers/rl_trainer.py
  44. 9
      ml-agents/mlagents/trainers/sac/trainer.py
  45. 13
      ml-agents/mlagents/trainers/stats.py
  46. 2
      ml-agents/mlagents/trainers/tests/mock_brain.py
  47. 41
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  48. 2
      ml-agents/mlagents/trainers/tests/test_bcmodule.py
  49. 32
      ml-agents/mlagents/trainers/tests/test_curriculum.py
  50. 11
      ml-agents/mlagents/trainers/tests/test_learn.py
  51. 56
      ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
  52. 2
      ml-agents/mlagents/trainers/tests/test_multigpu.py
  53. 38
      ml-agents/mlagents/trainers/tests/test_ppo.py
  54. 2
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  55. 39
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  56. 12
      ml-agents/mlagents/trainers/tests/test_sac.py
  57. 2
      ml-agents/mlagents/trainers/tests/test_stats.py
  58. 2
      ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
  59. 28
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  60. 2
      ml-agents/mlagents/trainers/tests/test_trainer_util.py
  61. 198
      ml-agents/mlagents/trainers/trainer.py
  62. 161
      ml-agents/mlagents/trainers/trainer_controller.py
  63. 6
      ml-agents/mlagents/trainers/trainer_util.py
  64. 1
      test_requirements.txt
  65. 105
      UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/FloatVisualSensorTests.cs
  66. 3
      UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/FloatVisualSensorTests.cs.meta

2
.circleci/config.yml


. venv/bin/activate
mkdir test-reports
pip freeze > test-reports/pip_versions.txt
pytest --cov=ml-agents --cov=ml-agents-envs --cov=gym-unity --cov-report html --junitxml=test-reports/junit.xml -p no:warnings
pytest -n 2 --cov=ml-agents --cov=ml-agents-envs --cov=gym-unity --cov-report html --junitxml=test-reports/junit.xml -p no:warnings
- run:
name: Verify there are no hidden/missing metafiles.

1
.gitignore


/UnitySDK/Assets/AssetStoreTools*
/UnitySDK/Assets/Plugins*
/UnitySDK/Assets/Demonstrations*
/UnitySDK/csharp_timers.json
# Tensorflow Model Info
/models

2
.pre-commit-config.yaml


.*_pb2_grpc.py
)$
# flake8-tidy-imports is used for banned-modules, not actually tidying
additional_dependencies: [flake8-comprehensions, flake8-tidy-imports, flake8-bugbear]
additional_dependencies: [flake8-comprehensions==3.1.4, flake8-tidy-imports==4.0.0, flake8-bugbear==20.1.2]
- id: trailing-whitespace
name: trailing-whitespace-markdown
types: [markdown]

4
UnitySDK/Assets/ML-Agents/Editor/DemonstrationImporter.cs


var texture = (Texture2D)
AssetDatabase.LoadAssetAtPath(k_IconPath, typeof(Texture2D));
#if UNITY_2017_3_OR_NEWER
#else
ctx.SetMainAsset(ctx.assetPath, demonstration);
#endif
}
catch
{

2
UnitySDK/Assets/ML-Agents/Editor/Tests/MLAgentsEditModeTest.cs


sensorName = n;
}
public int[] GetFloatObservationShape()
public int[] GetObservationShape()
{
return new[] { 0 };
}

2
UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/StackingSensorTests.cs


ISensor wrapped = new VectorSensor(4);
ISensor sensor = new StackingSensor(wrapped, 4);
Assert.AreEqual("StackingSensor_size4_VectorSensor_size4", sensor.GetName());
Assert.AreEqual(sensor.GetFloatObservationShape(), new [] {16});
Assert.AreEqual(sensor.GetObservationShape(), new [] {16});
}
[Test]

2
UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/VectorSensorTests.cs


Assert.AreEqual(fill, output[0]);
WriteAdapter writer = new WriteAdapter();
writer.SetTarget(output, 0);
writer.SetTarget(output, sensor.GetObservationShape(), 0);
// Make sure WriteAdapter didn't touch anything
Assert.AreEqual(fill, output[0]);

12
UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/WriterAdapterTests.cs


{
WriteAdapter writer = new WriteAdapter();
var buffer = new[] { 0f, 0f, 0f };
var shape = new[] { 3 };
writer.SetTarget(buffer, 0);
writer.SetTarget(buffer, shape, 0);
// Elementwise writes
writer[0] = 1f;
writer[2] = 2f;

writer.SetTarget(buffer, 1);
writer.SetTarget(buffer, shape, 1);
writer.SetTarget(buffer, 0);
writer.SetTarget(buffer, shape, 0);
writer.SetTarget(buffer, 1);
writer.SetTarget(buffer, shape, 1);
writer.AddRange(new [] {6f, 7f});
Assert.AreEqual(new[] { 4f, 6f, 7f }, buffer);
}

valueType = TensorProxy.TensorType.FloatingPoint,
data = new Tensor(2, 3)
};
writer.SetTarget(t, 0, 0);
Assert.AreEqual(0f, t.data[0, 0]);
writer[0] = 1f;

valueType = TensorProxy.TensorType.FloatingPoint,
data = new Tensor(2, 2, 2, 3)
};
var shape = new[] { 2, 2, 3 };
writer.SetTarget(t, 0, 0);
writer[1, 0, 1] = 1f;

7
UnitySDK/Assets/ML-Agents/Editor/Tests/StandaloneBuildTest.cs


using System;
using UnityEditor;
using UnityEngine;
#if UNITY_2018_1_OR_NEWER
#endif
namespace MLAgents
{

{
string[] scenes = { "Assets/ML-Agents/Examples/3DBall/Scenes/3DBall.unity" };
var buildResult = BuildPipeline.BuildPlayer(scenes, "testPlayer", BuildTarget.StandaloneOSX, BuildOptions.None);
#if UNITY_2018_1_OR_NEWER
var isOk = buildResult.summary.result == BuildResult.Succeeded;
var error = "";
foreach (var stepInfo in buildResult.steps)

}
}
}
#else
var error = buildResult;
var isOk = string.IsNullOrEmpty(error);
#endif
if (isOk)
{
EditorApplication.Exit(0);

673
UnitySDK/Assets/ML-Agents/Examples/GridWorld/Scenes/GridWorld.unity
文件差异内容过多而无法显示
查看文件

9
UnitySDK/Assets/ML-Agents/Scripts/Agent.cs


Debug.Assert(!sensors[i].GetName().Equals(sensors[i + 1].GetName()), "Sensor names must be unique.");
}
#endif
// Create a buffer for writing vector sensor data too
// Create a buffer for writing uncompressed (i.e. float) sensor data to
int numFloatObservations = 0;
for (var i = 0; i < sensors.Count; i++)
{

var sensor = sensors[i];
if (sensor.GetCompressionType() == SensorCompressionType.None)
{
// only handles 1D
m_WriteAdapter.SetTarget(m_VectorSensorBuffer, floatsWritten);
m_WriteAdapter.SetTarget(m_VectorSensorBuffer, sensor.GetObservationShape(), floatsWritten);
Shape = sensor.GetFloatObservationShape(),
Shape = sensor.GetObservationShape(),
CompressionType = sensor.GetCompressionType()
};
m_Info.observations.Add(floatObs);

var compressedObs = new Observation
{
CompressedData = sensor.GetCompressedObservation(),
Shape = sensor.GetFloatObservationShape(),
Shape = sensor.GetObservationShape(),
CompressionType = sensor.GetCompressionType()
};
m_Info.observations.Add(compressedObs);

19
UnitySDK/Assets/ML-Agents/Scripts/Grpc/RpcCommunicator.cs


var result = m_Client.Exchange(WrapMessage(unityOutput, 200));
unityInput = m_Client.Exchange(WrapMessage(null, 200)).UnityInput;
#if UNITY_EDITOR
#if UNITY_2017_2_OR_NEWER
#else
EditorApplication.playmodeStateChanged += HandleOnPlayModeChanged;
#endif
#endif
return result.UnityInput;
#else

#endregion
#if UNITY_EDITOR
#if UNITY_2017_2_OR_NEWER
/// <summary>
/// When the editor exits, the communicator must be closed
/// </summary>

}
}
#else
/// <summary>
/// When the editor exits, the communicator must be closed
/// </summary>
private void HandleOnPlayModeChanged()
{
// This method is run whenever the playmode state is changed.
if (!EditorApplication.isPlayingOrWillChangePlaymode)
{
Close();
}
}
#endif
#endif
}
}

5
UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/GeneratorImpl.cs


// Write each sensor consecutively to the tensor
foreach (var sensorIndex in m_SensorIndices)
{
m_WriteAdapter.SetTarget(tensorProxy, agentIndex, tensorOffset);
m_WriteAdapter.SetTarget(tensorProxy, agentIndex, tensorOffset);
var numWritten = sensor.Write(m_WriteAdapter);
tensorOffset += numWritten;
}

var agentIndex = 0;
foreach (var agent in agents)
{
var sensor = agent.sensors[m_SensorIndex];
agent.sensors[m_SensorIndex].Write(m_WriteAdapter);
sensor.Write(m_WriteAdapter);
agentIndex++;
}
}

2
UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorGenerator.cs


for (var sensorIndex = 0; sensorIndex < agent.sensors.Count; sensorIndex++)
{
var sensor = agent.sensors[sensorIndex];
var shape = sensor.GetFloatObservationShape();
var shape = sensor.GetObservationShape();
// TODO generalize - we currently only have vector or visual, but can't handle "2D" observations
var isVectorSensor = (shape.Length == 1);
if (isVectorSensor)

4
UnitySDK/Assets/ML-Agents/Scripts/Policy/BarracudaPolicy.cs


// First agent, save the sensor sizes
foreach (var sensor in agent.sensors)
{
m_SensorShapes.Add(sensor.GetFloatObservationShape());
m_SensorShapes.Add(sensor.GetObservationShape());
}
}
else

for (var i = 0; i < m_SensorShapes.Count; i++)
{
var cachedShape = m_SensorShapes[i];
var sensorShape = agent.sensors[i].GetFloatObservationShape();
var sensorShape = agent.sensors[i].GetObservationShape();
Debug.Assert(cachedShape.Length == sensorShape.Length, "Sensor dimensions must match.");
for (var j = 0; j < cachedShape.Length; j++)
{

4
UnitySDK/Assets/ML-Agents/Scripts/Policy/RemotePolicy.cs


// First agent, save the sensor sizes
foreach (var sensor in agent.sensors)
{
m_SensorShapes.Add(sensor.GetFloatObservationShape());
m_SensorShapes.Add(sensor.GetObservationShape());
}
}
else

for (var i = 0; i < m_SensorShapes.Count; i++)
{
var cachedShape = m_SensorShapes[i];
var sensorShape = agent.sensors[i].GetFloatObservationShape();
var sensorShape = agent.sensors[i].GetObservationShape();
Debug.Assert(cachedShape.Length == sensorShape.Length, "Sensor dimensions must match.");
for (var j = 0; j < cachedShape.Length; j++)
{

9
UnitySDK/Assets/ML-Agents/Scripts/Sensor/CameraSensor.cs


bool m_Grayscale;
string m_Name;
int[] m_Shape;
SensorCompressionType m_CompressionType;
public CameraSensor(Camera camera, int width, int height, bool grayscale, string name)
public CameraSensor(Camera camera, int width, int height, bool grayscale, string name,
SensorCompressionType compression)
{
m_Camera = camera;
m_Width = width;

m_Shape = new[] { height, width, grayscale ? 1 : 3 };
m_CompressionType = compression;
}
public string GetName()

public int[] GetFloatObservationShape()
public int[] GetObservationShape()
{
return m_Shape;
}

public SensorCompressionType GetCompressionType()
{
return SensorCompressionType.PNG;
return m_CompressionType;
}
/// <summary>

3
UnitySDK/Assets/ML-Agents/Scripts/Sensor/CameraSensorComponent.cs


public int width = 84;
public int height = 84;
public bool grayscale;
public SensorCompressionType compression = SensorCompressionType.PNG;
return new CameraSensor(camera, width, height, grayscale, sensorName);
return new CameraSensor(camera, width, height, grayscale, sensorName, compression);
}
public override int[] GetObservationShape()

4
UnitySDK/Assets/ML-Agents/Scripts/Sensor/ISensor.cs


/// A sensor that returns an RGB image would return new [] {Width, Height, 3}
/// </summary>
/// <returns></returns>
int[] GetFloatObservationShape();
int[] GetObservationShape();
/// <summary>
/// Write the observation data directly to the WriteAdapter.

/// <returns></returns>
public static int ObservationSize(this ISensor sensor)
{
var shape = sensor.GetFloatObservationShape();
var shape = sensor.GetObservationShape();
int count = 1;
for (var i = 0; i < shape.Length; i++)
{

2
UnitySDK/Assets/ML-Agents/Scripts/Sensor/RayPerceptionSensor.cs


{
}
public int[] GetFloatObservationShape()
public int[] GetObservationShape()
{
return m_Shape;
}

11
UnitySDK/Assets/ML-Agents/Scripts/Sensor/RenderTextureSensor.cs


bool m_Grayscale;
string m_Name;
int[] m_Shape;
SensorCompressionType m_CompressionType;
public RenderTextureSensor(RenderTexture renderTexture, bool grayscale, string name)
public RenderTextureSensor(RenderTexture renderTexture, bool grayscale, string name,
SensorCompressionType compressionType)
{
m_RenderTexture = renderTexture;
var width = renderTexture != null ? renderTexture.width : 0;

m_Shape = new[] { height, width, grayscale ? 1 : 3 };
m_CompressionType = compressionType;
}
public string GetName()

public int[] GetFloatObservationShape()
public int[] GetObservationShape()
{
return m_Shape;
}

public SensorCompressionType GetCompressionType()
{
return SensorCompressionType.PNG;
return m_CompressionType;
/// Converts a RenderTexture and correspinding resolution to a 2D texture.
/// Converts a RenderTexture to a 2D texture.
/// </summary>
/// <returns>The 2D texture.</returns>
/// <param name="obsTexture">RenderTexture.</param>

3
UnitySDK/Assets/ML-Agents/Scripts/Sensor/RenderTextureSensorComponent.cs


public RenderTexture renderTexture;
public string sensorName = "RenderTextureSensor";
public bool grayscale;
public SensorCompressionType compression = SensorCompressionType.PNG;
return new RenderTextureSensor(renderTexture, grayscale, sensorName);
return new RenderTextureSensor(renderTexture, grayscale, sensorName, compression);
}
public override int[] GetObservationShape()

6
UnitySDK/Assets/ML-Agents/Scripts/Sensor/SensorBase.cs


{
/// <summary>
/// Write the observations to the output buffer. This size of the buffer will be product of the sizes returned
/// by GetFloatObservationShape().
/// by GetObservationShape().
public abstract int[] GetFloatObservationShape();
public abstract int[] GetObservationShape();
public abstract string GetName();

/// <param name="adapter"></param>
public virtual int Write(WriteAdapter adapter)
{
// TODO reuse buffer for similar agents, don't call GetFloatObservationShape()
// TODO reuse buffer for similar agents, don't call GetObservationShape()
var numFloats = this.ObservationSize();
float[] buffer = new float[numFloats];
WriteObservation(buffer);

7
UnitySDK/Assets/ML-Agents/Scripts/Sensor/StackingSensor.cs


m_Name = $"StackingSensor_size{numStackedObservations}_{wrapped.GetName()}";
var shape = wrapped.GetFloatObservationShape();
var shape = wrapped.GetObservationShape();
m_Shape = new int[shape.Length];
m_UnstackedObservationSize = wrapped.ObservationSize();

public int Write(WriteAdapter adapter)
{
// First, call the wrapped sensor's write method. Make sure to use our own adapater, not the passed one.
m_LocalAdapter.SetTarget(m_StackedObservations[m_CurrentIndex], 0);
var wrappedShape = m_WrappedSensor.GetObservationShape();
m_LocalAdapter.SetTarget(m_StackedObservations[m_CurrentIndex], wrappedShape, 0);
m_WrappedSensor.Write(m_LocalAdapter);
// Now write the saved observations (oldest first)

m_CurrentIndex = (m_CurrentIndex + 1) % m_NumStackedObservations;
}
public int[] GetFloatObservationShape()
public int[] GetObservationShape()
{
return m_Shape;
}

2
UnitySDK/Assets/ML-Agents/Scripts/Sensor/VectorSensor.cs


Clear();
}
public int[] GetFloatObservationShape()
public int[] GetObservationShape()
{
return m_Shape;
}

53
UnitySDK/Assets/ML-Agents/Scripts/Sensor/WriteAdapter.cs


using System;
using Barracuda;
using MLAgents.InferenceBrain;
namespace MLAgents.Sensor

TensorProxy m_Proxy;
int m_Batch;
TensorShape m_TensorShape;
/// <param name="data"></param>
/// <param name="offset"></param>
public void SetTarget(IList<float> data, int offset)
/// <param name="data">Float array or list that will be written to.</param>
/// <param name="shape">Shape of the observations to be written.</param>
/// <param name="offset">Offset from the start of the float data to write to.</param>
public void SetTarget(IList<float> data, int[] shape, int offset)
m_Batch = -1;
m_Batch = 0;
if (shape.Length == 1)
{
m_TensorShape = new TensorShape(m_Batch, shape[0]);
}
else
{
m_TensorShape = new TensorShape(m_Batch, shape[0], shape[1], shape[2]);
}
/// <param name="tensorProxy"></param>
/// <param name="batchIndex"></param>
/// <param name="channelOffset"></param>
/// <param name="tensorProxy">Tensor proxy that will be writtent to.</param>
/// <param name="batchIndex">Batch index in the tensor proxy (i.e. the index of the Agent)</param>
/// <param name="channelOffset">Offset from the start of the channel to write to.</param>
public void SetTarget(TensorProxy tensorProxy, int batchIndex, int channelOffset)
{
m_Proxy = tensorProxy;

m_TensorShape = m_Proxy.data.shape;
}
/// <summary>

{
set
{
// Only TensorProxy supports 3D access
m_Proxy.data[m_Batch, h, w, ch + m_Offset] = value;
if (m_Data != null)
{
if (h < 0 || h >= m_TensorShape.height)
{
throw new IndexOutOfRangeException($"height value {h} must be in range [0, {m_TensorShape.height-1}]");
}
if (w < 0 || w >= m_TensorShape.width)
{
throw new IndexOutOfRangeException($"width value {w} must be in range [0, {m_TensorShape.width-1}]");
}
if (ch < 0 || ch >= m_TensorShape.channels)
{
throw new IndexOutOfRangeException($"channel value {ch} must be in range [0, {m_TensorShape.channels-1}]");
}
var index = m_TensorShape.Index(m_Batch, h, w, ch + m_Offset);
m_Data[index] = value;
}
else
{
m_Proxy.data[m_Batch, h, w, ch + m_Offset] = value;
}
}
}

71
config/sac_trainer_config.yaml


init_entcoef: 1.0
learning_rate: 3.0e-4
learning_rate_schedule: constant
max_steps: 5.0e4
max_steps: 5.0e5
memory_size: 256
normalize: false
num_update: 1

sequence_length: 64
summary_freq: 1000
summary_freq: 10000
tau: 0.005
use_recurrent: false
vis_encode_type: simple

normalize: false
batch_size: 256
buffer_size: 500000
max_steps: 1.0e5
max_steps: 2.0e6
max_steps: 5.0e5
max_steps: 2.0e7
summary_freq: 1000
summary_freq: 20000
max_steps: 5.0e4
max_steps: 1.5e7
summary_freq: 2000
summary_freq: 60000
max_steps: 1.0e6
max_steps: 3e7
summary_freq: 2000
summary_freq: 20000
time_horizon: 128
init_entcoef: 0.1
num_layers: 2

max_steps: 1.0e6
max_steps: 3e7
summary_freq: 2000
summary_freq: 20000
time_horizon: 128
num_layers: 2
init_entcoef: 0.1

max_steps: 5.0e5
max_steps: 5.0e6
summary_freq: 2000
summary_freq: 20000
time_horizon: 128
init_entcoef: 0.1
num_layers: 2

max_steps: 5.0e5
max_steps: 5.0e6
summary_freq: 2000
summary_freq: 20000
time_horizon: 128
init_entcoef: 0.1
num_layers: 2

summary_freq: 2000
summary_freq: 30000
time_horizon: 128
batch_size: 128
buffer_init_steps: 10000

init_entcoef: 0.01
max_steps: 5.0e5
max_steps: 1.0e7
sequence_length: 16
tau: 0.01
use_recurrent: false

hidden_units: 256
buffer_init_steps: 1000
num_layers: 1
max_steps: 5.0e5
max_steps: 1.0e7
buffer_size: 500000
init_entcoef: 0.01
tau: 0.01

normalize: true
batch_size: 64
buffer_size: 12000
summary_freq: 1000
summary_freq: 12000
time_horizon: 1000
hidden_units: 64
init_entcoef: 0.5

batch_size: 256
summary_freq: 1000
summary_freq: 12000
max_steps: 2e5
max_steps: 4e6
CrawlerStatic:
normalize: true

buffer_size: 500000
buffer_init_steps: 2000
max_steps: 5e5
summary_freq: 3000
max_steps: 5e6
summary_freq: 30000
init_entcoef: 1.0
num_layers: 3
hidden_units: 512

time_horizon: 1000
batch_size: 256
buffer_size: 500000
summary_freq: 3000
summary_freq: 30000
max_steps: 1e6
max_steps: 1e7
hidden_units: 512
reward_signals:
extrinsic:

time_horizon: 1000
batch_size: 256
buffer_size: 500000
max_steps: 2e6
summary_freq: 3000
max_steps: 2e7
summary_freq: 30000
num_layers: 4
train_interval: 2
hidden_units: 512

time_horizon: 1000
batch_size: 128
buffer_size: 500000
max_steps: 2e5
summary_freq: 3000
max_steps: 2e7
summary_freq: 60000
Hallway:
sequence_length: 32

init_entcoef: 0.1
max_steps: 5.0e5
max_steps: 1.0e7
summary_freq: 1000
time_horizon: 64
use_recurrent: true

memory_size: 256
gamma: 0.99
batch_size: 64
max_steps: 5.0e5
summary_freq: 1000
max_steps: 1.0e7
time_horizon: 64
use_recurrent: true

gamma: 0.99
buffer_size: 1024
batch_size: 64
max_steps: 5.0e5
summary_freq: 1000
max_steps: 3.0e6
summary_freq: 60000
time_horizon: 64
GridWorld:

init_entcoef: 0.5
buffer_init_steps: 1000
buffer_size: 50000
max_steps: 50000
summary_freq: 2000
max_steps: 500000
summary_freq: 20000
time_horizon: 5
reward_signals:
extrinsic:

72
config/trainer_config.yaml


lambd: 0.95
learning_rate: 3.0e-4
learning_rate_schedule: linear
max_steps: 5.0e4
max_steps: 5.0e5
memory_size: 256
normalize: false
num_epoch: 3

summary_freq: 1000
summary_freq: 10000
use_recurrent: false
vis_encode_type: simple
reward_signals:

beta: 5.0e-3
batch_size: 1024
buffer_size: 10240
max_steps: 1.0e5
max_steps: 2.0e6
max_steps: 1.0e6
max_steps: 2.0e7
max_steps: 5.0e4
max_steps: 1.5e7
summary_freq: 2000
summary_freq: 60000
max_steps: 1.0e6
max_steps: 3e7
summary_freq: 2000
summary_freq: 20000
max_steps: 1.0e6
max_steps: 3e7
summary_freq: 2000
summary_freq: 20000
max_steps: 5.0e5
max_steps: 5.0e6
learning_rate: 1e-3
batch_size: 128
num_epoch: 3

summary_freq: 2000
summary_freq: 20000
max_steps: 5.0e5
max_steps: 5.0e6
learning_rate: 1e-3
batch_size: 320
num_epoch: 3

summary_freq: 2000
summary_freq: 20000
summary_freq: 2000
summary_freq: 30000
time_horizon: 128
batch_size: 128
buffer_size: 2048

max_steps: 5.0e5
max_steps: 1.0e7
num_epoch: 3
reward_signals:
extrinsic:

hidden_units: 256
num_layers: 1
beta: 1.0e-2
max_steps: 5.0e5
max_steps: 1.0e7
num_epoch: 3
reward_signals:
extrinsic:

normalize: true
batch_size: 64
buffer_size: 12000
summary_freq: 1000
summary_freq: 12000
time_horizon: 1000
lambd: 0.99
beta: 0.001

batch_size: 1200
buffer_size: 12000
summary_freq: 1000
summary_freq: 12000
time_horizon: 1000
max_steps: 5.0e5
beta: 0.001

Tennis:
normalize: true
max_steps: 2e5
max_steps: 4e6
CrawlerStatic:
normalize: true

buffer_size: 20240
max_steps: 1e6
summary_freq: 3000
max_steps: 1e7
summary_freq: 30000
num_layers: 3
hidden_units: 512
reward_signals:

time_horizon: 1000
batch_size: 2024
buffer_size: 20240
max_steps: 1e6
summary_freq: 3000
max_steps: 1e7
summary_freq: 30000
num_layers: 3
hidden_units: 512
reward_signals:

time_horizon: 1000
batch_size: 2048
buffer_size: 20480
max_steps: 2e6
summary_freq: 3000
max_steps: 2e7
summary_freq: 30000
num_layers: 3
hidden_units: 512
reward_signals:

time_horizon: 1000
batch_size: 2024
buffer_size: 20240
max_steps: 1e6
summary_freq: 3000
max_steps: 2e7
summary_freq: 60000
reward_signals:
extrinsic:
strength: 1.0

num_epoch: 3
buffer_size: 1024
batch_size: 128
max_steps: 5.0e5
summary_freq: 1000
max_steps: 1.0e7
summary_freq: 10000
time_horizon: 64
VisualHallway:

num_epoch: 3
buffer_size: 1024
batch_size: 64
max_steps: 5.0e5
summary_freq: 1000
max_steps: 1.0e7
summary_freq: 10000
time_horizon: 64
VisualPushBlock:

num_epoch: 3
buffer_size: 1024
batch_size: 64
max_steps: 5.0e5
summary_freq: 1000
max_steps: 3.0e6
summary_freq: 60000
time_horizon: 64
GridWorld:

hidden_units: 256
beta: 5.0e-3
buffer_size: 256
max_steps: 50000
summary_freq: 2000
max_steps: 500000
summary_freq: 20000
time_horizon: 5
reward_signals:
extrinsic:

5
docs/Learning-Environment-Create-New.md


3. In a file system window, navigate to the folder containing your cloned
ML-Agents repository.
4. Drag the `ML-Agents` folder from `UnitySDK/Assets` to the Unity
Editor Project window.
Editor Project window. If you see console errors about Barracuda, make sure
you've installed Barracuda from the Unity Package Manager. More information
can be found in the [installation instructions](Installation.md) under
**Package Installation**.
Your Unity **Project** window should contain the following assets:

11
docs/Migrating.md


# Migrating
## Migrating from 0.12 to 0.13
## Migrating from 0.13 to latest
### Important changes
* Trainer steps are now counted per-Agent, not per-environment as in previous versions. For instance, if you have 10 Agents in the scene, 20 environment steps now corresponds to 200 steps as printed in the terminal and in Tensorboard.
### Steps to Migrate
* Multiply `max_steps` and `summary_steps` in your `trainer_config.yaml` by the number of Agents in the scene.
## Migrating from ML-Agents toolkit v0.12.0 to v0.13.0
### Important changes
* The low level Python API has changed. You can look at the document [Low Level Python API documentation](Python-API.md) for more information. This should only affect you if you're writing a custom trainer; if you use `mlagents-learn` for training, this should be a transparent change.

4
docs/Training-ML-Agents.md


the oldest checkpoint is deleted when saving a new checkpoint. Defaults to 5.
* `--lesson=<n>`: Specify which lesson to start with when performing curriculum
training. Defaults to 0.
* `--num-runs=<n>`: Sets the number of concurrent training sessions to perform.
Default is set to 1. Set to higher values when benchmarking performance and
multiple training sessions is desired. Training sessions are independent, and
do not improve learning performance.
* `--num-envs=<n>`: Specifies the number of concurrent Unity environment instances to
collect experiences from when training. Defaults to 1.
* `--run-id=<path>`: Specifies an identifier for each training run. This

2
gym-unity/gym_unity/tests/test_gym.py


import unittest.mock as mock
from unittest import mock
import pytest
import numpy as np

8
ml-agents-envs/mlagents_envs/exception.py


pass
class UnityObservationException(UnityException):
"""
Related to errors with receiving observations.
"""
pass
class UnityActionException(UnityException):
"""
Related to errors with sending actions.

43
ml-agents-envs/mlagents_envs/rpc_utils.py


from mlagents_envs.base_env import AgentGroupSpec, ActionType, BatchedStepResult
from mlagents_envs.exception import UnityObservationException
from mlagents_envs.communicator_objects.observation_pb2 import (
ObservationProto,
NONE as COMPRESSION_NONE,
)
from typing import cast, List, Tuple, Union, Collection
from typing import cast, List, Tuple, Union, Collection, Optional, Iterable
from PIL import Image
logger = logging.getLogger("mlagents_envs")

image = Image.open(io.BytesIO(image_bytearray))
# Normally Image loads lazily, this forces it to do loading in the timer scope.
image.load()
s = np.array(image) / 255.0
s = np.array(image, dtype=np.float32) / 255.0
if gray_scale:
s = np.mean(s, axis=2)
s = np.reshape(s, [s.shape[0], s.shape[1], 1])

@timed
def observation_to_np_array(
obs: ObservationProto, expected_shape: Optional[Iterable[int]] = None
) -> np.ndarray:
"""
Converts observation proto into numpy array of the appropriate size.
:param obs: observation proto to be converted
:param expected_shape: optional shape information, used for sanity checks.
:return: processed numpy array of observation from environment
"""
if expected_shape is not None:
if list(obs.shape) != list(expected_shape):
raise UnityObservationException(
f"Observation did not have the expected shape - got {obs.shape} but expected {expected_shape}"
)
gray_scale = obs.shape[2] == 1
if obs.compression_type == COMPRESSION_NONE:
img = np.array(obs.float_data.data, dtype=np.float32)
img = np.reshape(img, obs.shape)
return img
else:
img = process_pixels(obs.compressed_data, gray_scale)
# Compare decompressed image size to observation shape and make sure they match
if list(obs.shape) != list(img.shape):
raise UnityObservationException(
f"Decompressed observation did not have the expected shape - "
f"decompressed had {img.shape} but expected {obs.shape}"
)
return img
@timed
def _process_visual_observation(
obs_index: int,
shape: Tuple[int, int, int],

if len(agent_info_list) == 0:
return np.zeros((0, shape[0], shape[1], shape[2]), dtype=np.float32)
gray_scale = shape[2] == 1
process_pixels(agent_obs.observations[obs_index].compressed_data, gray_scale)
observation_to_np_array(agent_obs.observations[obs_index], shape)
for agent_obs in agent_info_list
]
return np.array(batched_visual, dtype=np.float32)

2
ml-agents-envs/mlagents_envs/tests/test_envs.py


import unittest.mock as mock
from unittest import mock
import pytest
import numpy as np

43
ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py


import io
import numpy as np
import pytest
from mlagents_envs.communicator_objects.agent_info_pb2 import AgentInfoProto
from mlagents_envs.communicator_objects.observation_pb2 import (
ObservationProto,

from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
import numpy as np
import io
from mlagents_envs.exception import UnityObservationException
from mlagents_envs.rpc_utils import (
agent_group_spec_from_proto,
process_pixels,

return obs_proto
def generate_uncompressed_proto_obs(in_array: np.ndarray) -> ObservationProto:
obs_proto = ObservationProto()
obs_proto.float_data.data.extend(in_array.flatten().tolist())
obs_proto.compression_type = NONE
obs_proto.shape.extend(in_array.shape)
return obs_proto
in_array = np.random.rand(128, 128, 3)
in_array = np.random.rand(128, 64, 3)
assert out_array.shape == (128, 128, 3)
assert out_array.shape == (128, 64, 3)
in_array = np.random.rand(128, 128, 3)
in_array = np.random.rand(128, 64, 3)
assert out_array.shape == (128, 128, 1)
assert out_array.shape == (128, 64, 1)
assert np.mean(in_array.mean(axis=2, keepdims=True) - out_array) < 0.01
assert (in_array.mean(axis=2, keepdims=True) - out_array < 0.01).all()

def test_process_visual_observation():
in_array_1 = np.random.rand(128, 128, 3)
in_array_1 = np.random.rand(128, 64, 3)
in_array_2 = np.random.rand(128, 128, 3)
proto_obs_2 = generate_compressed_proto_obs(in_array_2)
in_array_2 = np.random.rand(128, 64, 3)
proto_obs_2 = generate_uncompressed_proto_obs(in_array_2)
arr = _process_visual_observation(0, (128, 128, 3), ap_list)
assert list(arr.shape) == [2, 128, 128, 3]
arr = _process_visual_observation(0, (128, 64, 3), ap_list)
assert list(arr.shape) == [2, 128, 64, 3]
def test_process_visual_observation_bad_shape():
in_array_1 = np.random.rand(128, 64, 3)
proto_obs_1 = generate_compressed_proto_obs(in_array_1)
ap1 = AgentInfoProto()
ap1.observations.extend([proto_obs_1])
ap_list = [ap1]
with pytest.raises(UnityObservationException):
_process_visual_observation(0, (128, 42, 3), ap_list)
def test_batched_step_result_from_proto():

84
ml-agents/mlagents/trainers/agent_processor.py


import sys
from typing import List, Dict
from collections import defaultdict, Counter
from typing import List, Dict, Deque, TypeVar, Generic
from collections import defaultdict, Counter, deque
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.policy import Policy
T = TypeVar("T")
class AgentProcessor:
"""

def __init__(
self,
trainer: Trainer,
policy: TFPolicy,
behavior_id: str,
stats_reporter: StatsReporter,

self.episode_steps: Counter = Counter()
self.episode_rewards: Dict[str, float] = defaultdict(float)
self.stats_reporter = stats_reporter
self.trainer = trainer
self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
self.behavior_id = behavior_id
def add_experiences(

next_obs=next_obs,
behavior_id=self.behavior_id,
)
# This will eventually be replaced with a queue
self.trainer.process_trajectory(trajectory)
for traj_queue in self.trajectory_queues:
traj_queue.put(trajectory)
self.experience_buffers[agent_id] = []
if next_info.local_done[next_idx]:
self.stats_reporter.add_stat(

del self.episode_rewards[agent_id]
elif not next_info.local_done[next_idx]:
self.episode_steps[agent_id] += 1
def publish_trajectory_queue(
self, trajectory_queue: "AgentManagerQueue[Trajectory]"
) -> None:
"""
Adds a trajectory queue to the list of queues to publish to when this AgentProcessor
assembles a Trajectory
:param trajectory_queue: Trajectory queue to publish to.
"""
self.trajectory_queues.append(trajectory_queue)
class AgentManagerQueue(Generic[T]):
"""
Queue used by the AgentManager. Note that we make our own class here because in most implementations
deque is sufficient and faster. However, if we want to switch to multiprocessing, we'll need to change
out this implementation.
"""
class Empty(Exception):
"""
Exception for when the queue is empty.
"""
pass
def __init__(self, behavior_id: str):
"""
Initializes an AgentManagerQueue. Note that we can give it a behavior_id so that it can be identified
separately from an AgentManager.
"""
self.queue: Deque[T] = deque()
self.behavior_id = behavior_id
def empty(self) -> bool:
return len(self.queue) == 0
def get_nowait(self) -> T:
try:
return self.queue.popleft()
except IndexError:
raise self.Empty("The AgentManagerQueue is empty.")
def put(self, item: T) -> None:
self.queue.append(item)
class AgentManager(AgentProcessor):
"""
An AgentManager is an AgentProcessor that also holds a single trajectory and policy queue.
Note: this leaves room for adding AgentProcessors that publish multiple trajectory queues.
"""
def __init__(
self,
policy: TFPolicy,
behavior_id: str,
stats_reporter: StatsReporter,
max_trajectory_length: int = sys.maxsize,
):
super().__init__(policy, behavior_id, stats_reporter, max_trajectory_length)
self.trajectory_queue: AgentManagerQueue[Trajectory] = AgentManagerQueue(
self.behavior_id
)
self.policy_queue: AgentManagerQueue[Policy] = AgentManagerQueue(
self.behavior_id
)
self.publish_trajectory_queue(self.trajectory_queue)

32
ml-agents/mlagents/trainers/brain.py


import logging
import numpy as np
import io
from mlagents_envs.timers import hierarchical_timer, timed
from mlagents_envs.timers import timed
from mlagents_envs import rpc_utils
from PIL import Image
logger = logging.getLogger("mlagents.trainers")

@staticmethod
@timed
def process_pixels(image_bytes: bytes, gray_scale: bool) -> np.ndarray:
"""
Converts byte array observation image into numpy array, re-sizes it,
and optionally converts it to grey scale
:param gray_scale: Whether to convert the image to grayscale.
:param image_bytes: input byte array corresponding to image
:return: processed numpy array of observation from environment
"""
with hierarchical_timer("image_decompress"):
image_bytearray = bytearray(image_bytes)
image = Image.open(io.BytesIO(image_bytearray))
# Normally Image loads lazily, this forces it to do loading in the timer scope.
image.load()
s = np.array(image) / 255.0
if gray_scale:
s = np.mean(s, axis=2)
s = np.reshape(s, [s.shape[0], s.shape[1], 1])
return s
@staticmethod
@timed
def from_agent_proto(
worker_id: int,
agent_info_list: Collection[

vis_obs: List[np.ndarray] = []
for i in range(brain_params.number_visual_observations):
# TODO check compression type, handle uncompressed visuals
BrainInfo.process_pixels(
agent_obs[i].compressed_data,
brain_params.camera_resolutions[i].gray_scale,
rpc_utils.observation_to_np_array(
agent_obs[i], brain_params.camera_resolutions[i]
)
for agent_obs in visual_observation_protos
]

58
ml-agents/mlagents/trainers/curriculum.py


import os
import json
import math
from typing import Dict, Any, TextIO

logger = logging.getLogger("mlagents.trainers")
class Curriculum(object):
def __init__(self, location):
class Curriculum:
def __init__(self, brain_name: str, config: Dict):
:param location: Path to JSON defining curriculum.
:param brain_name: Name of the brain this Curriculum is associated with
:param config: Dictionary of fields needed to configure the Curriculum
# The name of the brain should be the basename of the file without the
# extension.
self._brain_name = os.path.basename(location).split(".")[0]
self.data = Curriculum.load_curriculum_file(location)
self.brain_name = brain_name
self.config = config
self.smoothing_value = 0
self.smoothing_value = 0.0
for key in [
"parameters",
"measure",

]:
if key not in self.data:
if key not in self.config:
"{0} does not contain a " "{1} field.".format(location, key)
f"{brain_name} curriculum config does not contain a {key} field."
self.measure = self.data["measure"]
self.min_lesson_length = self.data["min_lesson_length"]
self.max_lesson_num = len(self.data["thresholds"])
self.measure = self.config["measure"]
self.min_lesson_length = self.config["min_lesson_length"]
self.max_lesson_num = len(self.config["thresholds"])
parameters = self.data["parameters"]
parameters = self.config["parameters"]
"The parameter {0} in Curriculum {1} must have {2} values "
"but {3} were found".format(
key, location, self.max_lesson_num + 1, len(parameters[key])
)
f"The parameter {key} in {brain_name}'s curriculum must have {self.max_lesson_num + 1} values "
f"but {len(parameters[key])} were found"
)
@property

steps completed).
:return Whether the lesson was incremented.
"""
if not self.data or not measure_val or math.isnan(measure_val):
if not self.config or not measure_val or math.isnan(measure_val):
if self.data["signal_smoothing"]:
if self.config["signal_smoothing"]:
if measure_val > self.data["thresholds"][self.lesson_num]:
if measure_val > self.config["thresholds"][self.lesson_num]:
parameters = self.data["parameters"]
parameters = self.config["parameters"]
self._brain_name,
self.brain_name,
self.lesson_num,
", ".join([str(x) + " -> " + str(config[x]) for x in config]),
)

current lesson is returned.
:return: The configuration of the reset parameters.
"""
if not self.data:
if not self.config:
parameters = self.data["parameters"]
parameters = self.config["parameters"]
def load_curriculum_file(location: str) -> None:
def load_curriculum_file(config_path: str) -> Dict:
with open(location) as data_file:
with open(config_path) as data_file:
"The file {0} could not be found.".format(location)
"The file {0} could not be found.".format(config_path)
"There was an error decoding {}".format(location)
"There was an error decoding {}".format(config_path)
def _load_curriculum(fp: TextIO) -> None:
def _load_curriculum(fp: TextIO) -> Dict:
try:
return json.load(fp)
except json.decoder.JSONDecodeError as e:

78
ml-agents/mlagents/trainers/learn.py


import logging
import argparse
from multiprocessing import Process, Queue
import os
import glob
import shutil

import mlagents_envs
from mlagents import tf_utils
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.exception import TrainerError
from mlagents.trainers.meta_curriculum import MetaCurriculum
from mlagents.trainers.trainer_util import load_config, TrainerFactory
from mlagents.trainers.stats import TensorboardWriter, CSVWriter, StatsReporter

class CommandLineOptions(NamedTuple):
debug: bool
num_runs: int
seed: int
env_path: str
run_id: str

help="The directory name for model and summary statistics",
)
parser.add_argument(
"--num-runs", default=1, type=int, help="Number of concurrent training sessions"
)
parser.add_argument(
"--save-freq", default=50000, type=int, help="Frequency at which to save model"
)
parser.add_argument(

return CommandLineOptions.from_argparse(args)
def run_training(
sub_id: int, run_seed: int, options: CommandLineOptions, process_queue: Queue
) -> None:
def run_training(run_seed: int, options: CommandLineOptions) -> None:
:param process_queue: Queue used to send signal back to main.
:param sub_id: Unique id for training session.
:param options: parsed command line arguments
:param run_seed: Random seed used for training.
:param run_options: Command line arguments for training.

curriculum_folder = options.curriculum_folder
# Recognize and use docker volume if one is passed as an argument
if not options.docker_target_name:
model_path = "./models/{run_id}-{sub_id}".format(
run_id=options.run_id, sub_id=sub_id
)
model_path = f"./models/{options.run_id}"
trainer_config_path = "/{docker_target_name}/{trainer_config_path}".format(
docker_target_name=options.docker_target_name,
trainer_config_path=trainer_config_path,
)
trainer_config_path = f"/{options.docker_target_name}/{trainer_config_path}"
curriculum_folder = "/{docker_target_name}/{curriculum_folder}".format(
docker_target_name=options.docker_target_name,
curriculum_folder=curriculum_folder,
)
model_path = "/{docker_target_name}/models/{run_id}-{sub_id}".format(
docker_target_name=options.docker_target_name,
run_id=options.run_id,
sub_id=sub_id,
)
summaries_dir = "/{docker_target_name}/summaries".format(
docker_target_name=options.docker_target_name
)
curriculum_folder = f"/{options.docker_target_name}/{curriculum_folder}"
model_path = f"/{options.docker_target_name}/models/{options.run_id}"
summaries_dir = f"/{options.docker_target_name}/summaries"
port = options.base_port + (sub_id * options.num_envs)
port = options.base_port
# Configure CSV, Tensorboard Writers and StatsReporter
# We assume reward and episode length are needed in the CSV.

trainer_factory,
model_path,
summaries_dir,
options.run_id + "-" + str(sub_id),
options.run_id,
options.save_freq,
maybe_meta_curriculum,
options.train_model,

)
# Signal that environment has been launched.
process_queue.put(True)
# Begin training
try:
tc.start_learning(env_manager)

return None
else:
meta_curriculum = MetaCurriculum(curriculum_folder)
meta_curriculum = MetaCurriculum.from_directory(curriculum_folder)
meta_curriculum.set_all_curriculums_to_lesson_num(lesson)
meta_curriculum.set_all_curricula_to_lesson_num(lesson)
return meta_curriculum

else:
# disable noisy warnings from tensorflow.
tf_utils.set_warnings_enabled(False)
if options.env_path is None and options.num_runs > 1:
raise TrainerError(
"It is not possible to launch more than one concurrent training session "
"when training from the editor."
)
jobs = []
if options.num_runs == 1:
if options.seed == -1:
run_seed = np.random.randint(0, 10000)
run_training(0, run_seed, options, Queue())
else:
for i in range(options.num_runs):
if options.seed == -1:
run_seed = np.random.randint(0, 10000)
process_queue = Queue()
p = Process(target=run_training, args=(i, run_seed, options, process_queue))
jobs.append(p)
p.start()
# Wait for signal that environment has successfully launched
while process_queue.get() is not True:
continue
# Wait for jobs to complete. Otherwise we'll have an extra
# unhandled KeyboardInterrupt if we end early.
try:
for job in jobs:
job.join()
except KeyboardInterrupt:
pass
if options.seed == -1:
run_seed = np.random.randint(0, 10000)
run_training(run_seed, options)
# For python debugger to directly run this script

118
ml-agents/mlagents/trainers/meta_curriculum.py


logger = logging.getLogger("mlagents.trainers")
class MetaCurriculum(object):
"""A MetaCurriculum holds curriculums. Each curriculum is associated to a
class MetaCurriculum:
"""A MetaCurriculum holds curricula. Each curriculum is associated to a
def __init__(self, curriculum_folder: str):
def __init__(self, curricula: Dict[str, Curriculum]):
Args:
curriculum_folder (str): The relative or absolute path of the
folder which holds the curriculums for this environment.
The folder should contain JSON files whose names are the
brains that the curriculums belong to.
default_reset_parameters (dict): The default reset parameters
of the environment.
:param curriculum_folder: Dictionary of brain_name to the
Curriculum for each brain.
self._brains_to_curriculums: Dict[str, Curriculum] = {}
self._brains_to_curricula: Dict[str, Curriculum] = {}
for brain_name, curriculum in curricula.items():
self._brains_to_curricula[brain_name] = curriculum
config_keys: Set[str] = set(curriculum.get_config().keys())
try:
for curriculum_filename in os.listdir(curriculum_folder):
# This process requires JSON files
brain_name, extension = os.path.splitext(curriculum_filename)
if extension.lower() != ".json":
continue
curriculum_filepath = os.path.join(
curriculum_folder, curriculum_filename
# Check if any two curricula use the same reset params.
if config_keys & used_reset_parameters:
logger.warning(
"Two or more curricula will "
"attempt to change the same reset "
"parameter. The result will be "
"non-deterministic."
curriculum = Curriculum(curriculum_filepath)
config_keys: Set[str] = set(curriculum.get_config().keys())
# Check if any two curriculums use the same reset params.
if config_keys & used_reset_parameters:
logger.warning(
"Two or more curriculums will "
"attempt to change the same reset "
"parameter. The result will be "
"non-deterministic."
)
used_reset_parameters.update(config_keys)
self._brains_to_curriculums[brain_name] = curriculum
except NotADirectoryError:
raise MetaCurriculumError(
curriculum_folder + " is not a "
"directory. Refer to the ML-Agents "
"curriculum learning docs."
)
used_reset_parameters.update(config_keys)
def brains_to_curriculums(self):
def brains_to_curricula(self):
return self._brains_to_curriculums
return self._brains_to_curricula
for brain_name, curriculum in self.brains_to_curriculums.items():
for brain_name, curriculum in self.brains_to_curricula.items():
lesson_nums[brain_name] = curriculum.lesson_num
return lesson_nums

for brain_name, lesson in lesson_nums.items():
self.brains_to_curriculums[brain_name].lesson_num = lesson
self.brains_to_curricula[brain_name].lesson_num = lesson
def _lesson_ready_to_increment(
self, brain_name: str, reward_buff_size: int

Whether the curriculum of the specified brain should attempt to
increment its lesson.
"""
if brain_name not in self.brains_to_curriculums:
if brain_name not in self.brains_to_curricula:
self.brains_to_curriculums[brain_name].min_lesson_length
self.brains_to_curricula[brain_name].min_lesson_length
"""Attempts to increments all the lessons of all the curriculums in this
"""Attempts to increments all the lessons of all the curricula in this
MetaCurriculum. Note that calling this method does not guarantee the
lesson of a curriculum will increment. The lesson of a curriculum will
only increment if the specified measure threshold defined in the

for brain_name, buff_size in reward_buff_sizes.items():
if self._lesson_ready_to_increment(brain_name, buff_size):
measure_val = measure_vals[brain_name]
ret[brain_name] = self.brains_to_curriculums[
ret[brain_name] = self.brains_to_curricula[
ret[brain_name] = self.brains_to_curriculums[
brain_name
].increment_lesson(measure_val)
ret[brain_name] = self.brains_to_curricula[brain_name].increment_lesson(
measure_val
)
def set_all_curriculums_to_lesson_num(self, lesson_num):
"""Sets all the curriculums in this meta curriculum to a specified
def set_all_curricula_to_lesson_num(self, lesson_num):
"""Sets all the curricula in this meta curriculum to a specified
lesson_num (int): The lesson number which all the curriculums will
lesson_num (int): The lesson number which all the curricula will
for _, curriculum in self.brains_to_curriculums.items():
for _, curriculum in self.brains_to_curricula.items():
"""Get the combined configuration of all curriculums in this
"""Get the combined configuration of all curricula in this
Returns:
A dict from parameter to value.
:return: A dict from parameter to value.
for _, curriculum in self.brains_to_curriculums.items():
for _, curriculum in self.brains_to_curricula.items():
@staticmethod
def from_directory(folder_path: str) -> "MetaCurriculum":
"""
Creates a MetaCurriculum given a folder full of curriculum config files.
:param folder_path: The path to the folder which holds the curriculum configs
for this environment. The folder should contain JSON files whose names
are the brains that the curricula belong to.
"""
try:
curricula = {}
for curriculum_filename in os.listdir(folder_path):
# This process requires JSON files
brain_name, extension = os.path.splitext(curriculum_filename)
if extension.lower() != ".json":
continue
curriculum_filepath = os.path.join(folder_path, curriculum_filename)
curriculum_config = Curriculum.load_curriculum_file(curriculum_filepath)
curricula[brain_name] = Curriculum(brain_name, curriculum_config)
return MetaCurriculum(curricula)
except NotADirectoryError:
raise MetaCurriculumError(
f"{folder_path} is not a directory. Refer to the ML-Agents "
"curriculum learning docs."
)

9
ml-agents/mlagents/trainers/ppo/trainer.py


"model_path",
"reward_signals",
]
self.check_param_keys()
self._check_param_keys()
def process_trajectory(self, trajectory: Trajectory) -> None:
def _process_trajectory(self, trajectory: Trajectory) -> None:
super()._process_trajectory(trajectory)
agent_id = trajectory.agent_id # All the agents should have the same ID
# Add to episode_steps

agent_id, self.get_policy(trajectory.behavior_id)
)
def is_ready_update(self):
def _is_ready_update(self):
"""
Returns whether or not the trainer has enough elements to run update model
:return: A boolean corresponding to whether or not update_model() can be run

def update_policy(self):
def _update_policy(self):
"""
Uses demonstration_buffer to update the policy.
The reward signal generators must be updated in this method at their own pace.

14
ml-agents/mlagents/trainers/rl_trainer.py


RewardSignalResults = Dict[str, RewardSignalResult]
class RLTrainer(Trainer):
class RLTrainer(Trainer): # pylint: disable=abstract-method
Contains methods for adding BrainInfos to the Buffer.
"""
def __init__(self, *args, **kwargs):

def clear_update_buffer(self) -> None:
"""
Clear the buffers that have been built up during inference. If
we're not training, this should be called instead of update_policy.
Clear the buffers that have been built up during inference.
def advance(self) -> None:
"""
Steps the trainer, taking in trajectories and updates if ready
"""
super().advance()
if not self.is_training:
self.clear_update_buffer()

9
ml-agents/mlagents/trainers/sac/trainer.py


"vis_encode_type",
]
self.check_param_keys()
self._check_param_keys()
self.load = load
self.seed = seed
self.policy: SACPolicy = None # type: ignore

)
)
def process_trajectory(self, trajectory: Trajectory) -> None:
def _process_trajectory(self, trajectory: Trajectory) -> None:
super()._process_trajectory(trajectory)
last_step = trajectory.steps[-1]
agent_id = trajectory.agent_id # All the agents should have the same ID

agent_id, self.get_policy(trajectory.behavior_id)
)
def is_ready_update(self) -> bool:
def _is_ready_update(self) -> bool:
"""
Returns whether or not the trainer has enough elements to run update model
:return: A boolean corresponding to whether or not update_model() can be run

)
@timed
def update_policy(self) -> None:
def _update_policy(self) -> None:
"""
If train_interval is met, update the SAC policy given the current reward signals.
If reward_signal_train_interval is met, update the reward signals from the buffer.

13
ml-agents/mlagents/trainers/stats.py


def add_stat(self, key: str, value: float) -> None:
"""
Add a float value stat to the StatsReporter.
:param category: The highest categorization of the statistic, e.g. behavior name.
def set_stat(self, key: str, value: float) -> None:
"""
Sets a stat value to a float. This is for values that we don't want to average, and just
want the latest.
:param key: The type of statistic, e.g. Environment/Reward.
:param value: the value of the statistic.
"""
StatsReporter.stats_dict[self.category][key] = [value]
:param category: The category which to write out the stats.
:param step: Training step which to write these stats as.
"""
values: Dict[str, StatsSummary] = {}

def write_text(self, text: str, step: int) -> None:
"""
Write out some text.
:param category: The highest categorization of the statistic, e.g. behavior name.
:param text: The text to write out.
:param step: Training step which to write these stats as.
"""

def get_stats_summaries(self, key: str) -> StatsSummary:
"""
Get the mean, std, and count of a particular statistic, since last write.
:param category: The highest categorization of the statistic, e.g. behavior name.
:param key: The type of statistic, e.g. Environment/Reward.
:returns: A StatsSummary NamedTuple containing (mean, std, count).
"""

2
ml-agents/mlagents/trainers/tests/mock_brain.py


import unittest.mock as mock
from unittest import mock
import numpy as np
from mlagents.trainers.brain import CameraResolution, BrainParameters

41
ml-agents/mlagents/trainers/tests/test_agent_processor.py


import unittest.mock as mock
from unittest import mock
from mlagents.trainers.agent_processor import AgentProcessor
from mlagents.trainers.agent_processor import (
AgentProcessor,
AgentManager,
AgentManagerQueue,
)
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.stats import StatsReporter

@pytest.mark.parametrize("num_vis_obs", [0, 1, 2], ids=["vec", "1 viz", "2 viz"])
def test_agentprocessor(num_vis_obs):
policy = create_mock_policy()
trainer = mock.Mock()
tqueue = mock.Mock()
trainer,
policy,
name_behavior_id,
max_trajectory_length=5,

num_vector_acts=2,
num_vis_observations=num_vis_obs,
)
processor.publish_trajectory_queue(tqueue)
assert len(trainer.process_trajectory.call_args_list) == 2
assert len(tqueue.put.call_args_list) == 2
trajectory = trainer.process_trajectory.call_args_list[0][0][0]
trajectory = tqueue.put.call_args_list[0][0][0]
def test_agent_manager():
policy = create_mock_policy()
name_behavior_id = "test_brain_name"
manager = AgentManager(
policy,
name_behavior_id,
max_trajectory_length=5,
stats_reporter=StatsReporter("testcat"),
)
assert len(manager.trajectory_queues) == 1
assert isinstance(manager.trajectory_queues[0], AgentManagerQueue)
def test_agent_manager_queue():
queue = AgentManagerQueue(behavior_id="testbehavior")
trajectory = mock.Mock(spec=Trajectory)
assert queue.empty()
queue.put(trajectory)
assert not queue.empty()
queue_traj = queue.get_nowait()
assert isinstance(queue_traj, Trajectory)
assert queue.empty()

2
ml-agents/mlagents/trainers/tests/test_bcmodule.py


import unittest.mock as mock
from unittest import mock
import pytest
import mlagents.trainers.tests.mock_brain as mb

32
ml-agents/mlagents/trainers/tests/test_curriculum.py


}
"""
dummy_curriculum_config = json.loads(dummy_curriculum_json_str)
bad_curriculum_json_str = """
{

"""
@pytest.fixture
def location():
return "TestBrain.json"
dummy_curriculum_config_path = "TestBrain.json"
@pytest.fixture

@patch("builtins.open", new_callable=mock_open, read_data=dummy_curriculum_json_str)
def test_init_curriculum_happy_path(mock_file, location, default_reset_parameters):
curriculum = Curriculum(location)
def test_init_curriculum_happy_path():
curriculum = Curriculum("TestBrain", dummy_curriculum_config)
assert curriculum._brain_name == "TestBrain"
assert curriculum.brain_name == "TestBrain"
def test_init_curriculum_bad_curriculum_raises_error(
mock_file, location, default_reset_parameters
):
def test_load_bad_curriculum_file_raises_error(mock_file):
Curriculum(location)
Curriculum(
"TestBrain", Curriculum.load_curriculum_file(dummy_curriculum_config_path)
)
@patch("builtins.open", new_callable=mock_open, read_data=dummy_curriculum_json_str)
def test_increment_lesson(mock_file, location, default_reset_parameters):
curriculum = Curriculum(location)
def test_increment_lesson():
curriculum = Curriculum("TestBrain", dummy_curriculum_config)
assert curriculum.lesson_num == 0
curriculum.lesson_num = 1

assert curriculum.lesson_num == 3
@patch("builtins.open", new_callable=mock_open, read_data=dummy_curriculum_json_str)
def test_get_config(mock_file):
curriculum = Curriculum("TestBrain.json")
def test_get_parameters():
curriculum = Curriculum("TestBrain", dummy_curriculum_config)
assert curriculum.get_config() == {"param1": 0.7, "param2": 100, "param3": 0.2}
curriculum.lesson_num = 2

# Test json loading and error handling. These examples don't need to valid config files.
def test_curriculum_load_good():
expected = {"x": 1}
value = json.dumps(expected)

11
ml-agents/mlagents/trainers/tests/test_learn.py


mock_init = MagicMock(return_value=None)
with patch.object(TrainerController, "__init__", mock_init):
with patch.object(TrainerController, "start_learning", MagicMock()):
learn.run_training(0, 0, basic_options(), MagicMock())
learn.run_training(0, basic_options())
"./models/ppo-0",
"./models/ppo",
"ppo-0",
"ppo",
50000,
None,
False,

mock_init = MagicMock(return_value=None)
with patch.object(TrainerController, "__init__", mock_init):
with patch.object(TrainerController, "start_learning", MagicMock()):
learn.run_training(0, 0, options_with_docker_target, MagicMock())
learn.run_training(0, options_with_docker_target)
assert mock_init.call_args[0][1] == "/dockertarget/models/ppo-0"
assert mock_init.call_args[0][1] == "/dockertarget/models/ppo"
assert mock_init.call_args[0][2] == "/dockertarget/summaries"

"--lesson=3",
"--load",
"--run-id=myawesomerun",
"--num-runs=3",
"--save-freq=123456",
"--seed=7890",
"--train",

56
ml-agents/mlagents/trainers/tests/test_meta_curriculum.py


_check_environment_trains,
BRAIN_NAME,
)
from mlagents.trainers.tests.test_curriculum import dummy_curriculum_json_str
class MetaCurriculumTest(MetaCurriculum):
"""This class allows us to test MetaCurriculum objects without calling
MetaCurriculum's __init__ function.
"""
def __init__(self, brains_to_curriculums):
self._brains_to_curriculums = brains_to_curriculums
from mlagents.trainers.tests.test_curriculum import (
dummy_curriculum_json_str,
dummy_curriculum_config,
)
@pytest.fixture

@patch("mlagents.trainers.curriculum.Curriculum.get_config", return_value={})
@patch("mlagents.trainers.curriculum.Curriculum.__init__", return_value=None)
@patch(
"mlagents.trainers.curriculum.Curriculum.load_curriculum_file",
return_value=dummy_curriculum_config,
)
meta_curriculum = MetaCurriculum("test/")
meta_curriculum = MetaCurriculum.from_directory("test/")
assert len(meta_curriculum.brains_to_curriculums) == 2
assert len(meta_curriculum.brains_to_curricula) == 2
assert "Brain1" in meta_curriculum.brains_to_curriculums
assert "Brain2.test" in meta_curriculum.brains_to_curriculums
assert "Brain1" in meta_curriculum.brains_to_curricula
assert "Brain2.test" in meta_curriculum.brains_to_curricula
calls = [call("test/Brain1.json"), call("test/Brain2.test.json")]

@patch("os.listdir", side_effect=NotADirectoryError())
def test_init_meta_curriculum_bad_curriculum_folder_raises_error(listdir):
with pytest.raises(MetaCurriculumError):
MetaCurriculum("test/")
MetaCurriculum.from_directory("test/")
meta_curriculum = MetaCurriculumTest(
{"Brain1": curriculum_a, "Brain2": curriculum_b}
)
meta_curriculum = MetaCurriculum({"Brain1": curriculum_a, "Brain2": curriculum_b})
meta_curriculum.lesson_nums = {"Brain1": 1, "Brain2": 3}

@patch("mlagents.trainers.curriculum.Curriculum")
@patch("mlagents.trainers.curriculum.Curriculum")
def test_increment_lessons(curriculum_a, curriculum_b, measure_vals):
meta_curriculum = MetaCurriculumTest(
{"Brain1": curriculum_a, "Brain2": curriculum_b}
)
meta_curriculum = MetaCurriculum({"Brain1": curriculum_a, "Brain2": curriculum_b})
meta_curriculum.increment_lessons(measure_vals)

):
curriculum_a.min_lesson_length = 5
curriculum_b.min_lesson_length = 10
meta_curriculum = MetaCurriculumTest(
{"Brain1": curriculum_a, "Brain2": curriculum_b}
)
meta_curriculum = MetaCurriculum({"Brain1": curriculum_a, "Brain2": curriculum_b})
meta_curriculum.increment_lessons(measure_vals, reward_buff_sizes=reward_buff_sizes)

@patch("mlagents.trainers.curriculum.Curriculum")
@patch("mlagents.trainers.curriculum.Curriculum")
def test_set_all_curriculums_to_lesson_num(curriculum_a, curriculum_b):
meta_curriculum = MetaCurriculumTest(
{"Brain1": curriculum_a, "Brain2": curriculum_b}
)
meta_curriculum = MetaCurriculum({"Brain1": curriculum_a, "Brain2": curriculum_b})
meta_curriculum.set_all_curriculums_to_lesson_num(2)
meta_curriculum.set_all_curricula_to_lesson_num(2)
assert curriculum_a.lesson_num == 2
assert curriculum_b.lesson_num == 2

):
curriculum_a.get_config.return_value = default_reset_parameters
curriculum_b.get_config.return_value = default_reset_parameters
meta_curriculum = MetaCurriculumTest(
{"Brain1": curriculum_a, "Brain2": curriculum_b}
)
meta_curriculum = MetaCurriculum({"Brain1": curriculum_a, "Brain2": curriculum_b})
assert meta_curriculum.get_config() == default_reset_parameters

with patch(
"builtins.open", new_callable=mock_open, read_data=dummy_curriculum_json_str
):
curriculum = Curriculum("TestBrain.json")
mc = MetaCurriculumTest({curriculum_brain_name: curriculum})
curriculum_config = Curriculum.load_curriculum_file("TestBrain.json")
curriculum = Curriculum("TestBrain", curriculum_config)
mc = MetaCurriculum({curriculum_brain_name: curriculum})
_check_environment_trains(env, META_CURRICULUM_CONFIG, mc, -100.0)

2
ml-agents/mlagents/trainers/tests/test_multigpu.py


import unittest.mock as mock
from unittest import mock
import pytest
from mlagents.tf_utils import tf

38
ml-agents/mlagents/trainers/tests/test_ppo.py


import unittest.mock as mock
from unittest import mock
import pytest
import numpy as np

from mlagents.trainers.models import EncoderType, LearningModel
from mlagents.trainers.trainer import UnityTrainerException
from mlagents.trainers.brain import BrainParameters, CameraResolution
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.mock_communicator import MockCommunicator
from mlagents.trainers.tests import mock_brain as mb

trainer = PPOTrainer(
brain_params.brain_name, 0, trainer_params, True, False, 0, "0", False
)
policy_mock = mock.Mock()
policy_mock = mock.Mock(spec=PPOPolicy)
) # 10 hacked becausee this function is no longer called through trainer
) # 10 hacked because this function is no longer called through trainer
trainer.policy = policy_mock
trainer.add_policy("testbehavior", policy_mock)
trainer.increment_step(5)
print(trainer.policy.increment_step(5))
trainer._increment_step(5, "testbehavior")
policy_mock.increment_step.assert_called_with(5)
assert trainer.step == step_count

buffer["curiosity_value_estimates"] = buffer["rewards"]
trainer.update_buffer = buffer
trainer.update_policy()
trainer._update_policy()
trainer.update_policy()
trainer._update_policy()
trainer.update_policy()
trainer._update_policy()
def test_process_trajectory(dummy_config):

)
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
trainer = PPOTrainer(
brain_params.brain_name, 0, dummy_config, True, False, 0, "0", False
)
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False)
policy = trainer.create_policy(brain_params)
trainer.add_policy(brain_params.brain_name, policy)
trajectory_queue = AgentManagerQueue("testbrain")
trainer.subscribe_trajectory_queue(trajectory_queue)
time_horizon = 15
trajectory = make_fake_trajectory(
length=time_horizon,

action_space=2,
)
policy = trainer.create_policy(brain_params)
trainer.add_policy(brain_params.brain_name, policy)
trainer.process_trajectory(trajectory)
trajectory_queue.put(trajectory)
trainer.advance()
# Check that trainer put trajectory in update buffer
assert trainer.update_buffer.num_experiences == 15

num_vis_obs=0,
action_space=2,
)
trainer.process_trajectory(trajectory)
trajectory_queue.put(trajectory)
trainer.advance()
# Check that the stats are reset as episode is finished
for reward in trainer.collected_rewards.values():

policy = trainer.create_policy(brain_params)
trainer.add_policy(brain_params.brain_name, policy)
trainer.process_trajectory(trajectory)
trainer._process_trajectory(trajectory)
# Check that the running mean and variance is correct
steps, mean, variance = trainer.policy.sess.run(

num_vis_obs=0,
action_space=2,
)
trainer.process_trajectory(trajectory)
trainer._process_trajectory(trajectory)
# Check that the running mean and variance is correct
steps, mean, variance = trainer.policy.sess.run(

2
ml-agents/mlagents/trainers/tests/test_reward_signals.py


import unittest.mock as mock
from unittest import mock
import pytest
import yaml
import os

39
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


import unittest.mock as mock
from unittest import mock
import numpy as np
from mlagents.trainers.rl_trainer import RLTrainer
from mlagents.trainers.tests.test_buffer import construct_fake_buffer

"""
summary_path: "test/"
summary_freq: 1000
reward_signals:
extrinsic:
strength: 1.0

return mock_brain
def create_rl_trainer():
mock_brainparams = create_mock_brain()
trainer = RLTrainer(mock_brainparams.brain_name, dummy_config(), True, 0)
return trainer
# Add concrete implementations of abstract methods
class FakeTrainer(RLTrainer):
def get_policy(self, name_behavior_id):
return mock.Mock()
def _is_ready_update(self):
return True
def _update_policy(self):
pass
def add_policy(self):
pass
def create_policy(self):
return mock.Mock()
def create_mock_all_brain_info(brain_info):
return {"MockBrain": brain_info}
def _process_trajectory(self, trajectory):
super()._process_trajectory(trajectory)
def create_mock_policy():
mock_policy = mock.Mock()
mock_policy.reward_signals = {}
mock_policy.retrieve_memories.return_value = np.zeros((1, 1), dtype=np.float32)
mock_policy.retrieve_previous_action.return_value = np.zeros(
(1, 1), dtype=np.float32
)
return mock_policy
def create_rl_trainer():
mock_brainparams = create_mock_brain()
trainer = FakeTrainer(mock_brainparams, dummy_config(), True, 0)
return trainer
def test_rl_trainer():

12
ml-agents/mlagents/trainers/tests/test_sac.py


import unittest.mock as mock
from unittest import mock
import pytest
import yaml

from mlagents.trainers.sac.models import SACModel
from mlagents.trainers.sac.policy import SACPolicy
from mlagents.trainers.sac.trainer import SACTrainer
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.mock_brain import make_brain_parameters
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory

policy = trainer.create_policy(brain_params)
trainer.add_policy(brain_params.brain_name, policy)
trajectory_queue = AgentManagerQueue("testbrain")
trainer.subscribe_trajectory_queue(trajectory_queue)
trainer.process_trajectory(trajectory)
trajectory_queue.put(trajectory)
trainer.advance()
# Check that trainer put trajectory in update buffer
assert trainer.update_buffer.num_experiences == 15

num_vis_obs=0,
action_space=2,
)
trainer.process_trajectory(trajectory)
trajectory_queue.put(trajectory)
trainer.advance()
# Check that the stats are reset as episode is finished
for reward in trainer.collected_rewards.values():

2
ml-agents/mlagents/trainers/tests/test_stats.py


import unittest.mock as mock
from unittest import mock
import os
import pytest
import tempfile

2
ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py


import unittest.mock as mock
from unittest import mock
from unittest.mock import Mock, MagicMock
import unittest
from queue import Empty as EmptyQueue

28
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


import pytest
from mlagents.tf_utils import tf
from mlagents.trainers.trainer_controller import TrainerController, AgentManager
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.subprocess_env_manager import EnvironmentStep
from mlagents.trainers.sampler_class import SamplerManager

trainer_mock = MagicMock()
trainer_mock.get_step = 0
trainer_mock.get_max_steps = 5
trainer_mock.should_still_train = True
trainer_mock.parameters = {"some": "parameter"}
trainer_mock.write_tensorboard_text = MagicMock()

def take_step_sideeffect(env):
tc.trainers["testbrain"].get_step += 1
if (
not tc.trainers["testbrain"].get_step
<= tc.trainers["testbrain"].get_max_steps
):
tc.trainers["testbrain"].should_still_train = False
if tc.trainers["testbrain"].get_step > 10:
raise KeyboardInterrupt
return 1

trainer_mock.parameters = {"some": "parameter"}
trainer_mock.write_tensorboard_text = MagicMock()
processor_mock = MagicMock()
tc.managers = {"testbrain": AgentManager(processor=processor_mock)}
tc.managers = {"testbrain": MagicMock()}
return tc, trainer_mock

brain_info_dict = {brain_name: Mock()}
old_step_info = EnvironmentStep(brain_info_dict, brain_info_dict, action_info_dict)
new_step_info = EnvironmentStep(brain_info_dict, brain_info_dict, action_info_dict)
trainer_mock.is_ready_update = MagicMock(return_value=True)
trainer_mock._is_ready_update = MagicMock(return_value=True)
env_mock = MagicMock()
env_mock.step.return_value = [new_step_info]

env_mock.reset.assert_not_called()
env_mock.step.assert_called_once()
processor_mock = tc.managers[brain_name].processor
processor_mock.add_experiences.assert_called_once_with(
manager_mock = tc.managers[brain_name]
manager_mock.add_experiences.assert_called_once_with(
trainer_mock.update_policy.assert_called_once()
trainer_mock.increment_step.assert_called_once()
trainer_mock.advance.assert_called_once()
def test_take_step_if_not_training(trainer_controller_with_take_step_mocks):

old_step_info = EnvironmentStep(brain_info_dict, brain_info_dict, action_info_dict)
new_step_info = EnvironmentStep(brain_info_dict, brain_info_dict, action_info_dict)
trainer_mock.is_ready_update = MagicMock(return_value=False)
trainer_mock._is_ready_update = MagicMock(return_value=False)
env_mock = MagicMock()
env_mock.step.return_value = [new_step_info]

tc.advance(env_mock)
env_mock.reset.assert_not_called()
env_mock.step.assert_called_once()
processor_mock = tc.managers[brain_name].processor
processor_mock.add_experiences.assert_called_once_with(
manager_mock = tc.managers[brain_name]
manager_mock.add_experiences.assert_called_once_with(
new_step_info.previous_all_brain_info[brain_name],
new_step_info.current_all_brain_info[brain_name],
new_step_info.brain_name_to_action_info[brain_name].outputs,

2
ml-agents/mlagents/trainers/tests/test_trainer_util.py


import io
from unittest.mock import patch
import mlagents.trainers.trainer_util as trainer_util
from mlagents.trainers import trainer_util
from mlagents.trainers.trainer_util import load_config, _load_config
from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.exception import TrainerConfigError

198
ml-agents/mlagents/trainers/trainer.py


# # Unity ML-Agents Toolkit
import logging
from typing import Dict, List, Deque, Any
import time
import abc
from mlagents.tf_utils import tf

from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.stats import StatsReporter
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.policy import Policy
from mlagents_envs.timers import hierarchical_timer
LOGGER = logging.getLogger("mlagents.trainers")

pass
class Trainer(object):
class Trainer(abc.ABC):
"""This class is the base class for the mlagents_envs.trainers"""
def __init__(

self.cumulative_returns_since_policy_update: List[float] = []
self.is_training = training
self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)
self.policy_queues: List[AgentManagerQueue[Policy]] = []
self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
self.training_start_time = time.time()
self.summary_freq = self.trainer_parameters["summary_freq"]
self.next_update_step = self.summary_freq
def check_param_keys(self):
def _check_param_keys(self):
for k in self.param_keys:
if k not in self.trainer_parameters:
raise UnityTrainerException(

LOGGER.info("Could not write text summary for Tensorboard.")
pass
def dict_to_str(self, param_dict: Dict[str, Any], num_tabs: int) -> str:
def _dict_to_str(self, param_dict: Dict[str, Any], num_tabs: int) -> str:
"""
Takes a parameter dictionary and converts it to a human-readable string.
Recurses if there are multiple levels of dict. Used to print out hyperaparameters.

"\t"
+ " " * num_tabs
+ "{0}:\t{1}".format(
x, self.dict_to_str(param_dict[x], num_tabs + 1)
x, self._dict_to_str(param_dict[x], num_tabs + 1)
)
for x in param_dict
]

return """Hyperparameters for the {0} of brain {1}: \n{2}""".format(
self.__class__.__name__,
self.brain_name,
self.dict_to_str(self.trainer_parameters, 0),
self._dict_to_str(self.trainer_parameters, 0),
)
@property

return self.trainer_parameters
@property
def get_max_steps(self) -> float:
def get_max_steps(self) -> int:
return float(self.trainer_parameters["max_steps"])
return int(float(self.trainer_parameters["max_steps"]))
@property
def get_step(self) -> int:

return self.step
@property
def should_still_train(self) -> bool:
"""
Returns whether or not the trainer should train. A Trainer could
stop training if it wasn't training to begin with, or if max_steps
is reached.
"""
return self.is_training and self.get_step <= self.get_max_steps
@property
def reward_buffer(self) -> Deque[float]:
"""
Returns the reward buffer. The reward buffer contains the cumulative

"""
return self._reward_buffer
def increment_step(self, n_steps: int) -> None:
def _increment_step(self, n_steps: int, name_behavior_id: str) -> None:
self.next_update_step = self.step + (
self.summary_freq - self.step % self.summary_freq
)
p = self.get_policy(name_behavior_id)
if p:
p.increment_step(n_steps)
def save_model(self, name_behavior_id: str) -> None:
"""

"""
self.get_policy(name_behavior_id).export_model()
def write_summary(self, global_step: int, delta_train_start: float) -> None:
def _write_summary(self, step: int) -> None:
:param delta_train_start: Time elapsed since training started.
:param global_step: The number of steps the simulation has been going for
if (
global_step % self.trainer_parameters["summary_freq"] == 0
and global_step != 0
):
is_training = (
"Training."
if self.is_training and self.get_step <= self.get_max_steps
else "Not Training."
is_training = "Training." if self.should_still_train else "Not Training."
stats_summary = self.stats_reporter.get_stats_summaries(
"Environment/Cumulative Reward"
)
if stats_summary.num > 0:
LOGGER.info(
" {}: {}: Step: {}. "
"Time Elapsed: {:0.3f} s "
"Mean "
"Reward: {:0.3f}"
". Std of Reward: {:0.3f}. {}".format(
self.run_id,
self.brain_name,
step,
time.time() - self.training_start_time,
stats_summary.mean,
stats_summary.std,
is_training,
)
step = min(self.get_step, self.get_max_steps)
stats_summary = self.stats_reporter.get_stats_summaries(
"Environment/Cumulative Reward"
)
if stats_summary.num > 0:
LOGGER.info(
" {}: {}: Step: {}. "
"Time Elapsed: {:0.3f} s "
"Mean "
"Reward: {:0.3f}"
". Std of Reward: {:0.3f}. {}".format(
self.run_id,
self.brain_name,
step,
delta_train_start,
stats_summary.mean,
stats_summary.std,
is_training,
)
)
set_gauge(f"{self.brain_name}.mean_reward", stats_summary.mean)
else:
LOGGER.info(
" {}: {}: Step: {}. No episode was completed since last summary. {}".format(
self.run_id, self.brain_name, step, is_training
)
set_gauge(f"{self.brain_name}.mean_reward", stats_summary.mean)
else:
LOGGER.info(
" {}: {}: Step: {}. No episode was completed since last summary. {}".format(
self.run_id, self.brain_name, step, is_training
self.stats_reporter.write_stats(int(step))
)
self.stats_reporter.write_stats(int(step))
def process_trajectory(self, trajectory: Trajectory) -> None:
@abc.abstractmethod
def _process_trajectory(self, trajectory: Trajectory) -> None:
Processing involves calculating value and advantage targets for model updating step.
raise UnityTrainerException(
"The process_experiences method was not implemented."
)
self._maybe_write_summary(self.get_step + len(trajectory.steps))
self._increment_step(len(trajectory.steps), trajectory.behavior_id)
def _maybe_write_summary(self, step_after_process: int) -> None:
"""
If processing the trajectory will make the step exceed the next summary write,
write the summary. This logic ensures summaries are written on the update step and not in between.
:param step_after_process: the step count after processing the next trajectory.
"""
if step_after_process >= self.next_update_step and self.get_step != 0:
self._write_summary(self.next_update_step)
@abc.abstractmethod
raise UnityTrainerException("The end_episode method was not implemented.")
pass
@abc.abstractmethod
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
"""
Creates policy
"""
pass
@abc.abstractmethod
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
"""
Adds policy to trainer
"""
pass
@abc.abstractmethod
def get_policy(self, name_behavior_id: str) -> TFPolicy:
"""
Gets policy from trainer
"""
pass
def is_ready_update(self):
@abc.abstractmethod
def _is_ready_update(self):
raise UnityTrainerException("The is_ready_update method was not implemented.")
return False
def update_policy(self):
@abc.abstractmethod
def _update_policy(self):
raise UnityTrainerException("The update_model method was not implemented.")
pass
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
def advance(self) -> None:
Creates policy
Steps the trainer, taking in trajectories and updates if ready.
raise UnityTrainerException("The create_policy method was not implemented.")
with hierarchical_timer("process_trajectory"):
for traj_queue in self.trajectory_queues:
try:
t = traj_queue.get_nowait()
self._process_trajectory(t)
except AgentManagerQueue.Empty:
pass
if self.should_still_train:
if self._is_ready_update():
with hierarchical_timer("_update_policy"):
self._update_policy()
for q in self.policy_queues:
# Get policies that correspond to the policy queue in question
q.put(self.get_policy(q.behavior_id))
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
def publish_policy_queue(self, policy_queue: AgentManagerQueue[Policy]) -> None:
Adds policy to trainer
Adds a policy queue to the list of queues to publish to when this Trainer
makes a policy update
:param queue: Policy queue to publish to.
raise UnityTrainerException("The add_policy method was not implemented")
self.policy_queues.append(policy_queue)
def get_policy(self, name_behavior_id: str) -> TFPolicy:
def subscribe_trajectory_queue(
self, trajectory_queue: AgentManagerQueue[Trajectory]
) -> None:
Gets policy from trainer
Adds a trajectory queue to the list of queues for the trainer injest Trajectories from.
:param queue: Trajectory queue to publish to.
raise UnityTrainerException("The get_policy method was not implemented.")
def advance(self) -> None:
pass
self.trajectory_queues.append(trajectory_queue)

161
ml-agents/mlagents/trainers/trainer_controller.py


import sys
import json
import logging
from typing import Dict, List, Optional, Set, NamedTuple
from typing import Dict, List, Optional, Set
from time import time
from mlagents.trainers.env_manager import EnvManager, EnvironmentStep
from mlagents_envs.exception import (

from mlagents.trainers.trainer import Trainer
from mlagents.trainers.meta_curriculum import MetaCurriculum
from mlagents.trainers.trainer_util import TrainerFactory
from mlagents.trainers.agent_processor import AgentProcessor
class AgentManager(NamedTuple):
processor: AgentProcessor
from mlagents.trainers.agent_processor import AgentManager, AgentManagerQueue
class TrainerController(object):

self.save_freq = save_freq
self.train_model = train
self.meta_curriculum = meta_curriculum
self.training_start_time = time()
self.sampler_manager = sampler_manager
self.resampling_interval = resampling_interval
np.random.seed(training_seed)

for (
brain_name,
curriculum,
) in self.meta_curriculum.brains_to_curriculums.items():
) in self.meta_curriculum.brains_to_curricula.items():
measure_val = (
self.trainers[brain_name].get_step
/ self.trainers[brain_name].get_max_steps
measure_val = self.trainers[brain_name].get_step / float(
self.trainers[brain_name].get_max_steps
)
brain_names_to_measure_vals[brain_name] = measure_val
elif curriculum.measure == "reward":

def _not_done_training(self) -> bool:
return (
any(t.get_step <= t.get_max_steps for k, t in self.trainers.items())
any(t.should_still_train for t in self.trainers.values())
def write_to_tensorboard(self, global_step: int) -> None:
for brain_name, trainer in self.trainers.items():
# Write training statistics to Tensorboard.
delta_train_start = time() - self.training_start_time
if (
self.meta_curriculum
and brain_name in self.meta_curriculum.brains_to_curriculums
):
lesson_num = self.meta_curriculum.brains_to_curriculums[
brain_name
].lesson_num
trainer.stats_reporter.add_stat("Environment/Lesson", lesson_num)
trainer.write_summary(global_step, delta_train_start)
def _create_trainer_and_manager(
self, env_manager: EnvManager, name_behavior_id: str
) -> None:
try:
brain_name, _ = name_behavior_id.split("?")
except ValueError:
brain_name = name_behavior_id
try:
trainer = self.trainers[brain_name]
except KeyError:
trainer = self.trainer_factory.generate(brain_name)
self.trainers[brain_name] = trainer
self.logger.info(trainer)
if self.train_model:
trainer.write_tensorboard_text("Hyperparameters", trainer.parameters)
policy = trainer.create_policy(env_manager.external_brains[name_behavior_id])
trainer.add_policy(name_behavior_id, policy)
env_manager.set_policy(name_behavior_id, policy)
self.brain_name_to_identifier[brain_name].add(name_behavior_id)
agent_manager = AgentManager(
policy,
name_behavior_id,
trainer.stats_reporter,
trainer.parameters.get("time_horizon", sys.maxsize),
)
trainer.publish_policy_queue(agent_manager.policy_queue)
trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue)
self.managers[name_behavior_id] = agent_manager
def start_learning(self, env_manager: EnvManager) -> None:
self._create_model_path(self.model_path)

external_brain_behavior_ids = set(env_manager.external_brains.keys())
new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids
for name_behavior_id in new_behavior_ids:
try:
brain_name, _ = name_behavior_id.split("?")
except ValueError:
brain_name = name_behavior_id
try:
trainer = self.trainers[brain_name]
except KeyError:
trainer = self.trainer_factory.generate(brain_name)
self.trainers[brain_name] = trainer
self.logger.info(trainer)
if self.train_model:
trainer.write_tensorboard_text(
"Hyperparameters", trainer.parameters
)
policy = trainer.create_policy(
env_manager.external_brains[name_behavior_id]
)
trainer.add_policy(name_behavior_id, policy)
env_manager.set_policy(name_behavior_id, policy)
self.brain_name_to_identifier[brain_name].add(name_behavior_id)
agent_manager = AgentManager(
processor=AgentProcessor(
trainer,
policy,
name_behavior_id,
trainer.stats_reporter,
trainer.parameters.get("time_horizon", sys.maxsize),
)
)
self.managers[name_behavior_id] = agent_manager
self._create_trainer_and_manager(env_manager, name_behavior_id)
n_steps = self.advance(env_manager)
for _ in range(n_steps):
global_step += 1

self._save_model()
self.write_to_tensorboard(global_step)
# Final save Tensorflow model
if global_step != 0 and self.train_model:
self._save_model()

if meta_curriculum_reset or generalization_reset:
self.end_trainer_episodes(env, lessons_incremented)
@timed
def advance(self, env: EnvManager) -> int:
def _get_and_process_experiences(self, env: EnvManager) -> int:
# Get new policies if found
for brain_name in self.trainers.keys():
for name_behavior_id in self.brain_name_to_identifier[brain_name]:
try:
_policy = self.managers[
name_behavior_id
].policy_queue.get_nowait()
env.set_policy(name_behavior_id, _policy)
except AgentManagerQueue.Empty:
pass
# Step the environment
# Add to AgentProcessor
for step_info in new_step_infos:
for name_behavior_id in step_info.name_behavior_ids:
if name_behavior_id not in self.managers:

)
)
continue
_processor = self.managers[name_behavior_id].processor
_processor.add_experiences(
_processor = self.managers[name_behavior_id].add_experiences(
return len(new_step_infos)
for brain_name, trainer in self.trainers.items():
if self.train_model and trainer.get_step <= trainer.get_max_steps:
n_steps = len(new_step_infos)
trainer.increment_step(n_steps)
for name_behavior_id in self.brain_name_to_identifier[brain_name]:
trainer.get_policy(name_behavior_id).increment_step(n_steps)
if trainer.is_ready_update():
# Perform gradient descent with experience buffer
with hierarchical_timer("update_policy"):
trainer.update_policy()
for name_behavior_id in self.brain_name_to_identifier[brain_name]:
env.set_policy(
name_behavior_id, trainer.get_policy(name_behavior_id)
)
else:
# Avoid memory leak during inference
# Eventually this whole block will take place in advance()
# But currently this only calls clear_update_buffer() in RLTrainer
# and nothing in the base class
@timed
def advance(self, env: EnvManager) -> int:
# Get steps
num_steps = self._get_and_process_experiences(env)
# Report current lesson
if self.meta_curriculum:
for brain_name, curr in self.meta_curriculum.brains_to_curricula.items():
if brain_name in self.trainers:
self.trainers[brain_name].stats_reporter.set_stat(
"Environment/Lesson", curr.lesson_num
)
# Advance trainers. This can be done in a separate loop in the future.
with hierarchical_timer("trainer_advance"):
for trainer in self.trainers.values():
return len(new_step_infos)
return num_steps

6
ml-agents/mlagents/trainers/trainer_util.py


min_lesson_length = 1
if meta_curriculum:
if brain_name in meta_curriculum.brains_to_curriculums:
min_lesson_length = meta_curriculum.brains_to_curriculums[
if brain_name in meta_curriculum.brains_to_curricula:
min_lesson_length = meta_curriculum.brains_to_curricula[
f"Brains with curricula: {meta_curriculum.brains_to_curriculums.keys()}. "
f"Brains with curricula: {meta_curriculum.brains_to_curricula.keys()}. "
)
trainer: Trainer = None # type: ignore # will be set to one of these, or raise

1
test_requirements.txt


# Test-only dependencies should go here, not in setup.py
pytest>4.0.0,<6.0.0
pytest-cov==2.6.1
pytest-xdist

105
UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/FloatVisualSensorTests.cs


using NUnit.Framework;
using UnityEngine;
using MLAgents.Sensor;
namespace MLAgents.Tests
{
public class Float2DSensor : ISensor
{
public int Width { get; }
public int Height { get; }
string m_Name;
int[] m_Shape;
public float[,] floatData;
public Float2DSensor(int width, int height, string name)
{
Width = width;
Height = height;
m_Name = name;
m_Shape = new[] { height, width, 1 };
floatData = new float[Height, Width];
}
public Float2DSensor(float[,] floatData, string name)
{
this.floatData = floatData;
Height = floatData.GetLength(0);
Width = floatData.GetLength(1);
m_Name = name;
m_Shape = new[] { Height, Width, 1 };
}
public string GetName()
{
return m_Name;
}
public int[] GetObservationShape()
{
return m_Shape;
}
public byte[] GetCompressedObservation()
{
return null;
}
public int Write(WriteAdapter adapter)
{
using (TimerStack.Instance.Scoped("Float2DSensor.Write"))
{
for (var h = 0; h < Height; h++)
{
for (var w = 0; w < Width; w++)
{
adapter[h, w, 0] = floatData[h, w];
}
}
var numWritten = Height * Width;
return numWritten;
}
}
public void Update() { }
public SensorCompressionType GetCompressionType()
{
return SensorCompressionType.None;
}
}
public class FloatVisualSensorTests
{
[Test]
public void TestFloat2DSensorWrite()
{
var sensor = new Float2DSensor(3, 4, "floatsensor");
for (var h = 0; h < 4; h++)
{
for (var w = 0; w < 3; w++)
{
sensor.floatData[h, w] = 3 * h + w;
}
}
var output = new float[12];
var writer = new WriteAdapter();
writer.SetTarget(output, sensor.GetObservationShape(), 0);
sensor.Write(writer);
for (var i = 0; i < 9; i++)
{
Assert.AreEqual(i, output[i]);
}
}
[Test]
public void TestFloat2DSensorExternalData()
{
var data = new float[4, 3];
var sensor = new Float2DSensor(data, "floatsensor");
Assert.AreEqual(sensor.Height, 4);
Assert.AreEqual(sensor.Width, 3);
}
}
}

3
UnitySDK/Assets/ML-Agents/Editor/Tests/Sensor/FloatVisualSensorTests.cs.meta


fileFormatVersion: 2
guid: 49b7da14949a486b803e28ed32d91a09
timeCreated: 1578093005
正在加载...
取消
保存