浏览代码

Merge master

/release_4_branch
Arthur Juliani 4 年前
当前提交
6bee0fd1
共有 53 个文件被更改,包括 1432 次插入260 次删除
  1. 25
      Project/Assets/ML-Agents/Examples/Crawler/Scripts/CrawlerAgent.cs
  2. 33
      Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs
  3. 28
      com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyPoseExtractor.cs
  4. 44
      com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsSensorSettings.cs
  5. 85
      com.unity.ml-agents.extensions/Runtime/Sensors/PoseExtractor.cs
  6. 28
      com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyPoseExtractor.cs
  7. 11
      com.unity.ml-agents.extensions/Tests/Editor/Sensors/PoseExtractorTests.cs
  8. 1
      com.unity.ml-agents.extensions/Tests/Editor/Sensors/RigidBodyPoseExtractorTests.cs
  9. 8
      com.unity.ml-agents.extensions/Tests/Editor/Unity.ML-Agents.Extensions.EditorTests.asmdef
  10. 13
      com.unity.ml-agents/CHANGELOG.md
  11. 69
      com.unity.ml-agents/Runtime/Academy.cs
  12. 32
      com.unity.ml-agents/Tests/Editor/AcademyTests.cs
  13. 22
      com.unity.ml-agents/Tests/Editor/Sensor/VectorSensorTests.cs
  14. 4
      docs/Training-ML-Agents.md
  15. 4
      gym-unity/gym_unity/__init__.py
  16. 4
      ml-agents-envs/mlagents_envs/__init__.py
  17. 20
      ml-agents/mlagents/model_serialization.py
  18. 4
      ml-agents/mlagents/trainers/__init__.py
  19. 17
      ml-agents/mlagents/trainers/ghost/trainer.py
  20. 1
      ml-agents/mlagents/trainers/learn.py
  21. 31
      ml-agents/mlagents/trainers/policy/tf_policy.py
  22. 1
      ml-agents/mlagents/trainers/ppo/trainer.py
  23. 22
      ml-agents/mlagents/trainers/sac/trainer.py
  24. 12
      ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
  25. 4
      ml-agents/mlagents/trainers/tests/test_config_conversion.py
  26. 7
      ml-agents/mlagents/trainers/tests/test_nn_policy.py
  27. 27
      ml-agents/mlagents/trainers/tests/test_ppo.py
  28. 43
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  29. 31
      ml-agents/mlagents/trainers/tests/test_sac.py
  30. 8
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  31. 64
      ml-agents/mlagents/trainers/tests/test_training_status.py
  32. 66
      ml-agents/mlagents/trainers/trainer/rl_trainer.py
  33. 22
      ml-agents/mlagents/trainers/trainer/trainer.py
  34. 15
      ml-agents/mlagents/trainers/trainer_controller.py
  35. 5
      ml-agents/mlagents/trainers/trainer_util.py
  36. 2
      ml-agents/mlagents/trainers/training_status.py
  37. 41
      com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodySensorComponent.cs
  38. 11
      com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodySensorComponent.cs.meta
  39. 93
      com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsBodySensor.cs
  40. 11
      com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsBodySensor.cs.meta
  41. 52
      com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodySensorComponent.cs
  42. 11
      com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodySensorComponent.cs.meta
  43. 63
      com.unity.ml-agents.extensions/Tests/Editor/Sensors/ArticulationBodyPoseExtractorTests.cs
  44. 11
      com.unity.ml-agents.extensions/Tests/Editor/Sensors/ArticulationBodyPoseExtractorTests.cs.meta
  45. 113
      com.unity.ml-agents.extensions/Tests/Editor/Sensors/ArticulationBodySensorTests.cs
  46. 11
      com.unity.ml-agents.extensions/Tests/Editor/Sensors/ArticulationBodySensorTests.cs.meta
  47. 113
      com.unity.ml-agents.extensions/Tests/Editor/Sensors/RigidBodySensorTests.cs
  48. 11
      com.unity.ml-agents.extensions/Tests/Editor/Sensors/RigidBodySensorTests.cs.meta
  49. 66
      com.unity.ml-agents/Runtime/SensorHelper.cs
  50. 11
      com.unity.ml-agents/Runtime/SensorHelper.cs.meta
  51. 98
      ml-agents/mlagents/trainers/policy/checkpoint_manager.py
  52. 92
      ml-agents/mlagents/trainers/tests/test_tf_policy.py
  53. 71
      ml-agents/mlagents/trainers/tests/test_policy.py

25
Project/Assets/ML-Agents/Examples/Crawler/Scripts/CrawlerAgent.cs


using System;
using Random = UnityEngine.Random;
[RequireComponent(typeof(JointDriveController))] // Required to set joint forces
public class CrawlerAgent : Agent

{
//Add body rotation delta relative to orientation cube
sensor.AddObservation(Quaternion.FromToRotation(body.forward, orientationCube.transform.forward));
//Add pos of target relative to orientation cube
sensor.AddObservation(orientationCube.transform.InverseTransformPoint(target.transform.position));

{
var movingTowardsDot = Vector3.Dot(orientationCube.transform.forward,
Vector3.ClampMagnitude(m_JdController.bodyPartsDict[body].rb.velocity, maximumWalkingSpeed));
;
if (float.IsNaN(movingTowardsDot))
{
throw new ArgumentException(
"NaN in movingTowardsDot.\n" +
$" orientationCube.transform.forward: {orientationCube.transform.forward}\n"+
$" body.velocity: {m_JdController.bodyPartsDict[body].rb.velocity}\n"+
$" maximumWalkingSpeed: {maximumWalkingSpeed}"
);
}
AddReward(0.03f * movingTowardsDot);
}

void RewardFunctionFacingTarget()
{
AddReward(0.01f * Vector3.Dot(orientationCube.transform.forward, body.forward));
var facingReward = Vector3.Dot(orientationCube.transform.forward, body.forward);
if (float.IsNaN(facingReward))
{
throw new ArgumentException(
"NaN in movingTowardsDot.\n" +
$" orientationCube.transform.forward: {orientationCube.transform.forward}\n"+
$" body.forward: {body.forward}"
);
}
AddReward(0.01f * facingReward);
}
/// <summary>

33
Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs


using System;
using MLAgentsExamples;
using UnityEngine;
using Unity.MLAgents;

using Random = UnityEngine.Random;
public class WalkerAgent : Agent
{

// a. Velocity alignment with goal direction.
var moveTowardsTargetReward = Vector3.Dot(cubeForward,
Vector3.ClampMagnitude(m_JdController.bodyPartsDict[hips].rb.velocity, maximumWalkingSpeed));
if (float.IsNaN(moveTowardsTargetReward))
{
throw new ArgumentException(
"NaN in moveTowardsTargetReward.\n" +
$" cubeForward: {cubeForward}\n"+
$" hips.velocity: {m_JdController.bodyPartsDict[hips].rb.velocity}\n"+
$" maximumWalkingSpeed: {maximumWalkingSpeed}"
);
}
if (float.IsNaN(lookAtTargetReward))
{
throw new ArgumentException(
"NaN in lookAtTargetReward.\n" +
$" cubeForward: {cubeForward}\n"+
$" head.forward: {head.forward}"
);
}
var headHeightOverFeetReward =
var headHeightOverFeetReward =
if (float.IsNaN(headHeightOverFeetReward))
{
throw new ArgumentException(
"NaN in headHeightOverFeetReward.\n" +
$" head.position: {head.position}\n"+
$" footL.position: {footL.position}\n"+
$" footR.position: {footR.position}"
);
}
AddReward(
+ 0.02f * moveTowardsTargetReward
+ 0.02f * lookAtTargetReward

28
com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyPoseExtractor.cs


namespace Unity.MLAgents.Extensions.Sensors
{
/// <summary>
/// Utility class to track a hierarchy of ArticulationBodies.
/// </summary>
public class ArticulationBodyPoseExtractor : PoseExtractor
{
ArticulationBody[] m_Bodies;

if (rootBody == null)
{
return;
}
if (!rootBody.isRoot)
{
Debug.Log("Must pass ArticulationBody.isRoot");

for (var i = 1; i < numBodies; i++)
{
var body = m_Bodies[i];
var parent = body.GetComponentInParent<ArticulationBody>();
parentIndices[i] = bodyToIndex[parent];
var currentArticBody = m_Bodies[i];
// Component.GetComponentInParent will consider the provided object as well.
// So start looking from the parent.
var currentGameObject = currentArticBody.gameObject;
var parentGameObject = currentGameObject.transform.parent;
var parentArticBody = parentGameObject.GetComponentInParent<ArticulationBody>();
parentIndices[i] = bodyToIndex[parentArticBody];
/// <inheritdoc/>
protected override Vector3 GetLinearVelocityAt(int index)
{
return m_Bodies[index].velocity;
}
/// <inheritdoc/>
protected override Pose GetPoseAt(int index)
{
var body = m_Bodies[index];

}
}
}
#endif // UNITY_2020_1_OR_NEWER

44
com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsSensorSettings.cs


namespace Unity.MLAgents.Extensions.Sensors
{
/// <summary>
/// Settings that define the observations generated for physics-based sensors.
/// </summary>
[Serializable]
public struct PhysicsSensorSettings
{

public bool UseModelSpaceTranslations;
/// <summary>
/// Whether to use model space (relative to the root body) rotatoins as observations.
/// Whether to use model space (relative to the root body) rotations as observations.
/// </summary>
public bool UseModelSpaceRotations;

public bool UseLocalSpaceRotations;
/// <summary>
/// Whether to use model space (relative to the root body) linear velocities as observations.
/// </summary>
public bool UseModelSpaceLinearVelocity;
/// <summary>
/// Whether to use local space (relative to the parent body) linear velocities as observations.
/// </summary>
public bool UseLocalSpaceLinearVelocity;
/// <summary>
/// Creates a PhysicsSensorSettings with reasonable default values.
/// </summary>
/// <returns></returns>

/// </summary>
public bool UseModelSpace
{
get { return UseModelSpaceTranslations || UseModelSpaceRotations; }
get { return UseModelSpaceTranslations || UseModelSpaceRotations || UseModelSpaceLinearVelocity; }
}
/// <summary>

{
get { return UseLocalSpaceTranslations || UseLocalSpaceRotations; }
get { return UseLocalSpaceTranslations || UseLocalSpaceRotations || UseLocalSpaceLinearVelocity; }
}

obsPerTransform += UseModelSpaceRotations ? 4 : 0;
obsPerTransform += UseLocalSpaceTranslations ? 3 : 0;
obsPerTransform += UseLocalSpaceRotations ? 4 : 0;
obsPerTransform += UseModelSpaceLinearVelocity ? 3 : 0;
obsPerTransform += UseLocalSpaceLinearVelocity ? 3 : 0;
return numTransforms * obsPerTransform;
}

var offset = baseOffset;
if (settings.UseModelSpace)
{
foreach (var pose in poseExtractor.ModelSpacePoses)
var poses = poseExtractor.ModelSpacePoses;
var vels = poseExtractor.ModelSpaceVelocities;
for(var i=0; i<poseExtractor.NumPoses; i++)
var pose = poses[i];
if(settings.UseModelSpaceTranslations)
{
writer.Add(pose.position, offset);

{
writer.Add(pose.rotation, offset);
offset += 4;
}
if (settings.UseModelSpaceLinearVelocity)
{
writer.Add(vels[i], offset);
offset += 3;
}
}
}

foreach (var pose in poseExtractor.LocalSpacePoses)
var poses = poseExtractor.LocalSpacePoses;
var vels = poseExtractor.LocalSpaceVelocities;
for(var i=0; i<poseExtractor.NumPoses; i++)
var pose = poses[i];
if(settings.UseLocalSpaceTranslations)
{
writer.Add(pose.position, offset);

{
writer.Add(pose.rotation, offset);
offset += 4;
}
if (settings.UseLocalSpaceLinearVelocity)
{
writer.Add(vels[i], offset);
offset += 3;
}
}
}

85
com.unity.ml-agents.extensions/Runtime/Sensors/PoseExtractor.cs


Pose[] m_ModelSpacePoses;
Pose[] m_LocalSpacePoses;
Vector3[] m_ModelSpaceLinearVelocities;
Vector3[] m_LocalSpaceLinearVelocities;
/// <summary>
/// Read access to the model space transforms.
/// </summary>

}
/// <summary>
/// Number of transforms in the hierarchy (read-only).
/// Read access to the model space linear velocities.
/// </summary>
public IList<Vector3> ModelSpaceVelocities
{
get { return m_ModelSpaceLinearVelocities; }
}
/// <summary>
/// Read access to the local space linear velocities.
/// </summary>
public IList<Vector3> LocalSpaceVelocities
{
get { return m_LocalSpaceLinearVelocities; }
}
/// <summary>
/// Number of poses in the hierarchy (read-only).
/// </summary>
public int NumPoses
{

/// <summary>
/// Get the parent index of the body at the specified index.
/// </summary>
/// <param name="index"></param>
/// <returns></returns>
public int GetParentIndex(int index)
{
if (m_ParentIndices == null)
{
return -1;
}
return m_ParentIndices[index];
}
/// <summary>
/// Initialize with the mapping of parent indices.
/// The 0th element is assumed to be -1, indicating that it's the root.
/// </summary>

var numTransforms = parentIndices.Length;
m_ModelSpacePoses = new Pose[numTransforms];
m_LocalSpacePoses = new Pose[numTransforms];
m_ModelSpaceLinearVelocities = new Vector3[numTransforms];
m_LocalSpaceLinearVelocities = new Vector3[numTransforms];
}
/// <summary>

protected abstract Pose GetPoseAt(int index);
/// <summary>
/// Return the world space linear velocity of the i'th object.
/// </summary>
/// <param name="index"></param>
/// <returns></returns>
protected abstract Vector3 GetLinearVelocityAt(int index);
/// <summary>
/// Update the internal model space transform storage based on the underlying system.
/// </summary>
public void UpdateModelSpacePoses()

return;
}
var worldTransform = GetPoseAt(0);
var worldToModel = worldTransform.Inverse();
var rootWorldTransform = GetPoseAt(0);
var worldToModel = rootWorldTransform.Inverse();
var rootLinearVel = GetLinearVelocityAt(0);
var currentTransform = GetPoseAt(i);
m_ModelSpacePoses[i] = worldToModel.Multiply(currentTransform);
var currentWorldSpacePose = GetPoseAt(i);
var currentModelSpacePose = worldToModel.Multiply(currentWorldSpacePose);
m_ModelSpacePoses[i] = currentModelSpacePose;
var currentBodyLinearVel = GetLinearVelocityAt(i);
var relativeVelocity = currentBodyLinearVel - rootLinearVel;
m_ModelSpaceLinearVelocities[i] = worldToModel.rotation * relativeVelocity;
}
}

var invParent = parentTransform.Inverse();
var currentTransform = GetPoseAt(i);
m_LocalSpacePoses[i] = invParent.Multiply(currentTransform);
var parentLinearVel = GetLinearVelocityAt(m_ParentIndices[i]);
var currentLinearVel = GetLinearVelocityAt(i);
m_LocalSpaceLinearVelocities[i] = invParent.rotation * (currentLinearVel - parentLinearVel);
m_LocalSpaceLinearVelocities[i] = Vector3.zero;
public void DrawModelSpace(Vector3 offset)
internal void DrawModelSpace(Vector3 offset)
{
UpdateLocalSpacePoses();
UpdateModelSpacePoses();

}
}
/// <summary>
/// Extension methods for the Pose struct, in order to improve the readability of some math.
/// </summary>
public static class PoseExtensions
{
/// <summary>

public static Pose Multiply(this Pose pose, Pose rhs)
{
return rhs.GetTransformedBy(pose);
}
/// <summary>
/// Transform the vector by the pose. Conceptually this is equivalent to treating the Pose
/// as a 4x4 matrix and multiplying the augmented vector.
/// See https://en.wikipedia.org/wiki/Affine_transformation#Augmented_matrix for more details.
/// </summary>
/// <param name="pose"></param>
/// <param name="rhs"></param>
/// <returns></returns>
public static Vector3 Multiply(this Pose pose, Vector3 rhs)
{
return pose.rotation * rhs + pose.position;
}
// TODO optimize inv(A)*B?

28
com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyPoseExtractor.cs


/// Initialize given a root RigidBody.
/// </summary>
/// <param name="rootBody"></param>
public RigidBodyPoseExtractor(Rigidbody rootBody)
public RigidBodyPoseExtractor(Rigidbody rootBody, GameObject rootGameObject = null)
var rbs = rootBody.GetComponentsInChildren <Rigidbody>();
Rigidbody[] rbs;
if (rootGameObject == null)
{
rbs = rootBody.GetComponentsInChildren<Rigidbody>();
}
else
{
rbs = rootGameObject.GetComponentsInChildren<Rigidbody>();
}
var bodyToIndex = new Dictionary<Rigidbody, int>(rbs.Length);
var parentIndices = new int[rbs.Length];

SetParentIndices(parentIndices);
}
/// <summary>
/// Get the pose of the i'th RigidBody.
/// </summary>
/// <param name="index"></param>
/// <returns></returns>
/// <inheritdoc/>
protected override Vector3 GetLinearVelocityAt(int index)
{
return m_Bodies[index].velocity;
}
/// <inheritdoc/>
}
}
}

11
com.unity.ml-agents.extensions/Tests/Editor/Sensors/PoseExtractorTests.cs


return Pose.identity;
}
protected override Vector3 GetLinearVelocityAt(int index)
{
return Vector3.zero;
}
public void Init(int[] parentIndices)
{
SetParentIndices(parentIndices);

position = translation
};
}
protected override Vector3 GetLinearVelocityAt(int index)
{
return Vector3.zero;
}
}
[Test]

1
com.unity.ml-agents.extensions/Tests/Editor/Sensors/RigidBodyPoseExtractorTests.cs


using System.Collections.Generic;
using UnityEngine;
using NUnit.Framework;
using Unity.MLAgents.Extensions.Sensors;

8
com.unity.ml-agents.extensions/Tests/Editor/Unity.ML-Agents.Extensions.EditorTests.asmdef


"name": "Unity.ML-Agents.Extensions.EditorTests",
"references": [
"Unity.ML-Agents.Extensions.Editor",
"Unity.ML-Agents.Extensions"
"Unity.ML-Agents.Extensions",
"Unity.ML-Agents"
],
"optionalUnityReferences": [
"TestAssemblies"

],
"excludePlatforms": []
"excludePlatforms": [],
"defineConstraints": [
"UNITY_INCLUDE_TESTS"
]
}

13
com.unity.ml-agents/CHANGELOG.md


and this project adheres to
[Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Major Changes
### Minor Changes
### Bug Fixes
#### com.unity.ml-agents (C#)
- Academy.EnvironmentStep() will now throw an exception if it is called
recursively (for example, by an Agent's CollectObservations method).
Previously, this would result in an infinite loop and cause the editor to hang.
(#4226)
## [1.2.0-preview] - 2020-07-15
### Major Changes

69
com.unity.ml-agents/Runtime/Academy.cs


// Flag used to keep track of the first time the Academy is reset.
bool m_HadFirstReset;
// Whether the Academy is in the middle of a step. This is used to detect and Academy
// step called by user code that is also called by the Academy.
bool m_IsStepping;
// Random seed used for inference.
int m_InferenceSeed;

/// </summary>
public void EnvironmentStep()
{
if (!m_HadFirstReset)
// Check whether we're already in the middle of a step.
// This shouldn't happen generally, but could happen if user code (e.g. CollectObservations)
// that is called by EnvironmentStep() also calls EnvironmentStep(). This would result
// in an infinite loop and/or stack overflow, so stop it before it happens.
if (m_IsStepping)
ForcedFullReset();
throw new UnityAgentsException(
"Academy.EnvironmentStep() called recursively. " +
"This might happen if you call EnvironmentStep() from custom code such as " +
"CollectObservations() or OnActionReceived()."
);
AgentPreStep?.Invoke(m_StepCount);
m_IsStepping = true;
try
{
if (!m_HadFirstReset)
{
ForcedFullReset();
}
m_StepCount += 1;
m_TotalStepCount += 1;
AgentIncrementStep?.Invoke();
AgentPreStep?.Invoke(m_StepCount);
m_StepCount += 1;
m_TotalStepCount += 1;
AgentIncrementStep?.Invoke();
using (TimerStack.Instance.Scoped("AgentSendState"))
{
AgentSendState?.Invoke();
}
using (TimerStack.Instance.Scoped("AgentSendState"))
{
AgentSendState?.Invoke();
}
using (TimerStack.Instance.Scoped("DecideAction"))
{
DecideAction?.Invoke();
}
using (TimerStack.Instance.Scoped("DecideAction"))
{
DecideAction?.Invoke();
}
// If the communicator is not on, we need to clear the SideChannel sending queue
if (!IsCommunicatorOn)
{
SideChannelManager.GetSideChannelMessage();
}
// If the communicator is not on, we need to clear the SideChannel sending queue
if (!IsCommunicatorOn)
{
SideChannelManager.GetSideChannelMessage();
using (TimerStack.Instance.Scoped("AgentAct"))
{
AgentAct?.Invoke();
}
using (TimerStack.Instance.Scoped("AgentAct"))
finally
AgentAct?.Invoke();
// Reset m_IsStepping when we're done (or if an exception occurred).
m_IsStepping = false;
}
}

32
com.unity.ml-agents/Tests/Editor/AcademyTests.cs


using NUnit.Framework;
using Unity.MLAgents.Sensors;
using UnityEngine;
#if UNITY_2019_3_OR_NEWER
using System.Reflection;

Assert.AreEqual("com.unity.ml-agents", packageInfo.name);
Assert.AreEqual(Academy.k_PackageVersion, packageInfo.version);
#endif
}
class RecursiveAgent : Agent
{
int m_collectObsCount;
public override void CollectObservations(VectorSensor sensor)
{
m_collectObsCount++;
if (m_collectObsCount == 1)
{
// NEVER DO THIS IN REAL CODE!
Academy.Instance.EnvironmentStep();
}
}
}
[Test]
public void TestRecursiveStepThrows()
{
var gameObj = new GameObject();
var agent = gameObj.AddComponent<RecursiveAgent>();
agent.LazyInitialize();
agent.RequestDecision();
Assert.Throws<UnityAgentsException>(() =>
{
Academy.Instance.EnvironmentStep();
});
// Make sure the Academy reset to a good state and is still steppable.
Academy.Instance.EnvironmentStep();
}

22
com.unity.ml-agents/Tests/Editor/Sensor/VectorSensorTests.cs


namespace Unity.MLAgents.Tests
{
public class SensorTestHelper
public static class SensorTestHelper
var numExpected = expected.Length;
const float fill = -1337f;
var output = new float[numExpected];
for (var i = 0; i < numExpected; i++)
{
output[i] = fill;
}
Assert.AreEqual(fill, output[0]);
ObservationWriter writer = new ObservationWriter();
writer.SetTarget(output, sensor.GetObservationShape(), 0);
// Make sure ObservationWriter didn't touch anything
Assert.AreEqual(fill, output[0]);
sensor.Write(writer);
Assert.AreEqual(expected, output);
string errorMessage;
bool isOK = SensorHelper.CompareObservation(sensor, expected, out errorMessage);
Assert.IsTrue(isOK, errorMessage);
}
}

4
docs/Training-ML-Agents.md


blocks. See [Profiling in Python](Profiling-Python.md) for more information
on the timers generated.
These artifacts (except the `.nn` file) are updated throughout the training
process and finalized when training completes or is interrupted.
These artifacts are updated throughout the training
process and finalized when training is completed or is interrupted.
#### Stopping and Resuming Training

4
gym-unity/gym_unity/__init__.py


# Version of the library that will be used to upload to pypi
__version__ = "0.18.0"
__version__ = "0.19.0.dev0"
__release_tag__ = "release_4"
__release_tag__ = None

4
ml-agents-envs/mlagents_envs/__init__.py


# Version of the library that will be used to upload to pypi
__version__ = "0.18.0"
__version__ = "0.19.0.dev0"
__release_tag__ = "release_4"
__release_tag__ = None

20
ml-agents/mlagents/model_serialization.py


def export_policy_model(
settings: SerializationSettings, graph: tf.Graph, sess: tf.Session
output_filepath: str,
settings: SerializationSettings,
graph: tf.Graph,
sess: tf.Session,
Exports latest saved model to .nn format for Unity embedding.
Exports a TF graph for a Policy to .nn and/or .onnx format for Unity embedding.
:param output_filepath: file path to output the model (without file suffix)
:param settings: SerializationSettings describing how to export the model
:param graph: Tensorflow Graph for the policy
:param sess: Tensorflow session for the policy
if not os.path.exists(settings.model_path):
os.makedirs(settings.model_path)
# Save frozen graph
frozen_graph_def_path = settings.model_path + "/frozen_graph_def.pb"
with gfile.GFile(frozen_graph_def_path, "wb") as f:

if settings.convert_to_barracuda:
tf2bc.convert(frozen_graph_def_path, settings.model_path + ".nn")
logger.info(f"Exported {settings.model_path}.nn file")
tf2bc.convert(frozen_graph_def_path, f"{output_filepath}.nn")
logger.info(f"Exported {output_filepath}.nn")
# Save to onnx too (if we were able to import it)
if ONNX_EXPORT_ENABLED:

onnx_output_path = settings.model_path + ".onnx"
onnx_output_path = f"{output_filepath}.onnx"
with open(onnx_output_path, "wb") as f:
f.write(onnx_graph.SerializeToString())
logger.info(f"Converting to {onnx_output_path}")

4
ml-agents/mlagents/trainers/__init__.py


# Version of the library that will be used to upload to pypi
__version__ = "0.18.0"
__version__ = "0.19.0.dev0"
__release_tag__ = "release_4"
__release_tag__ = None

17
ml-agents/mlagents/trainers/ghost/trainer.py


self.current_policy_snapshot: Dict[str, List[float]] = {}
self.snapshot_counter: int = 0
self.policies: Dict[str, TFPolicy] = {}
# wrapped_training_team and learning team need to be separate
# in the situation where new agents are created destroyed

"""
self.trainer.end_episode()
def save_model(self, name_behavior_id: str) -> None:
"""
Forwarding call to wrapped trainers save_model
"""
parsed_behavior_id = self._name_to_parsed_behavior_id[name_behavior_id]
brain_name = parsed_behavior_id.brain_name
self.trainer.save_model(brain_name)
def export_model(self, name_behavior_id: str) -> None:
def save_model(self) -> None:
Forwarding call to wrapped trainers export_model.
Forwarding call to wrapped trainers save_model.
parsed_behavior_id = self._name_to_parsed_behavior_id[name_behavior_id]
brain_name = parsed_behavior_id.brain_name
self.trainer.export_model(brain_name)
self.trainer.save_model()
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec

1
ml-agents/mlagents/trainers/learn.py


GlobalTrainingStatus.load_state(
os.path.join(run_logs_dir, "training_status.json")
)
# Configure CSV, Tensorboard Writers and StatsReporter
# We assume reward and episode length are needed in the CSV.
csv_writer = CSVWriter(

31
ml-agents/mlagents/trainers/policy/tf_policy.py


from typing import Any, Dict, List, Optional, Tuple
import abc
import os
from mlagents.model_serialization import SerializationSettings, export_policy_model
from mlagents.tf_utils import tf
from mlagents import tf_utils
from mlagents_envs.exception import UnityException

"""
return list(self.update_dict.keys())
def save_model(self, steps):
def checkpoint(self, checkpoint_path: str, settings: SerializationSettings) -> None:
Saves the model
:param steps: The number of steps the model was trained for
:return:
Checkpoints the policy on disk.
:param checkpoint_path: filepath to write the checkpoint
:param settings: SerializationSettings for exporting the model.
# Save the TF checkpoint and graph definition
last_checkpoint = os.path.join(self.model_path, f"model-{steps}.ckpt")
self.saver.save(self.sess, last_checkpoint)
if self.saver:
self.saver.save(self.sess, f"{checkpoint_path}.ckpt")
# also save the policy so we have optimized model files for each checkpoint
self.save(checkpoint_path, settings)
def save(self, output_filepath: str, settings: SerializationSettings) -> None:
"""
Saves the serialized model, given a path and SerializationSettings
This method will save the policy graph to the given filepath. The path
should be provided without an extension as multiple serialized model formats
may be generated as a result.
:param output_filepath: path (without suffix) for the model file(s)
:param settings: SerializationSettings for how to save the model.
"""
export_policy_model(output_filepath, settings, self.graph, self.sess)
def update_normalization(self, vector_obs: np.ndarray) -> None:
"""

1
ml-agents/mlagents/trainers/ppo/trainer.py


if not isinstance(policy, NNPolicy):
raise RuntimeError("Non-NNPolicy passed to PPOTrainer.add_policy()")
self.policy = policy
self.policies[parsed_behavior_id.behavior_id] = policy
self.optimizer = PPOOptimizer(self.policy, self.trainer_settings)
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)

22
ml-agents/mlagents/trainers/sac/trainer.py


import os
import numpy as np
from mlagents.trainers.policy.checkpoint_manager import NNCheckpoint
from mlagents_envs.logging_util import get_logger
from mlagents_envs.timers import timed

self.checkpoint_replay_buffer = self.hyperparameters.save_replay_buffer
def save_model(self, name_behavior_id: str) -> None:
def _checkpoint(self) -> NNCheckpoint:
Saves the model. Overrides the default save_model since we want to save
the replay buffer as well.
Writes a checkpoint model to memory
Overrides the default to save the replay buffer.
self.policy.save_model(self.get_step)
ckpt = super()._checkpoint()
if self.checkpoint_replay_buffer:
self.save_replay_buffer()
return ckpt
def save_model(self) -> None:
"""
Saves the final training model to memory
Overrides the default to save the replay buffer.
"""
super().save_model()
if self.checkpoint_replay_buffer:
self.save_replay_buffer()

) -> None:
"""
Adds policy to trainer.
:param brain_parameters: specifications for policy construction
"""
if self.policy:
logger.warning(

if not isinstance(policy, NNPolicy):
raise RuntimeError("Non-SACPolicy passed to SACTrainer.add_policy()")
self.policy = policy
self.policies[parsed_behavior_id.behavior_id] = policy
self.optimizer = SACOptimizer(self.policy, self.trainer_settings)
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)

12
ml-agents/mlagents/trainers/tests/test_barracuda_converter.py


from mlagents.trainers.tests.test_nn_policy import create_policy_mock
from mlagents.trainers.settings import TrainerSettings
from mlagents.tf_utils import tf
from mlagents.model_serialization import SerializationSettings, export_policy_model
from mlagents.model_serialization import SerializationSettings
def test_barracuda_converter():

use_discrete=discrete,
use_visual=visual,
)
policy.save_model(1000)
settings = SerializationSettings(policy.model_path, os.path.join(tmpdir, "test"))
export_policy_model(settings, policy.graph, policy.sess)
settings = SerializationSettings(policy.model_path, "MockBrain")
checkpoint_path = f"{tmpdir}/MockBrain-1"
policy.checkpoint(checkpoint_path, settings)
assert os.path.isfile(os.path.join(tmpdir, "test.nn"))
assert os.path.getsize(os.path.join(tmpdir, "test.nn")) > 100
assert os.path.isfile(checkpoint_path + ".nn")
assert os.path.getsize(checkpoint_path + ".nn") > 100

4
ml-agents/mlagents/trainers/tests/test_config_conversion.py


if trainer_type == TrainerType.PPO:
trainer_config = PPO_CONFIG
trainer_settings_type = PPOSettings
elif trainer_type == TrainerType.SAC:
else:
old_config = yaml.load(trainer_config)
old_config = yaml.safe_load(trainer_config)
old_config[BRAIN_NAME]["use_recurrent"] = use_recurrent
new_config = convert_behaviors(old_config)

7
ml-agents/mlagents/trainers/tests/test_nn_policy.py


import tempfile
import numpy as np
from mlagents.model_serialization import SerializationSettings
from mlagents.tf_utils import tf

policy = create_policy_mock(trainer_params, model_path=path1)
policy.initialize_or_load()
policy._set_step(2000)
policy.save_model(2000)
mock_brain_name = "MockBrain"
checkpoint_path = f"{policy.model_path}/{mock_brain_name}-2000"
serialization_settings = SerializationSettings(policy.model_path, mock_brain_name)
policy.checkpoint(checkpoint_path, serialization_settings)
assert len(os.listdir(tmp_path)) > 0

27
ml-agents/mlagents/trainers/tests/test_ppo.py


from mlagents.tf_utils import tf
import copy
import attr
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
from mlagents.trainers.ppo.optimizer import PPOOptimizer

5 # 10 hacked because this function is no longer called through trainer
)
policy_mock.increment_step = mock.Mock(return_value=step_count)
trainer.add_policy("testbehavior", policy_mock)
behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
trainer.add_policy(behavior_id, policy_mock)
trainer._increment_step(5, "testbehavior")
trainer._increment_step(5, trainer.brain_name)
policy_mock.increment_step.assert_called_with(5)
assert trainer.step == step_count

dummy_config, curiosity_dummy_config, use_discrete # noqa: F811
):
mock_brain = mb.setup_test_behavior_specs(
mock_behavior_spec = mb.setup_test_behavior_specs(
use_discrete,
False,
vector_action_space=DISCRETE_ACTION_SPACE

# Test curiosity reward signal
trainer_params.reward_signals = curiosity_dummy_config
mock_brain_name = "MockBrain"
behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name)
policy = trainer.create_policy("test", mock_brain)
trainer.add_policy("test", policy)
policy = trainer.create_policy(behavior_id, mock_behavior_spec)
trainer.add_policy(behavior_id, policy)
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain)
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_behavior_spec)
# Mock out reward signal eval
buffer["extrinsic_rewards"] = buffer["environment_rewards"]
buffer["extrinsic_returns"] = buffer["environment_rewards"]

vector_action_space=DISCRETE_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE,
)
mock_brain_name = "MockBrain"
behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name)
policy = trainer.create_policy("test_brain", behavior_spec)
trainer.add_policy("test_brain", policy)
policy = trainer.create_policy(behavior_id, behavior_spec)
trainer.add_policy(behavior_id, policy)
trajectory_queue = AgentManagerQueue("testbrain")
trainer.subscribe_trajectory_queue(trajectory_queue)
time_horizon = 15

policy = mock.Mock(spec=NNPolicy)
policy.get_current_step.return_value = 2000
trainer.add_policy("test_policy", policy)
behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
trainer.add_policy(behavior_id, policy)
assert trainer.get_policy("test_policy") == policy
# Make sure the summary steps were loaded properly

policy = mock.Mock()
with pytest.raises(RuntimeError):
trainer.add_policy("test_policy", policy)
trainer.add_policy(behavior_id, policy)
if __name__ == "__main__":

43
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


from unittest import mock
import pytest
import mlagents.trainers.tests.mock_brain as mb
from mlagents.trainers.policy.checkpoint_manager import NNCheckpoint
from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.tests.test_buffer import construct_fake_buffer
from mlagents.trainers.agent_processor import AgentManagerQueue

def _update_policy(self):
return self.update_policy
def add_policy(self):
pass
def add_policy(self, mock_behavior_id, mock_policy):
self.policies[mock_behavior_id] = mock_policy
def create_policy(self):
return mock.Mock()

assert len(arr) == 0
@mock.patch("mlagents.trainers.trainer.trainer.Trainer.save_model")
def test_advance(mocked_clear_update_buffer):
def test_advance(mocked_clear_update_buffer, mocked_save_model):
mock_policy = mock.Mock()
mock_policy.model_path = "mock_model_path"
trainer.add_policy("TestBrain", mock_policy)
trajectory_queue = AgentManagerQueue("testbrain")
policy_queue = AgentManagerQueue("testbrain")
trainer.subscribe_trajectory_queue(trajectory_queue)

# Check that the buffer has been cleared
assert not trainer.should_still_train
assert mocked_clear_update_buffer.call_count > 0
assert mocked_save_model.call_count == 0
@mock.patch("mlagents.trainers.trainer.trainer.Trainer.save_model")
def test_summary_checkpoint(mock_write_summary, mock_save_model):
@mock.patch("mlagents.trainers.trainer.rl_trainer.NNCheckpointManager.add_checkpoint")
def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary):
mock_policy = mock.Mock()
mock_policy.model_path = "mock_model_path"
trainer.add_policy("TestBrain", mock_policy)
trajectory_queue = AgentManagerQueue("testbrain")
policy_queue = AgentManagerQueue("testbrain")
trainer.subscribe_trajectory_queue(trajectory_queue)

]
mock_write_summary.assert_has_calls(calls, any_order=True)
checkpoint_range = range(
checkpoint_interval, num_trajectories * time_horizon, checkpoint_interval
)
mock.call(trainer.brain_name)
for step in range(
checkpoint_interval, num_trajectories * time_horizon, checkpoint_interval
mock.call(f"{mock_policy.model_path}/{trainer.brain_name}-{step}", mock.ANY)
for step in checkpoint_range
]
mock_policy.checkpoint.assert_has_calls(calls, any_order=True)
add_checkpoint_calls = [
mock.call(
trainer.brain_name,
NNCheckpoint(
step,
f"{mock_policy.model_path}/{trainer.brain_name}-{step}.nn",
None,
mock.ANY,
),
trainer.trainer_settings.keep_checkpoints,
for step in checkpoint_range
mock_save_model.assert_has_calls(calls, any_order=True)
mock_add_checkpoint.assert_has_calls(add_checkpoint_calls)

31
ml-agents/mlagents/trainers/tests/test_sac.py


import copy
from mlagents.tf_utils import tf
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.sac.trainer import SACTrainer
from mlagents.trainers.sac.optimizer import SACOptimizer

trainer_params = dummy_config
trainer_params.hyperparameters.save_replay_buffer = True
trainer = SACTrainer("test", 1, trainer_params, True, False, 0, "testdir")
policy = trainer.create_policy("test", mock_specs)
trainer.add_policy("test", policy)
behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
policy = trainer.create_policy(behavior_id, mock_specs)
trainer.add_policy(behavior_id, policy)
trainer.save_model(trainer.brain_name)
trainer.save_model()
policy = trainer2.create_policy("test", mock_specs)
trainer2.add_policy("test", policy)
policy = trainer2.create_policy(behavior_id, mock_specs)
trainer2.add_policy(behavior_id, policy)
assert trainer2.update_buffer.num_experiences == buffer_len

trainer = SACTrainer("test", 0, dummy_config, True, False, 0, "0")
policy = mock.Mock(spec=NNPolicy)
policy.get_current_step.return_value = 2000
trainer.add_policy("test", policy)
assert trainer.get_policy("test") == policy
behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
trainer.add_policy(behavior_id, policy)
assert trainer.get_policy(behavior_id.behavior_id) == policy
# Make sure the summary steps were loaded properly
assert trainer.get_step == 2000

with pytest.raises(RuntimeError):
trainer.add_policy("test", policy)
trainer.add_policy(behavior_id, policy)
def test_advance(dummy_config):

dummy_config.hyperparameters.reward_signal_steps_per_update = 20
dummy_config.hyperparameters.buffer_init_steps = 0
trainer = SACTrainer("test", 0, dummy_config, True, False, 0, "0")
policy = trainer.create_policy("test", specs)
trainer.add_policy("test", policy)
behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
policy = trainer.create_policy(behavior_id, specs)
trainer.add_policy(behavior_id, policy)
trajectory_queue = AgentManagerQueue("testbrain")
policy_queue = AgentManagerQueue("testbrain")

# Call add_policy and check that we update the correct number of times.
# This is to emulate a load from checkpoint.
policy = trainer.create_policy("test", specs)
behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
policy = trainer.create_policy(behavior_id, specs)
trainer.add_policy("test", policy)
trainer.add_policy(behavior_id, policy)
trainer.optimizer.update = mock.Mock()
trainer.optimizer.update_reward_signals = mock.Mock()
trainer.optimizer.update_reward_signals.return_value = {}

8
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


tc.advance.side_effect = take_step_sideeffect
tc._export_graph = MagicMock()
tc._save_model = MagicMock()
tc._save_models = MagicMock()
return tc, trainer_mock

tf_reset_graph.assert_called_once()
env_mock.reset.assert_called_once()
assert tc.advance.call_count == 11
tc._export_graph.assert_not_called()
tc._save_model.assert_not_called()
tc._save_models.assert_not_called()
@patch.object(tf, "reset_default_graph")

tf_reset_graph.assert_called_once()
env_mock.reset.assert_called_once()
assert tc.advance.call_count == trainer_mock.get_max_steps + 1
tc._save_model.assert_called_once()
tc._save_models.assert_called_once()
@pytest.fixture

64
ml-agents/mlagents/trainers/tests/test_training_status.py


import unittest
import json
from enum import Enum
import time
)
from mlagents.trainers.policy.checkpoint_manager import (
NNCheckpointManager,
NNCheckpoint,
)

)
assert unknown_category is None
assert unknown_key is None
def test_model_management(tmpdir):
results_path = os.path.join(tmpdir, "results")
brain_name = "Mock_brain"
final_model_path = os.path.join(results_path, brain_name)
test_checkpoint_list = [
{
"steps": 1,
"file_path": os.path.join(final_model_path, f"{brain_name}-1.nn"),
"reward": 1.312,
"creation_time": time.time(),
},
{
"steps": 2,
"file_path": os.path.join(final_model_path, f"{brain_name}-2.nn"),
"reward": 1.912,
"creation_time": time.time(),
},
{
"steps": 3,
"file_path": os.path.join(final_model_path, f"{brain_name}-3.nn"),
"reward": 2.312,
"creation_time": time.time(),
},
]
GlobalTrainingStatus.set_parameter_state(
brain_name, StatusType.CHECKPOINTS, test_checkpoint_list
)
new_checkpoint_4 = NNCheckpoint(
4, os.path.join(final_model_path, f"{brain_name}-4.nn"), 2.678, time.time()
)
NNCheckpointManager.add_checkpoint(brain_name, new_checkpoint_4, 4)
assert len(NNCheckpointManager.get_checkpoints(brain_name)) == 4
new_checkpoint_5 = NNCheckpoint(
5, os.path.join(final_model_path, f"{brain_name}-5.nn"), 3.122, time.time()
)
NNCheckpointManager.add_checkpoint(brain_name, new_checkpoint_5, 4)
assert len(NNCheckpointManager.get_checkpoints(brain_name)) == 4
final_model_path = f"{final_model_path}.nn"
final_model_time = time.time()
current_step = 6
final_model = NNCheckpoint(current_step, final_model_path, 3.294, final_model_time)
NNCheckpointManager.track_final_checkpoint(brain_name, final_model)
assert len(NNCheckpointManager.get_checkpoints(brain_name)) == 4
check_checkpoints = GlobalTrainingStatus.saved_state[brain_name][
StatusType.CHECKPOINTS.value
]
assert check_checkpoints is not None
final_model = GlobalTrainingStatus.saved_state[StatusType.FINAL_CHECKPOINT.value]
assert final_model is not None
class StatsMetaDataTest(unittest.TestCase):

66
ml-agents/mlagents/trainers/trainer/rl_trainer.py


# # Unity ML-Agents Toolkit
from typing import Dict, List
import os
from typing import Dict, List, Optional
import attr
from mlagents.model_serialization import SerializationSettings
from mlagents.trainers.policy.checkpoint_manager import (
NNCheckpoint,
NNCheckpointManager,
)
from mlagents_envs.timers import timed
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trainer import Trainer

"""
return False
def _policy_mean_reward(self) -> Optional[float]:
""" Returns the mean episode reward for the current policy. """
rewards = self.cumulative_returns_since_policy_update
if len(rewards) == 0:
return None
else:
return sum(rewards) / len(rewards)
@timed
def _checkpoint(self) -> NNCheckpoint:
"""
Checkpoints the policy associated with this trainer.
"""
n_policies = len(self.policies.keys())
if n_policies > 1:
logger.warning(
"Trainer has multiple policies, but default behavior only saves the first."
)
policy = list(self.policies.values())[0]
model_path = policy.model_path
settings = SerializationSettings(model_path, self.brain_name)
checkpoint_path = os.path.join(model_path, f"{self.brain_name}-{self.step}")
policy.checkpoint(checkpoint_path, settings)
new_checkpoint = NNCheckpoint(
int(self.step),
f"{checkpoint_path}.nn",
self._policy_mean_reward(),
time.time(),
)
NNCheckpointManager.add_checkpoint(
self.brain_name, new_checkpoint, self.trainer_settings.keep_checkpoints
)
return new_checkpoint
def save_model(self) -> None:
"""
Saves the policy associated with this trainer.
"""
n_policies = len(self.policies.keys())
if n_policies > 1:
logger.warning(
"Trainer has multiple policies, but default behavior only saves the first."
)
policy = list(self.policies.values())[0]
settings = SerializationSettings(policy.model_path, self.brain_name)
model_checkpoint = self._checkpoint()
final_checkpoint = attr.evolve(
model_checkpoint, file_path=f"{policy.model_path}.nn"
)
policy.save(policy.model_path, settings)
NNCheckpointManager.track_final_checkpoint(self.brain_name, final_checkpoint)
@abc.abstractmethod
def _update_policy(self) -> bool:
"""