浏览代码

Merge branch 'master' into develop-sampler-refactor

/sampler-refactor-copy
Andrew Cohen 5 年前
当前提交
c0f7052b
共有 82 个文件被更改,包括 9955 次插入4080 次删除
  1. 2
      .pylintrc
  2. 3
      Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/JointDriveController.cs
  3. 7
      Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ProjectSettingsOverrides.cs
  4. 162
      Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs
  5. 2
      Project/Assets/ML-Agents/Examples/Walker/TFModels/WalkerDynamic.nn.meta
  6. 1001
      Project/Assets/ML-Agents/Examples/Walker/TFModels/WalkerDynamic.nn
  7. 5
      Project/Assets/ML-Agents/Examples/WallJump/Materials/TransparentWall.mat
  8. 2
      Project/ProjectSettings/DynamicsManager.asset
  9. 18
      com.unity.ml-agents/CHANGELOG.md
  10. 10
      com.unity.ml-agents/Runtime/Academy.cs
  11. 8
      com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs
  12. 12
      com.unity.ml-agents/Runtime/Policies/BehaviorParameters.cs
  13. 4
      com.unity.ml-agents/Runtime/Sensors/Reflection/EnumReflectionSensor.cs
  14. 2
      com.unity.ml-agents/package.json
  15. 2
      config/ppo/SoccerTwos.yaml
  16. 4
      config/ppo/StrikersVsGoalie.yaml
  17. 6
      config/ppo/Tennis.yaml
  18. 2
      config/ppo/WalkerStatic.yaml
  19. 2
      config/sac/WalkerDynamic.yaml
  20. 274
      docs/Learning-Environment-Design-Agents.md
  21. 10
      docs/Learning-Environment-Design.md
  22. 21
      docs/Learning-Environment-Examples.md
  23. 12
      docs/Migrating.md
  24. 3
      docs/Training-Configuration-File.md
  25. 7
      docs/Training-ML-Agents.md
  26. 3
      docs/Using-Tensorboard.md
  27. 999
      docs/images/walker.png
  28. 2
      gym-unity/gym_unity/__init__.py
  29. 2
      markdown-link-check.full.json
  30. 2
      ml-agents-envs/mlagents_envs/__init__.py
  31. 12
      ml-agents-envs/mlagents_envs/registry/binary_utils.py
  32. 32
      ml-agents/mlagents/model_serialization.py
  33. 2
      ml-agents/mlagents/trainers/__init__.py
  34. 7
      ml-agents/mlagents/trainers/cli_utils.py
  35. 2
      ml-agents/mlagents/trainers/ghost/trainer.py
  36. 1
      ml-agents/mlagents/trainers/learn.py
  37. 21
      ml-agents/mlagents/trainers/ppo/trainer.py
  38. 1
      ml-agents/mlagents/trainers/sac/trainer.py
  39. 4
      ml-agents/mlagents/trainers/settings.py
  40. 8
      ml-agents/mlagents/trainers/tests/test_learn.py
  41. 1
      ml-agents/mlagents/trainers/tests/test_ppo.py
  42. 49
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  43. 1
      ml-agents/mlagents/trainers/tests/test_sac.py
  44. 2
      ml-agents/mlagents/trainers/tests/test_simple_rl.py
  45. 2
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  46. 38
      ml-agents/mlagents/trainers/trainer/rl_trainer.py
  47. 4
      ml-agents/mlagents/trainers/trainer/trainer.py
  48. 22
      ml-agents/mlagents/trainers/trainer_controller.py
  49. 4
      utils/validate_meta_files.py
  50. 297
      Project/Assets/ML-Agents/Examples/SharedAssets/Prefabs/OrientationCube.prefab
  51. 7
      Project/Assets/ML-Agents/Examples/SharedAssets/Prefabs/OrientationCube.prefab.meta
  52. 27
      Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/DirectionIndicator.cs
  53. 11
      Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/DirectionIndicator.cs.meta
  54. 1001
      Project/Assets/ML-Agents/Examples/Walker/Demos/ExpertWalkerDyna.demo
  55. 10
      Project/Assets/ML-Agents/Examples/Walker/Demos/ExpertWalkerDyna.demo.meta
  56. 1001
      Project/Assets/ML-Agents/Examples/Walker/Demos/ExpertWalkerStat.demo
  57. 10
      Project/Assets/ML-Agents/Examples/Walker/Demos/ExpertWalkerStat.demo.meta
  58. 841
      Project/Assets/ML-Agents/Examples/Walker/Prefabs/DynamicPlatformWalker.prefab
  59. 7
      Project/Assets/ML-Agents/Examples/Walker/Prefabs/DynamicPlatformWalker.prefab.meta
  60. 1001
      Project/Assets/ML-Agents/Examples/Walker/Prefabs/WalkerRagdoll.prefab
  61. 7
      Project/Assets/ML-Agents/Examples/Walker/Prefabs/WalkerRagdoll.prefab.meta
  62. 485
      Project/Assets/ML-Agents/Examples/Walker/Prefabs/WalkerWithTargetPair.prefab
  63. 1001
      Project/Assets/ML-Agents/Examples/Walker/Scenes/WalkerDynamic.unity
  64. 7
      Project/Assets/ML-Agents/Examples/Walker/Scenes/WalkerDynamic.unity.meta
  65. 1001
      Project/Assets/ML-Agents/Examples/Walker/Scenes/WalkerStatic.unity
  66. 1001
      Project/Assets/ML-Agents/Examples/Walker/TFModels/WalkerStatic.nn
  67. 11
      Project/Assets/ML-Agents/Examples/Walker/TFModels/WalkerStatic.nn.meta
  68. 27
      config/ppo/WalkerDynamic.yaml
  69. 29
      config/sac/WalkerStatic.yaml
  70. 191
      ml-agents/mlagents/trainers/tests/test_config_conversion.py
  71. 137
      ml-agents/mlagents/trainers/upgrade_config.py
  72. 1001
      Project/Assets/ML-Agents/Examples/Walker/Demos/ExpertWalker.demo
  73. 10
      Project/Assets/ML-Agents/Examples/Walker/Demos/ExpertWalker.demo.meta
  74. 1001
      Project/Assets/ML-Agents/Examples/Walker/Prefabs/WalkerPair.prefab
  75. 1001
      Project/Assets/ML-Agents/Examples/Walker/Scenes/Walker.unity
  76. 110
      config/upgrade_config.py
  77. 0
      /Project/Assets/ML-Agents/Examples/Walker/Prefabs/WalkerWithTargetPair.prefab.meta
  78. 0
      /Project/Assets/ML-Agents/Examples/Walker/Scenes/WalkerStatic.unity.meta
  79. 0
      /Project/Assets/ML-Agents/Examples/Walker/TFModels/WalkerDynamic.nn.meta
  80. 0
      /Project/Assets/ML-Agents/Examples/Walker/TFModels/WalkerDynamic.nn
  81. 0
      /config/ppo/WalkerStatic.yaml
  82. 0
      /config/sac/WalkerDynamic.yaml

2
.pylintrc


[MASTER]
# Add files or directories to the blacklist. They should be base names, not
# Add files or directories to the ignore list. They should be base names, not
# paths.
ignore=CVS

3
Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/JointDriveController.cs


[HideInInspector] public Dictionary<Transform, BodyPart> bodyPartsDict = new Dictionary<Transform, BodyPart>();
[HideInInspector] public List<BodyPart> bodyPartsList = new List<BodyPart>();
const float k_MaxAngularVelocity = 50.0f;
/// <summary>
/// Create BodyPart object and add it to dictionary.

startingPos = t.position,
startingRot = t.rotation
};
bp.rb.maxAngularVelocity = 100;
bp.rb.maxAngularVelocity = k_MaxAngularVelocity;
// Add & setup the ground contact script
bp.groundContact = t.GetComponent<GroundContact>();

7
Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ProjectSettingsOverrides.cs


public class ProjectSettingsOverrides : MonoBehaviour
{
// Original values
bool m_OriginalReuseCollisionCallbacks;
[Tooltip("Increase or decrease the scene gravity. Use ~3x to make things less floaty")]
public float gravityMultiplier = 1.0f;

public int solverIterations = 6;
[Tooltip("Affects how accurately the Rigidbody joints and collision contacts are resolved. (default 1). Must be positive.")]
public int solverVelocityIterations = 1;
[Tooltip("Determines whether the garbage collector should reuse only a single instance of a Collision type for all collision callbacks. Reduces Garbage.")]
public bool reuseCollisionCallbacks = true;
public void Awake()
{

m_OriginalMaximumDeltaTime = Time.maximumDeltaTime;
m_OriginalSolverIterations = Physics.defaultSolverIterations;
m_OriginalSolverVelocityIterations = Physics.defaultSolverVelocityIterations;
m_OriginalReuseCollisionCallbacks = Physics.reuseCollisionCallbacks ;
// Override
Physics.gravity *= gravityMultiplier;

Physics.defaultSolverVelocityIterations = solverVelocityIterations;
Physics.reuseCollisionCallbacks = reuseCollisionCallbacks;
// Make sure the Academy singleton is initialized first, since it will create the SideChannels.
Academy.Instance.EnvironmentParameters.RegisterCallback("gravity", f => { Physics.gravity = new Vector3(0, -f, 0); });

Time.maximumDeltaTime = m_OriginalMaximumDeltaTime;
Physics.defaultSolverIterations = m_OriginalSolverIterations;
Physics.defaultSolverVelocityIterations = m_OriginalSolverVelocityIterations;
Physics.reuseCollisionCallbacks = m_OriginalReuseCollisionCallbacks;
}
}
}

162
Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs


using MLAgentsExamples;
using BodyPart = Unity.MLAgentsExamples.BodyPart;
[Header("Specific to Walker")]
[Header("Target To Walk Towards")]
[Space(10)]
public Transform target;
public float maximumWalkingSpeed = 999; //The max walk velocity magnitude an agent will be rewarded for
Vector3 m_WalkDir; //Direction to the target
Quaternion m_WalkDirLookRot; //Will hold the rotation to our target
[Header("Target To Walk Towards")] [Space(10)]
public float targetSpawnRadius; //The radius in which a target can be randomly spawned.
public Transform target; //Target the agent will walk towards.
public Transform ground; //Ground gameobject. The height will be used for target spawning
public bool detectTargets; //Should this agent detect targets
public bool respawnTargetWhenTouched; //Should the target respawn to a different position when touched
Vector3 m_DirToTarget;
public Transform hips;
[Header("Body Parts")] [Space(10)] public Transform hips;
public Transform chest;
public Transform spine;
public Transform head;

public Transform armR;
public Transform forearmR;
public Transform handR;
JointDriveController m_JdController;
[Header("Orientation")] [Space(10)]
//This will be used as a stable reference point for observations
//Because ragdolls can move erratically, using a standalone reference point can significantly improve learning
public GameObject orientationCube;
Rigidbody m_HipsRb;
Rigidbody m_ChestRb;
Rigidbody m_SpineRb;
JointDriveController m_JdController;
UpdateOrientationCube();
m_JdController = GetComponent<JointDriveController>();
m_JdController.SetupBodyPart(hips);
m_JdController.SetupBodyPart(chest);

m_JdController.SetupBodyPart(forearmR);
m_JdController.SetupBodyPart(handR);
m_HipsRb = hips.GetComponent<Rigidbody>();
m_ChestRb = chest.GetComponent<Rigidbody>();
m_SpineRb = spine.GetComponent<Rigidbody>();
m_ResetParams = Academy.Instance.EnvironmentParameters;
SetResetParameters();

/// </summary>
public void CollectObservationBodyPart(BodyPart bp, VectorSensor sensor)
{
var rb = bp.rb;
//GROUND CHECK
sensor.AddObservation(rb.velocity);
sensor.AddObservation(rb.angularVelocity);
var localPosRelToHips = hips.InverseTransformPoint(rb.position);
sensor.AddObservation(localPosRelToHips);
//Get velocities in the context of our orientation cube's space
//Note: You can get these velocities in world space as well but it may not train as well.
sensor.AddObservation(orientationCube.transform.InverseTransformDirection(bp.rb.velocity));
sensor.AddObservation(orientationCube.transform.InverseTransformDirection(bp.rb.angularVelocity));
if (bp.rb.transform != hips && bp.rb.transform != handL && bp.rb.transform != handR &&
bp.rb.transform != footL && bp.rb.transform != footR && bp.rb.transform != head)
//Get position relative to hips in the context of our orientation cube's space
sensor.AddObservation(orientationCube.transform.InverseTransformDirection(bp.rb.position - hips.position));
if (bp.rb.transform != hips && bp.rb.transform != handL && bp.rb.transform != handR)
sensor.AddObservation(bp.currentXNormalizedRot);
sensor.AddObservation(bp.currentYNormalizedRot);
sensor.AddObservation(bp.currentZNormalizedRot);
sensor.AddObservation(bp.rb.transform.localRotation);
sensor.AddObservation(bp.currentStrength / m_JdController.maxJointForceLimit);
}
}

/// </summary>
public override void CollectObservations(VectorSensor sensor)
{
m_JdController.GetCurrentJointForces();
sensor.AddObservation(Quaternion.FromToRotation(hips.forward, orientationCube.transform.forward));
sensor.AddObservation(Quaternion.FromToRotation(head.forward, orientationCube.transform.forward));
sensor.AddObservation(m_DirToTarget.normalized);
sensor.AddObservation(m_JdController.bodyPartsDict[hips].rb.position);
sensor.AddObservation(hips.forward);
sensor.AddObservation(hips.up);
sensor.AddObservation(orientationCube.transform.InverseTransformPoint(target.position));
foreach (var bodyPart in m_JdController.bodyPartsDict.Values)
foreach (var bodyPart in m_JdController.bodyPartsList)
{
CollectObservationBodyPart(bodyPart, sensor);
}

bpDict[shinR].SetJointTargetRotation(vectorAction[++i], 0, 0);
bpDict[footR].SetJointTargetRotation(vectorAction[++i], vectorAction[++i], vectorAction[++i]);
bpDict[footL].SetJointTargetRotation(vectorAction[++i], vectorAction[++i], vectorAction[++i]);
bpDict[armL].SetJointTargetRotation(vectorAction[++i], vectorAction[++i], 0);
bpDict[armR].SetJointTargetRotation(vectorAction[++i], vectorAction[++i], 0);

bpDict[forearmL].SetJointStrength(vectorAction[++i]);
bpDict[armR].SetJointStrength(vectorAction[++i]);
bpDict[forearmR].SetJointStrength(vectorAction[++i]);
}
void UpdateOrientationCube()
{
//FACING DIR
m_WalkDir = target.position - orientationCube.transform.position;
m_WalkDir.y = 0; //flatten dir on the y
m_WalkDirLookRot = Quaternion.LookRotation(m_WalkDir); //get our look rot to the target
//UPDATE ORIENTATION CUBE POS & ROT
orientationCube.transform.position = hips.position;
orientationCube.transform.rotation = m_WalkDirLookRot;
if (detectTargets)
{
foreach (var bodyPart in m_JdController.bodyPartsDict.Values)
{
if (bodyPart.targetContact && bodyPart.targetContact.touchingTarget)
{
TouchedTarget();
}
}
}
UpdateOrientationCube();
var moveTowardsTargetReward = Vector3.Dot(orientationCube.transform.forward,
Vector3.ClampMagnitude(m_JdController.bodyPartsDict[hips].rb.velocity, maximumWalkingSpeed));
var lookAtTargetReward = Vector3.Dot(orientationCube.transform.forward, head.forward);
// d. Discourage head movement.
m_DirToTarget = target.position - m_JdController.bodyPartsDict[hips].rb.position;
var headHeightOverFeetReward = (head.position.y - footL.position.y) + (head.position.y - footR.position.y);
+0.03f * Vector3.Dot(m_DirToTarget.normalized, m_JdController.bodyPartsDict[hips].rb.velocity)
+ 0.01f * Vector3.Dot(m_DirToTarget.normalized, hips.forward)
+ 0.02f * (head.position.y - hips.position.y)
- 0.01f * Vector3.Distance(m_JdController.bodyPartsDict[head].rb.velocity,
m_JdController.bodyPartsDict[hips].rb.velocity)
+0.02f * moveTowardsTargetReward
+ 0.01f * lookAtTargetReward
+ 0.01f * headHeightOverFeetReward
/// Agent touched the target
/// </summary>
public void TouchedTarget()
{
AddReward(1f);
if (respawnTargetWhenTouched)
{
MoveTargetToRandomPosition();
}
}
/// <summary>
/// Moves target to a random position within specified radius.
/// </summary>
public void MoveTargetToRandomPosition()
{
var newTargetPos = Random.insideUnitSphere * targetSpawnRadius;
newTargetPos.y = 5;
target.position = newTargetPos + ground.position;
}
/// <summary>
if (m_DirToTarget != Vector3.zero)
//Reset all of the body parts
foreach (var bodyPart in m_JdController.bodyPartsDict.Values)
transform.rotation = Quaternion.LookRotation(m_DirToTarget);
bodyPart.Reset(bodyPart);
foreach (var bodyPart in m_JdController.bodyPartsDict.Values)
//Random start rotation to help generalize
transform.rotation = Quaternion.Euler(0, Random.Range(0.0f, 360.0f), 0);
UpdateOrientationCube();
if (detectTargets && respawnTargetWhenTouched)
bodyPart.Reset(bodyPart);
MoveTargetToRandomPosition();
m_ChestRb.mass = m_ResetParams.GetWithDefault("chest_mass", 8);
m_SpineRb.mass = m_ResetParams.GetWithDefault("spine_mass", 10);
m_HipsRb.mass = m_ResetParams.GetWithDefault("hip_mass", 15);
m_JdController.bodyPartsDict[chest].rb.mass = m_ResetParams.GetWithDefault("chest_mass", 8);
m_JdController.bodyPartsDict[spine].rb.mass = m_ResetParams.GetWithDefault("spine_mass", 8);
m_JdController.bodyPartsDict[hips].rb.mass = m_ResetParams.GetWithDefault("hip_mass", 8);
}
private void OnDrawGizmosSelected()
{
if (Application.isPlaying)
{
Gizmos.color = Color.green;
Gizmos.matrix = orientationCube.transform.localToWorldMatrix;
Gizmos.DrawWireCube(Vector3.zero, orientationCube.transform.localScale);
Gizmos.DrawRay(Vector3.zero, Vector3.forward);
}
}
}

2
Project/Assets/ML-Agents/Examples/Walker/TFModels/WalkerDynamic.nn.meta


fileFormatVersion: 2
guid: 4e86a19e012da43bfa5ab97ae8089b98
guid: 2cb15010f7cbe4dc59418a5858c87819
ScriptedImporter:
fileIDToRecycleName:
11400000: main obj

1001
Project/Assets/ML-Agents/Examples/Walker/TFModels/WalkerDynamic.nn
文件差异内容过多而无法显示
查看文件

5
Project/Assets/ML-Agents/Examples/WallJump/Materials/TransparentWall.mat


Material:
serializedVersion: 6
m_ObjectHideFlags: 0
m_PrefabParentObject: {fileID: 0}
m_PrefabInternal: {fileID: 0}
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_Name: TransparentWall
m_Shader: {fileID: 46, guid: 0000000000000000f000000000000000, type: 0}
m_ShaderKeywords: _ALPHABLEND_ON

2
Project/ProjectSettings/DynamicsManager.asset


m_LayerCollisionMatrix: ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffebffffffddffffffeffffffff5fffffffbffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
m_AutoSimulation: 1
m_AutoSyncTransforms: 1
m_ReuseCollisionCallbacks: 0
m_ReuseCollisionCallbacks: 1
m_ClothInterCollisionSettingsToggle: 0
m_ContactPairsMode: 0
m_BroadphaseType: 0

18
com.unity.ml-agents/CHANGELOG.md


[Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Minor Changes
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
### Bug Fixes
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
## [1.1.0-preview] - 2020-06-10
### Major Changes
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
- Added new Walker environments. Improved ragdoll stability/performance. (#4037)
- `max_step` in the `TerminalStep` and `TerminalSteps` objects was renamed `interrupted`.
- `beta` and `epsilon` in `PPO` are no longer decayed by default but follow the same schedule as learning rate. (#3940)
- `get_behavior_names()` and `get_behavior_spec()` on UnityEnvironment were replaced by the `behavior_specs` property. (#3946)

#### ml-agents / ml-agents-envs / gym-unity (Python)
- Unity Player logs are now written out to the results directory. (#3877)
- Run configuration YAML files are written out to the results directory at the end of the run. (#3815)
- The `--save-freq` CLI option has been removed, and replaced by a `checkpoint_interval` option in the trainer configuration YAML. (#4034)
- Fixed a bug in the onnx export that would cause constants needed for inference to not be visible to some versions of
the Barracuda importer. (#4073)
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)

10
com.unity.ml-agents/Runtime/Academy.cs


/// Unity package version of com.unity.ml-agents.
/// This must match the version string in package.json and is checked in a unit test.
/// </summary>
internal const string k_PackageVersion = "1.0.2-preview";
internal const string k_PackageVersion = "1.1.0-preview";
const int k_EditorTrainingPort = 5004;

port = port
}
);
Communicator.QuitCommandReceived += OnQuitCommandReceived;
Communicator.ResetCommandReceived += OnResetCommand;
}
if (Communicator != null)

"Will perform inference instead."
);
Communicator = null;
}
if (Communicator != null)
{
Communicator.QuitCommandReceived += OnQuitCommandReceived;
Communicator.ResetCommandReceived += OnResetCommand;
}
}

8
com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs


m_Client = new UnityToExternalProto.UnityToExternalProtoClient(channel);
var result = m_Client.Exchange(WrapMessage(unityOutput, 200));
unityInput = m_Client.Exchange(WrapMessage(null, 200)).UnityInput;
var inputMessage = m_Client.Exchange(WrapMessage(null, 200));
unityInput = inputMessage.UnityInput;
if (result.Header.Status != 200 || inputMessage.Header.Status != 200)
{
m_IsOpen = false;
QuitCommandReceived?.Invoke();
}
return result.UnityInput;
#else
throw new UnityAgentsException(

12
com.unity.ml-agents/Runtime/Policies/BehaviorParameters.cs


public enum ObservableAttributeOptions
{
/// <summary>
/// All ObservableAttributes on the Agent will be ignored. If there are no
/// ObservableAttributes on the Agent, this will result in the fastest
/// initialization time.
/// All ObservableAttributes on the Agent will be ignored. This is the
/// default behavior. If there are no ObservableAttributes on the
/// Agent, this will result in the fastest initialization time.
/// inherited are ignored. This is the default behavior, and a reasonable
/// tradeoff between performance and flexibility.
/// inherited are ignored. This is a reasonable tradeoff between
/// performance and flexibility.
/// </summary>
/// <remarks>This corresponds to setting the
/// [BindingFlags.DeclaredOnly](https://docs.microsoft.com/en-us/dotnet/api/system.reflection.bindingflags?view=netcore-3.1)

/// <summary>
/// All members on the class will be examined. This can lead to slower
/// startup times
/// startup times.
/// </summary>
ExamineAll
}

4
com.unity.ml-agents/Runtime/Sensors/Reflection/EnumReflectionSensor.cs


namespace Unity.MLAgents.Sensors.Reflection
{
internal class EnumReflectionSensor: ReflectionSensorBase
internal class EnumReflectionSensor : ReflectionSensorBase
{
Array m_Values;
bool m_IsFlags;

var enumValue = (Enum)GetReflectedValue();
int i = 0;
foreach(var val in m_Values)
foreach (var val in m_Values)
{
if (m_IsFlags)
{

2
com.unity.ml-agents/package.json


{
"name": "com.unity.ml-agents",
"displayName": "ML Agents",
"version": "1.0.2-preview",
"version": "1.1.0-preview",
"unity": "2018.4",
"description": "Use state-of-the-art machine learning to create intelligent character behaviors in any Unity environment (games, robotics, film, etc.).",
"dependencies": {

2
config/ppo/SoccerTwos.yaml


self_play:
save_steps: 50000
team_change: 200000
swap_steps: 50000
swap_steps: 2000
window: 10
play_against_latest_model_ratio: 0.5
initial_elo: 1200.0

4
config/ppo/StrikersVsGoalie.yaml


self_play:
save_steps: 50000
team_change: 200000
swap_steps: 25000
swap_steps: 1000
window: 10
play_against_latest_model_ratio: 0.5
initial_elo: 1200.0

self_play:
save_steps: 50000
team_change: 200000
swap_steps: 100000
swap_steps: 4000
window: 10
play_against_latest_model_ratio: 0.5
initial_elo: 1200.0

6
config/ppo/Tennis.yaml


Tennis:
trainer_type: ppo
hyperparameters:
batch_size: 1024
buffer_size: 10240
batch_size: 2048
buffer_size: 20480
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2

self_play:
save_steps: 50000
team_change: 100000
swap_steps: 50000
swap_steps: 2000
window: 10
play_against_latest_model_ratio: 0.5
initial_elo: 1200.0

2
config/ppo/WalkerStatic.yaml


behaviors:
Walker:
WalkerStatic:
trainer_type: ppo
hyperparameters:
batch_size: 2048

2
config/sac/WalkerDynamic.yaml


behaviors:
Walker:
WalkerDynamic:
trainer_type: sac
hyperparameters:
learning_rate: 0.0003

274
docs/Learning-Environment-Design-Agents.md


- [Decisions](#decisions)
- [Observations and Sensors](#observations-and-sensors)
- [Generating Observations](#generating-observations)
- [Agent.CollectObservations()](#agentcollectobservations)
- [Observable Fields and Properties](#observable-fields-and-properties)
- [ISensor interface and SensorComponents](#isensor-interface-and-sensorcomponents)
- [Stacking](#stacking)
- [Vector Observation Summary & Best Practices](#vector-observation-summary--best-practices)
- [Visual Observations](#visual-observations)
- [Visual Observation Summary & Best Practices](#visual-observation-summary--best-practices)

write your own Policy. If the Agent has a `Model` file, its Policy will use the
neural network `Model` to take decisions.
When you create an Agent, you must extend the base Agent class. This includes
implementing the following methods:
When you create an Agent, you should usually extend the base Agent class. This
includes implementing the following methods:
including at the beginning of the simulation. The Ball3DAgent class uses this
function to reset the agent cube and ball to their starting positions. The
function randomizes the reset values so that the training generalizes to more
than a specific starting position and agent cube attitude.
- `Agent.CollectObservations(VectorSensor sensor)` — Called every simulation
step. Responsible for collecting the Agent's observations of the environment.
Since the Behavior Parameters of the Agent are set with vector observation
space with a state size of 8, the `CollectObservations(VectorSensor sensor)`
must call `VectorSensor.AddObservation()` such that vector size adds up to 8.
including at the beginning of the simulation.
- `Agent.CollectObservations(VectorSensor sensor)` — Called every step that the Agent
requests a decision. This is one possible way for collecting the Agent's
observations of the environment; see [Generating Observations](#generating-observations)
below for more options.
take. Receives the action chosen by the Agent. The vector action spaces result
in a small change in the agent cube's rotation at each step. The
`OnActionReceived()` method assigns a reward to the Agent; in this example, an
Agent receives a small positive reward for each step it keeps the ball on the
agent cube's head and a larger, negative reward for dropping the ball. An
Agent's episode is also ended when it drops the ball so that it will reset
with a new ball for the next simulation step.
take. Receives the action chosen by the Agent. It is also common to assign a
reward in this method.
returns an array of floats. In the case of the Ball 3D Agent, the
`Heuristic()` method converts the keyboard inputs into actions.
writes to a provided array of floats.
As a concrete example, here is how the Ball3DAgent class implements these methods:
- `Agent.OnEpisodeBegin()` — Resets the agent cube and ball to their starting
positions. The function randomizes the reset values so that the training
generalizes to more than a specific starting position and agent cube
orientation.
- `Agent.CollectObservations(VectorSensor sensor)` — Adds information about the
orientation of the agent cube, the ball velocity, and the relative position
between the ball and the cube. Since the `CollectObservations()`
method calls `VectorSensor.AddObservation()` such that vector size adds up to 8,
the Behavior Parameters of the Agent are set with vector observation space
with a state size of 8.
- `Agent.OnActionReceived()` — The vector action spaces result
in a small change in the agent cube's rotation at each step. In this example,
an Agent receives a small positive reward for each step it keeps the ball on the
agent cube's head and a larger, negative reward for dropping the ball. An
Agent's episode is also ended when it drops the ball so that it will reset
with a new ball for the next simulation step.
- `Agent.Heuristic()` - Converts the keyboard inputs into actions.
## Decisions

should call `Agent.RequestDecision()` manually.
## Observations and Sensors
To make informed decisions, an agent must first make observations of the state
of the environment. The observations are collected by Sensors attached to the
agent GameObject. By default, agents come with a `VectorSensor` which allows
them to collect floating-point observations into a single array. There are
additional sensor components which can be attached to the agent GameObject which
collect their own observations, or modify other observations. These are:
- `CameraSensorComponent` - Allows image from `Camera` to be used as
observation.
- `RenderTextureSensorComponent` - Allows content of `RenderTexture` to be used
as observation.
- `RayPerceptionSensorComponent` - Allows information from set of ray-casts to
be used as observation.
In order for an agent to learn, the observations should include all the
information an agent needs to accomplish its task. Without sufficient and
relevant information, an agent may learn poorly or may not learn at all. A
reasonable approach for determining what information should be included is to
consider what you would need to calculate an analytical solution to the problem,
or what you would expect a human to be able to use to solve the problem.
### Vector Observations
### Generating Observations
ML-Agents provides multiple ways for an Agent to make observations:
1. Overriding the `Agent.CollectObservations()` method and passing the
observations to the provided `VectorSensor`.
1. Adding the `[Observable]` attribute to fields and properties on the Agent.
1. Implementing the `ISensor` interface, using a `SensorComponent` attached to
the Agent to create the `ISensor`.
Vector observations are best used for aspects of the environment which are
#### Agent.CollectObservations()
Agent.CollectObservations() is best used for aspects of the environment which are
In order for an agent to learn, the observations should include all the
information an agent needs to accomplish its task. Without sufficient and
relevant information, an agent may learn poorly or may not learn at all. A
reasonable approach for determining what information should be included is to
consider what you would need to calculate an analytical solution to the problem,
or what you would expect a human to be able to use to solve the problem.
The `VectorSensor.AddObservation` method provides a number of overloads for
adding common types of data to your observation vector. You can add Integers and
booleans directly to the observation vector, as well as some common Unity data
types such as `Vector2`, `Vector3`, and `Quaternion`.
state observation. As an experiment, you can remove the velocity components from
the observation and retrain the 3DBall agent. While it will learn to balance the
ball reasonably well, the performance of the agent without using velocity is
noticeably worse.
state observation.
private List<float> state = new List<float>();
// Orientation of the cube (2 floats)
sensor.AddObservation((ball.transform.position.x - gameObject.transform.position.x));
sensor.AddObservation((ball.transform.position.y - gameObject.transform.position.y));
sensor.AddObservation((ball.transform.position.z - gameObject.transform.position.z));
sensor.AddObservation(ball.transform.GetComponent<Rigidbody>().velocity.x);
sensor.AddObservation(ball.transform.GetComponent<Rigidbody>().velocity.y);
sensor.AddObservation(ball.transform.GetComponent<Rigidbody>().velocity.z);
// Relative position of the ball to the cube (3 floats)
sensor.AddObservation(ball.transform.position - gameObject.transform.position);
// Velocity of the ball (3 floats)
sensor.AddObservation(m_BallRb.velocity);
// 8 floats total
The feature vector must always contain the same number of elements and
observations must always be in the same position within the list. If the number
of observed entities in an environment can vary you can pad the feature vector
with zeros for any missing entities in a specific observation or you can limit
As an experiment, you can remove the velocity components from
the observation and retrain the 3DBall agent. While it will learn to balance the
ball reasonably well, the performance of the agent without using velocity is
noticeably worse.
The observations passed to `VectorSensor.AddObservation()` must always contain
the same number of elements must always be in the same order. If the number
of observed entities in an environment can vary, you can pad the calls
with zeros for any missing entities in a specific observation, or you can limit
every enemy agent in an environment, you could only observe the closest five.
every enemy in an environment, you could only observe the closest five.
When you set up an Agent's `Behavior Parameters` in the Unity Editor, set the
following properties to use a vector observation:
Additionally, when you set up an Agent's `Behavior Parameters` in the Unity
Editor, you must set the **Vector Observations > Space Size**
to equal the number of floats that are written by `CollectObservations()`.
- **Space Size** — The state size must match the length of your feature vector.
#### Observable Fields and Properties
Another approach is to define the relevant observations as fields or properties
on your Agent class, and annotate them with an `ObservableAttribute`. For
example, in the 3DBall example above, the rigid body velocity could be observed
by adding a property to the Agent:
```csharp
using Unity.MLAgents.Sensors.Reflection;
The observation feature vector is a list of floating point numbers, which means
you must convert any other data types to a float or a list of floats.
public class Ball3DAgent : Agent {
The `VectorSensor.AddObservation` method provides a number of overloads for
adding common types of data to your observation vector. You can add Integers and
booleans directly to the observation vector, as well as some common Unity data
types such as `Vector2`, `Vector3`, and `Quaternion`.
[Observable]
public Vector3 RigidBodyVelocity
{
get { return m_BallRb.velocity; }
}
}
```
`ObservableAttribute` currently supports most basic types (e.g. floats, ints,
bools), as well as `Vector2`, `Vector3`, `Vector4`, `Quaternion`, and enums.
The behavior of `ObservableAttribute`s are controlled by the "Observable Attribute
Handling" in the Agent's `Behavior Parameters`. The possible values for this are:
* **Ignore** (default) - All ObservableAttributes on the Agent will be ignored.
If there are no ObservableAttributes on the Agent, this will result in the
fastest initialization time.
* **Exclude Inherited** - Only members on the declared class will be examined;
members that are inherited are ignored. This is a reasonable tradeoff between
performance and flexibility.
* **Examine All** All members on the class will be examined. This can lead to
slower startup times.
"Exclude Inherited" is generally sufficient, but if your Agent inherits from
another Agent implementation that has Observable members, you will need to use
"Examine All".
Internally, ObservableAttribute uses reflection to determine which members of
the Agent have ObservableAttributes, and also uses reflection to access the
fields or invoke the properties at runtime. This may be slower than using
CollectObservations or an ISensor, although this might not be enough to
noticeably affect performance.
**NOTE**: you do not need to adjust the Space Size in the Agent's
`Behavior Parameters` when you add `[Observable]` fields or properties to an
Agent, since their size can be computed before they are used.
#### ISensor interface and SensorComponents
The `ISensor` interface is generally intended for advanced users. The `Write()`
method is used to actually generate the observation, but some other methods
such as returning the shape of the observations must also be implemented.
The `SensorComponent` abstract class is used to create the actual `ISensor` at
runtime. It must be attached to the same `GameObject` as the `Agent`, or to a
child `GameObject`.
There are several SensorComponents provided in the API:
- `CameraSensorComponent` - Allows image from `Camera` to be used as
observation.
- `RenderTextureSensorComponent` - Allows content of `RenderTexture` to be used
as observation.
- `RayPerceptionSensorComponent` - Allows information from set of ray-casts to
be used as observation.
**NOTE**: you do not need to adjust the Space Size in the Agent's
`Behavior Parameters` when using an ISensor SensorComponents.
Internally, both `Agent.CollectObservations` and `[Observable]` attribute use an
ISensors to write observations, although this is mostly abstracted from the user.
### Vector Observations
Both `Agent.CollectObservations()` and `ObservableAttribute`s produce vector
observations, which are represented at lists of `float`s. `ISensor`s can
produce both vector observations and visual observations, which are
multi-dimensional arrays of floats.
Below are some additional considerations when dealing with vector observations:
#### One-hot encoding categorical information

the feature vector. The following code example illustrates how to add.
```csharp
enum CarriedItems { Sword, Shield, Bow, LastItem }
private List<float> state = new List<float>();
enum ItemType { Sword, Shield, Bow, LastItem }
for (int ci = 0; ci < (int)CarriedItems.LastItem; ci++)
for (int ci = 0; ci < (int)ItemType.LastItem; ci++)
{
sensor.AddObservation((int)currentItem == ci ? 1.0f : 0.0f);
}

to the previous one.
```csharp
enum CarriedItems { Sword, Shield, Bow, LastItem }
const int NUM_ITEM_TYPES = (int)CarriedItems.LastItem;
enum ItemType { Sword, Shield, Bow, LastItem }
const int NUM_ITEM_TYPES = (int)ItemType.LastItem;
public override void CollectObservations(VectorSensor sensor)
{

}
```
`ObservableAttribute` has built-in support for enums. Note that you don't need
the `LastItem` placeholder in this case:
```csharp
enum ItemType { Sword, Shield, Bow }
public class HeroAgent : Agent
{
[Observable]
ItemType m_CurrentItem;
}
```

angle, or, if the number of turns is significant, increase the maximum value
used in your normalization formula.
#### Stacking
Stacking refers to repeating observations from previous steps as part of a
larger observation. For example, consider an Agent that generates these
observations in four steps
```
step 1: [0.1]
step 2: [0.2]
step 3: [0.3]
step 4: [0.4]
```
If we use a stack size of 3, the observations would instead be:
```csharp
step 1: [0.1, 0.0, 0.0]
step 2: [0.2, 0.1, 0.0]
step 3: [0.3, 0.2, 0.1]
step 4: [0.4, 0.3, 0.2]
```
(The observations are padded with zeroes for the first `stackSize-1` steps).
This is a simple way to give an Agent limited "memory" without the complexity
of adding a recurrent neural network (RNN).
The steps for enabling stacking depends on how you generate observations:
* For Agent.CollectObservations(), set "Stacked Vectors" on the Agent's
`Behavior Parameters` to a value greater than 1.
* For ObservableAttribute, set the `numStackedObservations` parameter in the
constructor, e.g. `[Observable(numStackedObservations: 2)]`.
* For `ISensor`s, wrap them in a `StackingSensor` (which is also an `ISensor`).
Generally, this should happen in the `CreateSensor()` method of your
`SensorComponent`.
Note that stacking currently only supports for vector observations; stacking
for visual observations is not supported.
#### Vector Observation Summary & Best Practices
- Vector Observations should include all variables relevant for allowing the

value in the agent GameObject's `Behavior Parameters` should be changed.
- Categorical variables such as type of object (Sword, Shield, Bow) should be
encoded in one-hot fashion (i.e. `3` -> `0, 0, 1`). This can be done
automatically using the `AddOneHotObservation()` method of the `VectorSensor`.
automatically using the `AddOneHotObservation()` method of the `VectorSensor`,
or using `[Observable]` on an enum field or property of the Agent.
- In general, all inputs should be normalized to be in the range 0 to +1 (or -1
to 1). For example, the `x` position information of an agent where the maximum
possible value is `maxValue` should be recorded as

Mathf.Abs(gameObject.transform.position.x - area.transform.position.x) > 8f ||
Mathf.Abs(gameObject.transform.position.z + 5 - area.transform.position.z) > 8)
{
EndEpisode();
EndEpisode();
}
```

10
docs/Learning-Environment-Design.md


1. Calls your Academy's `OnEnvironmentReset` delegate.
1. Calls the `OnEpisodeBegin()` function for each Agent in the scene.
1. Calls the `CollectObservations(VectorSensor sensor)` function for each Agent
in the scene.
1. Gathers information about the scene. This is done by calling the
`CollectObservations(VectorSensor sensor)` function for each Agent in the
scene, as well as updating their sensor and collecting the resulting
observations.
1. Uses each Agent's Policy to decide on the Agent's next action.
1. Calls the `OnActionReceived()` function for each Agent in the scene, passing
in the action chosen by the Agent's Policy.

in a football game or a car object in a vehicle simulation. Every Agent must
have appropriate `Behavior Parameters`.
To create an Agent, extend the Agent class and implement the essential
`CollectObservations(VectorSensor sensor)` and `OnActionReceived()` methods:
Generally, when creating an Agent, you should extend the Agent class and implement
the `CollectObservations(VectorSensor sensor)` and `OnActionReceived()` methods:
- `CollectObservations(VectorSensor sensor)` — Collects the Agent's observation
of its environment.

21
docs/Learning-Environment-Examples.md


![Walker](images/walker.png)
- Set-up: Physics-based Humanoids agents with 26 degrees of freedom. These DOFs
- Set-up: Physics-based Humanoid agents with 26 degrees of freedom. These DOFs
- Agents: The environment contains 11 independent agents with same Behavior
- `WalkerStatic` - Goal direction is always forward.
- `WalkerDynamic`- Goal direction is randomized.
- Agents: The environment contains 10 independent agents with same Behavior
- +0.03 times body velocity in the goal direction.
- +0.01 times head y position.
- +0.01 times body direction alignment with goal direction.
- -0.01 times head velocity difference from body velocity.
- +0.02 times body velocity in the goal direction. (run towards target)
- +0.01 times head direction alignment with goal direction. (face towards target)
- +0.005 times head y position - left foot y position. (encourage head height)
- +0.005 times head y position - right foot y position. (encourage head height)
- Vector Observation space: 215 variables corresponding to position, rotation,
- Vector Observation space: 236 variables corresponding to position, rotation,
rotations applicable to the joints.
rotations and strength applicable to the joints.
- Visual Observations: None
- Float Properties: Four
- gravity: Magnitude of gravity

- Default: 10
- Recommended Minimum: 3
- Recommended Maximum: 20
- Benchmark Mean Reward: 1000
- Benchmark Mean Reward for `WalkerStatic`: 1500
- Benchmark Mean Reward for `WalkerDynamic`: 700
## Pyramids

12
docs/Migrating.md


- `use_visual` and `allow_multiple_visual_obs` in the `UnityToGymWrapper` constructor
were replaced by `allow_multiple_obs` which allows one or more visual observations and
vector observations to be used simultaneously.
- `--save-freq` has been removed from the CLI and is now configurable in the trainer configuration
file.
- To upgrade your configuration files, an upgrade script has been provided. Run `python config/update_config.py
-h` to see the script usage.
- To upgrade your configuration files, an upgrade script has been provided. Run
`python -m mlagents.trainers.upgrade_config -h` to see the script usage. Note that you will have
had to upgrade to/install the current version of ML-Agents before running the script.
- If your training uses [curriculum](Training-ML-Agents.md#curriculum-learning), move those configurations under
the `Behavior Name` section.
- If your training uses [curriculum](Training-ML-Agents.md#curriculum-learning), move those configurations under a `curriculum` section.
- If your training uses [parameter randomization](Training-ML-Agents.md#environment-parameter-randomization), move
the contents of the sampler config to `parameter_randomization` in the main trainer configuration.
- If you are using `UnityEnvironment` directly, replace `max_step` with `interrupted`

from the constructor and add `allow_multiple_obs = True` if the environment contains either
both visual and vector observations or multiple visual observations.
- If you were setting `--save-freq` in the CLI, add a `checkpoint_interval` value in your
trainer configuration, and set it equal to `save-freq * n_agents_in_scene`.
## Migrating from 0.15 to Release 1

3
docs/Training-Configuration-File.md


| `summary_freq` | (default = `50000`) Number of experiences that needs to be collected before generating and displaying training statistics. This determines the granularity of the graphs in Tensorboard. |
| `time_horizon` | (default = `64`) How many steps of experience to collect per-agent before adding it to the experience buffer. When this limit is reached before the end of an episode, a value estimate is used to predict the overall expected reward from the agent's current state. As such, this parameter trades off between a less biased, but higher variance estimate (long time horizon) and more biased, but less varied estimate (short time horizon). In cases where there are frequent rewards within an episode, or episodes are prohibitively large, a smaller number can be more ideal. This number should be large enough to capture all the important behavior within a sequence of an agent's actions. <br><br> Typical range: `32` - `2048` |
| `max_steps` | (default = `500000`) Total number of steps (i.e., observation collected and action taken) that must be taken in the environment (or across all environments if using multiple in parallel) before ending the training process. If you have multiple agents with the same behavior name within your environment, all steps taken by those agents will contribute to the same `max_steps` count. <br><br>Typical range: `5e5` - `1e7` |
| `keep_checkpoints` | (default = `5`) The maximum number of model checkpoints to keep. Checkpoints are saved after the number of steps specified by the save-freq option. Once the maximum number of checkpoints has been reached, the oldest checkpoint is deleted when saving a new checkpoint. |
| `keep_checkpoints` | (default = `5`) The maximum number of model checkpoints to keep. Checkpoints are saved after the number of steps specified by the checkpoint_interval option. Once the maximum number of checkpoints has been reached, the oldest checkpoint is deleted when saving a new checkpoint. |
| `checkpoint_interval` | (default = `500000`) The number of experiences collected between each checkpoint by the trainer. A maximum of `keep_checkpoints` checkpoints are saved before old ones are deleted. |
| `init_path` | (default = None) Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. <br><br>You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run. |
| `threaded` | (default = `true`) By default, model updates can happen while the environment is being stepped. This violates the [on-policy](https://spinningup.openai.com/en/latest/user/algorithms.html#the-on-policy-algorithms) assumption of PPO slightly in exchange for a training speedup. To maintain the strict on-policyness of PPO, you can disable parallel updates by setting `threaded` to `false`. There is usually no reason to turn `threaded` off for SAC. |
| `hyperparameters -> learning_rate` | (default = `3e-4`) Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3` |

7
docs/Training-ML-Agents.md


**NOTE:** The configuration file format has been changed from 0.17.0 and onwards. To convert
an old set of configuration files (trainer config, curriculum, and sampler files) to the new
format, a script has been provided. Run `python config/upgrade_config.py -h` in your console
to see the script's usage.
format, a script has been provided. Run `python -m mlagents.trainers.upgrade_config -h` in your
console to see the script's usage.
### Behavior Configurations

time_horizon: 64
summary_freq: 10000
keep_checkpoints: 5
checkpoint_interval: 50000
threaded: true
init_path: null

window: 10
play_against_latest_model_ratio: 0.5
save_steps: 50000
swap_steps: 50000
swap_steps: 2000
team_change: 100000
```

3
docs/Using-Tensorboard.md


The TensorBoard window also provides options for how to display and smooth
graphs.
When you run the training program, `mlagents-learn`, you can use the
`--save-freq` option to specify how frequently to save the statistics.
## The ML-Agents Toolkit training statistics
The ML-Agents training program saves the following statistics:

999
docs/images/walker.png
文件差异内容过多而无法显示
查看文件

2
gym-unity/gym_unity/__init__.py


# Version of the library that will be used to upload to pypi
__version__ = "0.17.0.dev0"
__version__ = "0.18.0.dev0"
# Git tag that will be checked to determine whether to trigger upload to pypi
__release_tag__ = None

2
markdown-link-check.full.json


},
{
"pattern": "https://github.com/Unity-Technologies/ml-agents/(tree|archive)/release_[0-9]+.*",
"comment": "Whitelist future release tags"
"comment": "Allow future release tags"
}
]
}

2
ml-agents-envs/mlagents_envs/__init__.py


# Version of the library that will be used to upload to pypi
__version__ = "0.17.0.dev0"
__version__ = "0.18.0.dev0"
# Git tag that will be checked to determine whether to trigger upload to pypi
__release_tag__ = None

12
ml-agents-envs/mlagents_envs/registry/binary_utils.py


break
try:
download_and_extract_zip(url, name)
except IOError:
logger.debug(
f"Attempt {attempt + 1} / {NUMBER_ATTEMPTS} : Failed to download"
)
except Exception: # pylint: disable=W0702
if attempt + 1 < NUMBER_ATTEMPTS:
logger.warning(
f"Attempt {attempt + 1} / {NUMBER_ATTEMPTS}"
": Failed to download and extract binary."
)
else:
raise
path = get_local_binary_path_if_exists(name, url)
if path is None:

32
ml-agents/mlagents/model_serialization.py


from distutils.version import LooseVersion
try:
import onnx
from tf2onnx.tfonnx import process_tf_graph, tf_optimize
from tf2onnx import optimizer

) -> Any:
# This is basically https://github.com/onnx/tensorflow-onnx/blob/master/tf2onnx/convert.py
# Some constants in the graph need to be read by the inference system.
# These aren't used by the model anywhere, so trying to make sure they propagate
# through conversion and import is a losing battle. Instead, save them now,
# so that we can add them back later.
constant_values = {}
for n in frozen_graph_def.node:
if n.name in MODEL_CONSTANTS:
val = n.attr["value"].tensor.int_val[0]
constant_values[n.name] = val
inputs = _get_input_node_names(frozen_graph_def)
outputs = _get_output_node_names(frozen_graph_def)
logger.info(f"onnx export - inputs:{inputs} outputs:{outputs}")

onnx_graph = optimizer.optimize_graph(g)
model_proto = onnx_graph.make_model(settings.brain_name)
# Save the constant values back the graph initializer.
# This will ensure the importer gets them as global constants.
constant_nodes = []
for k, v in constant_values.items():
constant_node = _make_onnx_node_for_constant(k, v)
constant_nodes.append(constant_node)
model_proto.graph.initializer.extend(constant_nodes)
def _make_onnx_node_for_constant(name: str, value: int) -> Any:
tensor_value = onnx.TensorProto(
data_type=onnx.TensorProto.INT32,
name=name,
int32_data=[value],
dims=[1, 1, 1, 1],
)
return tensor_value
def _get_input_node_names(frozen_graph_def: Any) -> List[str]:

def _get_output_node_names(frozen_graph_def: Any) -> List[str]:
"""
Get the list of output node names from the graph.
Also include constants, so that they will be readable by the
onnx importer.
output_names = node_names & POSSIBLE_OUTPUT_NODES
output_names = node_names & (POSSIBLE_OUTPUT_NODES | MODEL_CONSTANTS)
# Append the port
return [f"{n}:0" for n in output_names]

2
ml-agents/mlagents/trainers/__init__.py


# Version of the library that will be used to upload to pypi
__version__ = "0.17.0.dev0"
__version__ = "0.18.0.dev0"
# Git tag that will be checked to determine whether to trigger upload to pypi
__release_tag__ = None

7
ml-agents/mlagents/trainers/cli_utils.py


action=DetectDefault,
)
argparser.add_argument(
"--save-freq",
default=50000,
type=int,
help="How often (in steps) to save the model during training",
action=DetectDefault,
)
argparser.add_argument(
"--seed",
default=-1,
type=int,

2
ml-agents/mlagents/trainers/ghost/trainer.py


except AgentManagerQueue.Empty:
pass
self.next_summary_step = self.trainer.next_summary_step
self._next_summary_step = self.trainer._next_summary_step
self.trainer.advance()
if self.get_step - self.last_team_change > self.steps_to_train_team:
self.controller.change_training_team(self.get_step)

1
ml-agents/mlagents/trainers/learn.py


trainer_factory,
write_path,
checkpoint_settings.run_id,
checkpoint_settings.save_freq,
maybe_meta_curriculum,
not checkpoint_settings.inference,
run_seed,

21
ml-agents/mlagents/trainers/ppo/trainer.py


super(PPOTrainer, self).__init__(
brain_name, trainer_settings, training, run_id, reward_buff_cap
)
self.param_keys = [
"batch_size",
"beta",
"buffer_size",
"epsilon",
"hidden_units",
"lambd",
"learning_rate",
"max_steps",
"normalize",
"num_epoch",
"num_layers",
"time_horizon",
"sequence_length",
"summary_freq",
"use_recurrent",
"memory_size",
"output_path",
"reward_signals",
]
self.hyperparameters: PPOSettings = cast(
PPOSettings, self.trainer_settings.hyperparameters
)

self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly
self.step = policy.get_current_step()
self.next_summary_step = self._get_next_summary_step()
def get_policy(self, name_behavior_id: str) -> TFPolicy:
"""

1
ml-agents/mlagents/trainers/sac/trainer.py


self.reward_signal_update_steps = int(
max(1, self.step / self.reward_signal_steps_per_update)
)
self.next_summary_step = self._get_next_summary_step()
def get_policy(self, name_behavior_id: str) -> TFPolicy:
"""

4
ml-agents/mlagents/trainers/settings.py


# Assign team_change to about 4x save_steps
return self.save_steps * 5
swap_steps: int = 10000
swap_steps: int = 2000
window: int = 10
play_against_latest_model_ratio: float = 0.5
initial_elo: float = 1200.0

init_path: Optional[str] = None
output_path: str = "default"
keep_checkpoints: int = 5
checkpoint_interval: int = 500000
max_steps: int = 500000
time_horizon: int = 64
summary_freq: int = 50000

@attr.s(auto_attribs=True)
class CheckpointSettings:
save_freq: int = parser.get_default("save_freq")
run_id: str = parser.get_default("run_id")
initialize_from: str = parser.get_default("initialize_from")
load_model: bool = parser.get_default("load_model")

8
ml-agents/mlagents/trainers/tests/test_learn.py


seed: 9870
checkpoint_settings:
run_id: uselessrun
save_freq: 654321
debug: false
"""

trainer_factory_mock.return_value,
"results/ppo",
"ppo",
50000,
None,
True,
0,

assert opt.checkpoint_settings.resume is False
assert opt.checkpoint_settings.inference is False
assert opt.checkpoint_settings.run_id == "ppo"
assert opt.checkpoint_settings.save_freq == 50000
assert opt.env_settings.seed == -1
assert opt.env_settings.base_port == 5005
assert opt.env_settings.num_envs == 1

"--resume",
"--inference",
"--run-id=myawesomerun",
"--save-freq=123456",
"--seed=7890",
"--train",
"--base-port=4004",

assert opt.env_settings.env_path == "./myenvfile"
assert opt.parameter_randomization is None
assert opt.checkpoint_settings.run_id == "myawesomerun"
assert opt.checkpoint_settings.save_freq == 123456
assert opt.env_settings.seed == 7890
assert opt.env_settings.base_port == 4004
assert opt.env_settings.num_envs == 2

assert opt.env_settings.env_path == "./oldenvfile"
assert opt.parameter_randomization is None
assert opt.checkpoint_settings.run_id == "uselessrun"
assert opt.checkpoint_settings.save_freq == 654321
assert opt.env_settings.seed == 9870
assert opt.env_settings.base_port == 4001
assert opt.env_settings.num_envs == 4

"--resume",
"--inference",
"--run-id=myawesomerun",
"--save-freq=123456",
"--seed=7890",
"--train",
"--base-port=4004",

assert opt.env_settings.env_path == "./myenvfile"
assert opt.parameter_randomization is None
assert opt.checkpoint_settings.run_id == "myawesomerun"
assert opt.checkpoint_settings.save_freq == 123456
assert opt.env_settings.seed == 7890
assert opt.env_settings.base_port == 4004
assert opt.env_settings.num_envs == 2

1
ml-agents/mlagents/trainers/tests/test_ppo.py


# Make sure the summary steps were loaded properly
assert trainer.get_step == 2000
assert trainer.next_summary_step > 2000
# Test incorrect class of policy
policy = mock.Mock()

49
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


def create_rl_trainer():
mock_brainparams = create_mock_brain()
trainer = FakeTrainer(mock_brainparams, TrainerSettings(max_steps=100), True, 0)
trainer = FakeTrainer(
mock_brainparams,
TrainerSettings(max_steps=100, checkpoint_interval=10, summary_freq=20),
True,
0,
)
trainer.set_is_policy_updating(True)
return trainer

# Check that the buffer has been cleared
assert not trainer.should_still_train
assert mocked_clear_update_buffer.call_count > 0
@mock.patch("mlagents.trainers.trainer.trainer.Trainer.save_model")
@mock.patch("mlagents.trainers.trainer.trainer.StatsReporter.write_stats")
def test_summary_checkpoint(mock_write_summary, mock_save_model):
trainer = create_rl_trainer()
trajectory_queue = AgentManagerQueue("testbrain")
policy_queue = AgentManagerQueue("testbrain")
trainer.subscribe_trajectory_queue(trajectory_queue)
trainer.publish_policy_queue(policy_queue)
time_horizon = 10
summary_freq = trainer.trainer_settings.summary_freq
checkpoint_interval = trainer.trainer_settings.checkpoint_interval
trajectory = mb.make_fake_trajectory(
length=time_horizon,
max_step_complete=True,
vec_obs_size=1,
num_vis_obs=0,
action_space=[2],
)
# Check that we can turn off the trainer and that the buffer is cleared
num_trajectories = 5
for _ in range(0, num_trajectories):
trajectory_queue.put(trajectory)
trainer.advance()
# Check that there is stuff in the policy queue
policy_queue.get_nowait()
# Check that we have called write_summary the appropriate number of times
calls = [
mock.call(step)
for step in range(summary_freq, num_trajectories * time_horizon, summary_freq)
]
mock_write_summary.assert_has_calls(calls, any_order=True)
calls = [
mock.call(trainer.brain_name)
for step in range(
checkpoint_interval, num_trajectories * time_horizon, checkpoint_interval
)
]
mock_save_model.assert_has_calls(calls, any_order=True)

1
ml-agents/mlagents/trainers/tests/test_sac.py