浏览代码

Merge branch 'master' into asymm-envs

/asymm-envs
Andrew Cohen 5 年前
当前提交
2c42f577
共有 117 个文件被更改,包括 6286 次插入7721 次删除
  1. 5
      .yamato/python-ll-api-test.yml
  2. 2
      .yamato/standalone-build-test.yml
  3. 2
      Project/Assets/ML-Agents/Examples/3DBall/Demos/Expert3DBall.demo.meta
  4. 2
      Project/Assets/ML-Agents/Examples/3DBall/Demos/Expert3DBallHard.demo.meta
  5. 22
      Project/Assets/ML-Agents/Examples/3DBall/Prefabs/3DBallHardNew.prefab
  6. 9
      Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs
  7. 2
      Project/Assets/ML-Agents/Examples/Basic/Demos/ExpertBasic.demo.meta
  8. 936
      Project/Assets/ML-Agents/Examples/Basic/Prefabs/Basic.prefab
  9. 2
      Project/Assets/ML-Agents/Examples/Bouncer/Demos/ExpertBouncer.demo.meta
  10. 928
      Project/Assets/ML-Agents/Examples/Bouncer/Prefabs/Environment.prefab
  11. 11
      Project/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs
  12. 2
      Project/Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerDyn.demo.meta
  13. 2
      Project/Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerSta.demo.meta
  14. 22
      Project/Assets/ML-Agents/Examples/Crawler/Prefabs/DynamicPlatform.prefab
  15. 13
      Project/Assets/ML-Agents/Examples/Crawler/Prefabs/FixedPlatform.prefab
  16. 2
      Project/Assets/ML-Agents/Examples/FoodCollector/Demos/ExpertFood.demo.meta
  17. 153
      Project/Assets/ML-Agents/Examples/FoodCollector/Prefabs/FoodCollectorArea.prefab
  18. 14
      Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorAgent.cs
  19. 2
      Project/Assets/ML-Agents/Examples/GridWorld/Demos/ExpertGrid.demo.meta
  20. 32
      Project/Assets/ML-Agents/Examples/GridWorld/Prefabs/Area.prefab
  21. 12
      Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAgent.cs
  22. 2
      Project/Assets/ML-Agents/Examples/Hallway/Demos/ExpertHallway.demo.meta
  23. 43
      Project/Assets/ML-Agents/Examples/Hallway/Prefabs/SymbolFinderArea.prefab
  24. 18
      Project/Assets/ML-Agents/Examples/Hallway/Scripts/HallwayAgent.cs
  25. 2
      Project/Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo.meta
  26. 64
      Project/Assets/ML-Agents/Examples/PushBlock/Prefabs/PushBlockArea.prefab
  27. 18
      Project/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs
  28. 2
      Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo.meta
  29. 85
      Project/Assets/ML-Agents/Examples/Pyramids/Prefabs/AreaPB.prefab
  30. 18
      Project/Assets/ML-Agents/Examples/Pyramids/Scripts/PyramidAgent.cs
  31. 2
      Project/Assets/ML-Agents/Examples/Reacher/Demos/ExpertReacher.demo.meta
  32. 22
      Project/Assets/ML-Agents/Examples/Reacher/Prefabs/Agent.prefab
  33. 6
      Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ModelOverrider.cs
  34. 4
      Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ProjectSettingsOverrides.cs
  35. 5
      Project/Assets/ML-Agents/Examples/Soccer/Prefabs/StrikersVsGoalieField.prefab
  36. 38
      Project/Assets/ML-Agents/Examples/Soccer/Scripts/AgentSoccer.cs
  37. 1001
      Project/Assets/ML-Agents/Examples/Soccer/TFModels/Goalie.nn
  38. 1001
      Project/Assets/ML-Agents/Examples/Soccer/TFModels/Striker.nn
  39. 2
      Project/Assets/ML-Agents/Examples/Tennis/Demos/ExpertTennis.demo.meta
  40. 11
      Project/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs
  41. 2
      Project/Assets/ML-Agents/Examples/Walker/Demos/ExpertWalker.demo.meta
  42. 22
      Project/Assets/ML-Agents/Examples/Walker/Prefabs/WalkerPair.prefab
  43. 64
      Project/Assets/ML-Agents/Examples/WallJump/Prefabs/WallJumpArea.prefab
  44. 14
      Project/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs
  45. 3
      Project/ProjectSettings/GraphicsSettings.asset
  46. 2
      Project/ProjectSettings/ProjectVersion.txt
  47. 2
      Project/ProjectSettings/UnityConnectSettings.asset
  48. 303
      com.unity.ml-agents/CHANGELOG.md
  49. 78
      com.unity.ml-agents/Editor/DemonstrationDrawer.cs
  50. 26
      com.unity.ml-agents/Editor/DemonstrationImporter.cs
  51. 2
      com.unity.ml-agents/Runtime/Academy.cs
  52. 96
      com.unity.ml-agents/Runtime/Agent.cs
  53. 68
      com.unity.ml-agents/Runtime/Communicator/GrpcExtensions.cs
  54. 51
      com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs
  55. 2
      com.unity.ml-agents/Runtime/Demonstrations/DemonstrationWriter.cs
  56. 8
      com.unity.ml-agents/Runtime/Policies/BehaviorParameters.cs
  57. 22
      com.unity.ml-agents/Runtime/Policies/HeuristicPolicy.cs
  58. 35
      com.unity.ml-agents/Runtime/Timer.cs
  59. 4
      com.unity.ml-agents/Tests/Editor/BehaviorParameterTests.cs
  60. 6
      com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs
  61. 43
      com.unity.ml-agents/Tests/Editor/TimerTest.cs
  62. 8
      com.unity.ml-agents/Tests/Runtime/RuntimeAPITest.cs
  63. 2
      com.unity.ml-agents/package.json
  64. 92
      docs/FAQ.md
  65. 346
      docs/Getting-Started.md
  66. 120
      docs/Installation-Anaconda-Windows.md
  67. 8
      docs/Learning-Environment-Create-New.md
  68. 509
      docs/Learning-Environment-Design-Agents.md
  69. 670
      docs/Learning-Environment-Examples.md
  70. 84
      docs/Learning-Environment-Executable.md
  71. 39
      docs/ML-Agents-Overview.md
  72. 5
      docs/Migrating.md
  73. 100
      docs/Readme.md
  74. 6
      docs/Training-Imitation-Learning.md
  75. 368
      docs/Training-ML-Agents.md
  76. 14
      docs/Training-Self-Play.md
  77. 147
      docs/Training-on-Amazon-Web-Service.md
  78. 145
      docs/Training-on-Microsoft-Azure.md
  79. 36
      docs/Using-Docker.md
  80. 82
      docs/Using-Tensorboard.md
  81. 66
      docs/Using-Virtual-Environment.md
  82. 150
      docs/images/demo_component.png
  83. 257
      docs/images/demo_inspector.png
  84. 999
      docs/images/docker_build_settings.png
  85. 980
      docs/images/gridworld.png
  86. 198
      docs/images/learning_environment_basic.png
  87. 545
      docs/images/learning_environment_example.png
  88. 219
      docs/images/platform_prefab.png
  89. 604
      docs/images/unity_package_json.png
  90. 999
      docs/images/unity_package_manager_window.png
  91. 349
      docs/images/visual-observation-rawimage.png
  92. 95
      docs/images/visual-observation-rendertexture.png
  93. 107
      docs/images/visual-observation.png
  94. 63
      ml-agents-envs/mlagents_envs/environment.py
  95. 31
      ml-agents-envs/mlagents_envs/tests/test_envs.py
  96. 95
      ml-agents/mlagents/trainers/learn.py
  97. 22
      ml-agents/tests/yamato/scripts/run_gym.py
  98. 29
      ml-agents/tests/yamato/scripts/run_llapi.py
  99. 24
      utils/validate_versions.py
  100. 22
      com.unity.ml-agents/Runtime/Demonstrations/DemonstrationMetaData.cs

5
.yamato/python-ll-api-test.yml


commands:
- pip install pyyaml
- python -u -m ml-agents.tests.yamato.setup_venv
- ./venv/bin/python ml-agents/tests/yamato/scripts/run_llapi.py
- ./venv/bin/python ml-agents/tests/yamato/scripts/run_llapi.py
- ./venv/bin/python ml-agents/tests/yamato/scripts/run_llapi.py --env=artifacts/testPlayer-Basic
- ./venv/bin/python ml-agents/tests/yamato/scripts/run_llapi.py --env=artifacts/testPlayer-WallJump
- ./venv/bin/python ml-agents/tests/yamato/scripts/run_llapi.py --env=artifacts/testPlayer-Bouncer
dependencies:
- .yamato/standalone-build-test.yml#test_mac_standalone_{{ editor.version }}
triggers:

2
.yamato/standalone-build-test.yml


- pip install pyyaml
- python -u -m ml-agents.tests.yamato.standalone_build_tests
- python -u -m ml-agents.tests.yamato.standalone_build_tests --scene=Assets/ML-Agents/Examples/Basic/Scenes/Basic.unity
- python -u -m ml-agents.tests.yamato.standalone_build_tests --scene=Assets/ML-Agents/Examples/Bouncer/Scenes/Bouncer.unity
- python -u -m ml-agents.tests.yamato.standalone_build_tests --scene=Assets/ML-Agents/Examples/WallJump/Scenes/WallJump.unity
triggers:
cancel_old_ci: true
changes:

2
Project/Assets/ML-Agents/Examples/3DBall/Demos/Expert3DBall.demo.meta


fileIDToRecycleName:
11400000: Assets/ML-Agents/Examples/3DBall/Demos/Expert3DBall.demo
externalObjects: {}
userData: ' (MLAgents.Demonstrations.Demonstration)'
userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}

2
Project/Assets/ML-Agents/Examples/3DBall/Demos/Expert3DBallHard.demo.meta


fileIDToRecycleName:
11400000: Assets/ML-Agents/Examples/3DBall/Demos/Expert3DBallHard.demo
externalObjects: {}
userData: ' (MLAgents.Demonstrations.Demonstration)'
userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}

22
Project/Assets/ML-Agents/Examples/3DBall/Prefabs/3DBallHardNew.prefab


- component: {fileID: 114284317994838100}
- component: {fileID: 114466000339026140}
- component: {fileID: 8193279139064749781}
- component: {fileID: 7923264721978289873}
m_Layer: 0
m_Name: Agent
m_TagString: Untagged

m_InferenceDevice: 0
m_BehaviorType: 0
m_BehaviorName: 3DBallHard
m_TeamID: 0
m_useChildSensors: 1
TeamId: 0
m_UseChildSensors: 1
--- !u!114 &114466000339026140
MonoBehaviour:
m_ObjectHideFlags: 0

m_Script: {fileID: 11500000, guid: edf26e11cf4ed42eaa3ffb7b91bb4676, type: 3}
m_Name:
m_EditorClassIdentifier:
agentParameters:
maxStep: 0
hasUpgradedFromAgentParameters: 1
maxStep: 5000
ball: {fileID: 1142513601053358}
--- !u!114 &8193279139064749781

m_Name:
m_EditorClassIdentifier:
DecisionPeriod: 5
RepeatAction: 1
TakeActionsBetweenDecisions: 1
--- !u!114 &7923264721978289873
MonoBehaviour:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1829721031899636}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 3a6da8f78a394c6ab027688eab81e04d, type: 3}
m_Name:
m_EditorClassIdentifier:
--- !u!1 &1978072206102878
GameObject:
m_ObjectHideFlags: 0

9
Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs


SetResetParameters();
}
public override float[] Heuristic()
public override void Heuristic(float[] actionsOut)
var action = new float[2];
action[0] = -Input.GetAxis("Horizontal");
action[1] = Input.GetAxis("Vertical");
return action;
actionsOut[0] = -Input.GetAxis("Horizontal");
actionsOut[1] = Input.GetAxis("Vertical");
}
public void SetBall()

2
Project/Assets/ML-Agents/Examples/Basic/Demos/ExpertBasic.demo.meta


fileIDToRecycleName:
11400000: Assets/ML-Agents/Examples/Basic/Demos/ExpertBasic.demo
externalObjects: {}
userData: ' (MLAgents.Demonstrations.Demonstration)'
userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}

936
Project/Assets/ML-Agents/Examples/Basic/Prefabs/Basic.prefab
文件差异内容过多而无法显示
查看文件

2
Project/Assets/ML-Agents/Examples/Bouncer/Demos/ExpertBouncer.demo.meta


fileIDToRecycleName:
11400000: Assets/ML-Agents/Examples/Bouncer/Demos/ExpertBouncer.demo
externalObjects: {}
userData: ' (MLAgents.Demonstrations.Demonstration)'
userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}

928
Project/Assets/ML-Agents/Examples/Bouncer/Prefabs/Environment.prefab
文件差异内容过多而无法显示
查看文件

11
Project/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs


}
}
public override float[] Heuristic()
public override void Heuristic(float[] actionsOut)
var action = new float[3];
action[0] = Input.GetAxis("Horizontal");
action[1] = Input.GetKey(KeyCode.Space) ? 1.0f : 0.0f;
action[2] = Input.GetAxis("Vertical");
return action;
actionsOut[0] = Input.GetAxis("Horizontal");
actionsOut[1] = Input.GetKey(KeyCode.Space) ? 1.0f : 0.0f;
actionsOut[2] = Input.GetAxis("Vertical");
}
void Update()

2
Project/Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerDyn.demo.meta


fileIDToRecycleName:
11400000: Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerDyn.demo
externalObjects: {}
userData: ' (MLAgents.Demonstrations.Demonstration)'
userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}

2
Project/Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerSta.demo.meta


fileIDToRecycleName:
11400000: Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerSta.demo
externalObjects: {}
userData: ' (MLAgents.Demonstrations.Demonstration)'
userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}

22
Project/Assets/ML-Agents/Examples/Crawler/Prefabs/DynamicPlatform.prefab


- component: {fileID: 114590693924030052}
- component: {fileID: 114423363226357902}
- component: {fileID: 8520694362683208207}
- component: {fileID: 1267665179144855710}
m_Layer: 0
m_Name: Crawler
m_TagString: Untagged

m_InferenceDevice: 0
m_BehaviorType: 0
m_BehaviorName: CrawlerDynamic
m_TeamID: 0
m_useChildSensors: 1
TeamId: 0
m_UseChildSensors: 1
--- !u!114 &114590693924030052
MonoBehaviour:
m_ObjectHideFlags: 0

m_Script: {fileID: 11500000, guid: 2f37c30a5e8d04117947188818902ef3, type: 3}
m_Name:
m_EditorClassIdentifier:
agentParameters:
maxStep: 0
hasUpgradedFromAgentParameters: 1
maxStep: 5000
target: {fileID: 4490950947783742}
ground: {fileID: 4684408634944056}

m_Name:
m_EditorClassIdentifier:
DecisionPeriod: 5
RepeatAction: 0
TakeActionsBetweenDecisions: 0
--- !u!114 &1267665179144855710
MonoBehaviour:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1515093357607024}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 3a6da8f78a394c6ab027688eab81e04d, type: 3}
m_Name:
m_EditorClassIdentifier:
--- !u!1 &1520563409393552
GameObject:
m_ObjectHideFlags: 0

13
Project/Assets/ML-Agents/Examples/Crawler/Prefabs/FixedPlatform.prefab


- component: {fileID: 114230237520033992}
- component: {fileID: 114375802757824636}
- component: {fileID: 8847231916954260663}
- component: {fileID: 6335439310911778343}
m_Layer: 0
m_Name: Crawler
m_TagString: Untagged

DecisionPeriod: 5
TakeActionsBetweenDecisions: 0
offsetStep: 0
--- !u!114 &6335439310911778343
MonoBehaviour:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1492298671135358}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 3a6da8f78a394c6ab027688eab81e04d, type: 3}
m_Name:
m_EditorClassIdentifier:
--- !u!1 &1492926997393242
GameObject:
m_ObjectHideFlags: 0

2
Project/Assets/ML-Agents/Examples/FoodCollector/Demos/ExpertFood.demo.meta


fileIDToRecycleName:
11400000: Assets/ML-Agents/Examples/FoodCollector/Demos/ExpertFood.demo
externalObjects: {}
userData: ' (MLAgents.Demonstrations.Demonstration)'
userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}

153
Project/Assets/ML-Agents/Examples/FoodCollector/Prefabs/FoodCollectorArea.prefab


- component: {fileID: 114176228333253036}
- component: {fileID: 114725457980523372}
- component: {fileID: 8297075921230369060}
- component: {fileID: 1222199865870203693}
m_Layer: 0
m_Name: Agent
m_TagString: agent

m_InferenceDevice: 0
m_BehaviorType: 0
m_BehaviorName: FoodCollector
m_TeamID: 0
TeamId: 0
m_UseChildSensors: 1
--- !u!114 &114176228333253036
MonoBehaviour:

m_Script: {fileID: 11500000, guid: c66e6845309d241c78a6d77ee2567928, type: 3}
m_Name:
m_EditorClassIdentifier:
agentParameters:
maxStep: 0
hasUpgradedFromAgentParameters: 1
maxStep: 5000
area: {fileID: 1819751139121548}
turnSpeed: 300

m_Script: {fileID: 11500000, guid: 6bb6b867a41448888c1cd4f99643ad71, type: 3}
m_Name:
m_EditorClassIdentifier:
sensorName: RayPerceptionSensor
detectableTags:
m_SensorName: RayPerceptionSensor
m_DetectableTags:
raysPerDirection: 3
maxRayDegrees: 70
sphereCastRadius: 0.5
rayLength: 50
rayLayerMask:
m_RaysPerDirection: 3
m_MaxRayDegrees: 70
m_SphereCastRadius: 0.5
m_RayLength: 50
m_RayLayerMask:
observationStacks: 1
m_ObservationStacks: 1
useWorldPositions: 1
startVerticalOffset: 0
endVerticalOffset: 0
m_StartVerticalOffset: 0
m_EndVerticalOffset: 0
--- !u!114 &8297075921230369060
MonoBehaviour:
m_ObjectHideFlags: 0

m_Name:
m_EditorClassIdentifier:
DecisionPeriod: 5
RepeatAction: 1
TakeActionsBetweenDecisions: 1
--- !u!114 &1222199865870203693
MonoBehaviour:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1464820575638702}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 3a6da8f78a394c6ab027688eab81e04d, type: 3}
m_Name:
m_EditorClassIdentifier:
--- !u!1 &1482701732800114
GameObject:
m_ObjectHideFlags: 0

m_InferenceDevice: 0
m_BehaviorType: 0
m_BehaviorName: FoodCollector
m_TeamID: 0
TeamId: 0
m_UseChildSensors: 1
--- !u!114 &114711827726849508
MonoBehaviour:

m_Script: {fileID: 11500000, guid: c66e6845309d241c78a6d77ee2567928, type: 3}
m_Name:
m_EditorClassIdentifier:
agentParameters:
maxStep: 0
hasUpgradedFromAgentParameters: 1
maxStep: 5000
area: {fileID: 1819751139121548}
turnSpeed: 300

m_Script: {fileID: 11500000, guid: 6bb6b867a41448888c1cd4f99643ad71, type: 3}
m_Name:
m_EditorClassIdentifier:
sensorName: RayPerceptionSensor
detectableTags:
m_SensorName: RayPerceptionSensor
m_DetectableTags:
raysPerDirection: 3
maxRayDegrees: 70
sphereCastRadius: 0.5
rayLength: 50
rayLayerMask:
m_RaysPerDirection: 3
m_MaxRayDegrees: 70
m_SphereCastRadius: 0.5
m_RayLength: 50
m_RayLayerMask:
observationStacks: 1
m_ObservationStacks: 1
useWorldPositions: 1
startVerticalOffset: 0
endVerticalOffset: 0
m_StartVerticalOffset: 0
m_EndVerticalOffset: 0
--- !u!114 &259154752087955944
MonoBehaviour:
m_ObjectHideFlags: 0

m_Name:
m_EditorClassIdentifier:
DecisionPeriod: 5
RepeatAction: 1
TakeActionsBetweenDecisions: 1
offsetStep: 0
--- !u!1 &1528397385587768
GameObject:

m_InferenceDevice: 0
m_BehaviorType: 0
m_BehaviorName: FoodCollector
m_TeamID: 0
TeamId: 0
m_UseChildSensors: 1
--- !u!114 &114542632553128056
MonoBehaviour:

m_Script: {fileID: 11500000, guid: c66e6845309d241c78a6d77ee2567928, type: 3}
m_Name:
m_EditorClassIdentifier:
agentParameters:
maxStep: 0
hasUpgradedFromAgentParameters: 1
maxStep: 5000
area: {fileID: 1819751139121548}
turnSpeed: 300

m_Script: {fileID: 11500000, guid: 6bb6b867a41448888c1cd4f99643ad71, type: 3}
m_Name:
m_EditorClassIdentifier:
sensorName: RayPerceptionSensor
detectableTags:
m_SensorName: RayPerceptionSensor
m_DetectableTags:
raysPerDirection: 3
maxRayDegrees: 70
sphereCastRadius: 0.5
rayLength: 50
rayLayerMask:
m_RaysPerDirection: 3
m_MaxRayDegrees: 70
m_SphereCastRadius: 0.5
m_RayLength: 50
m_RayLayerMask:
observationStacks: 1
m_ObservationStacks: 1
useWorldPositions: 1
startVerticalOffset: 0
endVerticalOffset: 0
m_StartVerticalOffset: 0
m_EndVerticalOffset: 0
--- !u!114 &5519119940433428255
MonoBehaviour:
m_ObjectHideFlags: 0

m_Name:
m_EditorClassIdentifier:
DecisionPeriod: 5
RepeatAction: 1
TakeActionsBetweenDecisions: 1
offsetStep: 0
--- !u!1 &1617924810425504
GameObject:

m_InferenceDevice: 0
m_BehaviorType: 0
m_BehaviorName: FoodCollector
m_TeamID: 0
TeamId: 0
m_UseChildSensors: 1
--- !u!114 &114189751434580810
MonoBehaviour:

m_Script: {fileID: 11500000, guid: c66e6845309d241c78a6d77ee2567928, type: 3}
m_Name:
m_EditorClassIdentifier:
agentParameters:
maxStep: 0
hasUpgradedFromAgentParameters: 1
maxStep: 5000
area: {fileID: 1819751139121548}
turnSpeed: 300

m_Script: {fileID: 11500000, guid: 6bb6b867a41448888c1cd4f99643ad71, type: 3}
m_Name:
m_EditorClassIdentifier:
sensorName: RayPerceptionSensor
detectableTags:
m_SensorName: RayPerceptionSensor
m_DetectableTags:
raysPerDirection: 3
maxRayDegrees: 70
sphereCastRadius: 0.5
rayLength: 50
rayLayerMask:
m_RaysPerDirection: 3
m_MaxRayDegrees: 70
m_SphereCastRadius: 0.5
m_RayLength: 50
m_RayLayerMask:
observationStacks: 1
m_ObservationStacks: 1
useWorldPositions: 1
startVerticalOffset: 0
endVerticalOffset: 0
m_StartVerticalOffset: 0
m_EndVerticalOffset: 0
--- !u!114 &5884750436653390196
MonoBehaviour:
m_ObjectHideFlags: 0

m_Name:
m_EditorClassIdentifier:
DecisionPeriod: 5
RepeatAction: 1
TakeActionsBetweenDecisions: 1
offsetStep: 0
--- !u!1 &1688105343773098
GameObject:

m_InferenceDevice: 0
m_BehaviorType: 0
m_BehaviorName: FoodCollector
m_TeamID: 0
TeamId: 0
m_UseChildSensors: 1
--- !u!114 &114235147148547996
MonoBehaviour:

m_Script: {fileID: 11500000, guid: c66e6845309d241c78a6d77ee2567928, type: 3}
m_Name:
m_EditorClassIdentifier:
agentParameters:
maxStep: 0
hasUpgradedFromAgentParameters: 1
maxStep: 5000
area: {fileID: 1819751139121548}
turnSpeed: 300

m_Script: {fileID: 11500000, guid: 6bb6b867a41448888c1cd4f99643ad71, type: 3}
m_Name:
m_EditorClassIdentifier:
sensorName: RayPerceptionSensor
detectableTags:
m_SensorName: RayPerceptionSensor
m_DetectableTags:
raysPerDirection: 3
maxRayDegrees: 70
sphereCastRadius: 0.5
rayLength: 50
rayLayerMask:
m_RaysPerDirection: 3
m_MaxRayDegrees: 70
m_SphereCastRadius: 0.5
m_RayLength: 50
m_RayLayerMask:
observationStacks: 1
m_ObservationStacks: 1
useWorldPositions: 1
startVerticalOffset: 0
endVerticalOffset: 0
m_StartVerticalOffset: 0
m_EndVerticalOffset: 0
--- !u!114 &4768752321433982785
MonoBehaviour:
m_ObjectHideFlags: 0

m_Name:
m_EditorClassIdentifier:
DecisionPeriod: 5
RepeatAction: 1
TakeActionsBetweenDecisions: 1
offsetStep: 0
--- !u!1 &1729825611722018
GameObject:

14
Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorAgent.cs


MoveAgent(vectorAction);
}
public override float[] Heuristic()
public override void Heuristic(float[] actionsOut)
var action = new float[4];
action[2] = 2f;
actionsOut[2] = 2f;
action[0] = 1f;
actionsOut[0] = 1f;
action[2] = 1f;
actionsOut[2] = 1f;
action[0] = 2f;
actionsOut[0] = 2f;
action[3] = Input.GetKey(KeyCode.Space) ? 1.0f : 0.0f;
return action;
actionsOut[3] = Input.GetKey(KeyCode.Space) ? 1.0f : 0.0f;
}
public override void OnEpisodeBegin()

2
Project/Assets/ML-Agents/Examples/GridWorld/Demos/ExpertGrid.demo.meta


fileIDToRecycleName:
11400000: Assets/ML-Agents/Examples/GridWorld/Demos/ExpertGrid.demo
externalObjects: {}
userData: ' (MLAgents.Demonstrations.Demonstration)'
userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}

32
Project/Assets/ML-Agents/Examples/GridWorld/Prefabs/Area.prefab


- component: {fileID: 114935253044749092}
- component: {fileID: 114650561397225712}
- component: {fileID: 114889700908650620}
- component: {fileID: 7980686505185502968}
m_Layer: 8
m_Name: Agent
m_TagString: agent

m_InferenceDevice: 0
m_BehaviorType: 0
m_BehaviorName: GridWorld
m_TeamID: 0
m_useChildSensors: 1
TeamId: 0
m_UseChildSensors: 1
--- !u!114 &114650561397225712
MonoBehaviour:
m_ObjectHideFlags: 0

m_Script: {fileID: 11500000, guid: 857707f3f352541d5b858efca4479b95, type: 3}
m_Name:
m_EditorClassIdentifier:
agentParameters:
maxStep: 0
hasUpgradedFromAgentParameters: 1
maxStep: 100
area: {fileID: 114704252266302846}
timeBetweenDecisionsAtInference: 0.15

m_Script: {fileID: 11500000, guid: 282f342c2ab144bf38be65d4d0c4e07d, type: 3}
m_Name:
m_EditorClassIdentifier:
camera: {fileID: 20743940359151984}
sensorName: CameraSensor
width: 84
height: 64
grayscale: 0
compression: 1
m_Camera: {fileID: 20743940359151984}
m_SensorName: CameraSensor
m_Width: 84
m_Height: 64
m_Grayscale: 0
m_Compression: 1
--- !u!114 &7980686505185502968
MonoBehaviour:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1488387672112076}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 3a6da8f78a394c6ab027688eab81e04d, type: 3}
m_Name:
m_EditorClassIdentifier:
--- !u!1 &1625008366184734
GameObject:
m_ObjectHideFlags: 0

12
Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAgent.cs


}
}
public override float[] Heuristic()
public override void Heuristic(float[] actionsOut)
actionsOut[0] = k_NoAction;
return new float[] { k_Right };
actionsOut[0] = k_Right;
return new float[] { k_Up };
actionsOut[0] = k_Up;
return new float[] { k_Left };
actionsOut[0] = k_Left;
return new float[] { k_Down };
actionsOut[0] = k_Down;
return new float[] { k_NoAction };
}
// to be implemented by the developer

2
Project/Assets/ML-Agents/Examples/Hallway/Demos/ExpertHallway.demo.meta


fileIDToRecycleName:
11400000: Assets/ML-Agents/Examples/Hallway/Demos/ExpertHallway.demo
externalObjects: {}
userData: ' (MLAgents.Demonstrations.Demonstration)'
userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}

43
Project/Assets/ML-Agents/Examples/Hallway/Prefabs/SymbolFinderArea.prefab


- component: {fileID: 114286701363010626}
- component: {fileID: 114388598785529460}
- component: {fileID: 1360037369662378601}
- component: {fileID: 3959905707628515947}
m_Layer: 0
m_Name: Agent
m_TagString: agent

m_InferenceDevice: 0
m_BehaviorType: 0
m_BehaviorName: Hallway
m_TeamID: 0
m_useChildSensors: 1
TeamId: 0
m_UseChildSensors: 1
--- !u!114 &114286701363010626
MonoBehaviour:
m_ObjectHideFlags: 0

m_Script: {fileID: 11500000, guid: b446afae240924105b36d07e8d17a608, type: 3}
m_Name:
m_EditorClassIdentifier:
agentParameters:
maxStep: 0
hasUpgradedFromAgentParameters: 1
maxStep: 3000
ground: {fileID: 1510027348950282}
area: {fileID: 1745841960385024}

m_Script: {fileID: 11500000, guid: 6bb6b867a41448888c1cd4f99643ad71, type: 3}
m_Name:
m_EditorClassIdentifier:
sensorName: RayPerceptionSensor
detectableTags:
m_SensorName: RayPerceptionSensor
m_DetectableTags:
raysPerDirection: 2
maxRayDegrees: 70
sphereCastRadius: 0.5
rayLength: 12
rayLayerMask:
m_RaysPerDirection: 2
m_MaxRayDegrees: 70
m_SphereCastRadius: 0.5
m_RayLength: 12
m_RayLayerMask:
observationStacks: 3
m_ObservationStacks: 3
useWorldPositions: 1
startVerticalOffset: 0
endVerticalOffset: 0
m_StartVerticalOffset: 0
m_EndVerticalOffset: 0
--- !u!114 &1360037369662378601
MonoBehaviour:
m_ObjectHideFlags: 0

m_Name:
m_EditorClassIdentifier:
DecisionPeriod: 6
RepeatAction: 1
TakeActionsBetweenDecisions: 1
--- !u!114 &3959905707628515947
MonoBehaviour:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1471560210313468}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 3a6da8f78a394c6ab027688eab81e04d, type: 3}
m_Name:
m_EditorClassIdentifier:
--- !u!1 &1510027348950282
GameObject:
m_ObjectHideFlags: 0

18
Project/Assets/ML-Agents/Examples/Hallway/Scripts/HallwayAgent.cs


}
}
public override float[] Heuristic()
public override void Heuristic(float[] actionsOut)
actionsOut[0] = 0;
return new float[] { 3 };
actionsOut[0] = 3;
if (Input.GetKey(KeyCode.W))
else if (Input.GetKey(KeyCode.W))
return new float[] { 1 };
actionsOut[0] = 1;
if (Input.GetKey(KeyCode.A))
else if (Input.GetKey(KeyCode.A))
return new float[] { 4 };
actionsOut[0] = 4;
if (Input.GetKey(KeyCode.S))
else if (Input.GetKey(KeyCode.S))
return new float[] { 2 };
actionsOut[0] = 2;
return new float[] { 0 };
}
public override void OnEpisodeBegin()

2
Project/Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo.meta


fileIDToRecycleName:
11400000: Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo
externalObjects: {}
userData: ' (MLAgents.Demonstrations.Demonstration)'
userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}

64
Project/Assets/ML-Agents/Examples/PushBlock/Prefabs/PushBlockArea.prefab


- component: {fileID: 114807072692257076}
- component: {fileID: 114451319691753174}
- component: {fileID: 8964598783836598940}
- component: {fileID: 4081319787948195948}
m_Layer: 0
m_Name: Agent
m_TagString: agent

m_InferenceDevice: 0
m_BehaviorType: 0
m_BehaviorName: PushBlock
m_TeamID: 0
m_useChildSensors: 1
TeamId: 0
m_UseChildSensors: 1
--- !u!114 &114505490781873732
MonoBehaviour:
m_ObjectHideFlags: 0

m_Script: {fileID: 11500000, guid: dea8c4f2604b947e6b7b97750dde87ca, type: 3}
m_Name:
m_EditorClassIdentifier:
agentParameters:
maxStep: 0
hasUpgradedFromAgentParameters: 1
maxStep: 5000
ground: {fileID: 1500989011945850}
area: {fileID: 1125452240183160}

m_Script: {fileID: 11500000, guid: 6bb6b867a41448888c1cd4f99643ad71, type: 3}
m_Name:
m_EditorClassIdentifier:
sensorName: RayPerceptionSensor
detectableTags:
m_SensorName: RayPerceptionSensor
m_DetectableTags:
raysPerDirection: 3
maxRayDegrees: 90
sphereCastRadius: 0.5
rayLength: 12
rayLayerMask:
m_RaysPerDirection: 3
m_MaxRayDegrees: 90
m_SphereCastRadius: 0.5
m_RayLength: 12
m_RayLayerMask:
observationStacks: 3
m_ObservationStacks: 3
useWorldPositions: 1
startVerticalOffset: 0
endVerticalOffset: 0
m_StartVerticalOffset: 0
m_EndVerticalOffset: 0
--- !u!114 &114451319691753174
MonoBehaviour:
m_ObjectHideFlags: 0

m_Script: {fileID: 11500000, guid: 6bb6b867a41448888c1cd4f99643ad71, type: 3}
m_Name:
m_EditorClassIdentifier:
sensorName: OffsetRayPerceptionSensor
detectableTags:
m_SensorName: OffsetRayPerceptionSensor
m_DetectableTags:
raysPerDirection: 3
maxRayDegrees: 90
sphereCastRadius: 0.5
rayLength: 12
rayLayerMask:
m_RaysPerDirection: 3
m_MaxRayDegrees: 90
m_SphereCastRadius: 0.5
m_RayLength: 12
m_RayLayerMask:
observationStacks: 3
m_ObservationStacks: 3
useWorldPositions: 1
startVerticalOffset: 1.5
endVerticalOffset: 1.5
m_StartVerticalOffset: 1.5
m_EndVerticalOffset: 1.5
--- !u!114 &8964598783836598940
MonoBehaviour:
m_ObjectHideFlags: 0

m_Name:
m_EditorClassIdentifier:
DecisionPeriod: 5
RepeatAction: 1
TakeActionsBetweenDecisions: 1
--- !u!114 &4081319787948195948
MonoBehaviour:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1489716781518988}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 3a6da8f78a394c6ab027688eab81e04d, type: 3}
m_Name:
m_EditorClassIdentifier:
--- !u!1 &1500989011945850
GameObject:
m_ObjectHideFlags: 0

18
Project/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs


AddReward(-1f / maxStep);
}
public override float[] Heuristic()
public override void Heuristic(float[] actionsOut)
actionsOut[0] = 0;
return new float[] { 3 };
actionsOut[0] = 3;
if (Input.GetKey(KeyCode.W))
else if (Input.GetKey(KeyCode.W))
return new float[] { 1 };
actionsOut[0] = 1;
if (Input.GetKey(KeyCode.A))
else if (Input.GetKey(KeyCode.A))
return new float[] { 4 };
actionsOut[0] = 4;
if (Input.GetKey(KeyCode.S))
else if (Input.GetKey(KeyCode.S))
return new float[] { 2 };
actionsOut[0] = 2;
return new float[] { 0 };
}
/// <summary>

2
Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo.meta


fileIDToRecycleName:
11400000: Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
externalObjects: {}
userData: ' (MLAgents.Demonstrations.Demonstration)'
userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}

85
Project/Assets/ML-Agents/Examples/Pyramids/Prefabs/AreaPB.prefab


- component: {fileID: 5712624269609438939}
- component: {fileID: 5767481171805996936}
- component: {fileID: 4725417187860315718}
- component: {fileID: 6474351450651730614}
m_Layer: 0
m_Name: Agent
m_TagString: agent

m_InferenceDevice: 0
m_BehaviorType: 0
m_BehaviorName: Pyramids
m_TeamID: 0
m_useChildSensors: 1
TeamId: 0
m_UseChildSensors: 1
--- !u!114 &114937736047215868
MonoBehaviour:
m_ObjectHideFlags: 0

m_Script: {fileID: 11500000, guid: b8db44472779248d3be46895c4d562d5, type: 3}
m_Name:
m_EditorClassIdentifier:
agentParameters:
maxStep: 0
hasUpgradedFromAgentParameters: 1
maxStep: 5000
area: {fileID: 1464170487903594}
areaSwitch: {fileID: 1432086782037750}

m_Name:
m_EditorClassIdentifier:
DecisionPeriod: 5
RepeatAction: 1
TakeActionsBetweenDecisions: 1
offsetStep: 0
--- !u!114 &5712624269609438939
MonoBehaviour:

m_Script: {fileID: 11500000, guid: 6bb6b867a41448888c1cd4f99643ad71, type: 3}
m_Name:
m_EditorClassIdentifier:
sensorName: RayPerceptionSensor
detectableTags:
m_SensorName: RayPerceptionSensor
m_DetectableTags:
- block
- wall
- goal

raysPerDirection: 3
maxRayDegrees: 70
sphereCastRadius: 0.5
rayLength: 35
rayLayerMask:
m_RaysPerDirection: 3
m_MaxRayDegrees: 70
m_SphereCastRadius: 0.5
m_RayLength: 35
m_RayLayerMask:
observationStacks: 1
m_ObservationStacks: 1
useWorldPositions: 1
startVerticalOffset: 0
endVerticalOffset: 0
m_StartVerticalOffset: 0
m_EndVerticalOffset: 0
--- !u!114 &5767481171805996936
MonoBehaviour:
m_ObjectHideFlags: 0

m_Script: {fileID: 11500000, guid: 6bb6b867a41448888c1cd4f99643ad71, type: 3}
m_Name:
m_EditorClassIdentifier:
sensorName: RayPerceptionSensor1
detectableTags:
m_SensorName: RayPerceptionSensor1
m_DetectableTags:
- block
- wall
- goal

raysPerDirection: 3
maxRayDegrees: 65
sphereCastRadius: 0.5
rayLength: 35
rayLayerMask:
m_RaysPerDirection: 3
m_MaxRayDegrees: 65
m_SphereCastRadius: 0.5
m_RayLength: 35
m_RayLayerMask:
observationStacks: 1
m_ObservationStacks: 1
useWorldPositions: 1
startVerticalOffset: 0
endVerticalOffset: 5
m_StartVerticalOffset: 0
m_EndVerticalOffset: 5
--- !u!114 &4725417187860315718
MonoBehaviour:
m_ObjectHideFlags: 0

m_Script: {fileID: 11500000, guid: 6bb6b867a41448888c1cd4f99643ad71, type: 3}
m_Name:
m_EditorClassIdentifier:
sensorName: RayPerceptionSensor2
detectableTags:
m_SensorName: RayPerceptionSensor2
m_DetectableTags:
- block
- wall
- goal

raysPerDirection: 3
maxRayDegrees: 75
sphereCastRadius: 0.5
rayLength: 35
rayLayerMask:
m_RaysPerDirection: 3
m_MaxRayDegrees: 75
m_SphereCastRadius: 0.5
m_RayLength: 35
m_RayLayerMask:
observationStacks: 1
m_ObservationStacks: 1
useWorldPositions: 1
startVerticalOffset: 0
endVerticalOffset: 10
m_StartVerticalOffset: 0
m_EndVerticalOffset: 10
--- !u!114 &6474351450651730614
MonoBehaviour:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1131043459059966}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 3a6da8f78a394c6ab027688eab81e04d, type: 3}
m_Name:
m_EditorClassIdentifier:
--- !u!1 &1148882946833254
GameObject:
m_ObjectHideFlags: 0

18
Project/Assets/ML-Agents/Examples/Pyramids/Scripts/PyramidAgent.cs


MoveAgent(vectorAction);
}
public override float[] Heuristic()
public override void Heuristic(float[] actionsOut)
actionsOut[0] = 0;
return new float[] { 3 };
actionsOut[0] = 3;
if (Input.GetKey(KeyCode.W))
else if (Input.GetKey(KeyCode.W))
return new float[] { 1 };
actionsOut[0] = 1;
if (Input.GetKey(KeyCode.A))
else if (Input.GetKey(KeyCode.A))
return new float[] { 4 };
actionsOut[0] = 4;
if (Input.GetKey(KeyCode.S))
else if (Input.GetKey(KeyCode.S))
return new float[] { 2 };
actionsOut[0] = 2;
return new float[] { 0 };
}
public override void OnEpisodeBegin()

2
Project/Assets/ML-Agents/Examples/Reacher/Demos/ExpertReacher.demo.meta


fileIDToRecycleName:
11400000: Assets/ML-Agents/Examples/Reacher/Demos/ExpertReacher.demo
externalObjects: {}
userData: ' (MLAgents.Demonstrations.Demonstration)'
userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}

22
Project/Assets/ML-Agents/Examples/Reacher/Prefabs/Agent.prefab


- component: {fileID: 114731167133171590}
- component: {fileID: 114955921823023820}
- component: {fileID: 6226801880261327134}
- component: {fileID: 7840105453417110232}
m_Layer: 0
m_Name: Agent
m_TagString: Untagged

m_InferenceDevice: 0
m_BehaviorType: 0
m_BehaviorName: Reacher
m_TeamID: 0
m_useChildSensors: 1
TeamId: 0
m_UseChildSensors: 1
--- !u!114 &114955921823023820
MonoBehaviour:
m_ObjectHideFlags: 0

m_Script: {fileID: 11500000, guid: 220b156e3b142406c8b76d4db981d044, type: 3}
m_Name:
m_EditorClassIdentifier:
agentParameters:
maxStep: 0
hasUpgradedFromAgentParameters: 1
maxStep: 4000
pendulumA: {fileID: 1644872085946016}
pendulumB: {fileID: 1053261483945176}

m_Name:
m_EditorClassIdentifier:
DecisionPeriod: 4
RepeatAction: 1
TakeActionsBetweenDecisions: 1
--- !u!114 &7840105453417110232
MonoBehaviour:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1395682910799436}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 3a6da8f78a394c6ab027688eab81e04d, type: 3}
m_Name:
m_EditorClassIdentifier:
--- !u!1 &1644872085946016
GameObject:
m_ObjectHideFlags: 0

6
Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ModelOverrider.cs


{
if (m_MaxEpisodes > 0)
{
if (m_NumSteps > m_MaxEpisodes * m_Agent.maxStep)
// For Agents without maxSteps, exit as soon as we've hit the target number of episodes.
// For Agents that specify maxStep, also make sure we've gone at least that many steps.
// Since we exit as soon as *any* Agent hits its target, the maxSteps condition keeps us running
// a bit longer in case there's an early failure.
if (m_Agent.CompletedEpisodes >= m_MaxEpisodes && m_NumSteps > m_MaxEpisodes * m_Agent.maxStep)
{
Application.Quit(0);
}

4
Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ProjectSettingsOverrides.cs


Physics.defaultSolverIterations = solverIterations;
Physics.defaultSolverVelocityIterations = solverVelocityIterations;
// Make sure the Academy singleton is initialized first, since it will create the SideChannels.
var academy = Academy.Instance;
SideChannelUtils.GetSideChannel<FloatPropertiesChannel>().RegisterCallback("gravity", f => { Physics.gravity = new Vector3(0, -f, 0); });
}

Physics.defaultSolverVelocityIterations = m_OriginalSolverVelocityIterations;
}
}
}
}

5
Project/Assets/ML-Agents/Examples/Soccer/Prefabs/StrikersVsGoalieField.prefab


m_EditorClassIdentifier:
DecisionPeriod: 5
TakeActionsBetweenDecisions: 1
offsetStep: 0
--- !u!1 &1100217258374548
GameObject:
m_ObjectHideFlags: 0

m_EditorClassIdentifier:
DecisionPeriod: 5
TakeActionsBetweenDecisions: 1
offsetStep: 0
--- !u!1 &1141134673700168
GameObject:
m_ObjectHideFlags: 0

m_RenderingLayerMask: 1
m_RendererPriority: 0
m_Materials:
- {fileID: 2100000, guid: 89ad1e117353e48db80e27f4f7727dc1, type: 2}
- {fileID: 2100000, guid: 66163cf35956a4be08e801b750c26f33, type: 2}
m_StaticBatchInfo:
firstSubMesh: 0
subMeshCount: 0

m_EditorClassIdentifier:
DecisionPeriod: 5
TakeActionsBetweenDecisions: 1
offsetStep: 0

38
Project/Assets/ML-Agents/Examples/Soccer/Scripts/AgentSoccer.cs


float m_KickPower;
int m_PlayerIndex;
public SoccerFieldArea area;
// The coefficient for the reward for colliding with a ball. Set using curriculum.
float m_Power;
const float k_Power = 2000f;
float m_Existential;
float m_LateralSpeed;
float m_ForwardSpeed;

public override void Initialize()
{
m_Existential = 1f / maxStep;
m_BehaviorParameters = gameObject.GetComponent<BehaviorParameters>();
if (m_BehaviorParameters.TeamId == (int)Team.Blue)
{

team = Team.Purple;
m_Transform = new Vector3(transform.position.x + 4f, .5f, transform.position.z);
}
m_Power = 2000f;
if (position == Position.Goalie)
{
m_LateralSpeed = 1.0f;

if (position == Position.Goalie)
{
// Existential bonus for Goalies.
AddReward(1f / 3000f);
AddReward(m_Existential);
AddReward(-1f / 3000f);
AddReward(-m_Existential);
timePenalty += -1f / 3000f;
timePenalty -= m_Existential;
public override float[] Heuristic()
public override void Heuristic(float[] actionsOut)
var action = new float[3];
action[0] = 1f;
actionsOut[0] = 1f;
action[0] = 2f;
actionsOut[0] = 2f;
action[2] = 1f;
actionsOut[2] = 1f;
action[2] = 2f;
actionsOut[2] = 2f;
action[1] = 1f;
actionsOut[1] = 1f;
action[1] = 2f;
actionsOut[1] = 2f;
return action;
}
/// <summary>
/// Used to provide a "kick" to the ball.

var force = m_Power * m_KickPower;
var force = k_Power * m_KickPower;
force = m_Power;
force = k_Power;
// Generic gets curriculum
if (position == Position.Generic)
{
AddReward(.2f * m_BallTouch);
}
AddReward(.2f * m_BallTouch);
var dir = c.contacts[0].point - transform.position;
dir = dir.normalized;
c.gameObject.GetComponent<Rigidbody>().AddForce(dir * force);

1001
Project/Assets/ML-Agents/Examples/Soccer/TFModels/Goalie.nn
文件差异内容过多而无法显示
查看文件

1001
Project/Assets/ML-Agents/Examples/Soccer/TFModels/Striker.nn
文件差异内容过多而无法显示
查看文件

2
Project/Assets/ML-Agents/Examples/Tennis/Demos/ExpertTennis.demo.meta


fileIDToRecycleName:
11400000: Assets/ML-Agents/Examples/Tennis/Demos/ExpertTennis.demo
externalObjects: {}
userData: ' (MLAgents.Demonstrations.Demonstration)'
userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}

11
Project/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs


m_TextComponent.text = score.ToString();
}
public override float[] Heuristic()
public override void Heuristic(float[] actionsOut)
var action = new float[3];
action[0] = Input.GetAxis("Horizontal"); // Racket Movement
action[1] = Input.GetKey(KeyCode.Space) ? 1f : 0f; // Racket Jumping
action[2] = Input.GetAxis("Vertical"); // Racket Rotation
return action;
actionsOut[0] = Input.GetAxis("Horizontal"); // Racket Movement
actionsOut[1] = Input.GetKey(KeyCode.Space) ? 1f : 0f; // Racket Jumping
actionsOut[2] = Input.GetAxis("Vertical"); // Racket Rotation
}
void OnCollisionEnter(Collision c)

2
Project/Assets/ML-Agents/Examples/Walker/Demos/ExpertWalker.demo.meta


fileIDToRecycleName:
11400000: Assets/ML-Agents/Examples/Walker/Demos/ExpertWalker.demo
externalObjects: {}
userData: ' (MLAgents.Demonstrations.Demonstration)'
userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}

22
Project/Assets/ML-Agents/Examples/Walker/Prefabs/WalkerPair.prefab


- component: {fileID: 114363722412740164}
- component: {fileID: 114614375190687060}
- component: {fileID: 7095046440131842424}
- component: {fileID: 526281586680617836}
m_Layer: 0
m_Name: WalkerAgent
m_TagString: Untagged

m_InferenceDevice: 0
m_BehaviorType: 0
m_BehaviorName: Walker
m_TeamID: 0
m_useChildSensors: 1
TeamId: 0
m_UseChildSensors: 1
--- !u!114 &114363722412740164
MonoBehaviour:
m_ObjectHideFlags: 0

m_Script: {fileID: 11500000, guid: ccb0f85f0009540d7ad997952e2aed7b, type: 3}
m_Name:
m_EditorClassIdentifier:
agentParameters:
maxStep: 0
hasUpgradedFromAgentParameters: 1
maxStep: 5000
target: {fileID: 4085853164035250}
hips: {fileID: 4333477265252406}

m_Name:
m_EditorClassIdentifier:
DecisionPeriod: 5
RepeatAction: 0
TakeActionsBetweenDecisions: 0
--- !u!114 &526281586680617836
MonoBehaviour:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1800913799254612}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 3a6da8f78a394c6ab027688eab81e04d, type: 3}
m_Name:
m_EditorClassIdentifier:
--- !u!1 &1907539933197724
GameObject:
m_ObjectHideFlags: 0

64
Project/Assets/ML-Agents/Examples/WallJump/Prefabs/WallJumpArea.prefab


- component: {fileID: 114458838850320084}
- component: {fileID: 114227939525648256}
- component: {fileID: 4778045978646539396}
- component: {fileID: 7445449404652947848}
m_Layer: 0
m_Name: Agent
m_TagString: agent

m_InferenceDevice: 0
m_BehaviorType: 0
m_BehaviorName: SmallWallJump
m_TeamID: 0
m_useChildSensors: 1
TeamId: 0
m_UseChildSensors: 1
--- !u!114 &114925928594762506
MonoBehaviour:
m_ObjectHideFlags: 0

m_Script: {fileID: 11500000, guid: 676fca959b8ee45539773905ca71afa1, type: 3}
m_Name:
m_EditorClassIdentifier:
agentParameters:
maxStep: 0
hasUpgradedFromAgentParameters: 1
maxStep: 2000
noWallBrain: {fileID: 11400000, guid: fb2ce36eb40b6480e94ea0b5d7573e47, type: 3}
smallWallBrain: {fileID: 11400000, guid: fb2ce36eb40b6480e94ea0b5d7573e47, type: 3}

m_Script: {fileID: 11500000, guid: 6bb6b867a41448888c1cd4f99643ad71, type: 3}
m_Name:
m_EditorClassIdentifier:
sensorName: RayPerceptionSensor
detectableTags:
m_SensorName: RayPerceptionSensor
m_DetectableTags:
raysPerDirection: 3
maxRayDegrees: 90
sphereCastRadius: 0.5
rayLength: 20
rayLayerMask:
m_RaysPerDirection: 3
m_MaxRayDegrees: 90
m_SphereCastRadius: 0.5
m_RayLength: 20
m_RayLayerMask:
observationStacks: 6
m_ObservationStacks: 6
useWorldPositions: 1
startVerticalOffset: 0
endVerticalOffset: 0
m_StartVerticalOffset: 0
m_EndVerticalOffset: 0
--- !u!114 &114227939525648256
MonoBehaviour:
m_ObjectHideFlags: 0

m_Script: {fileID: 11500000, guid: 6bb6b867a41448888c1cd4f99643ad71, type: 3}
m_Name:
m_EditorClassIdentifier:
sensorName: OffsetRayPerceptionSensor
detectableTags:
m_SensorName: OffsetRayPerceptionSensor
m_DetectableTags:
raysPerDirection: 3
maxRayDegrees: 90
sphereCastRadius: 0.5
rayLength: 20
rayLayerMask:
m_RaysPerDirection: 3
m_MaxRayDegrees: 90
m_SphereCastRadius: 0.5
m_RayLength: 20
m_RayLayerMask:
observationStacks: 6
m_ObservationStacks: 6
useWorldPositions: 1
startVerticalOffset: 2.5
endVerticalOffset: 5
m_StartVerticalOffset: 2.5
m_EndVerticalOffset: 5
--- !u!114 &4778045978646539396
MonoBehaviour:
m_ObjectHideFlags: 0

m_Name:
m_EditorClassIdentifier:
DecisionPeriod: 5
RepeatAction: 1
TakeActionsBetweenDecisions: 1
--- !u!114 &7445449404652947848
MonoBehaviour:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1195095783991828}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 3a6da8f78a394c6ab027688eab81e04d, type: 3}
m_Name:
m_EditorClassIdentifier:
--- !u!1 &1264699583886832
GameObject:
m_ObjectHideFlags: 0

14
Project/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs


}
}
public override float[] Heuristic()
public override void Heuristic(float[] actionsOut)
var action = new float[4];
action[1] = 2f;
actionsOut[1] = 2f;
action[0] = 1f;
actionsOut[0] = 1f;
action[1] = 1f;
actionsOut[1] = 1f;
action[0] = 2f;
actionsOut[0] = 2f;
action[3] = Input.GetKey(KeyCode.Space) ? 1.0f : 0.0f;
return action;
actionsOut[3] = Input.GetKey(KeyCode.Space) ? 1.0f : 0.0f;
}
// Detect when the agent hits the goal

3
Project/ProjectSettings/GraphicsSettings.asset


- {fileID: 10753, guid: 0000000000000000f000000000000000, type: 0}
- {fileID: 10770, guid: 0000000000000000f000000000000000, type: 0}
- {fileID: 10783, guid: 0000000000000000f000000000000000, type: 0}
- {fileID: 16000, guid: 0000000000000000f000000000000000, type: 0}
- {fileID: 16001, guid: 0000000000000000f000000000000000, type: 0}
- {fileID: 17000, guid: 0000000000000000f000000000000000, type: 0}
m_PreloadedShaders: []
m_SpritesDefaultMaterial: {fileID: 10754, guid: 0000000000000000f000000000000000,
type: 0}

2
Project/ProjectSettings/ProjectVersion.txt


m_EditorVersion: 2018.4.18f1
m_EditorVersion: 2018.4.17f1

2
Project/ProjectSettings/UnityConnectSettings.asset


UnityConnectSettings:
m_ObjectHideFlags: 0
serializedVersion: 1
m_Enabled: 0
m_Enabled: 1
m_TestMode: 0
m_EventOldUrl: https://api.uca.cloud.unity3d.com/v1/events
m_EventUrl: https://cdp.cloud.unity3d.com/v1/events

303
com.unity.ml-agents/CHANGELOG.md


# Changelog
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
and this project adheres to
[Semantic Versioning](http://semver.org/spec/v2.0.0.html).
- The `--load` and `--train` command-line flags have been deprecated. Training now happens by default, and
use `--resume` to resume training instead. (#3705)
- The Jupyter notebooks have been removed from the repository.
- Introduced the `SideChannelUtils` to register, unregister and access side channels.
- `Academy.FloatProperties` was removed, please use `SideChannelUtils.GetSideChannel<FloatPropertiesChannel>()` instead.
- Removed the multi-agent gym option from the gym wrapper. For multi-agent scenarios, use the [Low Level Python API](Python-API.md).
- The low level Python API has changed. You can look at the document [Low Level Python API documentation](Python-API.md) for more information. If you use `mlagents-learn` for training, this should be a transparent change.
- Added ability to start training (initialize model weights) from a previous run ID. (#3710)
- The internal event `Academy.AgentSetStatus` was renamed to `Academy.AgentPreStep` and made public.
- The offset logic was removed from DecisionRequester.
- The `--load` and `--train` command-line flags have been deprecated. Training
now happens by default, and use `--resume` to resume training instead. (#3705)
- The Jupyter notebooks have been removed from the repository.
- Introduced the `SideChannelUtils` to register, unregister and access side
channels.
- `Academy.FloatProperties` was removed, please use
`SideChannelUtils.GetSideChannel<FloatPropertiesChannel>()` instead.
- Removed the multi-agent gym option from the gym wrapper. For multi-agent
scenarios, use the [Low Level Python API](../docs/Python-API.md).
- The low level Python API has changed. You can look at the document
[Low Level Python API documentation](../docs/Python-API.md) for more
information. If you use `mlagents-learn` for training, this should be a
transparent change.
- Added ability to start training (initialize model weights) from a previous run
ID. (#3710)
- The internal event `Academy.AgentSetStatus` was renamed to
`Academy.AgentPreStep` and made public.
- The offset logic was removed from DecisionRequester.
- The signature of `Agent.Heuristic()` was changed to take a `float[]` as a
parameter, instead of returning the array. This was done to prevent a common
source of error where users would return arrays of the wrong size.
- The communication API version has been bumped up to 1.0.0 and will use
[Semantic Versioning](https://semver.org/) to do compatibility checks for
communication between Unity and the Python process.
- The obsolete `Agent` methods `GiveModel`, `Done`, `InitializeAgent`,
`AgentAction` and `AgentReset` have been removed.
- The GhostTrainer has been extended to support asymmetric games and the asymmetric example environment Strikers Vs. Goalie has been added.
- Format of console output has changed slightly and now matches the name of the model/summary directory. (#3630, #3616)
- Added a feature to allow sending stats from C# environments to TensorBoard (and other python StatsWriters). To do this from your code, use `SideChannelUtils.GetSideChannel<StatsSideChannel>().AddStat(key, value)` (#3660)
- Renamed 'Generalization' feature to 'Environment Parameter Randomization'.
- Timer files now contain a dictionary of metadata, including things like the package version numbers.
- SideChannel IncomingMessages methods now take an optional default argument, which is used when trying to read more data than the message contains.
- The way that UnityEnvironment decides the port was changed. If no port is specified, the behavior will depend on the `file_name` parameter. If it is `None`, 5004 (the editor port) will be used; otherwise 5005 (the base environment port) will be used.
- Fixed an issue where exceptions from environments provided a returncode of 0. (#3680)
- Running `mlagents-learn` with the same `--run-id` twice will no longer overwrite the existing files. (#3705)
- `StackingSensor` was changed from `internal` visibility to `public`
- Format of console output has changed slightly and now matches the name of the
model/summary directory. (#3630, #3616)
- Added a feature to allow sending stats from C# environments to TensorBoard
(and other python StatsWriters). To do this from your code, use
`SideChannelUtils.GetSideChannel<StatsSideChannel>().AddStat(key, value)`
(#3660)
- Renamed 'Generalization' feature to 'Environment Parameter Randomization'.
- Timer files now contain a dictionary of metadata, including things like the
package version numbers.
- SideChannel IncomingMessages methods now take an optional default argument,
which is used when trying to read more data than the message contains.
- The way that UnityEnvironment decides the port was changed. If no port is
specified, the behavior will depend on the `file_name` parameter. If it is
`None`, 5004 (the editor port) will be used; otherwise 5005 (the base
environment port) will be used.
- Fixed an issue where exceptions from environments provided a returncode of 0.
(#3680)
- Running `mlagents-learn` with the same `--run-id` twice will no longer
overwrite the existing files. (#3705)
- `StackingSensor` was changed from `internal` visibility to `public`
- Updated Barracuda to 0.6.3-preview.
### Bug Fixes
- Fixed a display bug when viewing Demonstration files in the inspector. The
shapes of the observations in the file now display correctly. (#3771)
- Raise the wall in CrawlerStatic scene to prevent Agent from falling off. (#3650)
- Fixed an issue where specifying `vis_encode_type` was required only for SAC. (#3677)
- Fixed the reported entropy values for continuous actions (#3684)
- Fixed an issue where switching models using `SetModel()` during training would use an excessive amount of memory. (#3664)
- Environment subprocesses now close immediately on timeout or wrong API version. (#3679)
- Fixed an issue in the gym wrapper that would raise an exception if an Agent called EndEpisode multiple times in the same step. (#3700)
- Fixed an issue where logging output was not visible; logging levels are now set consistently. (#3703)
- Raise the wall in CrawlerStatic scene to prevent Agent from falling off.
(#3650)
- Fixed an issue where specifying `vis_encode_type` was required only for SAC.
(#3677)
- Fixed the reported entropy values for continuous actions (#3684)
- Fixed an issue where switching models using `SetModel()` during training would
use an excessive amount of memory. (#3664)
- Environment subprocesses now close immediately on timeout or wrong API
version. (#3679)
- Fixed an issue in the gym wrapper that would raise an exception if an Agent
called EndEpisode multiple times in the same step. (#3700)
- Fixed an issue where logging output was not visible; logging levels are now
set consistently. (#3703)
- `Agent.CollectObservations` now takes a VectorSensor argument. (#3352, #3389)
- Added `Agent.CollectDiscreteActionMasks` virtual method with a `DiscreteActionMasker` argument to specify which discrete actions are unavailable to the Agent. (#3525)
- Beta support for ONNX export was added. If the `tf2onnx` python package is installed, models will be saved to `.onnx` as well as `.nn` format.
Note that Barracuda 0.6.0 or later is required to import the `.onnx` files properly
- Multi-GPU training and the `--multi-gpu` option has been removed temporarily. (#3345)
- All Sensor related code has been moved to the namespace `MLAgents.Sensors`.
- All SideChannel related code has been moved to the namespace `MLAgents.SideChannels`.
- `BrainParameters` and `SpaceType` have been removed from the public API
- `BehaviorParameters` have been removed from the public API.
- The following methods in the `Agent` class have been deprecated and will be removed in a later release:
- `InitializeAgent()` was renamed to `Initialize()`
- `AgentAction()` was renamed to `OnActionReceived()`
- `AgentReset()` was renamed to `OnEpisodeBegin()`
- `Done()` was renamed to `EndEpisode()`
- `GiveModel()` was renamed to `SetModel()`
- `Agent.CollectObservations` now takes a VectorSensor argument. (#3352, #3389)
- Added `Agent.CollectDiscreteActionMasks` virtual method with a
`DiscreteActionMasker` argument to specify which discrete actions are
unavailable to the Agent. (#3525)
- Beta support for ONNX export was added. If the `tf2onnx` python package is
installed, models will be saved to `.onnx` as well as `.nn` format. Note that
Barracuda 0.6.0 or later is required to import the `.onnx` files properly
- Multi-GPU training and the `--multi-gpu` option has been removed temporarily.
(#3345)
- All Sensor related code has been moved to the namespace `MLAgents.Sensors`.
- All SideChannel related code has been moved to the namespace
`MLAgents.SideChannels`.
- `BrainParameters` and `SpaceType` have been removed from the public API
- `BehaviorParameters` have been removed from the public API.
- The following methods in the `Agent` class have been deprecated and will be
removed in a later release:
- `InitializeAgent()` was renamed to `Initialize()`
- `AgentAction()` was renamed to `OnActionReceived()`
- `AgentReset()` was renamed to `OnEpisodeBegin()`
- `Done()` was renamed to `EndEpisode()`
- `GiveModel()` was renamed to `SetModel()`
- Monitor.cs was moved to Examples. (#3372)
- Automatic stepping for Academy is now controlled from the AutomaticSteppingEnabled property. (#3376)
- The GetEpisodeCount, GetStepCount, GetTotalStepCount and methods of Academy were changed to EpisodeCount, StepCount, TotalStepCount properties respectively. (#3376)
- Several classes were changed from public to internal visibility. (#3390)
- Academy.RegisterSideChannel and UnregisterSideChannel methods were added. (#3391)
- A tutorial on adding custom SideChannels was added (#3391)
- The stepping logic for the Agent and the Academy has been simplified (#3448)
- Update Barracuda to 0.6.1-preview
* The interface for `RayPerceptionSensor.PerceiveStatic()` was changed to take an input class and write to an output class, and the method was renamed to `Perceive()`.
- The checkpoint file suffix was changed from `.cptk` to `.ckpt` (#3470)
- The command-line argument used to determine the port that an environment will listen on was changed from `--port` to `--mlagents-port`.
- `DemonstrationRecorder` can now record observations outside of the editor.
- `DemonstrationRecorder` now has an optional path for the demonstrations. This will default to `Application.dataPath` if not set.
- `DemonstrationStore` was changed to accept a `Stream` for its constructor, and was renamed to `DemonstrationWriter`
- The method `GetStepCount()` on the Agent class has been replaced with the property getter `StepCount`
- `RayPerceptionSensorComponent` and related classes now display the debug gizmos whenever the Agent is selected (not just Play mode).
- Most fields on `RayPerceptionSensorComponent` can now be changed while the editor is in Play mode. The exceptions to this are fields that affect the number of observations.
- Most fields on `CameraSensorComponent` and `RenderTextureSensorComponent` were changed to private and replaced by properties with the same name.
- Unused static methods from the `Utilities` class (ShiftLeft, ReplaceRange, AddRangeNoAlloc, and GetSensorFloatObservationSize) were removed.
- The `Agent` class is no longer abstract.
- SensorBase was moved out of the package and into the Examples directory.
- `AgentInfo.actionMasks` has been renamed to `AgentInfo.discreteActionMasks`.
- `DecisionRequester` has been made internal (you can still use the DecisionRequesterComponent from the inspector). `RepeatAction` was renamed `TakeActionsBetweenDecisions` for clarity. (#3555)
- The `IFloatProperties` interface has been removed.
- Fix #3579.
- Improved inference performance for models with multiple action branches. (#3598)
- Fixed an issue when using GAIL with less than `batch_size` number of demonstrations. (#3591)
- The interfaces to the `SideChannel` classes (on C# and python) have changed to use new `IncomingMessage` and `OutgoingMessage` classes. These should make reading and writing data to the channel easier. (#3596)
- Updated the ExpertPyramid.demo example demonstration file (#3613)
- Updated project version for example environments to 2018.4.18f1. (#3618)
- Changed the Product Name in the example environments to remove spaces, so that the default build executable file doesn't contain spaces. (#3612)
- Monitor.cs was moved to Examples. (#3372)
- Automatic stepping for Academy is now controlled from the
AutomaticSteppingEnabled property. (#3376)
- The GetEpisodeCount, GetStepCount, GetTotalStepCount and methods of Academy
were changed to EpisodeCount, StepCount, TotalStepCount properties
respectively. (#3376)
- Several classes were changed from public to internal visibility. (#3390)
- Academy.RegisterSideChannel and UnregisterSideChannel methods were added.
(#3391)
- A tutorial on adding custom SideChannels was added (#3391)
- The stepping logic for the Agent and the Academy has been simplified (#3448)
- Update Barracuda to 0.6.1-preview
* The interface for `RayPerceptionSensor.PerceiveStatic()` was changed to take
an input class and write to an output class, and the method was renamed to
`Perceive()`.
- The checkpoint file suffix was changed from `.cptk` to `.ckpt` (#3470)
- The command-line argument used to determine the port that an environment will
listen on was changed from `--port` to `--mlagents-port`.
- `DemonstrationRecorder` can now record observations outside of the editor.
- `DemonstrationRecorder` now has an optional path for the demonstrations. This
will default to `Application.dataPath` if not set.
- `DemonstrationStore` was changed to accept a `Stream` for its constructor, and
was renamed to `DemonstrationWriter`
- The method `GetStepCount()` on the Agent class has been replaced with the
property getter `StepCount`
- `RayPerceptionSensorComponent` and related classes now display the debug
gizmos whenever the Agent is selected (not just Play mode).
- Most fields on `RayPerceptionSensorComponent` can now be changed while the
editor is in Play mode. The exceptions to this are fields that affect the
number of observations.
- Most fields on `CameraSensorComponent` and `RenderTextureSensorComponent` were
changed to private and replaced by properties with the same name.
- Unused static methods from the `Utilities` class (ShiftLeft, ReplaceRange,
AddRangeNoAlloc, and GetSensorFloatObservationSize) were removed.
- The `Agent` class is no longer abstract.
- SensorBase was moved out of the package and into the Examples directory.
- `AgentInfo.actionMasks` has been renamed to `AgentInfo.discreteActionMasks`.
- `DecisionRequester` has been made internal (you can still use the
DecisionRequesterComponent from the inspector). `RepeatAction` was renamed
`TakeActionsBetweenDecisions` for clarity. (#3555)
- The `IFloatProperties` interface has been removed.
- Fix #3579.
- Improved inference performance for models with multiple action branches.
(#3598)
- Fixed an issue when using GAIL with less than `batch_size` number of
demonstrations. (#3591)
- The interfaces to the `SideChannel` classes (on C# and python) have changed to
use new `IncomingMessage` and `OutgoingMessage` classes. These should make
reading and writing data to the channel easier. (#3596)
- Updated the ExpertPyramid.demo example demonstration file (#3613)
- Updated project version for example environments to 2018.4.18f1. (#3618)
- Changed the Product Name in the example environments to remove spaces, so that
the default build executable file doesn't contain spaces. (#3612)
- Fixed an issue which caused self-play training sessions to consume a lot of memory. (#3451)
- Fixed an IndexError when using GAIL or behavioral cloning with demonstrations recorded with 0.14.0 or later (#3464)
- Fixed an issue which caused self-play training sessions to consume a lot of
memory. (#3451)
- Fixed an IndexError when using GAIL or behavioral cloning with demonstrations
recorded with 0.14.0 or later (#3464)
- Fixed a bug with the rewards of multiple Agents in the gym interface (#3471, #3496)
- Fixed a bug with the rewards of multiple Agents in the gym interface (#3471,
#3496)
- A new self-play mechanism for training agents in adversarial scenarios was added (#3194)
- Tennis and Soccer environments were refactored to enable training with self-play (#3194, #3331)
- UnitySDK folder was split into a Unity Package (com.unity.ml-agents) and our examples were moved to the Project folder (#3267)
- A new self-play mechanism for training agents in adversarial scenarios was
added (#3194)
- Tennis and Soccer environments were refactored to enable training with
self-play (#3194, #3331)
- UnitySDK folder was split into a Unity Package (com.unity.ml-agents) and our
examples were moved to the Project folder (#3267)
- In order to reduce the size of the API, several classes and methods were marked as internal or private. Some public fields on the Agent were trimmed (#3342, #3353, #3269)
- Decision Period and on-demand decision checkboxes were removed from the Agent. on-demand decision is now the default (#3243)
- Calling Done() on the Agent will reset it immediately and call the AgentReset virtual method (#3291, #3242)
- The "Reset on Done" setting in AgentParameters was removed; this is now always true. AgentOnDone virtual method on the Agent was removed (#3311, #3222)
- Trainer steps are now counted per-Agent, not per-environment as in previous versions. For instance, if you have 10 Agents in the scene, 20 environment steps now correspond to 200 steps as printed in the terminal and in Tensorboard (#3113)
- In order to reduce the size of the API, several classes and methods were
marked as internal or private. Some public fields on the Agent were trimmed
(#3342, #3353, #3269)
- Decision Period and on-demand decision checkboxes were removed from the Agent.
on-demand decision is now the default (#3243)
- Calling Done() on the Agent will reset it immediately and call the AgentReset
virtual method (#3291, #3242)
- The "Reset on Done" setting in AgentParameters was removed; this is now always
true. AgentOnDone virtual method on the Agent was removed (#3311, #3222)
- Trainer steps are now counted per-Agent, not per-environment as in previous
versions. For instance, if you have 10 Agents in the scene, 20 environment
steps now correspond to 200 steps as printed in the terminal and in
Tensorboard (#3113)
- Curriculum config files are now YAML formatted and all curricula for a training run are combined into a single file (#3186)
- ML-Agents components, such as BehaviorParameters and various Sensor implementations, now appear in the Components menu (#3231)
- Exceptions are now raised in Unity (in debug mode only) if NaN observations or rewards are passed (#3221)
- RayPerception MonoBehavior, which was previously deprecated, was removed (#3304)
- Uncompressed visual (i.e. 3d float arrays) observations are now supported. CameraSensorComponent and RenderTextureSensor now have an option to write uncompressed observations (#3148)
- Agent’s handling of observations during training was improved so that an extra copy of the observations is no longer maintained (#3229)
- Error message for missing trainer config files was improved to include the absolute path (#3230)
- Curriculum config files are now YAML formatted and all curricula for a
training run are combined into a single file (#3186)
- ML-Agents components, such as BehaviorParameters and various Sensor
implementations, now appear in the Components menu (#3231)
- Exceptions are now raised in Unity (in debug mode only) if NaN observations or
rewards are passed (#3221)
- RayPerception MonoBehavior, which was previously deprecated, was removed
(#3304)
- Uncompressed visual (i.e. 3d float arrays) observations are now supported.
CameraSensorComponent and RenderTextureSensor now have an option to write
uncompressed observations (#3148)
- Agent’s handling of observations during training was improved so that an extra
copy of the observations is no longer maintained (#3229)
- Error message for missing trainer config files was improved to include the
absolute path (#3230)
- A bug that caused RayPerceptionSensor to behave inconsistently with transforms that have non-1 scale was fixed (#3321)
- Some small bugfixes to tensorflow_to_barracuda.py were backported from the barracuda release (#3341)
- Base port in the jupyter notebook example was updated to use the same port that the editor uses (#3283)
- A bug that caused RayPerceptionSensor to behave inconsistently with transforms
that have non-1 scale was fixed (#3321)
- Some small bugfixes to tensorflow_to_barracuda.py were backported from the
barracuda release (#3341)
- Base port in the jupyter notebook example was updated to use the same port
that the editor uses (#3283)
### This is the first release of *Unity Package ML-Agents*.
### This is the first release of _Unity Package ML-Agents_.
*Short description of this release*
_Short description of this release_

78
com.unity.ml-agents/Editor/DemonstrationDrawer.cs


using System.Collections.Generic;
using System.Text;
using UnityEditor;
using MLAgents.Demonstrations;

namespace MLAgents.Editor
{
/// <summary>
/// Renders a custom UI for Demonstration Scriptable Object.
/// Renders a custom UI for DemonstrationSummary ScriptableObject.
[CustomEditor(typeof(Demonstration))]
[CustomEditor(typeof(DemonstrationSummary))]
SerializedProperty m_ObservationShapes;
m_ObservationShapes = serializedObject.FindProperty("observationSummaries");
}
/// <summary>

{
var nameProp = property.FindPropertyRelative("demonstrationName");
var expProp = property.FindPropertyRelative("numberExperiences");
var epiProp = property.FindPropertyRelative("numberEpisodes");
var rewProp = property.FindPropertyRelative("meanReward");
var experiencesProp = property.FindPropertyRelative("numberSteps");
var episodesProp = property.FindPropertyRelative("numberEpisodes");
var rewardsProp = property.FindPropertyRelative("meanReward");
var expLabel = expProp.displayName + ": " + expProp.intValue;
var epiLabel = epiProp.displayName + ": " + epiProp.intValue;
var rewLabel = rewProp.displayName + ": " + rewProp.floatValue;
var experiencesLabel = experiencesProp.displayName + ": " + experiencesProp.intValue;
var episodesLabel = episodesProp.displayName + ": " + episodesProp.intValue;
var rewardsLabel = rewardsProp.displayName + ": " + rewardsProp.floatValue;
EditorGUILayout.LabelField(expLabel);
EditorGUILayout.LabelField(epiLabel);
EditorGUILayout.LabelField(rewLabel);
EditorGUILayout.LabelField(experiencesLabel);
EditorGUILayout.LabelField(episodesLabel);
EditorGUILayout.LabelField(rewardsLabel);
/// Constructs label for action size array.
/// Constructs label for a serialized integer array.
static string BuildActionArrayLabel(SerializedProperty actionSizeProperty)
static string BuildIntArrayLabel(SerializedProperty actionSizeProperty)
{
var actionSize = actionSizeProperty.arraySize;
var actionLabel = new StringBuilder("[ ");

}
/// <summary>
/// Renders Inspector UI for Brain Parameters of Demonstration.
/// Renders Inspector UI for BrainParameters of a DemonstrationSummary.
/// Only the Action size and type are used from the BrainParameters.
void MakeBrainParametersProperty(SerializedProperty property)
void MakeActionsProperty(SerializedProperty property)
var vecObsSizeProp = property.FindPropertyRelative("vectorObservationSize");
var numStackedProp = property.FindPropertyRelative("numStackedVectorObservations");
var vecObsSizeLabel = vecObsSizeProp.displayName + ": " + vecObsSizeProp.intValue;
var numStackedLabel = numStackedProp.displayName + ": " + numStackedProp.intValue;
actSizeProperty.displayName + ": " + BuildActionArrayLabel(actSizeProperty);
actSizeProperty.displayName + ": " + BuildIntArrayLabel(actSizeProperty);
EditorGUILayout.LabelField(vecObsSizeLabel);
EditorGUILayout.LabelField(numStackedLabel);
/// <summary>
/// Render the observation shapes of a DemonstrationSummary.
/// </summary>
/// <param name="obsSummariesProperty"></param>
void MakeObservationsProperty(SerializedProperty obsSummariesProperty)
{
var shapesLabels = new List<string>();
var numObservations = obsSummariesProperty.arraySize;
for (var i = 0; i < numObservations; i++)
{
var summary = obsSummariesProperty.GetArrayElementAtIndex(i);
var shapeProperty = summary.FindPropertyRelative("shape");
shapesLabels.Add(BuildIntArrayLabel(shapeProperty));
}
var shapeLabel = $"Shapes: {string.Join(", ", shapesLabels)}";
EditorGUILayout.LabelField(shapeLabel);
}
EditorGUI.indentLevel++;
EditorGUILayout.LabelField("Brain Parameters", EditorStyles.boldLabel);
MakeBrainParametersProperty(m_BrainParameters);
EditorGUI.indentLevel--;
EditorGUILayout.LabelField("Observations", EditorStyles.boldLabel);
EditorGUI.indentLevel++;
MakeObservationsProperty(m_ObservationShapes);
EditorGUI.indentLevel--;
EditorGUILayout.LabelField("Actions", EditorStyles.boldLabel);
EditorGUI.indentLevel++;
MakeActionsProperty(m_BrainParameters);
EditorGUI.indentLevel--;
serializedObject.ApplyModifiedProperties();
}
}

26
com.unity.ml-agents/Editor/DemonstrationImporter.cs


using System;
using System.Collections.Generic;
using System.IO;
using MLAgents.CommunicatorObjects;
using UnityEditor;

try
{
// Read first two proto objects containing metadata and brain parameters.
// Read first three proto objects containing metadata, brain parameters, and observations.
Stream reader = File.OpenRead(ctx.assetPath);
var metaDataProto = DemonstrationMetaProto.Parser.ParseDelimitedFrom(reader);

var brainParamsProto = BrainParametersProto.Parser.ParseDelimitedFrom(reader);
var brainParameters = brainParamsProto.ToBrainParameters();
// Read the first AgentInfoActionPair so that we can get the observation sizes.
List<ObservationSummary> observationSummaries;
try
{
var agentInfoActionPairProto = AgentInfoActionPairProto.Parser.ParseDelimitedFrom(reader);
observationSummaries = agentInfoActionPairProto.GetObservationSummaries();
}
catch
{
// Just in case there weren't any AgentInfoActionPair or they couldn't be read.
observationSummaries = new List<ObservationSummary>();
}
var demonstration = ScriptableObject.CreateInstance<Demonstration>();
demonstration.Initialize(brainParameters, metaData);
userData = demonstration.ToString();
var demonstrationSummary = ScriptableObject.CreateInstance<DemonstrationSummary>();
demonstrationSummary.Initialize(brainParameters, metaData, observationSummaries);
userData = demonstrationSummary.ToString();
ctx.AddObjectToAsset(ctx.assetPath, demonstration, texture);
ctx.SetMainObject(demonstration);
ctx.AddObjectToAsset(ctx.assetPath, demonstrationSummary, texture);
ctx.SetMainObject(demonstrationSummary);
}
catch
{

2
com.unity.ml-agents/Runtime/Academy.cs


void InitializeEnvironment()
{
TimerStack.Instance.AddMetadata("communication_protocol_version", k_ApiVersion);
TimerStack.Instance.AddMetadata("package_version", k_PackageVersion);
TimerStack.Instance.AddMetadata("com.unity.ml-agents_version", k_PackageVersion);
EnableAutomaticStepping();

96
com.unity.ml-agents/Runtime/Agent.cs


/// their own experience.
int m_StepCount;
/// Number of times the Agent has completed an episode.
int m_CompletedEpisodes;
/// Episode identifier each agent receives. It is used
/// to separate between different agents in the environment.
/// This Id will be changed every time the Agent resets.

if (doneReason != DoneReason.Disabled)
{
// We don't want to udpate the reward stats when the Agent is disabled, because this will make
// We don't want to update the reward stats when the Agent is disabled, because this will make
m_CompletedEpisodes++;
UpdateRewardStats();
}

m_RequestDecision = false;
}
[Obsolete("GiveModel() has been deprecated, use SetModel() instead.")]
public void GiveModel(
string behaviorName,
NNModel model,
InferenceDevice inferenceDevice = InferenceDevice.CPU)
{
SetModel(behaviorName, model, inferenceDevice);
Array.Clear(m_Info.storedVectorActions, 0, m_Info.storedVectorActions.Length);
}
/// <summary>

}
/// <summary>
/// Returns the number of episodes that the Agent has completed (either <see cref="Agent.EndEpisode()"/>
/// was called, or maxSteps was reached).
/// </summary>
/// <returns>
/// Current episode count.
/// </returns>
public int CompletedEpisodes
{
get { return m_CompletedEpisodes; }
}
/// <summary>
/// Overrides the current step reward of the agent and updates the episode
/// reward accordingly.
/// </summary>

TimerStack.Instance.SetGauge(gaugeName, GetCumulativeReward());
}
[Obsolete("Done() has been deprecated, use EndEpisode() instead.")]
public void Done()
{
EndEpisode();
}
/// <summary>
/// Sets the done flag to true.
/// </summary>

}
}
[Obsolete("InitializeAgent() has been deprecated, use Initialize() instead.")]
public virtual void InitializeAgent()
{
}
/// <summary>
/// Initializes the agent, called once when the agent is enabled. Can be
/// left empty if there is no special, unique set-up behavior for the

/// One sample use is to store local references to other objects in the
/// scene which would facilitate computing this agents observation.
/// </remarks>
public virtual void Initialize()
{
#pragma warning disable 0618
InitializeAgent();
#pragma warning restore 0618
}
public virtual void Initialize(){}
/// <summary>
/// When the Agent uses Heuristics, it will call this method every time it

/// <returns> A float array corresponding to the next action of the Agent
/// </returns>
public virtual float[] Heuristic()
public virtual void Heuristic(float[] actionsOut)
var param = m_PolicyFactory.brainParameters;
return new float[param.numActions];
Array.Clear(actionsOut, 0, actionsOut.Length);
}
/// <summary>

return;
}
m_Info.storedVectorActions = m_Action.vectorActions;
if (m_Info.done)
{
Array.Clear(m_Info.storedVectorActions, 0, m_Info.storedVectorActions.Length);
}
else
{
Array.Copy(m_Action.vectorActions, m_Info.storedVectorActions, m_Action.vectorActions.Length);
}
m_ActionMasker.ResetMask();
UpdateSensors();
using (TimerStack.Instance.Scoped("CollectObservations"))

{
}
[Obsolete("AgentAction() has been deprecated, use OnActionReceived() instead.")]
public virtual void AgentAction(float[] vectorAction)
{
}
/// <summary>
/// Specifies the agent behavior at every step based on the provided
/// action.

/// will be of length 1.
/// </param>
public virtual void OnActionReceived(float[] vectorAction)
{
#pragma warning disable 0618
AgentAction(m_Action.vectorActions);
#pragma warning restore 0618
}
[Obsolete("AgentReset() has been deprecated, use OnEpisodeBegin() instead.")]
public virtual void AgentReset()
{
}
public virtual void OnActionReceived(float[] vectorAction){}
/// <summary>
/// Specifies the agent behavior when being reset, which can be due to

public virtual void OnEpisodeBegin()
{
#pragma warning disable 0618
AgentReset();
#pragma warning restore 0618
}
public virtual void OnEpisodeBegin(){}
/// <summary>
/// Returns the last action that was decided on by the Agent

void DecideAction()
{
m_Action.vectorActions = m_Brain?.DecideAction();
}
var action = m_Brain?.DecideAction();
if (action == null)
{
Array.Clear(m_Action.vectorActions, 0, m_Action.vectorActions.Length);
}
else
{
Array.Copy(action, m_Action.vectorActions, action.Length);
}
}
}

68
com.unity.ml-agents/Runtime/Communicator/GrpcExtensions.cs


{
internal static class GrpcExtensions
{
#region AgentInfo
/// <summary>
/// Converts a AgentInfo to a protobuf generated AgentInfoActionPairProto
/// </summary>

}
/// <summary>
/// Get summaries for the observations in the AgentInfo part of the AgentInfoActionPairProto.
/// </summary>
/// <param name="infoActionPair"></param>
/// <returns></returns>
public static List<ObservationSummary> GetObservationSummaries(this AgentInfoActionPairProto infoActionPair)
{
List<ObservationSummary> summariesOut = new List<ObservationSummary>();
var agentInfo = infoActionPair.AgentInfo;
foreach (var obs in agentInfo.Observations)
{
var summary = new ObservationSummary();
summary.shape = obs.Shape.ToArray();
summariesOut.Add(summary);
}
return summariesOut;
}
#endregion
#region BrainParameters
/// <summary>
/// Converts a Brain into to a Protobuf BrainInfoProto so it can be sent
/// </summary>
/// <returns>The BrainInfoProto generated.</returns>

}
/// <summary>
/// Convert a BrainParametersProto to a BrainParameters struct.
/// </summary>
/// <param name="bpp">An instance of a brain parameters protobuf object.</param>
/// <returns>A BrainParameters struct.</returns>
public static BrainParameters ToBrainParameters(this BrainParametersProto bpp)
{
var bp = new BrainParameters
{
vectorActionSize = bpp.VectorActionSize.ToArray(),
vectorActionDescriptions = bpp.VectorActionDescriptions.ToArray(),
vectorActionSpaceType = (SpaceType)bpp.VectorActionSpaceType
};
return bp;
}
#endregion
#region DemonstrationMetaData
/// <summary>
/// Convert metadata object to proto object.
/// </summary>
public static DemonstrationMetaProto ToProto(this DemonstrationMetaData dm)

ApiVersion = DemonstrationMetaData.ApiVersion,
MeanReward = dm.meanReward,
NumberSteps = dm.numberExperiences,
NumberSteps = dm.numberSteps,
NumberEpisodes = dm.numberEpisodes,
DemonstrationName = dm.demonstrationName
};

var dm = new DemonstrationMetaData
{
numberEpisodes = demoProto.NumberEpisodes,
numberExperiences = demoProto.NumberSteps,
numberSteps = demoProto.NumberSteps,
meanReward = demoProto.MeanReward,
demonstrationName = demoProto.DemonstrationName
};

}
return dm;
}
/// <summary>
/// Convert a BrainParametersProto to a BrainParameters struct.
/// </summary>
/// <param name="bpp">An instance of a brain parameters protobuf object.</param>
/// <returns>A BrainParameters struct.</returns>
public static BrainParameters ToBrainParameters(this BrainParametersProto bpp)
{
var bp = new BrainParameters
{
vectorActionSize = bpp.VectorActionSize.ToArray(),
vectorActionDescriptions = bpp.VectorActionDescriptions.ToArray(),
vectorActionSpaceType = (SpaceType)bpp.VectorActionSpaceType
};
return bp;
}
#endregion
public static UnityRLInitParameters ToUnityRLInitParameters(this UnityRLInitializationInputProto inputProto)
{

};
}
#region AgentAction
public static AgentAction ToAgentAction(this AgentActionProto aap)
{
return new AgentAction

}
return agentActions;
}
#endregion
#region Observations
public static ObservationProto ToProto(this Observation obs)
{
ObservationProto obsProto = null;

observationProto.Shape.AddRange(shape);
return observationProto;
}
#endregion
}
}

51
com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs


#region Initialization
internal static bool CheckCommunicationVersionsAreCompatible(
string unityCommunicationVersion,
string pythonApiVersion,
string pythonLibraryVersion)
{
var unityVersion = new Version(unityCommunicationVersion);
var pythonVersion = new Version(pythonApiVersion);
if (unityVersion.Major == 0)
{
if (unityVersion.Major != pythonVersion.Major || unityVersion.Minor != pythonVersion.Minor)
{
return false;
}
}
else if (unityVersion.Major != pythonVersion.Major)
{
return false;
}
else if (unityVersion.Minor != pythonVersion.Minor)
{
// Even if we initialize, we still want to check to make sure that we inform users of minor version
// changes. This will surface any features that may not work due to minor version incompatibilities.
Debug.LogWarningFormat(
"WARNING: The communication API versions between Unity and python differ at the minor version level. " +
"Python API: {0}, Unity API: {1} Python Library Version: {2} .\n" +
"This means that some features may not work unless you upgrade the package with the lower version." +
"Please find the versions that work best together from our release page.\n" +
"https://github.com/Unity-Technologies/ml-agents/releases",
pythonApiVersion, unityCommunicationVersion, pythonLibraryVersion
);
}
return true;
}
/// <summary>
/// Sends the initialization parameters through the Communicator.
/// Is used by the academy to send initialization parameters to the communicator.

},
out input);
var pythonCommunicationVersion = initializationInput.RlInitializationInput.CommunicationVersion;
var pythonPackageVersion = initializationInput.RlInitializationInput.PackageVersion;
var unityCommunicationVersion = initParameters.unityCommunicationVersion;
var communicationIsCompatible = CheckCommunicationVersionsAreCompatible(unityCommunicationVersion,
pythonCommunicationVersion,
pythonPackageVersion);
var pythonCommunicationVersion = initializationInput.RlInitializationInput.CommunicationVersion;
var pythonPackageVersion = initializationInput.RlInitializationInput.PackageVersion;
if (pythonCommunicationVersion != initParameters.unityCommunicationVersion)
if (!communicationIsCompatible)
"Communication protocol between python ({0}) and Unity ({1}) don't match. " +
"Python library version: {2}.",
"Communication protocol between python ({0}) and Unity ({1}) have different " +
"versions which make them incompatible. Python library version: {2}.",
pythonCommunicationVersion, initParameters.unityCommunicationVersion,
pythonPackageVersion
);

2
com.unity.ml-agents/Runtime/Demonstrations/DemonstrationWriter.cs


}
// Increment meta-data counters.
m_MetaData.numberExperiences++;
m_MetaData.numberSteps++;
m_CumulativeReward += info.reward;
if (info.done)
{

8
com.unity.ml-agents/Runtime/Policies/BehaviorParameters.cs


get { return m_BehaviorName + "?team=" + TeamId; }
}
internal IPolicy GeneratePolicy(Func<float[]> heuristic)
internal IPolicy GeneratePolicy(HeuristicPolicy.ActionGenerator heuristic)
return new HeuristicPolicy(heuristic);
return new HeuristicPolicy(heuristic, m_BrainParameters.numActions);
case BehaviorType.InferenceOnly:
{
if (m_Model == null)

}
else
{
return new HeuristicPolicy(heuristic);
return new HeuristicPolicy(heuristic, m_BrainParameters.numActions);
return new HeuristicPolicy(heuristic);
return new HeuristicPolicy(heuristic, m_BrainParameters.numActions);
}
}

22
com.unity.ml-agents/Runtime/Policies/HeuristicPolicy.cs


/// </summary>
internal class HeuristicPolicy : IPolicy
{
Func<float[]> m_Heuristic;
public delegate void ActionGenerator(float[] actionsOut);
ActionGenerator m_Heuristic;
int m_numActions;
bool m_Done;
bool m_DecisionRequested;
WriteAdapter m_WriteAdapter = new WriteAdapter();
NullList m_NullList = new NullList();

public HeuristicPolicy(Func<float[]> heuristic)
public HeuristicPolicy(ActionGenerator heuristic, int numActions)
m_numActions = numActions;
m_LastDecision = new float[m_numActions];
}
/// <inheritdoc />

if (!info.done)
{
m_LastDecision = m_Heuristic.Invoke();
}
m_Done = info.done;
m_DecisionRequested = true;
if (!m_Done && m_DecisionRequested)
{
m_Heuristic.Invoke(m_LastDecision);
}
m_DecisionRequested = false;
return m_LastDecision;
}

35
com.unity.ml-agents/Runtime/Timer.cs


}
/// <summary>
/// Tracks the most recent value of a metric. This is analogous to gauges in statsd.
/// Tracks the most recent value of a metric. This is analogous to gauges in statsd and Prometheus.
/// </summary>
[DataContract]
internal class GaugeNode

/// <summary>
/// The most recent value that the gauge was set to.
/// </summary>
/// <summary>
/// The smallest value that has been seen for the gauge since it was created.
/// </summary>
/// <summary>
/// The largest value that has been seen for the gauge since it was created.
/// </summary>
/// <summary>
/// The exponential moving average of the gauge value. This will take all values into account,
/// but weights older values less as more values are added.
/// </summary>
/// <summary>
/// The running average of all gauge values.
/// </summary>
[DataMember]
public float runningAverage;
/// <summary>
/// The number of times the gauge has been updated.
/// </summary>
runningAverage = value;
minValue = value;
maxValue = value;
count = 1;

{
++count;
++count;
// Update running average - see https://www.johndcook.com/blog/standard_deviation/ for formula.
runningAverage = runningAverage + (newValue - runningAverage) / count;
}
}

4
com.unity.ml-agents/Tests/Editor/BehaviorParameterTests.cs


[TestFixture]
public class BehaviorParameterTests
{
static float[] DummyHeuristic()
static void DummyHeuristic(float[] actionsOut)
return null;
// No-op
}
[Test]

6
com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs


agentActionCallsForEpisode = 0;
}
public override float[] Heuristic()
public override void Heuristic(float[] actionsOut)
return new float[0];
}
}

var expectedAgentActionForEpisode = 0;
var expectedCollectObsCalls = 0;
var expectedCollectObsCallsForEpisode = 0;
var expectedCompletedEpisodes = 0;
var expectedSensorResetCalls = 0;
for (var i = 0; i < 15; i++)

expectedAgentActionForEpisode = 0;
expectedCollectObsCallsForEpisode = 0;
expectedAgentStepCount = 0;
expectedCompletedEpisodes++;
expectedSensorResetCalls++;
expectedCollectObsCalls += 1;
}

Assert.AreEqual(expectedAgentActionForEpisode, agent1.agentActionCallsForEpisode);
Assert.AreEqual(expectedCollectObsCalls, agent1.collectObservationsCalls);
Assert.AreEqual(expectedCollectObsCallsForEpisode, agent1.collectObservationsCallsForEpisode);
Assert.AreEqual(expectedCompletedEpisodes, agent1.CompletedEpisodes);
Assert.AreEqual(expectedSensorResetCalls, agent1.sensor1.numResetCalls);
}
}

43
com.unity.ml-agents/Tests/Editor/TimerTest.cs


myTimer.Reset();
Assert.AreEqual(myTimer.RootNode.Children, null);
}
[Test]
public void TestGauges()
{
TimerStack myTimer = TimerStack.Instance;
myTimer.Reset();
// Simple test - adding 1's should keep that for the weighted and running averages.
myTimer.SetGauge("one", 1.0f);
var oneNode = myTimer.RootNode.Gauges["one"];
Assert.AreEqual(oneNode.weightedAverage, 1.0f);
Assert.AreEqual(oneNode.runningAverage, 1.0f);
for (int i = 0; i < 10; i++)
{
myTimer.SetGauge("one", 1.0f);
}
Assert.AreEqual(oneNode.weightedAverage, 1.0f);
Assert.AreEqual(oneNode.runningAverage, 1.0f);
// Try some more interesting values
myTimer.SetGauge("increasing", 1.0f);
myTimer.SetGauge("increasing", 2.0f);
myTimer.SetGauge("increasing", 3.0f);
myTimer.SetGauge("decreasing", 3.0f);
myTimer.SetGauge("decreasing", 2.0f);
myTimer.SetGauge("decreasing", 1.0f);
var increasingNode = myTimer.RootNode.Gauges["increasing"];
var decreasingNode = myTimer.RootNode.Gauges["decreasing"];
// Expect the running average to be (roughly) the same,
// but weighted averages will be biased differently.
Assert.AreEqual(increasingNode.runningAverage, 2.0f);
Assert.AreEqual(decreasingNode.runningAverage, 2.0f);
// The older values are actually weighted more heavily, so we expect the
// increasing series to have a lower moving average.
Assert.Less(increasingNode.weightedAverage, decreasingNode.weightedAverage);
}
}
}

8
com.unity.ml-agents/Tests/Runtime/RuntimeAPITest.cs


{
public int numHeuristicCalls;
public override float[] Heuristic()
public override void Heuristic(float[] actionsOut)
return base.Heuristic();
base.Heuristic(actionsOut);
}// Simple SensorComponent that sets up a StackingSensor
}
// Simple SensorComponent that sets up a StackingSensor
public class StackingComponent : SensorComponent
{
public SensorComponent wrappedComponent;

2
com.unity.ml-agents/package.json


"unity": "2018.4",
"description": "Add interactivity to your game with Machine Learning Agents trained using Deep Reinforcement Learning.",
"dependencies": {
"com.unity.barracuda": "0.6.1-preview"
"com.unity.barracuda": "0.6.3-preview"
}
}

92
docs/FAQ.md


## Installation problems
### Tensorflow dependency
ML Agents requires TensorFlow; if you don't already have it installed, `pip` will try to install it when you install
the ml-agents package.
ML Agents requires TensorFlow; if you don't already have it installed, `pip`
will try to install it when you install the ml-agents package.
it means that there is no version of TensorFlow for your python environment. Some known potential causes are:
* You're using 32-bit python instead of 64-bit. See the answer [here](https://stackoverflow.com/a/1405971/224264)
for how to tell which you have installed.
* You're using python 3.8. Tensorflow plans to release packages for this as soon as possible; see
[this issue](https://github.com/tensorflow/tensorflow/issues/33374) for more details.
* You have the `tensorflow-gpu` package installed. This is equivalent to `tensorflow`, however `pip` doesn't recognize
this. The best way to resolve this is to update to `tensorflow==1.15.0` which provides GPU support in the same package
(see the [release notes](https://github.com/tensorflow/tensorflow/issues/33374) for more details.)
* You're on another architecture (e.g. ARM) which requires vendor provided packages.
In all of these cases, the issue is a pip/python environment setup issue. Please search the tensorflow github issues
for similar problems and solutions before creating a new issue.
## Scripting Runtime Environment not setup correctly
If you haven't switched your scripting runtime version from .NET 3.5 to .NET 4.6
or .NET 4.x, you will see such error message:
it means that there is no version of TensorFlow for your python environment.
Some known potential causes are:
```console
error CS1061: Type `System.Text.StringBuilder' does not contain a definition for `Clear' and no extension method `Clear' of type `System.Text.StringBuilder' could be found. Are you missing an assembly reference?
```
- You're using 32-bit python instead of 64-bit. See the answer
[here](https://stackoverflow.com/a/1405971/224264) for how to tell which you
have installed.
- You're using python 3.8. Tensorflow plans to release packages for this as soon
as possible; see
[this issue](https://github.com/tensorflow/tensorflow/issues/33374) for more
details.
- You have the `tensorflow-gpu` package installed. This is equivalent to
`tensorflow`, however `pip` doesn't recognize this. The best way to resolve
this is to update to `tensorflow==1.15.0` which provides GPU support in the
same package (see the
[release notes](https://github.com/tensorflow/tensorflow/issues/33374) for
more details.)
- You're on another architecture (e.g. ARM) which requires vendor provided
packages.
This is because .NET 3.5 doesn't support method Clear() for StringBuilder, refer
to [Setting Up The ML-Agents Toolkit Within
Unity](Installation.md#setting-up-ml-agent-within-unity) for solution.
In all of these cases, the issue is a pip/python environment setup issue. Please
search the tensorflow github issues for similar problems and solutions before
creating a new issue.
If you directly import your Unity environment without building it in the
editor, you might need to give it additional permissions to execute it.
If you directly import your Unity environment without building it in the editor,
you might need to give it additional permissions to execute it.
If you receive such a permission error on macOS, run:

```
On Windows, you can find
[instructions](https://technet.microsoft.com/en-us/library/cc754344(v=ws.11).aspx).
[instructions](<https://technet.microsoft.com/en-us/library/cc754344(v=ws.11).aspx>).
## Environment Connection Timeout

There may be a number of possible causes:
* _Cause_: There may be no agent in the scene
* _Cause_: On OSX, the firewall may be preventing communication with the
- _Cause_: There may be no agent in the scene
- _Cause_: On OSX, the firewall may be preventing communication with the
* _Cause_: An error happened in the Unity Environment preventing communication.
_Solution_: Look into the [log
files](https://docs.unity3d.com/Manual/LogFiles.html) generated by the Unity
Environment to figure what error happened.
* _Cause_: You have assigned HTTP_PROXY and HTTPS_PROXY values in your
- _Cause_: An error happened in the Unity Environment preventing communication.
_Solution_: Look into the
[log files](https://docs.unity3d.com/Manual/LogFiles.html) generated by the
Unity Environment to figure what error happened.
- _Cause_: You have assigned `HTTP_PROXY` and `HTTPS_PROXY` values in your
If you receive an exception `"Couldn't launch new environment because
communication port {} is still in use. "`, you can change the worker number in
the Python script when calling
If you receive an exception
`"Couldn't launch new environment because communication port {} is still in use. "`,
you can change the worker number in the Python script when calling
```python
UnityEnvironment(file_name=filename, worker_id=X)

If you receive a message `Mean reward : nan` when attempting to train a model
using PPO, this is due to the episodes of the Learning Environment not
terminating. In order to address this, set `Max Steps` for the
Agents within the Scene Inspector to a value greater than 0. Alternatively, it
is possible to manually set `done` conditions for episodes from within scripts
for custom episode-terminating events.
## Problems with training on AWS
Please refer to [Training on Amazon Web Service FAQ](Training-on-Amazon-Web-Service.md#faq)
# Known Issues
## Release 0.10.0
* ml-agents 0.10.0 and earlier were incompatible with TensorFlow 1.15.0; the graph could contain
an operator that `tensorflow_to_barracuda` didn't handle. This was fixed in the 0.11.0 release.
terminating. In order to address this, set `Max Steps` for the Agents within the
Scene Inspector to a value greater than 0. Alternatively, it is possible to
manually set `done` conditions for episodes from within scripts for custom
episode-terminating events.

346
docs/Getting-Started.md


# Getting Started Guide
This guide walks through the end-to-end process of opening an ML-Agents
toolkit example environment in Unity, building the Unity executable, training an
Agent in it, and finally embedding the trained model into the Unity environment.
The ML-Agents toolkit includes a number of [example
environments](Learning-Environment-Examples.md) which you can examine to help
understand the different ways in which the ML-Agents toolkit can be used. These
environments can also serve as templates for new environments or as ways to test
new ML algorithms. After reading this tutorial, you should be able to explore
train the example environments.
If you are not familiar with the [Unity Engine](https://unity3d.com/unity), we
highly recommend the [Roll-a-ball
tutorial](https://unity3d.com/learn/tutorials/s/roll-ball-tutorial) to learn all
the basic concepts first.
This guide walks through the end-to-end process of opening one of our
[example environments](Learning-Environment-Examples.md) in Unity, training an
Agent in it, and embedding the trained model into the Unity environment. After
reading this tutorial, you should be able to train any of the example
environments. If you are not familiar with the
[Unity Engine](https://unity3d.com/unity), view our
[Background: Unity](Background-Unity.md) page for helpful pointers.
Additionally, if you're not familiar with machine learning, view our
[Background: Machine Learning](Background-Machine-Learning.md) page for a brief
overview and helpful pointers.
This guide uses the **3D Balance Ball** environment to teach the basic concepts and
usage patterns of ML-Agents. 3D Balance Ball
contains a number of agent cubes and balls (which are all copies of each other).
Each agent cube tries to keep its ball from falling by rotating either
horizontally or vertically. In this environment, an agent cube is an **Agent** that
receives a reward for every step that it balances the ball. An agent is also
penalized with a negative reward for dropping the ball. The goal of the training
process is to have the agents learn to balance the ball on their head.
For this guide, we'll use the **3D Balance Ball** environment which contains a
number of agent cubes and balls (which are all copies of each other). Each agent
cube tries to keep its ball from falling by rotating either horizontally or
vertically. In this environment, an agent cube is an **Agent** that receives a
reward for every step that it balances the ball. An agent is also penalized with
a negative reward for dropping the ball. The goal of the training process is to
have the agents learn to balance the ball on their head.
In order to install and set up the ML-Agents toolkit, the Python dependencies
and Unity, see the [installation instructions](Installation.md).
Depending on your version of Unity, it may be necessary to change the **Scripting Runtime Version** of your project. This can be done as follows:
If you haven't already, follow the [installation instructions](Installation.md).
Afterwards, open the Unity Project that contains all the example environments:
2. On the Projects dialog, choose the **Add** option at the top of the window.
3. Using the file dialog that opens, locate the `Project` folder
within the ML-Agents toolkit project and click **Open**.
4. Go to **Edit** > **Project Settings** > **Player**
5. For **each** of the platforms you target (**PC, Mac and Linux Standalone**,
**iOS** or **Android**):
1. Expand the **Other Settings** section.
2. Select **Scripting Runtime Version** to **Experimental (.NET 4.6
Equivalent or .NET 4.x Equivalent)**
6. Go to **File** > **Save Project**
1. On the Projects dialog, choose the **Add** option at the top of the window.
1. Using the file dialog that opens, locate the `Project` folder within the
ML-Agents Toolkit and click **Open**.
1. In the **Project** window, go to the
`Assets/ML-Agents/Examples/3DBall/Scenes` folder and open the `3DBall` scene
file.
_environment_. In the context of Unity, an environment is a scene containing
one or more Agent objects, and, of course, the other
entities that an agent interacts with.
_environment_. In the context of Unity, an environment is a scene containing one
or more Agent objects, and, of course, the other entities that an agent
interacts with.
![Unity Editor](images/mlagents-3DBallHierarchy.png)

window. The Inspector shows every component on a GameObject.
The first thing you may notice after opening the 3D Balance Ball scene is that
it contains not one, but several agent cubes. Each agent cube in the scene is an
independent agent, but they all share the same Behavior. 3D Balance Ball does this
to speed up training since all twelve agents contribute to training in parallel.
it contains not one, but several agent cubes. Each agent cube in the scene is an
independent agent, but they all share the same Behavior. 3D Balance Ball does
this to speed up training since all twelve agents contribute to training in
parallel.
### Agent

behavior:
* **Behavior Parameters** — Every Agent must have a Behavior. The Behavior
determines how an Agent makes decisions. More on Behavior Parameters in
the next section.
* **Max Step** — Defines how many simulation steps can occur before the Agent's
- **Behavior Parameters** — Every Agent must have a Behavior. The Behavior
determines how an Agent makes decisions.
- **Max Step** — Defines how many simulation steps can occur before the Agent's
When you create an Agent, you must extend the base Agent class.
The Ball3DAgent subclass defines the following methods:
* `Agent.OnEpisodeBegin()` — Called at the beginning of an Agent's episode, including at the beginning
of the simulation. The Ball3DAgent class uses this function to reset the
agent cube and ball to their starting positions. The function randomizes the reset values so that the
training generalizes to more than a specific starting position and agent cube
attitude.
* `Agent.CollectObservations(VectorSensor sensor)` — Called every simulation step. Responsible for
collecting the Agent's observations of the environment. Since the Behavior
Parameters of the Agent are set with vector observation
space with a state size of 8, the `CollectObservations(VectorSensor sensor)` must call
`VectorSensor.AddObservation()` such that vector size adds up to 8.
* `Agent.OnActionReceived()` — Called every time the Agent receives an action to take. Receives the action chosen
by the Agent. The vector action spaces result in a
small change in the agent cube's rotation at each step. The `OnActionReceived()` method
assigns a reward to the Agent; in this example, an Agent receives a small
positive reward for each step it keeps the ball on the agent cube's head and a larger,
negative reward for dropping the ball. An Agent's episode is also ended when it
drops the ball so that it will reset with a new ball for the next simulation
step.
* `Agent.Heuristic()` - When the `Behavior Type` is set to `Heuristic Only` in the Behavior
Parameters of the Agent, the Agent will use the `Heuristic()` method to generate
the actions of the Agent. As such, the `Heuristic()` method returns an array of
floats. In the case of the Ball 3D Agent, the `Heuristic()` method converts the
keyboard inputs into actions.
#### Behavior Parameters : Vector Observation Space
Before making a decision, an agent collects its observation about its state in

The Behavior Parameters of the 3D Balance Ball example uses a **Space Size** of 8.
This means that the feature
vector containing the Agent's observations contains eight elements: the `x` and
`z` components of the agent cube's rotation and the `x`, `y`, and `z` components
of the ball's relative position and velocity. (The observation values are
defined in the Agent's `CollectObservations(VectorSensor sensor)` method.)
The Behavior Parameters of the 3D Balance Ball example uses a `Space Size` of 8.
This means that the feature vector containing the Agent's observations contains
eight elements: the `x` and `z` components of the agent cube's rotation and the
`x`, `y`, and `z` components of the ball's relative position and velocity.
An Agent is given instructions in the form of a float array of *actions*.
ML-Agents toolkit classifies actions into two types: the **Continuous** vector
action space is a vector of numbers that can vary continuously. What each
element of the vector means is defined by the Agent logic (the training
process just learns what values are better given particular state observations
based on the rewards received when it tries different values). For example, an
element might represent a force or torque applied to a `Rigidbody` in the Agent.
The **Discrete** action vector space defines its actions as tables. An action
given to the Agent is an array of indices into tables.
The 3D Balance Ball example is programmed to use continuous action
space with `Space Size` of 2.
An Agent is given instructions in the form of a float array of _actions_.
ML-Agents Toolkit classifies actions into two types: continuous and discrete.
The 3D Balance Ball example is programmed to use continuous action space which
is a a vector of numbers that can vary continuously. More specifically, it uses
a `Space Size` of 2 to control the amount of `x` and `z` rotations to apply to
itself to keep the ball balanced on its head.
[Unity Inference Engine](Unity-Inference-Engine.md) to run these models
inside Unity. In this section, we will use the pre-trained model for the
3D Ball example.
[Unity Inference Engine](Unity-Inference-Engine.md) to run these models inside
Unity. In this section, we will use the pre-trained model for the 3D Ball
example.
1. In the **Project** window, go to the `Assets/ML-Agents/Examples/3DBall/Scenes` folder
and open the `3DBall` scene file.
2. In the **Project** window, go to the `Assets/ML-Agents/Examples/3DBall/Prefabs` folder.
Expand `3DBall` and click on the `Agent` prefab. You should see the `Agent` prefab in the **Inspector** window.
1. In the **Project** window, go to the
`Assets/ML-Agents/Examples/3DBall/Prefabs` folder. Expand `3DBall` and click
on the `Agent` prefab. You should see the `Agent` prefab in the **Inspector**
window.
**Note**: The platforms in the `3DBall` scene were created using the `3DBall` prefab. Instead of updating all 12 platforms individually, you can update the `3DBall` prefab instead.
**Note**: The platforms in the `3DBall` scene were created using the `3DBall`
prefab. Instead of updating all 12 platforms individually, you can update the
`3DBall` prefab instead.
3. In the **Project** window, drag the **3DBall** Model located in
`Assets/ML-Agents/Examples/3DBall/TFModels` into the `Model` property under `Behavior Parameters (Script)` component in the Agent GameObject **Inspector** window.
1. In the **Project** window, drag the **3DBall** Model located in
`Assets/ML-Agents/Examples/3DBall/TFModels` into the `Model` property under
`Behavior Parameters (Script)` component in the Agent GameObject
**Inspector** window.
4. You should notice that each `Agent` under each `3DBall` in the **Hierarchy** windows now contains **3DBall** as `Model` on the `Behavior Parameters`. __Note__ : You can modify multiple game objects in a scene by selecting them all at
once using the search bar in the Scene Hierarchy.
8. Select the **InferenceDevice** to use for this model (CPU or GPU) on the Agent.
_Note: CPU is faster for the majority of ML-Agents toolkit generated models_
9. Click the **Play** button and you will see the platforms balance the balls
using the pre-trained model.
1. You should notice that each `Agent` under each `3DBall` in the **Hierarchy**
windows now contains **3DBall** as `Model` on the `Behavior Parameters`.
**Note** : You can modify multiple game objects in a scene by selecting them
all at once using the search bar in the Scene Hierarchy.
1. Set the **Inference Device** to use for this model as `CPU`.
1. Click the :arrow_forward: button in the Unity Editor and you will see the
platforms balance the balls using the pre-trained model.
While we provide pre-trained `.nn` files for the agents in this environment, any environment you make yourself will require training agents from scratch to generate a new model file. We can do this using reinforcement learning.
In order to train an agent to correctly balance the ball, we provide two
deep reinforcement learning algorithms.
The default algorithm is Proximal Policy Optimization (PPO). This
is a method that has been shown to be more general purpose and stable
than many other RL algorithms. For more information on PPO, OpenAI
has a [blog post](https://blog.openai.com/openai-baselines-ppo/)
explaining it, and [our page](Training-PPO.md) for how to use it in training.
We also provide Soft-Actor Critic, an off-policy algorithm that
has been shown to be both stable and sample-efficient.
For more information on SAC, see UC Berkeley's
[blog post](https://bair.berkeley.edu/blog/2018/12/14/sac/) and
[our page](Training-SAC.md) for more guidance on when to use SAC vs. PPO. To
use SAC to train Balance Ball, replace all references to `config/trainer_config.yaml`
with `config/sac_trainer_config.yaml` below.
To train the agents within the Balance Ball environment, we will be using the
ML-Agents Python package. We have provided a convenient command called `mlagents-learn`
which accepts arguments used to configure both training and inference phases.
While we provide pre-trained `.nn` files for the agents in this environment, any
environment you make yourself will require training agents from scratch to
generate a new model file. In this section we will demonstrate how to use the
reinforcement learning algorithms that are part of the ML-Agents Python package
to accomplish this. We have provided a convenient command `mlagents-learn` which
accepts arguments used to configure both training and inference phases.
2. Navigate to the folder where you cloned the ML-Agents toolkit repository.
**Note**: If you followed the default [installation](Installation.md), then
you should be able to run `mlagents-learn` from any directory.
3. Run `mlagents-learn <trainer-config-path> --run-id=<run-identifier>`
where:
- `<trainer-config-path>` is the relative or absolute filepath of the
trainer configuration. The defaults used by example environments included
in `MLAgentsSDK` can be found in `config/trainer_config.yaml`.
- `<run-identifier>` is a string used to separate the results of different
training runs. Make sure to use one that hasn't been used already!
4. If you cloned the ML-Agents repo, then you can simply run
```sh
mlagents-learn config/trainer_config.yaml --run-id=firstRun
```
5. When the message _"Start training by pressing the Play button in the Unity
1. Navigate to the folder where you cloned the `ml-agents` repository. **Note**:
If you followed the default [installation](Installation.md), then you should
be able to run `mlagents-learn` from any directory.
1. Run `mlagents-learn config/trainer_config.yaml --run-id=first3DBallRun`.
- `config/trainer_config.yaml` is the path to a default training
configuration file that we provide. In includes training configurations for
all our example environments, including 3DBall.
- `run-id` is a unique name for this training session.
1. When the message _"Start training by pressing the Play button in the Unity
**Note**: If you're using Anaconda, don't forget to activate the ml-agents
environment first.
The `--time-scale=100` sets the `Time.TimeScale` value in Unity.
**Note**: You can train using an executable rather than the Editor. To do so,
follow the instructions in
[Using an Executable](Learning-Environment-Executable.md).
**Note**: Re-running this command will start training from scratch again. To resume
a previous training run, append the `--load` flag and give the same `--run-id` as the
run you want to resume.
If `mlagents-learn` runs correctly and starts training, you should see something
like this:

sequence_length: 64
summary_freq: 1000
use_recurrent: False
summary_path: ./summaries/first-run-0
summary_path: ./summaries/first3DBallRun
model_path: ./models/first-run-0/3DBallLearning
INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 1000. Mean Reward: 1.242. Std of Reward: 0.746. Training.
INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 2000. Mean Reward: 1.319. Std of Reward: 0.693. Training.
INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 3000. Mean Reward: 1.804. Std of Reward: 1.056. Training.
INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 4000. Mean Reward: 2.151. Std of Reward: 1.432. Training.
INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 5000. Mean Reward: 3.175. Std of Reward: 2.250. Training.
INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 6000. Mean Reward: 4.898. Std of Reward: 4.019. Training.
INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 7000. Mean Reward: 6.716. Std of Reward: 5.125. Training.
INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 8000. Mean Reward: 12.124. Std of Reward: 11.929. Training.
INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 9000. Mean Reward: 18.151. Std of Reward: 16.871. Training.
INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 10000. Mean Reward: 27.284. Std of Reward: 28.667. Training.
model_path: ./models/first3DBallRun/3DBallLearning
INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 1000. Mean Reward: 1.242. Std of Reward: 0.746. Training.
INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 2000. Mean Reward: 1.319. Std of Reward: 0.693. Training.
INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 3000. Mean Reward: 1.804. Std of Reward: 1.056. Training.
INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 4000. Mean Reward: 2.151. Std of Reward: 1.432. Training.
INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 5000. Mean Reward: 3.175. Std of Reward: 2.250. Training.
INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 6000. Mean Reward: 4.898. Std of Reward: 4.019. Training.
INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 7000. Mean Reward: 6.716. Std of Reward: 5.125. Training.
INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 8000. Mean Reward: 12.124. Std of Reward: 11.929. Training.
INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 9000. Mean Reward: 18.151. Std of Reward: 16.871. Training.
INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 10000. Mean Reward: 27.284. Std of Reward: 28.667. Training.
Note how the `Mean Reward` value printed to the screen increases as training
progresses. This is a positive sign that training is succeeding.
### Observing Training Progress

tensorboard --logdir=summaries
```
Then navigate to `localhost:6006` in your browser.
From TensorBoard, you will see the summary statistics:
* **Lesson** - only interesting when performing [curriculum
training](Training-Curriculum-Learning.md). This is not used in the 3D Balance
Ball environment.
* **Cumulative Reward** - The mean cumulative episode reward over all agents. Should
increase during a successful training session.
* **Entropy** - How random the decisions of the model are. Should slowly decrease
during a successful training process. If it decreases too quickly, the `beta`
hyperparameter should be increased.
* **Episode Length** - The mean length of each episode in the environment for all
agents.
* **Learning Rate** - How large a step the training algorithm takes as it searches
for the optimal policy. Should decrease over time.
* **Policy Loss** - The mean loss of the policy function update. Correlates to how
much the policy (process for deciding actions) is changing. The magnitude of
this should decrease during a successful training session.
* **Value Estimate** - The mean value estimate for all states visited by the agent.
Should increase during a successful training session.
* **Value Loss** - The mean loss of the value function update. Correlates to how
well the model is able to predict the value of each state. This should
decrease during a successful training session.
Then navigate to `localhost:6006` in your browser to view the TensorBoard
summary statistics as shown below. For the purposes of this section, the most
important statistic is `Environment/Cumulative Reward` which should increase
throughout training, eventually converging close to `100` which is the maximum
reward the agent can accumulate.
![Example TensorBoard Run](images/mlagents-TensorBoard.png)

(denoted by the `Saved Model` message) you can add it to the Unity project and
use it with compatible Agents (the Agents that generated the model).
__Note:__ Do not just close the Unity Window once the `Saved Model` message appears.
use it with compatible Agents (the Agents that generated the model). **Note:**
Do not just close the Unity Window once the `Saved Model` message appears.
command-line prompt. If you close the window manually, the `.nn` file
containing the trained model is not exported into the ml-agents folder.
command-line prompt. If you close the window manually, the `.nn` file containing
the trained model is not exported into the ml-agents folder.
If you've quit the training early using Ctrl+C and want to resume training, run the
same command again, appending the `--resume` flag:
If you've quit the training early using Ctrl+C and want to resume training, run
the same command again, appending the `--resume` flag:
mlagents-learn config/trainer_config.yaml --run-id=firstRun --resume
mlagents-learn config/trainer_config.yaml --run-id=first3DBallRun --resume
`<behavior_name>` is the name of the `Behavior Name` of the agents corresponding to the model.
(**Note:** There is a known bug on Windows that causes the saving of the model to
fail when you early terminate the training, it's recommended to wait until Step
has reached the max_steps parameter you set in trainer_config.yaml.) This file
corresponds to your model's latest checkpoint. You can now embed this trained
model into your Agents by following the steps below, which is similar to
the steps described
[above](#running-a-pre-trained-model).
`<behavior_name>` is the name of the `Behavior Name` of the agents corresponding
to the model. This file corresponds to your model's latest checkpoint. You can
now embed this trained model into your Agents by following the steps below,
which is similar to the steps described [above](#running-a-pre-trained-model).
2. Open the Unity Editor, and select the **3DBall** scene as described above.
3. Select the **3DBall** prefab Agent object.
4. Drag the `<behavior_name>.nn` file from the Project window of
the Editor to the **Model** placeholder in the **Ball3DAgent**
inspector window.
5. Press the :arrow_forward: button at the top of the Editor.
1. Open the Unity Editor, and select the **3DBall** scene as described above.
1. Select the **3DBall** prefab Agent object.
1. Drag the `<behavior_name>.nn` file from the Project window of the Editor to
the **Model** placeholder in the **Ball3DAgent** inspector window.
1. Press the :arrow_forward: button at the top of the Editor.
- For more information on the ML-Agents toolkit, in addition to helpful
- For more information on the ML-Agents Toolkit, in addition to helpful
check out the [Making a New Learning
Environment](Learning-Environment-Create-New.md) page.
check out the
[Making a New Learning Environment](Learning-Environment-Create-New.md) page.
- For an overview on the more complex example environments that are provided in
this toolkit, check out the
[Example Environments](Learning-Environment-Examples.md) page.
- For more information on the various training options available, check out the
[Training ML-Agents](Training-ML-Agents.md) page.

120
docs/Installation-Anaconda-Windows.md


# Installing ML-Agents Toolkit for Windows (Deprecated)
Note: We no longer use this guide ourselves and so it may not work correctly. We've decided to
keep it up just in case it is helpful to you.
:warning: **Note:** We no longer use this guide ourselves and so it may not work
correctly. We've decided to keep it up just in case it is helpful to you.
The ML-Agents toolkit supports Windows 10. While it might be possible to run the
ML-Agents toolkit using other versions of Windows, it has not been tested on

[Download](https://www.anaconda.com/download/#windows) and install Anaconda for
Windows. By using Anaconda, you can manage separate environments for different
distributions of Python. Python 3.6.1 or higher is required as we no longer support
Python 2. In this guide, we are using Python version 3.6 and Anaconda version
5.1
distributions of Python. Python 3.6.1 or higher is required as we no longer
support Python 2. In this guide, we are using Python version 3.6 and Anaconda
version 5.1
([64-bit](https://repo.continuum.io/archive/Anaconda3-5.1.0-Windows-x86_64.exe)
or [32-bit](https://repo.continuum.io/archive/Anaconda3-5.1.0-Windows-x86.exe)
direct links).

<img src="images/anaconda_default.PNG" alt="Anaconda Install" width="500" border="10" />
</p>
After installation, you must open __Anaconda Navigator__ to finish the setup.
After installation, you must open **Anaconda Navigator** to finish the setup.
From the Windows search bar, type _anaconda navigator_. You can close Anaconda
Navigator after it opens.

Type `environment variables` in the search bar (this can be reached by hitting
the Windows key or the bottom left Windows button). You should see an option
called __Edit the system environment variables__.
called **Edit the system environment variables**.
<p align="center">
<img src="images/edit_env_var.png"

From here, click the __Environment Variables__ button. Double click "Path" under
__System variable__ to edit the "Path" variable, click __New__ to add the
From here, click the **Environment Variables** button. Double click "Path" under
**System variable** to edit the "Path" variable, click **New** to add the
following new paths.
```console

install these Python dependencies.
If you haven't already, clone the ML-Agents Toolkit Github repository to your
local computer. You can do this using Git ([download
here](https://git-scm.com/download/win)) and running the following commands in
an Anaconda Prompt _(if you open a new prompt, be sure to activate the ml-agents
Conda environment by typing `activate ml-agents`)_:
local computer. You can do this using Git
([download here](https://git-scm.com/download/win)) and running the following
commands in an Anaconda Prompt _(if you open a new prompt, be sure to activate
the ml-agents Conda environment by typing `activate ml-agents`)_:
The `--branch latest_release` option will switch to the tag of the latest stable release.
Omitting that will get the `master` branch which is potentially unstable.
The `--branch latest_release` option will switch to the tag of the latest stable
release. Omitting that will get the `master` branch which is potentially
unstable.
The `com.unity.ml-agents` subdirectory contains the core code to add to your projects.
The `Project` subdirectory contains many [example environments](Learning-Environment-Examples.md)
to help you get started.
The `com.unity.ml-agents` subdirectory contains the core code to add to your
projects. The `Project` subdirectory contains many
[example environments](Learning-Environment-Examples.md) to help you get
started.
The `ml-agents` subdirectory contains a Python package which provides deep reinforcement
learning trainers to use with Unity environments.
The `ml-agents` subdirectory contains a Python package which provides deep
reinforcement learning trainers to use with Unity environments.
The `ml-agents-envs` subdirectory contains a Python API to interface with Unity, which
the `ml-agents` package depends on.
The `ml-agents-envs` subdirectory contains a Python API to interface with Unity,
which the `ml-agents` package depends on.
Keep in mind where the files were downloaded, as you will need the
trainer config files in this directory when running `mlagents-learn`.
Make sure you are connected to the Internet and then type in the Anaconda
Prompt:
Keep in mind where the files were downloaded, as you will need the trainer
config files in this directory when running `mlagents-learn`. Make sure you are
connected to the Internet and then type in the Anaconda Prompt:
```console
pip install mlagents

the ML-Agents toolkit.
Sometimes on Windows, when you use pip to install certain Python packages, the pip will get stuck when trying to read the cache of the package. If you see this, you can try:
Sometimes on Windows, when you use pip to install certain Python packages, the
pip will get stuck when trying to read the cache of the package. If you see
this, you can try:
```console
pip install mlagents --no-cache-dir

### Installing for Development
If you intend to make modifications to `ml-agents` or `ml-agents-envs`, you should install
the packages from the cloned repo rather than from PyPi. To do this, you will need to install
`ml-agents` and `ml-agents-envs` separately.
If you intend to make modifications to `ml-agents` or `ml-agents-envs`, you
should install the packages from the cloned repo rather than from PyPi. To do
this, you will need to install `ml-agents` and `ml-agents-envs` separately.
cloned or downloaded the files, from the Anaconda Prompt, change to the ml-agents
subdirectory inside the ml-agents directory:
cloned or downloaded the files, from the Anaconda Prompt, change to the
ml-agents subdirectory inside the ml-agents directory:
```console
cd C:\Downloads\ml-agents

pip install -e .
```
Running pip with the `-e` flag will let you make changes to the Python files directly and have those
reflected when you run `mlagents-learn`. It is important to install these packages in this order as the
`mlagents` package depends on `mlagents_envs`, and installing it in the other
order will download `mlagents_envs` from PyPi.
Running pip with the `-e` flag will let you make changes to the Python files
directly and have those reflected when you run `mlagents-learn`. It is important
to install these packages in this order as the `mlagents` package depends on
`mlagents_envs`, and installing it in the other order will download
`mlagents_envs` from PyPi.
## (Optional) Step 4: GPU Training using The ML-Agents Toolkit

Additionally, you will need to check if your GPU is CUDA compatible. Please
check Nvidia's page [here](https://developer.nvidia.com/cuda-gpus).
Currently for the ML-Agents toolkit, only CUDA v9.0 and cuDNN v7.0.5 is supported.
Currently for the ML-Agents toolkit, only CUDA v9.0 and cuDNN v7.0.5 is
supported.
### Install Nvidia CUDA toolkit

this guide, we are using version
[9.0.176](https://developer.nvidia.com/compute/cuda/9.0/Prod/network_installers/cuda_9.0.176_win10_network-exe)).
Before installing, please make sure you __close any running instances of Unity
or Visual Studio__.
Before installing, please make sure you **close any running instances of Unity
or Visual Studio**.
Run the installer and select the Express option. Note the directory where you
installed the CUDA toolkit. In this guide, we installed in the directory

</p>
Once you've signed up, go back to the cuDNN
[downloads page](https://developer.nvidia.com/cudnn).
You may or may not be asked to fill out a short survey. When you get to the list
cuDNN releases, __make sure you are downloading the right version for the CUDA
toolkit you installed in Step 1.__ In this guide, we are using version 7.0.5 for
CUDA toolkit version 9.0
[downloads page](https://developer.nvidia.com/cudnn). You may or may not be
asked to fill out a short survey. When you get to the list cuDNN releases,
**make sure you are downloading the right version for the CUDA toolkit you
installed in Step 1.** In this guide, we are using version 7.0.5 for CUDA
toolkit version 9.0
([direct link](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/v7.0.5/prod/9.0_20171129/cudnn-9.0-windows10-x64-v7)).
After you have downloaded the cuDNN files, you will need to extract the files

To set the environment variable, type `environment variables` in the search bar
(this can be reached by hitting the Windows key or the bottom left Windows
button). You should see an option called __Edit the system environment
variables__.
button). You should see an option called **Edit the system environment
variables**.
<p align="center">
<img src="images/edit_env_var.png"

From here, click the __Environment Variables__ button. Click __New__ to add a
new system variable _(make sure you do this under __System variables__ and not
From here, click the **Environment Variables** button. Click **New** to add a
new system variable _(make sure you do this under **System variables** and not
User variables_.
<p align="center">

</p>
For __Variable Name__, enter `CUDA_HOME`. For the variable value, put the
For **Variable Name**, enter `CUDA_HOME`. For the variable value, put the
is `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`. Press __OK__ once.
is `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`. Press **OK** once.
<p align="center">
<img src="images/system_variable_name_value.PNG"

To set the two path variables, inside the same __Environment Variables__ window
and under the second box called __System Variables__, find a variable called
`Path` and click __Edit__. You will add two directories to the list. For this
To set the two path variables, inside the same **Environment Variables** window
and under the second box called **System Variables**, find a variable called
`Path` and click **Edit**. You will add two directories to the list. For this
guide, the two entries would look like:
```console

Next, install `tensorflow-gpu` using `pip`. You'll need version 1.7.1. In an
Anaconda Prompt with the Conda environment ml-agents activated, type in the
following command to uninstall TensorFlow for cpu and install TensorFlow
for gpu _(make sure you are connected to the Internet)_:
following command to uninstall TensorFlow for cpu and install TensorFlow for gpu
_(make sure you are connected to the Internet)_:
```sh
pip uninstall tensorflow

Lastly, you should test to see if everything installed properly and that
TensorFlow can identify your GPU. In the same Anaconda Prompt, open Python
in the Prompt by calling:
TensorFlow can identify your GPU. In the same Anaconda Prompt, open Python in
the Prompt by calling:
```sh
python

8
docs/Learning-Environment-Create-New.md


The `Heuristic()` method will look like this :
```csharp
public override float[] Heuristic()
public override void Heuristic(float[] actionsOut)
var action = new float[2];
action[0] = Input.GetAxis("Horizontal");
action[1] = Input.GetAxis("Vertical");
return action;
actionsOut[0] = Input.GetAxis("Horizontal");
actionsOut[1] = Input.GetAxis("Vertical");
}
```

509
docs/Learning-Environment-Design-Agents.md


# Agents
An agent is an entity that can observe its environment, decide on the best
course of action using those observations, and execute those actions within
its environment. Agents can be created in Unity by extending
the `Agent` class. The most important aspects of creating agents that can
successfully learn are the observations the agent collects,
and the reward you assign to estimate the value of the
agent's current state toward accomplishing its tasks.
course of action using those observations, and execute those actions within its
environment. Agents can be created in Unity by extending the `Agent` class. The
most important aspects of creating agents that can successfully learn are the
observations the agent collects, and the reward you assign to estimate the value
of the agent's current state toward accomplishing its tasks.
An Agent passes its observations to its Policy. The Policy then makes a decision
and passes the chosen action back to the agent. Your agent code must execute the

discover the optimal decision-making policy.
The `Policy` class abstracts out the decision making logic from the Agent itself so
that you can use the same Policy in multiple Agents. How a Policy makes its
The `Policy` class abstracts out the decision making logic from the Agent itself
so that you can use the same Policy in multiple Agents. How a Policy makes its
write your own Policy. If the Agent has a `Model` file, its Policy will use
the neural network `Model` to take decisions.
write your own Policy. If the Agent has a `Model` file, its Policy will use the
neural network `Model` to take decisions.
When you create an Agent, you must extend the base Agent class. This includes
implementing the following methods:
- `Agent.OnEpisodeBegin()` — Called at the beginning of an Agent's episode,
including at the beginning of the simulation. The Ball3DAgent class uses this
function to reset the agent cube and ball to their starting positions. The
function randomizes the reset values so that the training generalizes to more
than a specific starting position and agent cube attitude.
- `Agent.CollectObservations(VectorSensor sensor)` — Called every simulation
step. Responsible for collecting the Agent's observations of the environment.
Since the Behavior Parameters of the Agent are set with vector observation
space with a state size of 8, the `CollectObservations(VectorSensor sensor)`
must call `VectorSensor.AddObservation()` such that vector size adds up to 8.
- `Agent.OnActionReceived()` — Called every time the Agent receives an action to
take. Receives the action chosen by the Agent. The vector action spaces result
in a small change in the agent cube's rotation at each step. The
`OnActionReceived()` method assigns a reward to the Agent; in this example, an
Agent receives a small positive reward for each step it keeps the ball on the
agent cube's head and a larger, negative reward for dropping the ball. An
Agent's episode is also ended when it drops the ball so that it will reset
with a new ball for the next simulation step.
- `Agent.Heuristic()` - When the `Behavior Type` is set to `Heuristic Only` in
the Behavior Parameters of the Agent, the Agent will use the `Heuristic()`
method to generate the actions of the Agent. As such, the `Heuristic()` method
returns an array of floats. In the case of the Ball 3D Agent, the
`Heuristic()` method converts the keyboard inputs into actions.
a decision.
Agents will request a decision when `Agent.RequestDecision()` is called. If you need
the Agent to request decisions on its own at regular intervals, add a
`Decision Requester` component to the Agent's GameObject. Making decisions at regular step
intervals is generally most appropriate for physics-based simulations. For example, an
agent in a robotic simulator that must provide fine-control of joint torques
should make its decisions every step of the simulation. On the other hand, an
agent that only needs to make decisions when certain game or simulation events
occur, such as in a turn-based game, should call `Agent.RequestDecision()` manually.
a decision. Agents will request a decision when `Agent.RequestDecision()` is
called. If you need the Agent to request decisions on its own at regular
intervals, add a `Decision Requester` component to the Agent's GameObject.
Making decisions at regular step intervals is generally most appropriate for
physics-based simulations. For example, an agent in a robotic simulator that
must provide fine-control of joint torques should make its decisions every step
of the simulation. On the other hand, an agent that only needs to make decisions
when certain game or simulation events occur, such as in a turn-based game,
should call `Agent.RequestDecision()` manually.
To make informed decisions, an agent must first make observations of the state of
the environment. The observations are collected by Sensors attached to the agent
GameObject. By default, agents come with a `VectorSensor` which allows them to
collect floating-point observations into a single array. There are additional
sensor components which can be attached to the agent GameObject which collect their own
observations, or modify other observations. These are:
To make informed decisions, an agent must first make observations of the state
of the environment. The observations are collected by Sensors attached to the
agent GameObject. By default, agents come with a `VectorSensor` which allows
them to collect floating-point observations into a single array. There are
additional sensor components which can be attached to the agent GameObject which
collect their own observations, or modify other observations. These are:
* `CameraSensorComponent` - Allows image from `Camera` to be used as observation.
* `RenderTextureSensorComponent` - Allows content of `RenderTexture` to be used as observation.
* `RayPerceptionSensorComponent` - Allows information from set of ray-casts to be used as observation.
- `CameraSensorComponent` - Allows image from `Camera` to be used as
observation.
- `RenderTextureSensorComponent` - Allows content of `RenderTexture` to be used
as observation.
- `RayPerceptionSensorComponent` - Allows information from set of ray-casts to
be used as observation.
Vector observations are best used for aspects of the environment which are numerical
and non-visual. The Policy class calls the `CollectObservations(VectorSensor sensor)`
method of each Agent. Your implementation of this function must call
`VectorSensor.AddObservation` to add vector observations.
Vector observations are best used for aspects of the environment which are
numerical and non-visual. The Policy class calls the
`CollectObservations(VectorSensor sensor)` method of each Agent. Your
implementation of this function must call `VectorSensor.AddObservation` to add
vector observations.
information an agent needs to accomplish its task. Without sufficient and relevant
information, an agent may learn poorly
or may not learn at all. A reasonable approach for determining what information
should be included is to consider what you would need to calculate an analytical
solution to the problem, or what you would expect a human to be able to use to solve the problem.
information an agent needs to accomplish its task. Without sufficient and
relevant information, an agent may learn poorly or may not learn at all. A
reasonable approach for determining what information should be included is to
consider what you would need to calculate an analytical solution to the problem,
or what you would expect a human to be able to use to solve the problem.
ML-Agents SDK. For instance, the 3DBall example uses the rotation of the
ML-Agents SDK. For instance, the 3DBall example uses the rotation of the
platform, the relative position of the ball, and the velocity of the ball as its
state observation. As an experiment, you can remove the velocity components from
the observation and retrain the 3DBall agent. While it will learn to balance the

an agent's observations to a fixed subset. For example, instead of observing
every enemy agent in an environment, you could only observe the closest five.
When you set up an Agent's `Behavior Parameters` in the Unity Editor, set the following
properties to use a vector observation:
When you set up an Agent's `Behavior Parameters` in the Unity Editor, set the
following properties to use a vector observation:
* **Space Size** — The state size must match the length of your feature vector.
- **Space Size** — The state size must match the length of your feature vector.
The `VectorSensor.AddObservation` method provides a number of overloads for adding common types
of data to your observation vector. You can add Integers and booleans directly to
the observation vector, as well as some common Unity data types such as `Vector2`,
`Vector3`, and `Quaternion`.
The `VectorSensor.AddObservation` method provides a number of overloads for
adding common types of data to your observation vector. You can add Integers and
booleans directly to the observation vector, as well as some common Unity data
types such as `Vector2`, `Vector3`, and `Quaternion`.
#### One-hot encoding categorical information

}
```
`VectorSensor` also provides a two-argument function `AddOneHotObservation()` as a shortcut for _one-hot_
style observations. The following example is identical to the previous one.
`VectorSensor` also provides a two-argument function `AddOneHotObservation()` as
a shortcut for _one-hot_ style observations. The following example is identical
to the previous one.
```csharp
enum CarriedItems { Sword, Shield, Bow, LastItem }

```csharp
normalizedValue = (currentValue - minValue)/(maxValue - minValue)
```
:warning: For vectors, you should apply the above formula to each component (x, y, and z). Note that this is *not* the same as using the `Vector3.normalized` property or `Vector3.Normalize()` method in Unity (and similar for `Vector2`).
:warning: For vectors, you should apply the above formula to each component (x,
y, and z). Note that this is _not_ the same as using the `Vector3.normalized`
property or `Vector3.Normalize()` method in Unity (and similar for `Vector2`).
Rotations and angles should also be normalized. For angles between 0 and 360
degrees, you can use the following formulas:

#### Vector Observation Summary & Best Practices
* Vector Observations should include all variables relevant for allowing the
agent to take the optimally informed decision, and ideally no extraneous information.
* In cases where Vector Observations need to be remembered or compared over
time, either an LSTM (see [here](Feature-Memory.md)) should be used in the model, or the
`Stacked Vectors` value in the agent GameObject's `Behavior Parameters` should be changed.
* Categorical variables such as type of object (Sword, Shield, Bow) should be
encoded in one-hot fashion (i.e. `3` -> `0, 0, 1`). This can be done automatically using the
`AddOneHotObservation()` method of the `VectorSensor`.
* In general, all inputs should be normalized to be in
the range 0 to +1 (or -1 to 1). For example, the `x` position information of
an agent where the maximum possible value is `maxValue` should be recorded as
- Vector Observations should include all variables relevant for allowing the
agent to take the optimally informed decision, and ideally no extraneous
information.
- In cases where Vector Observations need to be remembered or compared over
time, either an LSTM (see [here](Feature-Memory.md)) should be used in the
model, or the `Stacked Vectors` value in the agent GameObject's
`Behavior Parameters` should be changed.
- Categorical variables such as type of object (Sword, Shield, Bow) should be
encoded in one-hot fashion (i.e. `3` -> `0, 0, 1`). This can be done
automatically using the `AddOneHotObservation()` method of the `VectorSensor`.
- In general, all inputs should be normalized to be in the range 0 to +1 (or -1
to 1). For example, the `x` position information of an agent where the maximum
possible value is `maxValue` should be recorded as
* Positional information of relevant GameObjects should be encoded in relative
- Positional information of relevant GameObjects should be encoded in relative
Visual observations are generally provided to agent via either a `CameraSensor` or `RenderTextureSensor`.
These collect image information and transforms it into a 3D Tensor which
can be fed into the convolutional neural network (CNN) of the agent policy. For more information on
CNNs, see [this guide](http://cs231n.github.io/convolutional-networks/). This allows agents
to learn from spatial regularities in the observation images. It is possible to
use visual and vector observations with the same agent.
Visual observations are generally provided to agent via either a `CameraSensor`
or `RenderTextureSensor`. These collect image information and transforms it into
a 3D Tensor which can be fed into the convolutional neural network (CNN) of the
agent policy. For more information on CNNs, see
[this guide](http://cs231n.github.io/convolutional-networks/). This allows
agents to learn from spatial regularities in the observation images. It is
possible to use visual and vector observations with the same agent.
used when it is not possible to properly define the problem using vector or ray-cast observations.
used when it is not possible to properly define the problem using vector or
ray-cast observations.
Visual observations can be derived from Cameras or RenderTextures within your scene.
To add a visual observation to an Agent, add either a Camera Sensor Component
or RenderTextures Sensor Component to the Agent. Then drag the camera or
render texture you want to add to the `Camera` or `RenderTexture` field.
You can have more than one camera or render texture and even use a combination
of both attached to an Agent. For each visual observation, set the width and height
of the image (in pixels) and whether or not the observation is color or grayscale.
Visual observations can be derived from Cameras or RenderTextures within your
scene. To add a visual observation to an Agent, add either a Camera Sensor
Component or RenderTextures Sensor Component to the Agent. Then drag the camera
or render texture you want to add to the `Camera` or `RenderTexture` field. You
can have more than one camera or render texture and even use a combination of
both attached to an Agent. For each visual observation, set the width and height
of the image (in pixels) and whether or not the observation is color or
grayscale.
![Agent Camera](images/visual-observation.png)

Each Agent that uses the same Policy must have the same number of visual observations,
and they must all have the same resolutions (including whether or not they are grayscale).
Additionally, each Sensor Component on an Agent must have a unique name so that they can
be sorted deterministically (the name must be unique for that Agent, but multiple Agents can
have a Sensor Component with the same name).
Each Agent that uses the same Policy must have the same number of visual
observations, and they must all have the same resolutions (including whether or
not they are grayscale). Additionally, each Sensor Component on an Agent must
have a unique name so that they can be sorted deterministically (the name must
be unique for that Agent, but multiple Agents can have a Sensor Component with
the same name).
adding a `Canvas`, then adding a `Raw Image` with it's texture set to the Agent's
`RenderTexture`. This will render the agent observation on the game screen.
adding a `Canvas`, then adding a `Raw Image` with it's texture set to the
Agent's `RenderTexture`. This will render the agent observation on the game
screen.
The [GridWorld environment](Learning-Environment-Examples.md#gridworld)
is an example on how to use a RenderTexture for both debugging and observation. Note
that in this example, a Camera is rendered to a RenderTexture, which is then used for
observations and debugging. To update the RenderTexture, the Camera must be asked to
render every time a decision is requested within the game code. When using Cameras
as observations directly, this is done automatically by the Agent.
The [GridWorld environment](Learning-Environment-Examples.md#gridworld) is an
example on how to use a RenderTexture for both debugging and observation. Note
that in this example, a Camera is rendered to a RenderTexture, which is then
used for observations and debugging. To update the RenderTexture, the Camera
must be asked to render every time a decision is requested within the game code.
When using Cameras as observations directly, this is done automatically by the
Agent.
* To collect visual observations, attach `CameraSensor` or `RenderTextureSensor`
- To collect visual observations, attach `CameraSensor` or `RenderTextureSensor`
* Visual observations should generally be used unless vector observations are not sufficient.
* Image size should be kept as small as possible, without the loss of
needed details for decision making.
* Images should be made greyscale in situations where color information is
not needed for making informed decisions.
- Visual observations should generally be used unless vector observations are
not sufficient.
- Image size should be kept as small as possible, without the loss of needed
details for decision making.
- Images should be made greyscale in situations where color information is not
needed for making informed decisions.
This can be easily implemented by adding a
`RayPerceptionSensorComponent3D` (or `RayPerceptionSensorComponent2D`) to the Agent GameObject.
This can be easily implemented by adding a `RayPerceptionSensorComponent3D` (or
`RayPerceptionSensorComponent2D`) to the Agent GameObject.
During observations, several rays (or spheres, depending on settings) are cast into
the physics world, and the objects that are hit determine the observation vector that
is produced.
During observations, several rays (or spheres, depending on settings) are cast
into the physics world, and the objects that are hit determine the observation
vector that is produced.
* _Detectable Tags_ A list of strings corresponding to the types of objects that the
Agent should be able to distinguish between. For example, in the WallJump example,
we use "wall", "goal", and "block" as the list of objects to detect.
* _Rays Per Direction_ Determines the number of rays that are cast. One ray is
- _Detectable Tags_ A list of strings corresponding to the types of objects that
the Agent should be able to distinguish between. For example, in the WallJump
example, we use "wall", "goal", and "block" as the list of objects to detect.
- _Rays Per Direction_ Determines the number of rays that are cast. One ray is
* _Max Ray Degrees_ The angle (in degrees) for the outermost rays. 90 degrees
- _Max Ray Degrees_ The angle (in degrees) for the outermost rays. 90 degrees
* _Sphere Cast Radius_ The size of the sphere used for sphere casting. If set
to 0, rays will be used instead of spheres. Rays may be more efficient,
- _Sphere Cast Radius_ The size of the sphere used for sphere casting. If set to
0, rays will be used instead of spheres. Rays may be more efficient,
* _Ray Length_ The length of the casts
* _Observation Stacks_ The number of previous results to "stack" with the cast
results. Note that this can be independent of the "Stacked Vectors" setting
in `Behavior Parameters`.
* _Start Vertical Offset_ (3D only) The vertical offset of the ray start point.
* _End Vertical Offset_ (3D only) The vertical offset of the ray end point.
- _Ray Length_ The length of the casts
- _Observation Stacks_ The number of previous results to "stack" with the cast
results. Note that this can be independent of the "Stacked Vectors" setting in
`Behavior Parameters`.
- _Start Vertical Offset_ (3D only) The vertical offset of the ray start point.
- _End Vertical Offset_ (3D only) The vertical offset of the ray end point.
Both use 3 Rays Per Direction and 90 Max Ray Degrees. One of the components
had a vertical offset, so the Agent can tell whether it's clear to jump over
the wall.
Both use 3 Rays Per Direction and 90 Max Ray Degrees. One of the components had
a vertical offset, so the Agent can tell whether it's clear to jump over the
wall.
so the number of rays and tags should be kept as small as possible to reduce the
amount of data used. Note that this is separate from the State Size defined in
`Behavior Parameters`, so you don't need to worry about the formula above when

* Attach `RayPerceptionSensorComponent3D` or `RayPerceptionSensorComponent2D` to use.
* This observation type is best used when there is relevant spatial information
- Attach `RayPerceptionSensorComponent3D` or `RayPerceptionSensorComponent2D` to
use.
- This observation type is best used when there is relevant spatial information
* Use as few rays and tags as necessary to solve the problem in order to improve learning stability and agent performance.
- Use as few rays and tags as necessary to solve the problem in order to improve
learning stability and agent performance.
agent's `OnActionReceived()` function. Actions for an agent can take one of two forms, either **Continuous** or **Discrete**.
agent's `OnActionReceived()` function. Actions for an agent can take one of two
forms, either **Continuous** or **Discrete**.
When you specify that the vector action space
is **Continuous**, the action parameter passed to the Agent is an array of
floating point numbers with length equal to the `Vector Action Space Size` property.
When you specify a **Discrete** vector action space type, the action parameter
is an array containing integers. Each integer is an index into a list or table
of commands. In the **Discrete** vector action space type, the action parameter
is an array of indices. The number of indices in the array is determined by the
number of branches defined in the `Branches Size` property. Each branch
corresponds to an action table, you can specify the size of each table by
modifying the `Branches` property.
When you specify that the vector action space is **Continuous**, the action
parameter passed to the Agent is an array of floating point numbers with length
equal to the `Vector Action Space Size` property. When you specify a
**Discrete** vector action space type, the action parameter is an array
containing integers. Each integer is an index into a list or table of commands.
In the **Discrete** vector action space type, the action parameter is an array
of indices. The number of indices in the array is determined by the number of
branches defined in the `Branches Size` property. Each branch corresponds to an
action table, you can specify the size of each table by modifying the `Branches`
property.
Neither the Policy nor the training algorithm know anything about what the action
values themselves mean. The training algorithm simply tries different values for
the action list and observes the affect on the accumulated rewards over time and
many training episodes. Thus, the only place actions are defined for an Agent is
in the `OnActionReceived()` function.
Neither the Policy nor the training algorithm know anything about what the
action values themselves mean. The training algorithm simply tries different
values for the action list and observes the affect on the accumulated rewards
over time and many training episodes. Thus, the only place actions are defined
for an Agent is in the `OnActionReceived()` function.
For example, if you designed an agent to move in two dimensions, you could use
either continuous or the discrete vector actions. In the continuous case, you

with values ranging from zero to one.
Note that when you are programming actions for an agent, it is often helpful to
test your action logic using the `Heuristic()` method of the Agent,
which lets you map keyboard
commands to actions.
test your action logic using the `Heuristic()` method of the Agent, which lets
you map keyboard commands to actions.
The [3DBall](Learning-Environment-Examples.md#3dball-3d-balance-ball) and
[Area](Learning-Environment-Examples.md#push-block) example environments are set

When an Agent uses a Policy set to the **Continuous** vector action space, the
action parameter passed to the Agent's `OnActionReceived()` function is an array with
length equal to the `Vector Action Space Size` property value.
The individual values in the array have whatever meanings that you ascribe to
them. If you assign an element in the array as the speed of an Agent, for
example, the training process learns to control the speed of the Agent through
this parameter.
action parameter passed to the Agent's `OnActionReceived()` function is an array
with length equal to the `Vector Action Space Size` property value. The
individual values in the array have whatever meanings that you ascribe to them.
If you assign an element in the array as the speed of an Agent, for example, the
training process learns to control the speed of the Agent through this
parameter.
The [Reacher example](Learning-Environment-Examples.md#reacher) defines a
continuous action space with four control values.

### Discrete Action Space
When an Agent uses a **Discrete** vector action space, the
action parameter passed to the Agent's `OnActionReceived()` function is an array
containing indices. With the discrete vector action space, `Branches` is an
array of integers, each value corresponds to the number of possibilities for
each branch.
When an Agent uses a **Discrete** vector action space, the action parameter
passed to the Agent's `OnActionReceived()` function is an array containing
indices. With the discrete vector action space, `Branches` is an array of
integers, each value corresponds to the number of possibilities for each branch.
agent be able to move __and__ jump concurrently. We define the first branch to
agent be able to move **and** jump concurrently. We define the first branch to
have 5 possible actions (don't move, go left, go right, go backward, go forward)
and the second one to have 2 possible actions (don't jump, jump). The
`OnActionReceived()` method would look something like:

#### Masking Discrete Actions
When using Discrete Actions, it is possible to specify that some actions are
impossible for the next decision. When the Agent is controlled by a
neural network, the Agent will be unable to perform the specified action. Note
that when the Agent is controlled by its Heuristic, the Agent will
still be able to decide to perform the masked action. In order to mask an
action, override the `Agent.CollectDiscreteActionMasks()` virtual method,
and call `DiscreteActionMasker.SetMask()` in it:
impossible for the next decision. When the Agent is controlled by a neural
network, the Agent will be unable to perform the specified action. Note that
when the Agent is controlled by its Heuristic, the Agent will still be able to
decide to perform the masked action. In order to mask an action, override the
`Agent.CollectDiscreteActionMasks()` virtual method, and call
`DiscreteActionMasker.SetMask()` in it:
```csharp
public override void CollectDiscreteActionMasks(DiscreteActionMasker actionMasker){

Where:
* `branch` is the index (starting at 0) of the branch on which you want to mask
- `branch` is the index (starting at 0) of the branch on which you want to mask
* `actionIndices` is a list of `int` corresponding to the
indices of the actions that the Agent cannot perform.
- `actionIndices` is a list of `int` corresponding to the indices of the actions
that the Agent cannot perform.
For example, if you have an Agent with 2 branches and on the first branch
(branch 0) there are 4 possible actions : _"do nothing"_, _"jump"_, _"shoot"_

Notes:
* You can call `SetMask` multiple times if you want to put masks on
multiple branches.
* You cannot mask all the actions of a branch.
* You cannot mask actions in continuous control.
- You can call `SetMask` multiple times if you want to put masks on multiple
branches.
- You cannot mask all the actions of a branch.
- You cannot mask actions in continuous control.
### Actions Summary & Best Practices
### Actions Summary & Best Practices
* Actions can either use `Discrete` or `Continuous` spaces.
* When using `Discrete` it is possible to assign multiple action branches, and to mask certain actions.
* In general, smaller action spaces will make for easier learning.
* Be sure to set the Vector Action's Space Size to the number of used Vector
- Actions can either use `Discrete` or `Continuous` spaces.
- When using `Discrete` it is possible to assign multiple action branches, and
to mask certain actions.
- In general, smaller action spaces will make for easier learning.
- Be sure to set the Vector Action's Space Size to the number of used Vector
* When using continuous control, action values should be clipped to an
- When using continuous control, action values should be clipped to an
## Rewards

reward over time. The better your reward mechanism, the better your agent will
learn.
**Note:** Rewards are not used during inference by an Agent using a
trained model and is also not used during imitation learning.
**Note:** Rewards are not used during inference by an Agent using a trained
model and is also not used during imitation learning.
the desired results. You can even use the
Agent's Heuristic to control the Agent while watching how it accumulates rewards.
the desired results. You can even use the Agent's Heuristic to control the Agent
while watching how it accumulates rewards.
Allocate rewards to an Agent by calling the `AddReward()` or `SetReward()` methods on the agent.
The reward assigned between each decision
should be in the range [-1,1]. Values outside this range can lead to
unstable training. The `reward` value is reset to zero when the agent receives a
new decision. If there are multiple calls to `AddReward()` for a single agent
decision, the rewards will be summed together to evaluate how good the previous
decision was. The `SetReward()` will override all
previous rewards given to an agent since the previous decision.
Allocate rewards to an Agent by calling the `AddReward()` or `SetReward()`
methods on the agent. The reward assigned between each decision should be in the
range [-1,1]. Values outside this range can lead to unstable training. The
`reward` value is reset to zero when the agent receives a new decision. If there
are multiple calls to `AddReward()` for a single agent decision, the rewards
will be summed together to evaluate how good the previous decision was. The
`SetReward()` will override all previous rewards given to an agent since the
previous decision.
You can examine the `OnActionReceived()` functions defined in the [example
environments](Learning-Environment-Examples.md) to see how those projects
allocate rewards.
You can examine the `OnActionReceived()` functions defined in the
[example environments](Learning-Environment-Examples.md) to see how those
projects allocate rewards.
The `GridAgent` class in the [GridWorld
example](Learning-Environment-Examples.md#gridworld) uses a very simple reward
system:
The `GridAgent` class in the
[GridWorld example](Learning-Environment-Examples.md#gridworld) uses a very
simple reward system:
```csharp
Collider[] hitObjects = Physics.OverlapBox(trueAgent.transform.position,

example of a _sparse_ reward system. The agent must explore a lot to find the
infrequent reward.
In contrast, the `AreaAgent` in the [Area
example](Learning-Environment-Examples.md#push-block) gets a small negative
reward every step. In order to get the maximum reward, the agent must finish its
task of reaching the goal square as quickly as possible:
In contrast, the `AreaAgent` in the
[Area example](Learning-Environment-Examples.md#push-block) gets a small
negative reward every step. In order to get the maximum reward, the agent must
finish its task of reaching the goal square as quickly as possible:
```csharp
AddReward( -0.005f);

The `Ball3DAgent` also assigns a negative penalty when the ball falls off the
platform.
Note that all of these environments make use of the `EndEpisode()` method, which manually
terminates an episode when a termination condition is reached. This can be
called independently of the `Max Step` property.
Note that all of these environments make use of the `EndEpisode()` method, which
manually terminates an episode when a termination condition is reached. This can
be called independently of the `Max Step` property.
* Use `AddReward()` to accumulate rewards between decisions. Use `SetReward()`
- Use `AddReward()` to accumulate rewards between decisions. Use `SetReward()`
* The magnitude of any given reward should typically not be greater than 1.0 in
- The magnitude of any given reward should typically not be greater than 1.0 in
* Positive rewards are often more helpful to shaping the desired behavior of an
agent than negative rewards. Excessive negative rewards can result in the agent
failing to learn any meaningful behavior.
* For locomotion tasks, a small positive reward (+0.1) for forward velocity is
- Positive rewards are often more helpful to shaping the desired behavior of an
agent than negative rewards. Excessive negative rewards can result in the
agent failing to learn any meaningful behavior.
- For locomotion tasks, a small positive reward (+0.1) for forward velocity is
* If you want the agent to finish a task quickly, it is often helpful to provide
- If you want the agent to finish a task quickly, it is often helpful to provide
episode by calling `EndEpisode()` on the agent when it has accomplished its goal.
episode by calling `EndEpisode()` on the agent when it has accomplished its
goal.
* `Behavior Parameters` - The parameters dictating what Policy the Agent will
receive.
* `Behavior Name` - The identifier for the behavior. Agents with the same behavior name
will learn the same policy. If you're using [curriculum learning](Training-Curriculum-Learning.md),
this is used as the top-level key in the config.
* `Vector Observation`
* `Space Size` - Length of vector observation for the Agent.
* `Stacked Vectors` - The number of previous vector observations that will
- `Behavior Parameters` - The parameters dictating what Policy the Agent will
receive.
- `Behavior Name` - The identifier for the behavior. Agents with the same
behavior name will learn the same policy. If you're using
[curriculum learning](Training-Curriculum-Learning.md), this is used as the
top-level key in the config.
- `Vector Observation`
- `Space Size` - Length of vector observation for the Agent.
- `Stacked Vectors` - The number of previous vector observations that will
* `Vector Action`
* `Space Type` - Corresponds to whether action vector contains a single
- `Vector Action`
- `Space Type` - Corresponds to whether action vector contains a single
* `Space Size` (Continuous) - Length of action vector.
* `Branches` (Discrete) - An array of integers, defines multiple concurrent
- `Space Size` (Continuous) - Length of action vector.
- `Branches` (Discrete) - An array of integers, defines multiple concurrent
* `Model` - The neural network model used for inference (obtained after
training)
* `Inference Device` - Whether to use CPU or GPU to run the model during inference
* `Behavior Type` - Determines whether the Agent will do training, inference, or use its
Heuristic() method:
* `Default` - the Agent will train if they connect to a python trainer, otherwise they will perform inference.
* `Heuristic Only` - the Agent will always use the `Heuristic()` method.
* `Inference Only` - the Agent will always perform inference.
* `Team ID` - Used to define the team for [self-play](Training-Self-Play.md)
* `Use Child Sensors` - Whether to use all Sensor components attached to child GameObjects of this Agent.
* `Max Step` - The per-agent maximum number of steps. Once this number is
- `Model` - The neural network model used for inference (obtained after
training)
- `Inference Device` - Whether to use CPU or GPU to run the model during
inference
- `Behavior Type` - Determines whether the Agent will do training, inference,
or use its Heuristic() method:
- `Default` - the Agent will train if they connect to a python trainer,
otherwise they will perform inference.
- `Heuristic Only` - the Agent will always use the `Heuristic()` method.
- `Inference Only` - the Agent will always perform inference.
- `Team ID` - Used to define the team for [self-play](Training-Self-Play.md)
- `Use Child Sensors` - Whether to use all Sensor components attached to child
GameObjects of this Agent.
- `Max Step` - The per-agent maximum number of steps. Once this number is
reached, the Agent will be reset.
## Monitoring Agents

## Destroying an Agent
You can destroy an Agent GameObject during the simulation. Make sure that there is
always at least one Agent training at all times by either spawning a new Agent
every time one is destroyed or by re-spawning new Agents when the whole environment
resets.
You can destroy an Agent GameObject during the simulation. Make sure that there
is always at least one Agent training at all times by either spawning a new
Agent every time one is destroyed or by re-spawning new Agents when the whole
environment resets.

670
docs/Learning-Environment-Examples.md


# Example Learning Environments
The Unity ML-Agents toolkit contains an expanding set of example environments
which demonstrate various features of the platform. Environments are located in
`Project/Assets/ML-Agents/Examples` and summarized below. Additionally, our
The Unity ML-Agents Toolkit includes an expanding set of example environments
that highlight the various features of the toolkit. These environments can also
serve as templates for new environments or as ways to test new ML algorithms.
Environments are located in `Project/Assets/ML-Agents/Examples` and summarized
below. Additionally, our
This page only overviews the example environments we provide. To learn more on
how to design and build your own environments see our [Making a New Learning
Environment](Learning-Environment-Create-New.md) page.
Note: Environment scenes marked as _optional_ do not have accompanying
pre-trained model files, and are designed to serve as challenges for
researchers.
For the environments that highlight specific features of the toolkit, we provide
the pre-trained model files and the training config file that enables you to
train the scene yourself. The environments that are designed to serve as
challenges for researchers do not have accompanying pre-trained model files or
training configs and are marked as _Optional_ below.
If you would like to contribute environments, please see our
This page only overviews the example environments we provide. To learn more on
how to design and build your own environments see our
[Making a New Learning Environment](Learning-Environment-Create-New.md) page. If
you would like to contribute environments, please see our
[contribution guidelines](../com.unity.ml-agents/CONTRIBUTING.md) page.
## Basic

* Set-up: A linear movement task where the agent must move left or right to
- Set-up: A linear movement task where the agent must move left or right to
* Goal: Move to the most reward state.
* Agents: The environment contains one agent.
* Agent Reward Function:
* -0.01 at each step
* +0.1 for arriving at suboptimal state.
* +1.0 for arriving at optimal state.
* Behavior Parameters:
* Vector Observation space: One variable corresponding to current state.
* Vector Action space: (Discrete) Two possible actions (Move left, move
- Goal: Move to the most reward state.
- Agents: The environment contains one agent.
- Agent Reward Function:
- -0.01 at each step
- +0.1 for arriving at suboptimal state.
- +1.0 for arriving at optimal state.
- Behavior Parameters:
- Vector Observation space: One variable corresponding to current state.
- Vector Action space: (Discrete) Two possible actions (Move left, move
* Visual Observations: None
* Float Properties: None
* Benchmark Mean Reward: 0.93
- Visual Observations: None
- Float Properties: None
- Benchmark Mean Reward: 0.93
* Set-up: A balance-ball task, where the agent balances the ball on it's head.
* Goal: The agent must balance the ball on it's head for as long as possible.
* Agents: The environment contains 12 agents of the same kind, all using the
- Set-up: A balance-ball task, where the agent balances the ball on it's head.
- Goal: The agent must balance the ball on it's head for as long as possible.
- Agents: The environment contains 12 agents of the same kind, all using the
* Agent Reward Function:
* +0.1 for every step the ball remains on it's head.
* -1.0 if the ball falls off.
* Behavior Parameters:
* Vector Observation space: 8 variables corresponding to rotation of the agent cube,
and position and velocity of ball.
* Vector Observation space (Hard Version): 5 variables corresponding to
- Agent Reward Function:
- +0.1 for every step the ball remains on it's head.
- -1.0 if the ball falls off.
- Behavior Parameters:
- Vector Observation space: 8 variables corresponding to rotation of the agent
cube, and position and velocity of ball.
- Vector Observation space (Hard Version): 5 variables corresponding to
* Vector Action space: (Continuous) Size of 2, with one value corresponding to
- Vector Action space: (Continuous) Size of 2, with one value corresponding to
* Visual Observations: None.
* Float Properties: Three
* scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions)
* Default: 1
* Recommended Minimum: 0.2
* Recommended Maximum: 5
* gravity: Magnitude of gravity
* Default: 9.81
* Recommended Minimum: 4
* Recommended Maximum: 105
* mass: Specifies mass of the ball
* Default: 1
* Recommended Minimum: 0.1
* Recommended Maximum: 20
* Benchmark Mean Reward: 100
- Visual Observations: None.
- Float Properties: Three
- scale: Specifies the scale of the ball in the 3 dimensions (equal across the
three dimensions)
- Default: 1
- Recommended Minimum: 0.2
- Recommended Maximum: 5
- gravity: Magnitude of gravity
- Default: 9.81
- Recommended Minimum: 4
- Recommended Maximum: 105
- mass: Specifies mass of the ball
- Default: 1
- Recommended Minimum: 0.1
- Recommended Maximum: 20
- Benchmark Mean Reward: 100
* Set-up: A version of the classic grid-world task. Scene contains agent, goal,
- Set-up: A version of the classic grid-world task. Scene contains agent, goal,
* Goal: The agent must navigate the grid to the goal while avoiding the
- Goal: The agent must navigate the grid to the goal while avoiding the
* Agents: The environment contains nine agents with the same Behavior Parameters.
* Agent Reward Function:
* -0.01 for every step.
* +1.0 if the agent navigates to the goal position of the grid (episode ends).
* -1.0 if the agent navigates to an obstacle (episode ends).
* Behavior Parameters:
* Vector Observation space: None
* Vector Action space: (Discrete) Size of 4, corresponding to movement in
- Agents: The environment contains nine agents with the same Behavior
Parameters.
- Agent Reward Function:
- -0.01 for every step.
- +1.0 if the agent navigates to the goal position of the grid (episode ends).
- -1.0 if the agent navigates to an obstacle (episode ends).
- Behavior Parameters:
- Vector Observation space: None
- Vector Action space: (Discrete) Size of 4, corresponding to movement in
is turned on by default (this option can be toggled
using the `Mask Actions` checkbox within the `trueAgent` GameObject).
The trained model file provided was generated with action masking turned on.
* Visual Observations: One corresponding to top-down view of GridWorld.
* Float Properties: Three, corresponding to grid size, number of obstacles, and
is turned on by default (this option can be toggled using the `Mask Actions`
checkbox within the `trueAgent` GameObject). The trained model file provided
was generated with action masking turned on.
- Visual Observations: One corresponding to top-down view of GridWorld.
- Float Properties: Three, corresponding to grid size, number of obstacles, and
* Benchmark Mean Reward: 0.8
- Benchmark Mean Reward: 0.8
* Set-up: Two-player game where agents control rackets to hit a ball over the
- Set-up: Two-player game where agents control rackets to hit a ball over the
* Goal: The agents must hit the ball so that the opponent cannot hit a valid
return.
* Agents: The environment contains two agent with same Behavior Parameters.
After training you can set the `Behavior Type` to `Heuristic Only` on one of the Agent's
Behavior Parameters to play against your trained model.
* Agent Reward Function (independent):
* +1.0 To the agent that wins the point. An agent wins a point by preventing
the opponent from hitting a valid return.
* -1.0 To the agent who loses the point.
* Behavior Parameters:
* Vector Observation space: 9 variables corresponding to position, velocity
- Goal: The agents must hit the ball so that the opponent cannot hit a valid
return.
- Agents: The environment contains two agent with same Behavior Parameters.
After training you can set the `Behavior Type` to `Heuristic Only` on one of
the Agent's Behavior Parameters to play against your trained model.
- Agent Reward Function (independent):
- +1.0 To the agent that wins the point. An agent wins a point by preventing
the opponent from hitting a valid return.
- -1.0 To the agent who loses the point.
- Behavior Parameters:
- Vector Observation space: 9 variables corresponding to position, velocity
* Vector Action space: (Continuous) Size of 3, corresponding to movement
- Vector Action space: (Continuous) Size of 3, corresponding to movement
* Visual Observations: None
* Float Properties: Three
* gravity: Magnitude of gravity
* Default: 9.81
* Recommended Minimum: 6
* Recommended Maximum: 20
* scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions)
* Default: .5
* Recommended Minimum: 0.2
* Recommended Maximum: 5
- Visual Observations: None
- Float Properties: Three
- gravity: Magnitude of gravity
- Default: 9.81
- Recommended Minimum: 6
- Recommended Maximum: 20
- scale: Specifies the scale of the ball in the 3 dimensions (equal across the
three dimensions)
- Default: .5
- Recommended Minimum: 0.2
- Recommended Maximum: 5
* Set-up: A platforming environment where the agent can push a block around.
* Goal: The agent must push the block to the goal.
* Agents: The environment contains one agent.
* Agent Reward Function:
* -0.0025 for every step.
* +1.0 if the block touches the goal.
* Behavior Parameters:
* Vector Observation space: (Continuous) 70 variables corresponding to 14
- Set-up: A platforming environment where the agent can push a block around.
- Goal: The agent must push the block to the goal.
- Agents: The environment contains one agent.
- Agent Reward Function:
- -0.0025 for every step.
- +1.0 if the block touches the goal.
- Behavior Parameters:
- Vector Observation space: (Continuous) 70 variables corresponding to 14
* Vector Action space: (Discrete) Size of 6, corresponding to turn clockwise
- Vector Action space: (Discrete) Size of 6, corresponding to turn clockwise
* Visual Observations (Optional): One first-person camera. Use
`VisualPushBlock` scene. __The visual observation version of
this environment does not train with the provided default
training parameters.__
* Float Properties: Four
* block_scale: Scale of the block along the x and z dimensions
* Default: 2
* Recommended Minimum: 0.5
* Recommended Maximum: 4
* dynamic_friction: Coefficient of friction for the ground material acting on moving objects
* Default: 0
* Recommended Minimum: 0
* Recommended Maximum: 1
* static_friction: Coefficient of friction for the ground material acting on stationary objects
* Default: 0
* Recommended Minimum: 0
* Recommended Maximum: 1
* block_drag: Effect of air resistance on block
* Default: 0.5
* Recommended Minimum: 0
* Recommended Maximum: 2000
* Benchmark Mean Reward: 4.5
- Visual Observations (Optional): One first-person camera. Use
`VisualPushBlock` scene. **The visual observation version of this
environment does not train with the provided default training parameters.**
- Float Properties: Four
- block_scale: Scale of the block along the x and z dimensions
- Default: 2
- Recommended Minimum: 0.5
- Recommended Maximum: 4
- dynamic_friction: Coefficient of friction for the ground material acting on
moving objects
- Default: 0
- Recommended Minimum: 0
- Recommended Maximum: 1
- static_friction: Coefficient of friction for the ground material acting on
stationary objects
- Default: 0
- Recommended Minimum: 0
- Recommended Maximum: 1
- block_drag: Effect of air resistance on block
- Default: 0.5
- Recommended Minimum: 0
- Recommended Maximum: 2000
- Benchmark Mean Reward: 4.5
* Set-up: A platforming environment where the agent can jump over a wall.
* Goal: The agent must use the block to scale the wall and reach the goal.
* Agents: The environment contains one agent linked to two different
Models. The Policy the agent is linked to changes depending on the
height of the wall. The change of Policy is done in the WallJumpAgent class.
* Agent Reward Function:
* -0.0005 for every step.
* +1.0 if the agent touches the goal.
* -1.0 if the agent falls off the platform.
* Behavior Parameters:
* Vector Observation space: Size of 74, corresponding to 14 ray casts each
- Set-up: A platforming environment where the agent can jump over a wall.
- Goal: The agent must use the block to scale the wall and reach the goal.
- Agents: The environment contains one agent linked to two different Models. The
Policy the agent is linked to changes depending on the height of the wall. The
change of Policy is done in the WallJumpAgent class.
- Agent Reward Function:
- -0.0005 for every step.
- +1.0 if the agent touches the goal.
- -1.0 if the agent falls off the platform.
- Behavior Parameters:
- Vector Observation space: Size of 74, corresponding to 14 ray casts each
* Vector Action space: (Discrete) 4 Branches:
* Forward Motion (3 possible actions: Forward, Backwards, No Action)
* Rotation (3 possible actions: Rotate Left, Rotate Right, No Action)
* Side Motion (3 possible actions: Left, Right, No Action)
* Jump (2 possible actions: Jump, No Action)
* Visual Observations: None
* Float Properties: Four
* Benchmark Mean Reward (Big & Small Wall): 0.8
- Vector Action space: (Discrete) 4 Branches:
- Forward Motion (3 possible actions: Forward, Backwards, No Action)
- Rotation (3 possible actions: Rotate Left, Rotate Right, No Action)
- Side Motion (3 possible actions: Left, Right, No Action)
- Jump (2 possible actions: Jump, No Action)
- Visual Observations: None
- Float Properties: Four
- Benchmark Mean Reward (Big & Small Wall): 0.8
* Set-up: Double-jointed arm which can move to target locations.
* Goal: The agents must move its hand to the goal location, and keep it there.
* Agents: The environment contains 10 agent with same Behavior Parameters.
* Agent Reward Function (independent):
* +0.1 Each step agent's hand is in goal location.
* Behavior Parameters:
* Vector Observation space: 26 variables corresponding to position, rotation,
- Set-up: Double-jointed arm which can move to target locations.
- Goal: The agents must move its hand to the goal location, and keep it there.
- Agents: The environment contains 10 agent with same Behavior Parameters.
- Agent Reward Function (independent):
- +0.1 Each step agent's hand is in goal location.
- Behavior Parameters:
- Vector Observation space: 26 variables corresponding to position, rotation,
* Vector Action space: (Continuous) Size of 4, corresponding to torque
- Vector Action space: (Continuous) Size of 4, corresponding to torque
* Visual Observations: None.
* Float Properties: Five
* goal_size: radius of the goal zone
* Default: 5
* Recommended Minimum: 1
* Recommended Maximum: 10
* goal_speed: speed of the goal zone around the arm (in radians)
* Default: 1
* Recommended Minimum: 0.2
* Recommended Maximum: 4
* gravity
* Default: 9.81
* Recommended Minimum: 4
* Recommended Maximum: 20
* deviation: Magnitude of sinusoidal (cosine) deviation of the goal along the vertical dimension
* Default: 0
* Recommended Minimum: 0
* Recommended Maximum: 5
* deviation_freq: Frequency of the cosine deviation of the goal along the vertical dimension
* Default: 0
* Recommended Minimum: 0
* Recommended Maximum: 3
* Benchmark Mean Reward: 30
- Visual Observations: None.
- Float Properties: Five
- goal_size: radius of the goal zone
- Default: 5
- Recommended Minimum: 1
- Recommended Maximum: 10
- goal_speed: speed of the goal zone around the arm (in radians)
- Default: 1
- Recommended Minimum: 0.2
- Recommended Maximum: 4
- gravity
- Default: 9.81
- Recommended Minimum: 4
- Recommended Maximum: 20
- deviation: Magnitude of sinusoidal (cosine) deviation of the goal along the
vertical dimension
- Default: 0
- Recommended Minimum: 0
- Recommended Maximum: 5
- deviation_freq: Frequency of the cosine deviation of the goal along the
vertical dimension
- Default: 0
- Recommended Minimum: 0
- Recommended Maximum: 3
- Benchmark Mean Reward: 30
* Set-up: A creature with 4 arms and 4 forearms.
* Goal: The agents must move its body toward the goal direction without falling.
* `CrawlerStaticTarget` - Goal direction is always forward.
* `CrawlerDynamicTarget`- Goal direction is randomized.
* Agents: The environment contains 3 agent with same Behavior Parameters.
* Agent Reward Function (independent):
* +0.03 times body velocity in the goal direction.
* +0.01 times body direction alignment with goal direction.
* Behavior Parameters:
* Vector Observation space: 117 variables corresponding to position, rotation,
- Set-up: A creature with 4 arms and 4 forearms.
- Goal: The agents must move its body toward the goal direction without falling.
- `CrawlerStaticTarget` - Goal direction is always forward.
- `CrawlerDynamicTarget`- Goal direction is randomized.
- Agents: The environment contains 3 agent with same Behavior Parameters.
- Agent Reward Function (independent):
- +0.03 times body velocity in the goal direction.
- +0.01 times body direction alignment with goal direction.
- Behavior Parameters:
- Vector Observation space: 117 variables corresponding to position, rotation,
* Vector Action space: (Continuous) Size of 20, corresponding to target
- Vector Action space: (Continuous) Size of 20, corresponding to target
* Visual Observations: None
* Float Properties: None
* Benchmark Mean Reward for `CrawlerStaticTarget`: 2000
* Benchmark Mean Reward for `CrawlerDynamicTarget`: 400
- Visual Observations: None
- Float Properties: None
- Benchmark Mean Reward for `CrawlerStaticTarget`: 2000
- Benchmark Mean Reward for `CrawlerDynamicTarget`: 400
* Set-up: A multi-agent environment where agents compete to collect food.
* Goal: The agents must learn to collect as many green food spheres as possible
- Set-up: A multi-agent environment where agents compete to collect food.
- Goal: The agents must learn to collect as many green food spheres as possible
* Agents: The environment contains 5 agents with same Behavior Parameters.
* Agent Reward Function (independent):
* +1 for interaction with green spheres
* -1 for interaction with red spheres
* Behavior Parameters:
* Vector Observation space: 53 corresponding to velocity of agent (2), whether
- Agents: The environment contains 5 agents with same Behavior Parameters.
- Agent Reward Function (independent):
- +1 for interaction with green spheres
- -1 for interaction with red spheres
- Behavior Parameters:
- Vector Observation space: 53 corresponding to velocity of agent (2), whether
* Vector Action space: (Discrete) 4 Branches:
* Forward Motion (3 possible actions: Forward, Backwards, No Action)
* Side Motion (3 possible actions: Left, Right, No Action)
* Rotation (3 possible actions: Rotate Left, Rotate Right, No Action)
* Laser (2 possible actions: Laser, No Action)
* Visual Observations (Optional): First-person camera per-agent. Use
`VisualFoodCollector` scene. __The visual observation version of
this environment does not train with the provided default
training parameters.__
* Float Properties: Two
* laser_length: Length of the laser used by the agent
* Default: 1
* Recommended Minimum: 0.2
* Recommended Maximum: 7
* agent_scale: Specifies the scale of the agent in the 3 dimensions (equal across the three dimensions)
* Default: 1
* Recommended Minimum: 0.5
* Recommended Maximum: 5
* Benchmark Mean Reward: 10
- Vector Action space: (Discrete) 4 Branches:
- Forward Motion (3 possible actions: Forward, Backwards, No Action)
- Side Motion (3 possible actions: Left, Right, No Action)
- Rotation (3 possible actions: Rotate Left, Rotate Right, No Action)
- Laser (2 possible actions: Laser, No Action)
- Visual Observations (Optional): First-person camera per-agent. Use
`VisualFoodCollector` scene. **The visual observation version of this
environment does not train with the provided default training parameters.**
- Float Properties: Two
- laser_length: Length of the laser used by the agent
- Default: 1
- Recommended Minimum: 0.2
- Recommended Maximum: 7
- agent_scale: Specifies the scale of the agent in the 3 dimensions (equal
across the three dimensions)
- Default: 1
- Recommended Minimum: 0.5
- Recommended Maximum: 5
- Benchmark Mean Reward: 10
* Set-up: Environment where the agent needs to find information in a room,
- Set-up: Environment where the agent needs to find information in a room,
* Goal: Move to the goal which corresponds to the color of the block in the
- Goal: Move to the goal which corresponds to the color of the block in the
* Agents: The environment contains one agent.
* Agent Reward Function (independent):
* +1 For moving to correct goal.
* -0.1 For moving to incorrect goal.
* -0.0003 Existential penalty.
* Behavior Parameters:
* Vector Observation space: 30 corresponding to local ray-casts detecting
- Agents: The environment contains one agent.
- Agent Reward Function (independent):
- +1 For moving to correct goal.
- -0.1 For moving to incorrect goal.
- -0.0003 Existential penalty.
- Behavior Parameters:
- Vector Observation space: 30 corresponding to local ray-casts detecting
* Vector Action space: (Discrete) 1 Branch, 4 actions corresponding to agent
- Vector Action space: (Discrete) 1 Branch, 4 actions corresponding to agent
* Visual Observations (Optional): First-person view for the agent. Use
`VisualHallway` scene. __The visual observation version of
this environment does not train with the provided default
training parameters.__
* Float Properties: None
* Benchmark Mean Reward: 0.7
* To speed up training, you can enable curiosity by adding the `curiosity` reward signal in `config/trainer_config.yaml`
- Visual Observations (Optional): First-person view for the agent. Use
`VisualHallway` scene. **The visual observation version of this environment
does not train with the provided default training parameters.**
- Float Properties: None
- Benchmark Mean Reward: 0.7
- To speed up training, you can enable curiosity by adding the `curiosity`
reward signal in `config/trainer_config.yaml`
* Set-up: Environment where the agent needs on-demand decision making. The agent
- Set-up: Environment where the agent needs on-demand decision making. The agent
* Goal: Catch the floating green cube. Only has a limited number of jumps.
* Agents: The environment contains one agent.
* Agent Reward Function (independent):
* +1 For catching the green cube.
* -1 For bouncing out of bounds.
* -0.05 Times the action squared. Energy expenditure penalty.
* Behavior Parameters:
* Vector Observation space: 6 corresponding to local position of agent and
- Goal: Catch the floating green cube. Only has a limited number of jumps.
- Agents: The environment contains one agent.
- Agent Reward Function (independent):
- +1 For catching the green cube.
- -1 For bouncing out of bounds.
- -0.05 Times the action squared. Energy expenditure penalty.
- Behavior Parameters:
- Vector Observation space: 6 corresponding to local position of agent and
* Vector Action space: (Continuous) 3 corresponding to agent force applied for
- Vector Action space: (Continuous) 3 corresponding to agent force applied for
* Visual Observations: None
* Float Properties: Two
* target_scale: The scale of the green cube in the 3 dimensions
* Default: 150
* Recommended Minimum: 50
* Recommended Maximum: 250
* Benchmark Mean Reward: 10
- Visual Observations: None
- Float Properties: Two
- target_scale: The scale of the green cube in the 3 dimensions
- Default: 150
- Recommended Minimum: 50
- Recommended Maximum: 250
- Benchmark Mean Reward: 10
* Set-up: Environment where four agents compete in a 2 vs 2 toy soccer game.
* Goal:
* Get the ball into the opponent's goal while preventing
the ball from entering own goal.
* Agents: The environment contains four agents, with the same
- Set-up: Environment where four agents compete in a 2 vs 2 toy soccer game.
- Goal:
- Get the ball into the opponent's goal while preventing the ball from entering own goal.
- Agents: The environment contains four agents, with the same
* Agent Reward Function (dependent):
* +1 When ball enters opponent's goal.
* -1 When ball enters team's goal.
* -0.001 Existential penalty.
* Behavior Parameters:
* Vector Observation space: 336 corresponding to 11 ray-casts forward distributed over 120 degrees (264)
- Agent Reward Function (dependent):
- (1 - `accumulated time penalty`) When ball enters opponent's goal `accumulated time penalty` is incremented by
(1 / `maxStep`) every fixed update and is reset to 0 at the beginning of an episode.
- -1 When ball enters team's goal.
- Behavior Parameters:
- Vector Observation space: 336 corresponding to 11 ray-casts forward distributed over 120 degrees
* Vector Action space: (Discrete) Three branched actions corresponding to forward, backward, sideways movement,
- Vector Action space: (Discrete) Three branched actions corresponding to forward, backward, sideways movement,
* Visual Observations: None
* Float Properties: Two
* ball_scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions)
* Default: 7.5
* Recommended minimum: 4
* Recommended maximum: 10
* gravity: Magnitude of the gravity
* Default: 9.81
* Recommended minimum: 6
* Recommended maximum: 20
- Visual Observations: None
- Float Properties: Two
- ball_scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions)
- Default: 7.5
- Recommended minimum: 4
- Recommended maximum: 10
- gravity: Magnitude of the gravity
- Default: 9.81
- Recommended minimum: 6
- Recommended maximum: 20
# Strikers Vs. Goalie
![StrikersVsGoalie](images/strikersvsgoalie.png)
- Set-up: Environment where two agents compete in a 2 vs 1 soccer variant.
- Goal:
- Striker: Get the ball into the opponent's goal.
- Goalie: Keep the ball out of the goal.
- Agents: The environment contains three agents. Two Strikers and one Goalie.
Behavior Parameters : Striker, Goalie.
- Striker Agent Reward Function (dependent):
- +1 When ball enters opponent's goal.
- -0.001 Existential penalty.
- Goalie Agent Reward Function (dependent):
- -1 When ball enters goal.
- 0.001 Existential bonus.
- Behavior Parameters:
- Striker Vector Observation space: 294 corresponding to 11 ray-casts forward distributed over 120 degrees
and 3 ray-casts backward distributed over 90 degrees each detecting 5 possible object types, along with the object's distance.
The forward ray-casts contribute 231 state dimensions and backward 63 state dimensions over three observation stacks.
- Striker Vector Action space: (Discrete) Three branched actions corresponding to forward, backward, sideways movement,
as well as rotation.
- Goalie Vector Observation space: 738 corresponding to 41 ray-casts distributed over 360 degrees
each detecting 4 possible object types, along with the object's distance and 3 observation stacks.
- Goalie Vector Action space: (Discrete) Three branched actions corresponding to forward, backward, sideways movement,
as well as rotation.
- Visual Observations: None
- Float Properties: Two
- ball_scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions)
- Default: 7.5
- Recommended minimum: 4
- Recommended maximum: 10
- gravity: Magnitude of the gravity
- Default: 9.81
- Recommended minimum: 6
- Recommended maximum: 20
# Strikers Vs. Goalie

![Walker](images/walker.png)
* Set-up: Physics-based Humanoids agents with 26 degrees of freedom. These DOFs
- Set-up: Physics-based Humanoids agents with 26 degrees of freedom. These DOFs
* Goal: The agents must move its body toward the goal direction as quickly as
- Goal: The agents must move its body toward the goal direction as quickly as
* Agents: The environment contains 11 independent agents with same Behavior Parameters.
* Agent Reward Function (independent):
* +0.03 times body velocity in the goal direction.
* +0.01 times head y position.
* +0.01 times body direction alignment with goal direction.
* -0.01 times head velocity difference from body velocity.
* Behavior Parameters:
* Vector Observation space: 215 variables corresponding to position, rotation,
- Agents: The environment contains 11 independent agents with same Behavior
Parameters.
- Agent Reward Function (independent):
- +0.03 times body velocity in the goal direction.
- +0.01 times head y position.
- +0.01 times body direction alignment with goal direction.
- -0.01 times head velocity difference from body velocity.
- Behavior Parameters:
- Vector Observation space: 215 variables corresponding to position, rotation,
* Vector Action space: (Continuous) Size of 39, corresponding to target
- Vector Action space: (Continuous) Size of 39, corresponding to target
* Visual Observations: None
* Float Properties: Four
* gravity: Magnitude of gravity
* Default: 9.81
* Recommended Minimum:
* Recommended Maximum:
* hip_mass: Mass of the hip component of the walker
* Default: 15
* Recommended Minimum: 7
* Recommended Maximum: 28
* chest_mass: Mass of the chest component of the walker
* Default: 8
* Recommended Minimum: 3
* Recommended Maximum: 20
* spine_mass: Mass of the spine component of the walker
* Default: 10
* Recommended Minimum: 3
* Recommended Maximum: 20
* Benchmark Mean Reward: 1000
- Visual Observations: None
- Float Properties: Four
- gravity: Magnitude of gravity
- Default: 9.81
- Recommended Minimum:
- Recommended Maximum:
- hip_mass: Mass of the hip component of the walker
- Default: 15
- Recommended Minimum: 7
- Recommended Maximum: 28
- chest_mass: Mass of the chest component of the walker
- Default: 8
- Recommended Minimum: 3
- Recommended Maximum: 20
- spine_mass: Mass of the spine component of the walker
- Default: 10
- Recommended Minimum: 3
- Recommended Maximum: 20
- Benchmark Mean Reward: 1000
* Set-up: Environment where the agent needs to press a button to spawn a
- Set-up: Environment where the agent needs to press a button to spawn a
* Goal: Move to the golden brick on top of the spawned pyramid.
* Agents: The environment contains one agent.
* Agent Reward Function (independent):
* +2 For moving to golden brick (minus 0.001 per step).
* Behavior Parameters:
* Vector Observation space: 148 corresponding to local ray-casts detecting
- Goal: Move to the golden brick on top of the spawned pyramid.
- Agents: The environment contains one agent.
- Agent Reward Function (independent):
- +2 For moving to golden brick (minus 0.001 per step).
- Behavior Parameters:
- Vector Observation space: 148 corresponding to local ray-casts detecting
* Vector Action space: (Discrete) 4 corresponding to agent rotation and
- Vector Action space: (Discrete) 4 corresponding to agent rotation and
* Visual Observations (Optional): First-person camera per-agent. Us
`VisualPyramids` scene. __The visual observation version of
this environment does not train with the provided default
training parameters.__
* Float Properties: None
* Benchmark Mean Reward: 1.75
- Visual Observations (Optional): First-person camera per-agent. Us
`VisualPyramids` scene. **The visual observation version of this environment
does not train with the provided default training parameters.**
- Float Properties: None
- Benchmark Mean Reward: 1.75

84
docs/Learning-Environment-Executable.md


Editor to interact with an environment. Using an executable has some advantages
over using the Editor:
* You can exchange executable with other people without having to share your
- You can exchange executable with other people without having to share your
* You can put your executable on a remote machine for faster training.
* You can use `Headless` mode for faster training.
* You can keep using the Unity Editor for other tasks while the agents are
- You can put your executable on a remote machine for faster training.
- You can use `Headless` mode for faster training.
- You can keep using the Unity Editor for other tasks while the agents are
training.
## Building the 3DBall environment

1. Launch Unity.
2. On the Projects dialog, choose the **Open** option at the top of the window.
3. Using the file dialog that opens, locate the `Project` folder within the
1. On the Projects dialog, choose the **Open** option at the top of the window.
1. Using the file dialog that opens, locate the `Project` folder within the
4. In the **Project** window, navigate to the folder
1. In the **Project** window, navigate to the folder
5. Double-click the `3DBall` file to load the scene containing the Balance Ball
1. Double-click the `3DBall` file to load the scene containing the Balance Ball
environment.
![3DBall Scene](images/mlagents-Open3DBall.png)

* The environment application runs in the background.
* No dialogs require interaction.
* The correct scene loads automatically.
- The environment application runs in the background.
- No dialogs require interaction.
- The correct scene loads automatically.
2. Under **Resolution and Presentation**:
* Ensure that **Run in Background** is Checked.
* Ensure that **Display Resolution Dialog** is set to Disabled.
3. Open the Build Settings window (menu:**File** > **Build Settings**).
4. Choose your target platform.
* (optional) Select “Development Build” to [log debug
messages](https://docs.unity3d.com/Manual/LogFiles.html).
5. If any scenes are shown in the **Scenes in Build** list, make sure that the
1. Under **Resolution and Presentation**:
- Ensure that **Run in Background** is Checked.
- Ensure that **Display Resolution Dialog** is set to Disabled.
1. Open the Build Settings window (menu:**File** > **Build Settings**).
1. Choose your target platform.
- (optional) Select “Development Build” to
[log debug messages](https://docs.unity3d.com/Manual/LogFiles.html).
1. If any scenes are shown in the **Scenes in Build** list, make sure that the
6. Click **Build**:
* In the File dialog, navigate to your ML-Agents directory.
* Assign a file name and click **Save**.
* (For Windows)With Unity 2018.1, it will ask you to select a folder instead
1. Click **Build**:
- In the File dialog, navigate to your ML-Agents directory.
- Assign a file name and click **Save**.
- (For Windows)With Unity 2018.1, it will ask you to select a folder instead
subfolder's name as `env_name`. You cannot create builds in the Assets folder
subfolder's name as `env_name`. You cannot create builds in the Assets
folder
![Build Window](images/mlagents-BuildWindow.png)

## Training the Environment
1. Open a command or terminal window.
2. Navigate to the folder where you installed the ML-Agents Toolkit. If you
1. Navigate to the folder where you installed the ML-Agents Toolkit. If you
3. Run
1. Run
* `<trainer-config-file>` is the file path of the trainer configuration yaml
* `<env_name>` is the name and path to the executable you exported from Unity
- `<trainer-config-file>` is the file path of the trainer configuration yaml
- `<env_name>` is the name and path to the executable you exported from Unity
* `<run-identifier>` is a string used to separate the results of different
- `<run-identifier>` is a string used to separate the results of different
training runs
For example, if you are training with a 3DBall executable you exported to the

use_curiosity: False
curiosity_strength: 0.01
curiosity_enc_size: 128
model_path: ./models/first-run-0/Ball3DLearning
model_path: ./models/first-run-0/Ball3DLearning
INFO:mlagents.trainers: first-run-0: Ball3DLearning: Step: 1000. Mean Reward: 1.242. Std of Reward: 0.746. Training.
INFO:mlagents.trainers: first-run-0: Ball3DLearning: Step: 2000. Mean Reward: 1.319. Std of Reward: 0.693. Training.
INFO:mlagents.trainers: first-run-0: Ball3DLearning: Step: 3000. Mean Reward: 1.804. Std of Reward: 1.056. Training.

```
You can press Ctrl+C to stop the training, and your trained model will be at
`models/<run-identifier>/<behavior_name>.nn`, which corresponds
to your model's latest checkpoint. (**Note:** There is a known bug on Windows
that causes the saving of the model to fail when you early terminate the
training, it's recommended to wait until Step has reached the max_steps
parameter you set in trainer_config.yaml.) You can now embed this trained model
into your Agent by following the steps below:
`models/<run-identifier>/<behavior_name>.nn`, which corresponds to your model's
latest checkpoint. (**Note:** There is a known bug on Windows that causes the
saving of the model to fail when you early terminate the training, it's
recommended to wait until Step has reached the max_steps parameter you set in
trainer_config.yaml.) You can now embed this trained model into your Agent by
following the steps below:
2. Open the Unity Editor, and select the **3DBall** scene as described above.
3. Select the **3DBall** prefab from the Project window and select **Agent**.
5. Drag the `<behavior_name>.nn` file from the Project window of
the Editor to the **Model** placeholder in the **Ball3DAgent**
inspector window.
6. Press the Play button at the top of the editor.
1. Open the Unity Editor, and select the **3DBall** scene as described above.
1. Select the **3DBall** prefab from the Project window and select **Agent**.
1. Drag the `<behavior_name>.nn` file from the Project window of the Editor to
the **Model** placeholder in the **Ball3DAgent** inspector window.
1. Press the :arrow_forward: button at the top of the editor.

39
docs/ML-Agents-Overview.md


complex behaviors by hand is challenging and prone to errors.
With ML-Agents, it is possible to _train_ the behaviors of such NPCs (called
**agents**) using a variety of methods. The basic idea is quite simple. We need
**Agents**) using a variety of methods. The basic idea is quite simple. We need
to define three entities at every moment of the game (called **environment**):
- **Observations** - what the medic perceives about the environment.

- **Agents** - which is attached to a Unity GameObject (any character within a
scene) and handles generating its observations, performing the actions it
receives and assigning a reward (positive / negative) when appropriate. Each
Agent is linked to a Policy.
Agent is linked to a Behavior.
every character in the scene. While each Agent must be linked to a Policy, it is
every character in the scene. While each Agent must be linked to a Behavior, it is
the same Policy type. In our sample game, we have two teams each with their own medic.
the same Behavior. In our sample game, we have two teams each with their own medic.
but both of these medics can have the same Policy. Note that these two
medics have the same Policy because their _space_ of observations and
actions are similar. This does not mean that at each instance they will have
identical observation and action _values_. In other words, the Policy defines the
space of all possible observations and actions, while the Agents connected to it
(in this case the medics) can each have their own, unique observation and action
values. If we expanded our game to include tank driver NPCs, then the Agent
attached to those characters cannot share a Policy with the Agent linked to the
but both of these medics can have the same Behavior. Note that these two
medics have the same Behavior. This does not mean that at each instance they will have
identical observation and action _values_. If we expanded our game to include
tank driver NPCs, then the Agent
attached to those characters cannot share its Behavior with the Agent linked to the
medics (medics and drivers have different actions).
<p align="center">

We have yet to discuss how the ML-Agents toolkit trains behaviors, and what role
the Python API and External Communicator play. Before we dive into those
details, let's summarize the earlier components. Each character is attached to
an Agent, and each Agent has a Policy. The Policy receives observations
and rewards from the Agent and returns actions. The Academy ensures that all the
an Agent, and each Agent has a Behavior. The Behavior can be thought as a function
that receives observations
and rewards from the Agent and returns actions. The Learning Environment through
the Academy (not represented in the diagram) ensures that all the
Note that in a single environment, there can be multiple Agents and multiple Behaviors
at the same time. These Behaviors can communicate with Python through the communicator
but can also use a pre-trained _Neural Network_ or a _Heuristic_. Note that it is also
possible to communicate data with Python without using Agents through _Side Channels_.
One example of using _Side Channels_ is to exchange data with Python about
_Environment Parameters_. The following diagram illustrates the above.
<p align="center">
<img src="images/learning_environment_full.png"
alt="More Complete Example ML-Agents Scene Block Diagram"
border="10" />
</p>
## Training Modes

5
docs/Migrating.md


* The `play_against_current_self_ratio` self-play trainer hyperparameter has been renamed to `play_against_latest_model_ratio`
* Removed the multi-agent gym option from the gym wrapper. For multi-agent scenarios, use the [Low Level Python API](Python-API.md).
* The low level Python API has changed. You can look at the document [Low Level Python API documentation](Python-API.md) for more information. If you use `mlagents-learn` for training, this should be a transparent change.
* The obsolete `Agent` methods `GiveModel`, `Done`, `InitializeAgent`, `AgentAction` and `AgentReset` have been removed.
* The signature of `Agent.Heuristic()` was changed to take a `float[]` as a parameter, instead of returning the array. This was done to prevent a common source of error where users would return arrays of the wrong size.
### Steps to Migrate
* Replace the `--load` flag with `--resume` when calling `mlagents-learn`, and don't use the `--train` flag as training

* `Academy.FloatProperties` was removed.
* `Academy.RegisterSideChannel` and `Academy.UnregisterSideChannel` were removed.
### Steps to Migrate
* If your Agent class overrides `Heuristic()`, change the signature to `public override void Heuristic(float[] actionsOut)` and assign values to `actionsOut` instead of returning an array.
## Migrating from 0.14 to 0.15

100
docs/Readme.md


## Installation & Set-up
* [Installation](Installation.md)
* [Using Virtual Environment](Using-Virtual-Environment.md)
- [Installation](Installation.md)
- [Using Virtual Environment](Using-Virtual-Environment.md)
* [Getting Started Guide](Getting-Started.md)
* [ML-Agents Toolkit Overview](ML-Agents-Overview.md)
* [Background: Unity](Background-Unity.md)
* [Background: Machine Learning](Background-Machine-Learning.md)
* [Background: TensorFlow](Background-TensorFlow.md)
* [Example Environments](Learning-Environment-Examples.md)
- [Getting Started Guide](Getting-Started.md)
- [ML-Agents Toolkit Overview](ML-Agents-Overview.md)
- [Background: Unity](Background-Unity.md)
- [Background: Machine Learning](Background-Machine-Learning.md)
- [Background: TensorFlow](Background-TensorFlow.md)
- [Example Environments](Learning-Environment-Examples.md)
* [Making a New Learning Environment](Learning-Environment-Create-New.md)
* [Designing a Learning Environment](Learning-Environment-Design.md)
* [Designing Agents](Learning-Environment-Design-Agents.md)
- [Making a New Learning Environment](Learning-Environment-Create-New.md)
- [Designing a Learning Environment](Learning-Environment-Design.md)
- [Designing Agents](Learning-Environment-Design-Agents.md)
* [Using the Monitor](Feature-Monitor.md)
* [Using the Video Recorder](https://github.com/Unity-Technologies/video-recorder)
* [Using an Executable Environment](Learning-Environment-Executable.md)
* [Creating Custom Side Channels](Custom-SideChannels.md)
- [Using the Monitor](Feature-Monitor.md)
- [Using an Executable Environment](Learning-Environment-Executable.md)
* [Training ML-Agents](Training-ML-Agents.md)
* [Using TensorBoard to Observe Training](Using-Tensorboard.md)
* [Training Using Concurrent Unity Instances](Training-Using-Concurrent-Unity-Instances.md)
* [Training with Proximal Policy Optimization](Training-PPO.md)
* [Training with Soft Actor-Critic](Training-SAC.md)
- [Training ML-Agents](Training-ML-Agents.md)
- [Reward Signals](Reward-Signals.md)
- [Profiling Trainers](Profiling-Python.md)
- [Using TensorBoard to Observe Training](Using-Tensorboard.md)
- [Training Using Concurrent Unity Instances](Training-Using-Concurrent-Unity-Instances.md)
- [Training with Proximal Policy Optimization](Training-PPO.md)
- [Training with Soft Actor-Critic](Training-SAC.md)
- [Training with Self-Play](Training-Self-Play.md)
* [Training with Curriculum Learning](Training-Curriculum-Learning.md)
* [Training with Imitation Learning](Training-Imitation-Learning.md)
* [Training with LSTM](Feature-Memory.md)
* [Training with Environment Parameter Randomization](Training-Environment-Parameter-Randomization.md)
- [Training with Curriculum Learning](Training-Curriculum-Learning.md)
- [Training with Imitation Learning](Training-Imitation-Learning.md)
- [Training with LSTM](Feature-Memory.md)
- [Training with Environment Parameter Randomization](Training-Environment-Parameter-Randomization.md)
* [Unity Inference Engine](Unity-Inference-Engine.md)
- [Unity Inference Engine](Unity-Inference-Engine.md)
## Extending ML-Agents
- [Creating Custom Side Channels](Custom-SideChannels.md)
* [Migrating from earlier versions of ML-Agents](Migrating.md)
* [Frequently Asked Questions](FAQ.md)
* [ML-Agents Glossary](Glossary.md)
* [Limitations](Limitations.md)
- [Migrating from earlier versions of ML-Agents](Migrating.md)
- [Frequently Asked Questions](FAQ.md)
- [ML-Agents Glossary](Glossary.md)
- [Limitations](Limitations.md)
* [API Reference](API-Reference.md)
* [How to use the Python API](Python-API.md)
* [Wrapping Learning Environment as a Gym (+Baselines/Dopamine Integration)](../gym-unity/README.md)
- [API Reference](API-Reference.md)
- [How to use the Python API](Python-API.md)
- [Wrapping Learning Environment as a Gym (+Baselines/Dopamine Integration)](../gym-unity/README.md)
To make the Unity ML-Agents toolkit accessible to the global research and
Unity developer communities, we're attempting to create and maintain
translations of our documentation. We've started with translating a subset
of the documentation to one language (Chinese), but we hope to continue
translating more pages and to other languages. Consequently,
we welcome any enhancements and improvements from the community.
To make the Unity ML-Agents toolkit accessible to the global research and Unity
developer communities, we're attempting to create and maintain translations of
our documentation. We've started with translating a subset of the documentation
to one language (Chinese), but we hope to continue translating more pages and to
other languages. Consequently, we welcome any enhancements and improvements from
the community.
* [Chinese](localized/zh-CN/)
* [Korean](localized/KR/)
- [Chinese](localized/zh-CN/)
- [Korean](localized/KR/)
We no longer use them ourselves and so they may not be up-to-date.
We've decided to keep them up just in case they are helpful to you.
We no longer use them ourselves and so they may not be up-to-date. We've decided
to keep them up just in case they are helpful to you.
* [Training on the Cloud with Amazon Web Services](Training-on-Amazon-Web-Service.md)
* [Training on the Cloud with Microsoft Azure](Training-on-Microsoft-Azure.md)
* [Using Docker](Using-Docker.md)
* [Windows Anaconda Installation](Installation-Anaconda-Windows.md)
- [Windows Anaconda Installation](Installation-Anaconda-Windows.md)
- [Using Docker](Using-Docker.md)
- [Training on the Cloud with Amazon Web Services](Training-on-Amazon-Web-Service.md)
- [Training on the Cloud with Microsoft Azure](Training-on-Microsoft-Azure.md)
- [Using the Video Recorder](https://github.com/Unity-Technologies/video-recorder)

6
docs/Training-Imitation-Learning.md


<p align="center">
<img src="images/demo_component.png"
alt="BC Teacher Helper"
alt="Demonstration Recorder"
width="375" border="10" />
</p>

<p align="center">
<img src="images/demo_inspector.png"
alt="BC Teacher Helper"
alt="Demonstration Inspector"
width="375" border="10" />
</p>

gail:
demo_path: <path_to_your_demo_file>
...
```
```

368
docs/Training-ML-Agents.md


# Training ML-Agents
The ML-Agents toolkit conducts training using an external Python training
process. During training, this external process communicates with the Academy
to generate a block of agent experiences. These
experiences become the training set for a neural network used to optimize the
agent's policy (which is essentially a mathematical function mapping
observations to actions). In reinforcement learning, the neural network
optimizes the policy by maximizing the expected rewards. In imitation learning,
the neural network optimizes the policy to achieve the smallest difference
between the actions chosen by the agent trainee and the actions chosen by the
expert in the same situation.
The output of the training process is a model file containing the optimized
policy. This model file is a TensorFlow data graph containing the mathematical
operations and the optimized weights selected during the training process. You
can set the generated model file in the Behaviors Parameters under your
Agent in your Unity project to decide the best course of action for an agent.
Use the command `mlagents-learn` to train your agents. This command is installed
with the `mlagents` package and its implementation can be found at
`ml-agents/mlagents/trainers/learn.py`. The [configuration file](#training-config-file),
like `config/trainer_config.yaml` specifies the hyperparameters used during training.
You can edit this file with a text editor to add a specific configuration for
each Behavior.
For a broad overview of reinforcement learning, imitation learning and all the
training scenarios, methods and options within the ML-Agents Toolkit, see
[ML-Agents Toolkit Overview](ML-Agents-Overview.md).
For a broader overview of reinforcement learning, imitation learning and the
ML-Agents training process, see [ML-Agents Toolkit
Overview](ML-Agents-Overview.md).
Once your learning environment has been created and is ready for training, the
next step is to initiate a training run. Training in the ML-Agents Toolkit is
powered by a dedicated Python package, `mlagents`. This package exposes a
command `mlagents-learn` that is the single entry point for all training
workflows (e.g. reinforcement leaning, imitation learning, curriculum learning).
Its implementation can be found at
[ml-agents/mlagents/trainers/learn.py](../ml-agents/mlagents/trainers/learn.py).
Use the `mlagents-learn` command to train agents. `mlagents-learn` supports
training with
[reinforcement learning](Background-Machine-Learning.md#reinforcement-learning),
[curriculum learning](Training-Curriculum-Learning.md),
and [behavioral cloning imitation learning](Training-Imitation-Learning.md).
### Starting Training
Run `mlagents-learn` from the command line to launch the training process. Use
the command line patterns and the `config/trainer_config.yaml` file to control
training options.
`mlagents-learn` is the main training utility provided by the ML-Agents Toolkit.
It accepts a number of CLI options in addition to a YAML configuration file that
contains all the configurations and hyperparameters to be used during training.
The set of configurations and hyperparameters to include in this file depend on
the agents in your environment and the specific training method you wish to
utilize. Keep in mind that the hyperparameter values can have a big impact on
the training performance (i.e. your agent's ability to learn a policy that
solves the task). In this page, we will review all the hyperparameters for all
training methods and provide guidelines and advice on their values.
The basic command for training is:
To view a description of all the CLI options accepted by `mlagents-learn`, use
the `--help`:
mlagents-learn <trainer-config-file> --env=<env_name> --run-id=<run-identifier>
mlagents-learn --help
where
* `<trainer-config-file>` is the file path of the trainer configuration yaml.
* `<env_name>`__(Optional)__ is the name (including path) of your Unity
executable containing the agents to be trained. If `<env_name>` is not passed,
the training will happen in the Editor. Press the :arrow_forward: button in
Unity when the message _"Start training by pressing the Play button in the
Unity Editor"_ is displayed on the screen.
* `<run-identifier>` is an optional identifier you can use to identify the
results of individual training runs.
For example, suppose you have a project in Unity named "CatsOnBicycles" which
contains agents ready to train. To perform the training:
1. [Build the project](Learning-Environment-Executable.md), making sure that you
only include the training scene.
2. Open a terminal or console window.
3. Navigate to the directory where you installed the ML-Agents Toolkit.
4. Run the following to launch the training process using the path to the Unity
environment you built in step 1:
The basic command for training is:
mlagents-learn config/trainer_config.yaml --env=../../projects/Cats/CatsOnBicycles.app --run-id=cob_1
mlagents-learn <trainer-config-file> --env=<env_name> --run-id=<run-identifier>
During a training session, the training program prints out and saves updates at
regular intervals (specified by the `summary_freq` option). The saved statistics
are grouped by the `run-id` value so you should assign a unique id to each
training run if you plan to view the statistics. You can view these statistics
using TensorBoard during or after training by running the following command:
where
```sh
tensorboard --logdir=summaries --port 6006
```
- `<trainer-config-file>` is the file path of the trainer configuration yaml.
This contains all the hyperparameter values. We offer a detailed guide on the
structure of this file and the meaning of the hyperameters (and advice on how
to set them) in the dedicated [Training Config File](#training-config-file)
section below.
- `<env_name>`**(Optional)** is the name (including path) of your
[Unity executable](Learning-Environment-Executable.md) containing the agents
to be trained. If `<env_name>` is not passed, the training will happen in the
Editor. Press the :arrow_forward: button in Unity when the message _"Start
training by pressing the Play button in the Unity Editor"_ is displayed on
the screen.
- `<run-identifier>` is a unique name you can use to identify the results of
your training runs.
And then opening the URL: [localhost:6006](http://localhost:6006).
See the
[Getting Started Guide](Getting-Started.md#training-a-new-model-with-reinforcement-learning)
for a sample execution of the `mlagents-learn` command.
**Note:** The default port TensorBoard uses is 6006. If there is an existing session
running on port 6006 a new session can be launched on an open port using the --port
option.
#### Observing Training
When training is finished, you can find the saved model in the `models` folder
under the assigned run-id — in the cats example, the path to the model would be
`models/cob_1/CatsOnBicycles_cob_1.nn`.
Regardless of which training methods, configurations or hyperparameters you
provide, the training process will always generate three artifacts:
While this example used the default training hyperparameters, you can edit the
[trainer_config.yaml file](#training-config-file) with a text editor to set
different values.
1. Summaries (under the `summaries/` folder): these are training metrics that
are updated throughout the training process. They are helpful to monitor your
training performance and may help inform how to update your hyperparameter
values. See [Using TensorBoard](Using-Tensorboard.md) for more details on how
to visualize the training metrics.
1. Models (under the `models/` folder): these contain the model checkpoints that
are updated throughout training and the final model file (`.nn`). This final
model file is generated once either when training completes or is
interrupted.
1. Timers file (also under the `summaries/` folder): this contains aggregated
metrics on your training process, including time spent on specific code
blocks. See [Profiling in Python](Profiling-Python.md) for more information
on the timers generated.
To interrupt training and save the current progress, hit Ctrl+C once and wait for the
model to be saved out.
These artifacts (except the `.nn` file) are updated throughout the training
process and finalized when training completes or is interrupted.
### Loading an Existing Model
#### Stopping and Resuming Training
If you've quit training early using Ctrl+C, you can resume the training run by running
`mlagents-learn` again, specifying the same `<run-identifier>` and appending the `--resume` flag
to the command.
To interrupt training and save the current progress, hit `Ctrl+C` once and wait
for the model(s) to be saved out.
You can also use this mode to run inference of an already-trained model in Python.
Append both the `--resume` and `--inference` to do this. Note that if you want to run
inference in Unity, you should use the
[Unity Inference Engine](Getting-started#Running-a-pre-trained-model).
To resume a previously interrupted or completed training run, use the `--resume`
flag and make sure to specify the previously used run ID.
If you've already trained a model using the specified `<run-identifier>` and `--resume` is not
specified, you will not be able to continue with training. Use `--force` to force ML-Agents to
overwrite the existing data.
If you would like to re-run a previously interrupted or completed training run
and re-use the same run ID (in this case, overwriting the previously generated
artifacts), then use the `--force` flag.
Alternatively, you might want to start a new training run but _initialize_ it using an already-trained
model. You may want to do this, for instance, if your environment changed and you want
a new model, but the old behavior is still better than random. You can do this by specifying `--initialize-from=<run-identifier>`, where `<run-identifier>` is the old run ID.
#### Loading an Existing Model
### Command Line Training Options
You can also use this mode to run inference of an already-trained model in
Python by using both the `--resume` and `--inference` flags. Note that if you
want to run inference in Unity, you should use the
[Unity Inference Engine](Getting-Started.md#running-a-pre-trained-model).
In addition to passing the path of the Unity executable containing your training
environment, you can set the following command line options when invoking
`mlagents-learn`:
Alternatively, you might want to start a new training run but _initialize_ it
using an already-trained model. You may want to do this, for instance, if your
environment changed and you want a new model, but the old behavior is still
better than random. You can do this by specifying
`--initialize-from=<run-identifier>`, where `<run-identifier>` is the old run
ID.
* `--env=<env>`: Specify an executable environment to train.
* `--curriculum=<file>`: Specify a curriculum JSON file for defining the
lessons for curriculum training. See [Curriculum
Training](Training-Curriculum-Learning.md) for more information.
* `--sampler=<file>`: Specify a sampler YAML file for defining the
sampler for parameter randomization. See [Environment Parameter Randomization](Training-Environment-Parameter-Randomization.md) for more information.
* `--keep-checkpoints=<n>`: Specify the maximum number of model checkpoints to
keep. Checkpoints are saved after the number of steps specified by the
`save-freq` option. Once the maximum number of checkpoints has been reached,
the oldest checkpoint is deleted when saving a new checkpoint. Defaults to 5.
* `--lesson=<n>`: Specify which lesson to start with when performing curriculum
training. Defaults to 0.
* `--num-envs=<n>`: Specifies the number of concurrent Unity environment instances to
collect experiences from when training. Defaults to 1.
* `--run-id=<run-identifier>`: Specifies an identifier for each training run. This
identifier is used to name the subdirectories in which the trained model and
summary statistics are saved as well as the saved model itself. The default id
is "ppo". If you use TensorBoard to view the training statistics, always set a
unique run-id for each training run. (The statistics for all runs with the
same id are combined as if they were produced by a the same session.)
* `--save-freq=<n>`: Specifies how often (in steps) to save the model during
training. Defaults to 50000.
* `--seed=<n>`: Specifies a number to use as a seed for the random number
generator used by the training code.
* `--env-args=<string>`: Specify arguments for the executable environment. Be aware that
the standalone build will also process these as
[Unity Command Line Arguments](https://docs.unity3d.com/Manual/CommandLineArguments.html).
You should choose different argument names if you want to create environment-specific arguments.
All arguments after this flag will be passed to the executable. For example, setting
`mlagents-learn config/trainer_config.yaml --env-args --num-orcs 42` would result in
` --num-orcs 42` passed to the executable.
* `--base-port`: Specifies the starting port. Each concurrent Unity environment instance
will get assigned a port sequentially, starting from the `base-port`. Each instance
will use the port `(base_port + worker_id)`, where the `worker_id` is sequential IDs
given to each instance from 0 to `num_envs - 1`. Default is 5005. __Note:__ When
training using the Editor rather than an executable, the base port will be ignored.
* `--inference`: Specifies whether to only run in inference mode. Omit to train the model.
To load an existing model, specify a run-id and combine with `--resume`.
* `--resume`: If set, the training code loads an already trained model to
initialize the neural network before training. The learning code looks for the
model in `models/<run-id>/` (which is also where it saves models at the end of
training). This option only works when the models exist, and have the same behavior names
as the current agents in your scene.
* `--force`: Attempting to train a model with a run-id that has been used before will
throw an error. Use `--force` to force-overwrite this run-id's summary and model data.
* `--initialize-from=<run-identifier>`: Specify an old run-id here to initialize your model from
a previously trained model. Note that the previously saved models _must_ have the same behavior
parameters as your current environment.
* `--no-graphics`: Specify this option to run the Unity executable in
`-batchmode` and doesn't initialize the graphics driver. Use this only if your
training doesn't involve visual observations (reading from Pixels). See
[here](https://docs.unity3d.com/Manual/CommandLineArguments.html) for more
details.
* `--debug`: Specify this option to enable debug-level logging for some parts of the code.
* `--cpu`: Forces training using CPU only.
* Engine Configuration :
* `--width` : The width of the executable window of the environment(s) in pixels
(ignored for editor training) (Default 84)
* `--height` : The height of the executable window of the environment(s) in pixels
(ignored for editor training). (Default 84)
* `--quality-level` : The quality level of the environment(s). Equivalent to
calling `QualitySettings.SetQualityLevel` in Unity. (Default 5)
* `--time-scale` : The time scale of the Unity environment(s). Equivalent to setting
`Time.timeScale` in Unity. (Default 20.0, maximum 100.0)
* `--target-frame-rate` : The target frame rate of the Unity environment(s).
Equivalent to setting `Application.targetFrameRate` in Unity. (Default: -1)
## Training Config File
### Training Config File
The Unity ML-Agents Toolkit provides a wide range of training scenarios, methods
and options. As such, specific training runs may require different training
configurations and may generate different artifacts and TensorBoard statistics.
This section offers a detailed guide into how to manage the different training
set-ups withing the toolkit.
The training config files `config/trainer_config.yaml`, `config/sac_trainer_config.yaml`,
`config/gail_config.yaml` and `config/offline_bc_config.yaml` specifies the training method,
the hyperparameters, and a few additional values to use when training with Proximal Policy
Optimization(PPO), Soft Actor-Critic(SAC), GAIL (Generative Adversarial Imitation Learning)
with PPO/SAC, and Behavioral Cloning(BC)/Imitation with PPO/SAC. These files are divided
into sections. The **default** section defines the default values for all the available
training with PPO, SAC, GAIL (with PPO), and BC. These files are divided into sections.
The **default** section defines the default values for all the available settings. You can
also add new sections to override these defaults to train specific Behaviors. Name each of these
override sections after the appropriate `Behavior Name`. Sections for the
The training config files `config/trainer_config.yaml`,
`config/sac_trainer_config.yaml`, `config/gail_config.yaml` and
`config/offline_bc_config.yaml` specifies the training method, the
hyperparameters, and a few additional values to use when training with Proximal
Policy Optimization(PPO), Soft Actor-Critic(SAC), GAIL (Generative Adversarial
Imitation Learning) with PPO/SAC, and Behavioral Cloning(BC)/Imitation with
PPO/SAC. These files are divided into sections. The **default** section defines
the default values for all the available training with PPO, SAC, GAIL (with
PPO), and BC. These files are divided into sections. The **default** section
defines the default values for all the available settings. You can also add new
sections to override these defaults to train specific Behaviors. Name each of
these override sections after the appropriate `Behavior Name`. Sections for the
| **Setting** | **Description** | **Applies To Trainer\*** |
| :------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------- |
| batch_size | The number of experiences in each iteration of gradient descent. | PPO, SAC |
| batches_per_epoch | In imitation learning, the number of batches of training examples to collect before training the model. | |
| beta | The strength of entropy regularization. | PPO |
| buffer_size | The number of experiences to collect before updating the policy model. In SAC, the max size of the experience buffer. | PPO, SAC |
| buffer_init_steps | The number of experiences to collect into the buffer before updating the policy model. | SAC |
| epsilon | Influences how rapidly the policy can evolve during training. | PPO |
| hidden_units | The number of units in the hidden layers of the neural network. | PPO, SAC |
| init_entcoef | How much the agent should explore in the beginning of training. | SAC |
| lambd | The regularization parameter. | PPO |
| learning_rate | The initial learning rate for gradient descent. | PPO, SAC |
| learning_rate_schedule | Determines how learning rate changes over time. | PPO, SAC |
| max_steps | The maximum number of simulation steps to run during a training session. | PPO, SAC |
| memory_size | The size of the memory an agent must keep. Used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, SAC |
| normalize | Whether to automatically normalize observations. | PPO, SAC |
| num_epoch | The number of passes to make through the experience buffer when performing gradient descent optimization. | PPO |
| num_layers | The number of hidden layers in the neural network. | PPO, SAC |
| behavioral_cloning | Use demonstrations to bootstrap the policy neural network. See [Pretraining Using Demonstrations](Training-PPO.md#optional-behavioral-cloning-using-demonstrations). | PPO, SAC |
| reward_signals | The reward signals used to train the policy. Enable Curiosity and GAIL here. See [Reward Signals](Reward-Signals.md) for configuration options. | PPO, SAC |
| save_replay_buffer | Saves the replay buffer when exiting training, and loads it on resume. | SAC |
| sequence_length | Defines how long the sequences of experiences must be while training. Only used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, SAC |
| summary_freq | How often, in steps, to save training statistics. This determines the number of data points shown by TensorBoard. | PPO, SAC |
| tau | How aggressively to update the target network used for bootstrapping value estimation in SAC. | SAC |
| time_horizon | How many steps of experience to collect per-agent before adding it to the experience buffer. | PPO, SAC |
| trainer | The type of training to perform: "ppo", "sac", "offline_bc" or "online_bc". | PPO, SAC |
| train_interval | How often to update the agent. | SAC |
| num_update | Number of mini-batches to update the agent with during each update. | SAC |
| use_recurrent | Train using a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, SAC |
| init_path | Initialize trainer from a previously saved model. | PPO, SAC |
\*PPO = Proximal Policy Optimization, SAC = Soft Actor-Critic, BC = Behavioral
Cloning (Imitation), GAIL = Generative Adversarial Imitation Learning
\*PPO = Proximal Policy Optimization, SAC = Soft Actor-Critic, BC = Behavioral Cloning (Imitation), GAIL = Generative Adversarial Imitaiton Learning
| **Setting** | **Description** | **Applies To Trainer\*** |
| :--------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------- |
| batch_size | The number of experiences in each iteration of gradient descent. | PPO, SAC |
| batches_per_epoch | In imitation learning, the number of batches of training examples to collect before training the model. | |
| beta | The strength of entropy regularization. | PPO |
| buffer_size | The number of experiences to collect before updating the policy model. In SAC, the max size of the experience buffer. | PPO, SAC |
| buffer_init_steps | The number of experiences to collect into the buffer before updating the policy model. | SAC |
| epsilon | Influences how rapidly the policy can evolve during training. | PPO |
| hidden_units | The number of units in the hidden layers of the neural network. | PPO, SAC |
| init_entcoef | How much the agent should explore in the beginning of training. | SAC |
| lambd | The regularization parameter. | PPO |
| learning_rate | The initial learning rate for gradient descent. | PPO, SAC |
| learning_rate_schedule | Determines how learning rate changes over time. | PPO, SAC |
| max_steps | The maximum number of simulation steps to run during a training session. | PPO, SAC |
| memory_size | The size of the memory an agent must keep. Used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, SAC |
| normalize | Whether to automatically normalize observations. | PPO, SAC |
| num_epoch | The number of passes to make through the experience buffer when performing gradient descent optimization. | PPO |
| num_layers | The number of hidden layers in the neural network. | PPO, SAC |
| behavioral_cloning | Use demonstrations to bootstrap the policy neural network. See [Pretraining Using Demonstrations](Training-PPO.md#optional-behavioral-cloning-using-demonstrations). | PPO, SAC |
| reward_signals | The reward signals used to train the policy. Enable Curiosity and GAIL here. See [Reward Signals](Reward-Signals.md) for configuration options. | PPO, SAC |
| save_replay_buffer | Saves the replay buffer when exiting training, and loads it on resume. | SAC |
| sequence_length | Defines how long the sequences of experiences must be while training. Only used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, SAC |
| summary_freq | How often, in steps, to save training statistics. This determines the number of data points shown by TensorBoard. | PPO, SAC |
| tau | How aggressively to update the target network used for bootstrapping value estimation in SAC. | SAC |
| time_horizon | How many steps of experience to collect per-agent before adding it to the experience buffer. | PPO, SAC |
| trainer | The type of training to perform: "ppo", "sac", "offline_bc" or "online_bc". | PPO, SAC |
| train_interval | How often to update the agent. | SAC |
| num_update | Number of mini-batches to update the agent with during each update. | SAC |
| use_recurrent | Train using a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, SAC |
| init_path | Initialize trainer from a previously saved model. | PPO, SAC |
* [Training with PPO](Training-PPO.md)
* [Training with SAC](Training-SAC.md)
* [Using Recurrent Neural Networks](Feature-Memory.md)
* [Training with Curriculum Learning](Training-Curriculum-Learning.md)
* [Training with Imitation Learning](Training-Imitation-Learning.md)
* [Training with Environment Parameter Randomization](Training-Environment-Parameter-Randomization.md)
- [Training with PPO](Training-PPO.md)
- [Training with SAC](Training-SAC.md)
- [Training with Self-Play](Training-Self-Play.md)
- [Using Recurrent Neural Networks](Feature-Memory.md)
- [Training with Curriculum Learning](Training-Curriculum-Learning.md)
- [Training with Imitation Learning](Training-Imitation-Learning.md)
- [Training with Environment Parameter Randomization](Training-Environment-Parameter-Randomization.md)
[example environments](Learning-Environment-Examples.md)
to the corresponding sections of the `config/trainer_config.yaml` file for each
example to see how the hyperparameters and other configuration variables have
been changed from the defaults.
### Debugging and Profiling
If you enable the `--debug` flag in the command line, the trainer metrics are logged to a CSV file
stored in the `summaries` directory. The metrics stored are:
* brain name
* time to update policy
* time since start of training
* time for last experience collection
* number of experiences used for training
* mean return
This option is not available currently for Behavioral Cloning.
Additionally, we have included basic [Profiling in Python](Profiling-Python.md) as part of the toolkit.
This information is also saved in the `summaries` directory.
[example environments](Learning-Environment-Examples.md) to the corresponding
sections of the `config/trainer_config.yaml` file for each example to see how
the hyperparameters and other configuration variables have been changed from the
defaults.

14
docs/Training-Self-Play.md


### Reward Signals
We make the assumption that the final reward in a trajectory corresponds to the outcome of an episode.
A final reward of +1 indicates winning, -1 indicates losing and 0 indicates a draw.
The ELO calculation (discussed below) depends on this final reward being either +1, 0, -1.
A final reward greater than 0 indicates winning, less than 0 indicates losing and 0 indicates a draw.
The final reward determines the result of an episode (win, loss, or draw) in the ELO calculation.
In problems that are too challenging to be solved by sparse rewards, it may be necessary to provide intermediate rewards to encourage useful instrumental behaviors.
For example, it may be difficult for a soccer agent to learn that kicking a ball into the net receives a reward because this sequence has a low probability
of occurring randomly. However, it will have a higher probability of occurring if the agent learns generally that kicking the ball has utility. So, we may be able
to speed up training by giving the agent intermediate reward for kicking the ball. However, we must be careful that the agent doesn't learn to undermine
its original objective of scoring goals e.g. if it scores a goal, the episode ends and it can no longer receive reward for kicking the ball. The behavior
that receives the most reward may be to keep the ball out of the net and to kick it indefinitely! To address this, we suggest
using a curriculum that allows the agents to learn the necessary intermediate behavior (i.e. colliding with a ball) and then
decays this reward signal to allow training on just the rewards of winning and losing. Please see our documentation on
how to use curriculum learning [here](./Training-Curriculum-Learning.md) and our SoccerTwos example environment.
### Save Steps

147
docs/Training-on-Amazon-Web-Service.md


# Training on Amazon Web Service
Note: We no longer use this guide ourselves and so it may not work correctly. We've
decided to keep it up just in case it is helpful to you.
:warning: **Note:** We no longer use this guide ourselves and so it may not work
correctly. We've decided to keep it up just in case it is helpful to you.
This page contains instructions for setting up an EC2 instance on Amazon Web
Service for training ML-Agents environments.

We've prepared a pre-configured AMI for you with the ID: `ami-016ff5559334f8619` in the
`us-east-1` region. It was created as a modification of [Deep Learning AMI
(Ubuntu)](https://aws.amazon.com/marketplace/pp/B077GCH38C). The AMI has been
tested with p2.xlarge instance. Furthermore, if you want to train without
headless mode, you need to enable X Server.
We've prepared a pre-configured AMI for you with the ID: `ami-016ff5559334f8619`
in the `us-east-1` region. It was created as a modification of
[Deep Learning AMI (Ubuntu)](https://aws.amazon.com/marketplace/pp/B077GCH38C).
The AMI has been tested with p2.xlarge instance. Furthermore, if you want to
train without headless mode, you need to enable X Server.
After launching your EC2 instance using the ami and ssh into it, run the
following commands to enable it:

1. Activate the python3 environment
```sh
source activate python3
```
```sh
source activate python3
```
```sh
git clone --branch latest_release https://github.com/Unity-Technologies/ml-agents.git
cd ml-agents/ml-agents/
pip3 install -e .
```
```sh
git clone --branch latest_release https://github.com/Unity-Technologies/ml-agents.git
cd ml-agents/ml-agents/
pip3 install -e .
```
### Setting up X Server (optional)

#### Make sure there are no Xorg processes running:
```sh
# Kill any possible running Xorg processes
# Note that you might have to run this command multiple times depending on
# how Xorg is configured.
$ sudo killall Xorg
```sh
# Kill any possible running Xorg processes
# Note that you might have to run this command multiple times depending on
# how Xorg is configured.
$ sudo killall Xorg
# Check if there is any Xorg process left
# You will have a list of processes running on the GPU, Xorg should not be in
# the list, as shown below.
$ nvidia-smi
# Check if there is any Xorg process left
# You will have a list of processes running on the GPU, Xorg should not be in
# the list, as shown below.
$ nvidia-smi
# Thu Jun 14 20:21:11 2018
# +-----------------------------------------------------------------------------+
# | NVIDIA-SMI 390.67 Driver Version: 390.67 |
# |-------------------------------+----------------------+----------------------+
# | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
# | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
# |===============================+======================+======================|
# | 0 Tesla K80 On | 00000000:00:1E.0 Off | 0 |
# | N/A 37C P8 31W / 149W | 0MiB / 11441MiB | 0% Default |
# +-------------------------------+----------------------+----------------------+
#
# +-----------------------------------------------------------------------------+
# | Processes: GPU Memory |
# | GPU PID Type Process name Usage |
# |=============================================================================|
# | No running processes found |
# +-----------------------------------------------------------------------------+
# Thu Jun 14 20:21:11 2018
# +-----------------------------------------------------------------------------+
# | NVIDIA-SMI 390.67 Driver Version: 390.67 |
# |-------------------------------+----------------------+----------------------+
# | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
# | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
# |===============================+======================+======================|
# | 0 Tesla K80 On | 00000000:00:1E.0 Off | 0 |
# | N/A 37C P8 31W / 149W | 0MiB / 11441MiB | 0% Default |
# +-------------------------------+----------------------+----------------------+
#
# +-----------------------------------------------------------------------------+
# | Processes: GPU Memory |
# | GPU PID Type Process name Usage |
# |=============================================================================|
# | No running processes found |
# +-----------------------------------------------------------------------------+
```
```
#### Start X Server and make the ubuntu use X Server for display:

can use one of the example environments if you have not created your own).
2. Open the Build Settings window (menu: File > Build Settings).
3. Select Linux as the Target Platform, and x86_64 as the target architecture
(the default x86 currently does not work).
(the default x86 currently does not work).
Headless Mode, you have to setup the X Server to enable training.)
Headless Mode, you have to setup the X Server to enable training.)
```sh
chmod +x <your_env>.x86_64
```
```sh
chmod +x <your_env>.x86_64
```
```sh
# Start the X Server, press Enter to come back to the command line
$ sudo /usr/bin/X :0 &
```sh
# Start the X Server, press Enter to come back to the command line
$ sudo /usr/bin/X :0 &
# Check if Xorg process is running
# You will have a list of processes running on the GPU, Xorg should be in the list.
$ nvidia-smi
# Check if Xorg process is running
# You will have a list of processes running on the GPU, Xorg should be in the list.
$ nvidia-smi
# Make the ubuntu use X Server for display
$ export DISPLAY=:0
```
# Make the ubuntu use X Server for display
$ export DISPLAY=:0
```
```python
from mlagents_envs.environment import UnityEnvironment
```python
from mlagents_envs.environment import UnityEnvironment
env = UnityEnvironment(<your_env>)
```
env = UnityEnvironment(<your_env>)
```
Where `<your_env>` corresponds to the path to your environment executable.
Where `<your_env>` corresponds to the path to your environment executable.
You should receive a message confirming that the environment was loaded successfully.
You should receive a message confirming that the environment was loaded
successfully.
10. Train your models
```console

## FAQ
### The <Executable_Name>_Data folder hasn't been copied cover
### The <Executable_Name>\_Data folder hasn't been copied cover
If you've built your Linux executable, but forget to copy over the corresponding <Executable_Name>_Data folder, you will see error message like the following:
If you've built your Linux executable, but forget to copy over the corresponding
<Executable_Name>\_Data folder, you will see error message like the following:
```sh
Set current directory to /home/ubuntu/ml-agents/ml-agents

### Unity Environment not responding
If you didn't setup X Server or hasn't launched it properly, or your environment somehow crashes, or you haven't `chmod +x` your Unity Environment, all of these will cause connection between Unity and Python to fail. Then you will see something like this:
If you didn't setup X Server or hasn't launched it properly, or your environment
somehow crashes, or you haven't `chmod +x` your Unity Environment, all of these
will cause connection between Unity and Python to fail. Then you will see
something like this:
```console
Logging to /home/ubuntu/.config/unity3d/<Some_Path>/Player.log

The environment and the Python interface have compatible versions.
```
It would be also really helpful to check your /home/ubuntu/.config/unity3d/<Some_Path>/Player.log to see what happens with your Unity environment.
It would be also really helpful to check your
/home/ubuntu/.config/unity3d/<Some_Path>/Player.log to see what happens with
your Unity environment.
### Could not launch X Server

```sh
NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.
```
This means the NVIDIA's driver needs to be updated. Refer to [this section](Training-on-Amazon-Web-Service.md#update-and-setup-nvidia-driver) for more information.
This means the NVIDIA's driver needs to be updated. Refer to
[this section](Training-on-Amazon-Web-Service.md#update-and-setup-nvidia-driver)
for more information.

145
docs/Training-on-Microsoft-Azure.md


# Training on Microsoft Azure (works with ML-Agents toolkit v0.3)
Note: We no longer use this guide ourselves and so it may not work correctly. We've
decided to keep it up just in case it is helpful to you.
:warning: **Note:** We no longer use this guide ourselves and so it may not work
correctly. We've decided to keep it up just in case it is helpful to you.
This page contains instructions for setting up training on Microsoft Azure
through either

## Pre-Configured Azure Virtual Machine
A pre-configured virtual machine image is available in the Azure Marketplace and
is nearly completely ready for training. You can start by deploying the
is nearly completely ready for training. You can start by deploying the
training will, by default, run on the GPU. If you choose any other type of VM,
training will, by default, run on the GPU. If you choose any other type of VM,
Setting up your own instance requires a number of package installations. Please
view the documentation for doing so
[here](Training-on-Microsoft-Azure-Custom-Instance.md).
Setting up your own instance requires a number of package installations. Please
view the documentation for doing so [here](#custom-instances).
## Installing ML-Agents

To run your training on the VM:
1. [Move](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/copy-files-to-linux-vm-using-scp)
your built Unity application to your Virtual Machine.
2. Set the directory where the ML-Agents Toolkit was installed to your
working directory.
your built Unity application to your Virtual Machine.
2. Set the directory where the ML-Agents Toolkit was installed to your working
directory.
3. Run the following command:
```sh

## Monitoring your Training Run with TensorBoard
Once you have started training, you can [use TensorBoard to observe the
training](Using-Tensorboard.md).
Once you have started training, you can
[use TensorBoard to observe the training](Using-Tensorboard.md).
1. Start by [opening the appropriate port for web traffic to connect to your VM](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/nsg-quickstart-portal).
1. Start by
[opening the appropriate port for web traffic to connect to your VM](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/nsg-quickstart-portal).
* Note that you don't need to generate a new `Network Security Group` but
instead, go to the **Networking** tab under **Settings** for your VM.
* As an example, you could use the following settings to open the Port with
the following Inbound Rule settings:
* Source: Any
* Source Port Ranges: *
* Destination: Any
* Destination Port Ranges: 6006
* Protocol: Any
* Action: Allow
* Priority: (Leave as default)
- Note that you don't need to generate a new `Network Security Group` but
instead, go to the **Networking** tab under **Settings** for your VM.
- As an example, you could use the following settings to open the Port with
the following Inbound Rule settings:
- Source: Any
- Source Port Ranges: \*
- Destination: Any
- Destination Port Ranges: 6006
- Protocol: Any
- Action: Allow
- Priority: (Leave as default)
2. Unless you started the training as a background process, connect to your VM
from another terminal instance.

[Azure Container Instances](https://azure.microsoft.com/services/container-instances/)
allow you to spin up a container, on demand, that will run your training and
then be shut down. This ensures you aren't leaving a billable VM running when
it isn't needed. Using ACI enables you to offload training of your models without needing to
install Python and TensorFlow on your own computer.
then be shut down. This ensures you aren't leaving a billable VM running when it
isn't needed. Using ACI enables you to offload training of your models without
needing to install Python and TensorFlow on your own computer.
## Custom Instances
This page contains instructions for setting up a custom Virtual Machine on
Microsoft Azure so you can running ML-Agents training in the cloud.
1. Start by
[deploying an Azure VM](https://docs.microsoft.com/azure/virtual-machines/linux/quick-create-portal)
with Ubuntu Linux (tests were done with 16.04 LTS). To use GPU support, use a
N-Series VM.
2. SSH into your VM.
3. Start with the following commands to install the Nvidia driver:
```sh
wget http://us.download.nvidia.com/tesla/375.66/nvidia-diag-driver-local-repo-ubuntu1604_375.66-1_amd64.deb
sudo dpkg -i nvidia-diag-driver-local-repo-ubuntu1604_375.66-1_amd64.deb
sudo apt-get update
sudo apt-get install cuda-drivers
sudo reboot
```
4. After a minute you should be able to reconnect to your VM and install the
CUDA toolkit:
```sh
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_8.0.61-1_amd64.deb
sudo dpkg -i cuda-repo-ubuntu1604_8.0.61-1_amd64.deb
sudo apt-get update
sudo apt-get install cuda-8-0
```
5. You'll next need to download cuDNN from the Nvidia developer site. This
requires a registered account.
6. Navigate to [http://developer.nvidia.com](http://developer.nvidia.com) and
create an account and verify it.
7. Download (to your own computer) cuDNN from
[this url](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/v6/prod/8.0_20170307/Ubuntu16_04_x64/libcudnn6_6.0.20-1+cuda8.0_amd64-deb).
8. Copy the deb package to your VM:
```sh
scp libcudnn6_6.0.21-1+cuda8.0_amd64.deb <VMUserName>@<VMIPAddress>:libcudnn6_6.0.21-1+cuda8.0_amd64.deb
```
9. SSH back to your VM and execute the following:
```console
sudo dpkg -i libcudnn6_6.0.21-1+cuda8.0_amd64.deb
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
. ~/.profile
sudo reboot
```
10. After a minute, you should be able to SSH back into your VM. After doing so,
run the following:
```sh
sudo apt install python-pip
sudo apt install python3-pip
```
11. At this point, you need to install TensorFlow. The version you install
should be tied to if you are using GPU to train:
```sh
pip3 install tensorflow-gpu==1.4.0 keras==2.0.6
```
Or CPU to train:
```sh
pip3 install tensorflow==1.4.0 keras==2.0.6
```
12. You'll then need to install additional dependencies:
```sh
pip3 install pillow
pip3 install numpy
```

36
docs/Using-Docker.md


# Using Docker For ML-Agents (Deprecated)
Note: We no longer use this guide ourselves and so it may not work correctly. We've decided to
keep it up just in case it is helpful to you.
:warning: **Note:** We no longer use this guide ourselves and so it may not work
correctly. We've decided to keep it up just in case it is helpful to you.
We currently offer a solution for Windows and Mac users who would like to do
training or inference using Docker. This option may be appealing to those who

## Requirements
- [Docker](https://www.docker.com)
- Unity _Linux Build Support_ Component. Make sure to select the _Linux
Build Support_ component when installing Unity.
- Unity _Linux Build Support_ Component. Make sure to select the _Linux Build
Support_ component when installing Unity.
<p align="center">
<img src="images/unity_linux_build_support.png"

Using Docker for ML-Agents involves three steps: building the Unity environment
with specific flags, building a Docker container and, finally, running the
container. If you are not familiar with building a Unity environment for
ML-Agents, please read through our [Getting Started with the 3D Balance Ball
Example](Getting-Started.md) guide first.
ML-Agents, please read through our
[Getting Started with the 3D Balance Ball Example](Getting-Started.md) guide
first.
### Build the Environment (Optional)

- Set the _Target Platform_ to `Linux`
- Set the _Architecture_ to `x86_64`
- If the environment does not contain visual observations, you can select the
`headless` option here.
Then click `Build`, pick an environment name (e.g. `3DBall`) and set the output
directory to `unity-volume`. After building, ensure that the file

random name if this is not set. _Note that this must be unique for every run
of a Docker image._
- `<image-name>` references the image name used when building the container.
- `<environment-name>` __(Optional)__: If you are training with a linux
- `<environment-name>` **(Optional)**: If you are training with a linux
executable, this is the name of the executable. If you are training in the
Editor, do not pass a `<environment-name>` argument and press the
:arrow_forward: button in Unity when the message _"Start training by pressing

For more detail on Docker mounts, check out
[these](https://docs.docker.com/storage/bind-mounts/) docs from Docker.
**NOTE** If you are training using docker for environments that use visual observations, you may need to increase the default memory that Docker allocates for the container. For example, see [here](https://docs.docker.com/docker-for-mac/#advanced) for instructions for Docker for Mac.
**NOTE** If you are training using docker for environments that use visual
observations, you may need to increase the default memory that Docker allocates
for the container. For example, see
[here](https://docs.docker.com/docker-for-mac/#advanced) for instructions for
Docker for Mac.
You can run Tensorboard to monitor your training instance on http://localhost:6006:
You can run Tensorboard to monitor your training instance on
http://localhost:6006:
```sh
docker exec -it <container-name> tensorboard --logdir=/unity-volume/summaries --host=0.0.0.0

For more details on Tensorboard, check out the documentation about [Using Tensorboard](Using-Tensorboard.md).
For more details on Tensorboard, check out the documentation about
[Using Tensorboard](Using-Tensorboard.md).
### Stopping Container and Saving State

docker kill --signal=SIGINT <container-name>
```
`<container-name>` is the name of the container specified in the earlier `docker
run` command. If you didn't specify one, you can find the randomly generated
identifier by running `docker container ls`.
`<container-name>` is the name of the container specified in the earlier
`docker run` command. If you didn't specify one, you can find the randomly
generated identifier by running `docker container ls`.

82
docs/Using-Tensorboard.md


start TensorBoard:
1. Open a terminal or console window:
2. Navigate to the directory where the ML-Agents Toolkit is installed.
3. From the command line run :
```sh
tensorboard --logdir=summaries --port=6006
```
4. Open a browser window and navigate to [localhost:6006](http://localhost:6006).
1. Navigate to the directory where the ML-Agents Toolkit is installed.
1. From the command line run: `tensorboard --logdir=summaries --port=6006`
1. Open a browser window and navigate to
[localhost:6006](http://localhost:6006).
**Note:** The default port TensorBoard uses is 6006. If there is an existing session
running on port 6006 a new session can be launched on an open port using the --port
option.
**Note:** The default port TensorBoard uses is 6006. If there is an existing
session running on port 6006 a new session can be launched on an open port using
the --port option.
**Note:** If you don't assign a `run-id` identifier, `mlagents-learn` uses the
default string, "ppo". All the statistics will be saved to the same sub-folder

### Environment Statistics
* `Environment/Lesson` - Plots the progress from lesson to lesson. Only interesting when
performing [curriculum training](Training-Curriculum-Learning.md).
- `Environment/Lesson` - Plots the progress from lesson to lesson. Only
interesting when performing
[curriculum training](Training-Curriculum-Learning.md).
* `Environment/Cumulative Reward` - The mean cumulative episode reward over all agents. Should
increase during a successful training session.
- `Environment/Cumulative Reward` - The mean cumulative episode reward over all
agents. Should increase during a successful training session.
* `Environment/Episode Length` - The mean length of each episode in the environment for all agents.
- `Environment/Episode Length` - The mean length of each episode in the
environment for all agents.
* `Policy/Entropy` (PPO; BC) - How random the decisions of the model are. Should slowly decrease
during a successful training process. If it decreases too quickly, the `beta`
hyperparameter should be increased.
- `Policy/Entropy` (PPO; BC) - How random the decisions of the model are. Should
slowly decrease during a successful training process. If it decreases too
quickly, the `beta` hyperparameter should be increased.
* `Policy/Learning Rate` (PPO; BC) - How large a step the training algorithm takes as it searches
for the optimal policy. Should decrease over time.
- `Policy/Learning Rate` (PPO; BC) - How large a step the training algorithm
takes as it searches for the optimal policy. Should decrease over time.
* `Policy/Value Estimate` (PPO) - The mean value estimate for all states visited by the agent. Should increase during a successful training session.
- `Policy/Value Estimate` (PPO) - The mean value estimate for all states visited
by the agent. Should increase during a successful training session.
* `Policy/Curiosity Reward` (PPO+Curiosity) - This corresponds to the mean cumulative intrinsic reward generated per-episode.
- `Policy/Curiosity Reward` (PPO+Curiosity) - This corresponds to the mean
cumulative intrinsic reward generated per-episode.
* `Losses/Policy Loss` (PPO) - The mean magnitude of policy loss function. Correlates to how
much the policy (process for deciding actions) is changing. The magnitude of
this should decrease during a successful training session.
- `Losses/Policy Loss` (PPO) - The mean magnitude of policy loss function.
Correlates to how much the policy (process for deciding actions) is changing.
The magnitude of this should decrease during a successful training session.
- `Losses/Value Loss` (PPO) - The mean loss of the value function update.
Correlates to how well the model is able to predict the value of each state.
This should increase while the agent is learning, and then decrease once the
reward stabilizes.
* `Losses/Value Loss` (PPO) - The mean loss of the value function update. Correlates to how
well the model is able to predict the value of each state. This should
increase while the agent is learning, and then decrease once the reward
stabilizes.
- `Losses/Forward Loss` (PPO+Curiosity) - The mean magnitude of the inverse
model loss function. Corresponds to how well the model is able to predict the
new observation encoding.
- `Losses/Inverse Loss` (PPO+Curiosity) - The mean magnitude of the forward
model loss function. Corresponds to how well the model is able to predict the
action taken between two observations.
* `Losses/Forward Loss` (PPO+Curiosity) - The mean magnitude of the inverse model
loss function. Corresponds to how well the model is able to predict the new
observation encoding.
- `Losses/Cloning Loss` (BC) - The mean magnitude of the behavioral cloning
loss. Corresponds to how well the model imitates the demonstration data.
* `Losses/Inverse Loss` (PPO+Curiosity) - The mean magnitude of the forward model
loss function. Corresponds to how well the model is able to predict the action
taken between two observations.
## Custom Metrics from Unity
* `Losses/Cloning Loss` (BC) - The mean magnitude of the behavioral cloning loss. Corresponds to how well the model imitates the demonstration data.
To get custom metrics from a C# environment into Tensorboard, you can use the
StatsSideChannel:
## Custom Metrics from C#
To get custom metrics from a C# environment into Tensorboard, you can use the StatsSideChannel:
```csharp
var statsSideChannel = SideChannelUtils.GetSideChannel<StatsSideChannel>();
statsSideChannel.AddStat("MyMetric", 1.0);

66
docs/Using-Virtual-Environment.md


# Using Virtual Environment
## What is a Virtual Environment?
A Virtual Environment is a self contained directory tree that contains a Python installation
for a particular version of Python, plus a number of additional packages. To learn more about
Virtual Environments see [here](https://docs.python.org/3/library/venv.html)
A Virtual Environment is a self contained directory tree that contains a Python
installation for a particular version of Python, plus a number of additional
packages. To learn more about Virtual Environments see
[here](https://docs.python.org/3/library/venv.html).
A Virtual Environment keeps all dependencies for the Python project separate from dependencies
of other projects. This has a few advantages:
A Virtual Environment keeps all dependencies for the Python project separate
from dependencies of other projects. This has a few advantages:
spinning up a new environment and verifying the compatibility of the code with the
different version.
spinning up a new environment and verifying the compatibility of the code
with the different version.
This guide has been tested with Python 3.6 and 3.7. Python 3.8 is not supported at this time.
This guide has been tested with Python 3.6 and 3.7. Python 3.8 is not supported
at this time.
1. Download the `get-pip.py` file using the command `curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py`
1. Download the `get-pip.py` file using the command
`curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py`
Note (for Ubuntu users): If the `ModuleNotFoundError: No module named 'distutils.util'` error is encountered, then
python3-distutils needs to be installed. Install python3-distutils using `sudo apt-get install python3-distutils`
Note (for Ubuntu users): If the
`ModuleNotFoundError: No module named 'distutils.util'` error is encountered,
then python3-distutils needs to be installed. Install python3-distutils using
`sudo apt-get install python3-distutils`
1. Create a folder where the virtual environments will reside `$ mkdir ~/python-envs`
1. To create a new environment named `sample-env` execute `$ python3 -m venv ~/python-envs/sample-env`
1. To activate the environment execute `$ source ~/python-envs/sample-env/bin/activate`
1. Verify pip version is the same as in the __Installing Pip__ section. In case it is not the latest, upgrade to
the latest pip version using `$ pip3 install --upgrade pip`
1. To deactivate the environment execute `$ deactivate` (you can reactivate the environment
using the same `activate` command listed above)
1. Create a folder where the virtual environments will reside
`$ mkdir ~/python-envs`
1. To create a new environment named `sample-env` execute
`$ python3 -m venv ~/python-envs/sample-env`
1. To activate the environment execute
`$ source ~/python-envs/sample-env/bin/activate`
1. Upgrade to the latest pip version using `$ pip3 install --upgrade pip`
1. Upgrade to the latest setuptools version using
`$ pip3 install --upgrade setuptools`
1. To deactivate the environment execute `$ deactivate` (you can reactivate the
environment using the same `activate` command listed above)
## Ubuntu Setup

## Windows Setup
1. Create a folder where the virtual environments will reside `md python-envs`
1. To create a new environment named `sample-env` execute `python -m venv python-envs\sample-env`
1. To create a new environment named `sample-env` execute
`python -m venv python-envs\sample-env`
1. Verify pip version is the same as in the __Installing Pip__ section. In case it is not the
latest, upgrade to the latest pip version using `pip install --upgrade pip`
1. To deactivate the environment execute `deactivate` (you can reactivate the environment
using the same `activate` command listed above)
1. Upgrade to the latest pip version using `pip install --upgrade pip`
1. To deactivate the environment execute `deactivate` (you can reactivate the
environment using the same `activate` command listed above)
- Verify that you are using Python 3.6 or Python 3.7. Launch a command prompt using `cmd` and
execute `python --version` to verify the version.
- Verify that you are using Python 3.6 or Python 3.7. Launch a command prompt
using `cmd` and execute `python --version` to verify the version.
- This guide is for Windows 10 using a 64-bit architecture only.
- This guide is for Windows 10 using a 64-bit architecture only.

150
docs/images/demo_component.png

之前 之后
宽度: 505  |  高度: 94  |  大小: 15 KiB

257
docs/images/demo_inspector.png

之前 之后
宽度: 245  |  高度: 236  |  大小: 13 KiB

999
docs/images/docker_build_settings.png
文件差异内容过多而无法显示
查看文件

980
docs/images/gridworld.png

之前 之后
宽度: 1064  |  高度: 725  |  大小: 84 KiB

198
docs/images/learning_environment_basic.png

之前 之后
宽度: 661  |  高度: 568  |  大小: 13 KiB

545
docs/images/learning_environment_example.png
文件差异内容过多而无法显示
查看文件

219
docs/images/platform_prefab.png

之前 之后
宽度: 285  |  高度: 121  |  大小: 8.8 KiB

604
docs/images/unity_package_json.png

之前 之后
宽度: 1117  |  高度: 596  |  大小: 128 KiB

999
docs/images/unity_package_manager_window.png
文件差异内容过多而无法显示
查看文件

349
docs/images/visual-observation-rawimage.png

之前 之后
宽度: 531  |  高度: 762  |  大小: 59 KiB

95
docs/images/visual-observation-rendertexture.png

之前 之后
宽度: 409  |  高度: 95  |  大小: 14 KiB

107
docs/images/visual-observation.png

之前 之后
宽度: 506  |  高度: 130  |  大小: 18 KiB

63
ml-agents-envs/mlagents_envs/environment.py


import atexit
from distutils.version import StrictVersion
import glob
import uuid
import numpy as np

import signal
import struct
logger = get_logger(__name__)

# Command line argument used to pass the port to the executable environment.
PORT_COMMAND_LINE_ARG = "--mlagents-port"
@staticmethod
def _raise_version_exception(unity_com_ver: str) -> None:
raise UnityEnvironmentException(
f"The communication API version is not compatible between Unity and python. "
f"Python API: {UnityEnvironment.API_VERSION}, Unity API: {unity_com_ver}.\n "
f"Please go to https://github.com/Unity-Technologies/ml-agents/releases/tag/latest_release "
f"to download the latest version of ML-Agents."
)
@staticmethod
def check_communication_compatibility(
unity_com_ver: str, python_api_version: str, unity_package_version: str
) -> bool:
unity_communicator_version = StrictVersion(unity_com_ver)
api_version = StrictVersion(python_api_version)
if unity_communicator_version.version[0] == 0:
if (
unity_communicator_version.version[0] != api_version.version[0]
or unity_communicator_version.version[1] != api_version.version[1]
):
# Minor beta versions differ.
return False
elif unity_communicator_version.version[0] != api_version.version[0]:
# Major versions mismatch.
return False
elif unity_communicator_version.version[1] != api_version.version[1]:
# Non-beta minor versions mismatch. Log a warning but allow execution to continue.
logger.warning(
f"WARNING: The communication API versions between Unity and python differ at the minor version level. "
f"Python API: {python_api_version}, Unity API: {unity_communicator_version}.\n"
f"This means that some features may not work unless you upgrade the package with the lower version."
f"Please find the versions that work best together from our release page.\n"
"https://github.com/Unity-Technologies/ml-agents/releases"
)
else:
logger.info(
f"Connected to Unity environment with package version {unity_package_version} "
f"and communication version {unity_com_ver}"
)
return True
def __init__(
self,
file_name: Optional[str] = None,

self._close(0)
raise
unity_communicator_version = aca_params.communication_version
if unity_communicator_version != UnityEnvironment.API_VERSION:
if not UnityEnvironment.check_communication_compatibility(
aca_params.communication_version,
UnityEnvironment.API_VERSION,
aca_params.package_version,
):
raise UnityEnvironmentException(
f"The communication API version is not compatible between Unity and python. "
f"Python API: {UnityEnvironment.API_VERSION}, Unity API: {unity_communicator_version}.\n "
f"Please go to https://github.com/Unity-Technologies/ml-agents/releases/tag/latest_release "
f"to download the latest version of ML-Agents."
)
else:
logger.info(
f"Connected to Unity environment with package version {aca_params.package_version} "
f"and communication version {aca_params.communication_version}"
)
UnityEnvironment._raise_version_exception(aca_params.communication_version)
self._env_state: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
self._env_specs: Dict[str, BehaviorSpec] = {}
self._env_actions: Dict[str, np.ndarray] = {}

31
ml-agents-envs/mlagents_envs/tests/test_envs.py


assert comm.has_been_closed
def test_check_communication_compatibility():
unity_ver = "1.0.0"
python_ver = "1.0.0"
unity_package_version = "0.15.0"
assert UnityEnvironment.check_communication_compatibility(
unity_ver, python_ver, unity_package_version
)
unity_ver = "1.1.0"
assert UnityEnvironment.check_communication_compatibility(
unity_ver, python_ver, unity_package_version
)
unity_ver = "2.0.0"
assert not UnityEnvironment.check_communication_compatibility(
unity_ver, python_ver, unity_package_version
)
unity_ver = "0.16.0"
python_ver = "0.16.0"
assert UnityEnvironment.check_communication_compatibility(
unity_ver, python_ver, unity_package_version
)
unity_ver = "0.17.0"
assert not UnityEnvironment.check_communication_compatibility(
unity_ver, python_ver, unity_package_version
)
unity_ver = "1.16.0"
assert not UnityEnvironment.check_communication_compatibility(
unity_ver, python_ver, unity_package_version
)
def test_returncode_to_signal_name():
assert UnityEnvironment.returncode_to_signal_name(-2) == "SIGINT"
assert UnityEnvironment.returncode_to_signal_name(42) is None

95
ml-agents/mlagents/trainers/learn.py


)
argparser.add_argument("trainer_config_path")
argparser.add_argument(
"--env", default=None, dest="env_path", help="Name of the Unity executable "
"--env",
default=None,
dest="env_path",
help="Path to the Unity executable to train",
help="Curriculum config yaml file for environment",
help="YAML file for defining the lessons for curriculum training",
)
argparser.add_argument(
"--lesson",
default=0,
type=int,
help="The lesson to start with when performing curriculum training",
help="Reset parameter yaml file for environment",
help="YAML file for defining the sampler for environment parameter randomization",
help="How many model checkpoints to keep",
)
argparser.add_argument(
"--lesson", default=0, type=int, help="Start learning from this lesson"
help="The maximum number of model checkpoints to keep. Checkpoints are saved after the"
"number of steps specified by the save-freq option. Once the maximum number of checkpoints"
"has been reached, the oldest checkpoint is deleted when saving a new checkpoint.",
)
argparser.add_argument(
"--load",

default=False,
dest="resume",
action="store_true",
help="Resumes training from a checkpoint. Specify a --run-id to use this option.",
help="Whether to resume training from a checkpoint. Specify a --run-id to use this option. "
"If set, the training code loads an already trained model to initialize the neural network "
"before resuming training. This option is only valid when the models exist, and have the same "
"behavior names as the current agents in your scene.",
)
argparser.add_argument(
"--force",

help="Force-overwrite existing models and summaries for a run ID that has been used "
"before.",
help="Whether to force-overwrite this run-id's existing summary and model data. (Without "
"this flag, attempting to train a model with a run-id that has been used before will throw "
"an error.",
help="The run identifier for model and summary statistics.",
help="The identifier for the training run. This identifier is used to name the "
"subdirectories in which the trained model and summary statistics are saved as well "
"as the saved model itself. If you use TensorBoard to view the training statistics, "
"always set a unique run-id for each training run. (The statistics for all runs with the "
"same id are combined as if they were produced by a the same session.)",
)
argparser.add_argument(
"--initialize-from",

"This can be used, for instance, to fine-tune an existing model on a new environment. ",
"This can be used, for instance, to fine-tune an existing model on a new environment. "
"Note that the previously saved models must have the same behavior parameters as your "
"current environment.",
"--save-freq", default=50000, type=int, help="Frequency at which to save model"
"--save-freq",
default=50000,
type=int,
help="How often (in steps) to save the model during training",
"--seed", default=-1, type=int, help="Random seed used for training"
"--seed",
default=-1,
type=int,
help="A number to use as a seed for the random number generator used by the training code",
)
argparser.add_argument(
"--train",

default=False,
dest="inference",
action="store_true",
help="Run in Python inference mode (don't train). Use with --resume to load a model trained with an "
"existing run ID.",
help="Whether to run in Python inference mode (i.e. no training). Use with --resume to load "
"a model trained with an existing run ID.",
help="Base port for environment communication",
help="The starting port for environment communication. Each concurrent Unity environment "
"instance will get assigned a port sequentially, starting from the base-port. Each instance "
"will use the port (base_port + worker_id), where the worker_id is sequential IDs given to "
"each instance from 0 to (num_envs - 1). Note that when training using the Editor rather "
"than an executable, the base port will be ignored.",
help="Number of parallel environments to use for training",
help="The number of concurrent Unity environment instances to collect experiences "
"from when training",
help="Whether to run the environment in no-graphics mode",
help="Whether to run the Unity executable in no-graphics mode (i.e. without initializing "
"the graphics driver. Use this only if your agents don't use visual observations.",
help="Whether to run ML-Agents in debug mode with detailed logging",
help="Whether to enable debug-level logging for some parts of the code",
help="Arguments passed to the Unity executable.",
help="Arguments passed to the Unity executable. Be aware that the standalone build will also "
"process these as Unity Command Line Arguments. You should choose different argument names if "
"you want to create environment-specific arguments. All arguments after this flag will be "
"passed to the executable.",
"--cpu", default=False, action="store_true", help="Run with CPU only"
"--cpu",
default=False,
action="store_true",
help="Forces training using CPU only",
)
argparser.add_argument("--version", action="version", version="")

"--width",
default=84,
type=int,
help="The width of the executable window of the environment(s)",
help="The width of the executable window of the environment(s) in pixels "
"(ignored for editor training).",
help="The height of the executable window of the environment(s)",
help="The height of the executable window of the environment(s) in pixels "
"(ignored for editor training)",
help="The quality level of the environment(s)",
help="The quality level of the environment(s). Equivalent to calling "
"QualitySettings.SetQualityLevel in Unity.",
help="The time scale of the Unity environment(s)",
help="The time scale of the Unity environment(s). Equivalent to setting "
"Time.timeScale in Unity.",
help="The target frame rate of the Unity environment(s)",
help="The target frame rate of the Unity environment(s). Equivalent to setting "
"Application.targetFrameRate in Unity.",
)
return argparser

22
ml-agents/tests/yamato/scripts/run_gym.py


from gym_unity.envs import UnityEnv
def main(env_name):
def test_run_environment(env_name):
"""
Run the gym test using the specified environment
:param env_name: Name of the Unity environment binary to launch

env.close()
def test_closing(env_name):
"""
Run the gym test and closes the environment multiple times
:param env_name: Name of the Unity environment binary to launch
"""
try:
env1 = UnityEnv(env_name, worker_id=1, use_visual=False, no_graphics=True)
env1.close()
env1 = UnityEnv(env_name, worker_id=1, use_visual=False, no_graphics=True)
env2 = UnityEnv(env_name, worker_id=2, use_visual=False, no_graphics=True)
env2.reset()
finally:
env1.close()
env2.close()
main(args.env)
test_run_environment(args.env)
test_closing(args.env)

29
ml-agents/tests/yamato/scripts/run_llapi.py


)
def main(env_name):
def test_run_environment(env_name):
"""
Run the low-level API test using the specified environment
:param env_name: Name of the Unity environment binary to launch

else:
# Should never happen
action = None
if tracked_agent == -1 and len(decision_steps) > 1:
if tracked_agent == -1 and len(decision_steps) >= 1:
tracked_agent = decision_steps.agent_id[0]
env.set_actions(group_name, action)
env.step()

env.close()
def test_closing(env_name):
"""
Run the low-level API and close the environment
:param env_name: Name of the Unity environment binary to launch
"""
try:
env1 = UnityEnvironment(
file_name=env_name, base_port=5006, no_graphics=True, args=["-logFile", "-"]
)
env1.close()
env1 = UnityEnvironment(
file_name=env_name, base_port=5006, no_graphics=True, args=["-logFile", "-"]
)
env2 = UnityEnvironment(
file_name=env_name, base_port=5007, no_graphics=True, args=["-logFile", "-"]
)
env2.reset()
finally:
env1.close()
env2.close()
main(args.env)
test_run_environment(args.env)
test_closing(args.env)

24
utils/validate_versions.py


return True
def set_version(new_version: str) -> None:
new_contents = f'{VERSION_LINE_START}"{new_version}"\n'
def set_version(python_version: str, csharp_version: str) -> None:
new_contents = f'{VERSION_LINE_START}"{python_version}"\n'
print(f"Setting {path} to version {new_version}")
print(f"Setting {path} to version {python_version}")
# Package version is a bit stricter - only set it if we're not a "dev" version.
if "dev" not in new_version:
package_version = new_version + "-preview"
if csharp_version is not None:
package_version = csharp_version + "-preview"
print(
f"Setting package version to {package_version} in {UNITY_PACKAGE_JSON_PATH}"
)

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--new-version", default=None)
parser.add_argument("--python-version", default=None)
parser.add_argument("--csharp-version", default=None)
if args.new_version:
print(f"Updating to verison {args.new_version}")
set_version(args.new_version)
if args.python_version:
print(f"Updating python library to version {args.python_version}")
if args.csharp_version:
print(f"Updating C# package to version {args.csharp_version}")
set_version(args.python_version, args.csharp_version)
else:
ok = check_versions()
return_code = 0 if ok else 1

22
com.unity.ml-agents/Runtime/Demonstrations/DemonstrationMetaData.cs


using System;
using UnityEngine;
using MLAgents.Policies;
using UnityEngine.Serialization;
namespace MLAgents.Demonstrations
{
/// <summary>
/// Demonstration meta-data.
/// Kept in a struct for easy serialization and deserialization.
/// </summary>
[Serializable]
internal class DemonstrationMetaData
{
[FormerlySerializedAs("numberExperiences")]
public int numberSteps;
public int numberEpisodes;
public float meanReward;
public string demonstrationName;
public const int ApiVersion = 1;
}
}

部分文件因为文件数量过多而无法显示

正在加载...
取消
保存