浏览代码

Release v0.5 (#1202)

/hotfix-v0.9.2a
GitHub 6 年前
当前提交
0c417c55
共有 316 个文件被更改,包括 5942 次插入2613 次删除
  1. 58
      .gitignore
  2. 5
      CODE_OF_CONDUCT.md
  3. 60
      CONTRIBUTING.md
  4. 11
      Dockerfile
  5. 201
      LICENSE
  6. 106
      README.md
  7. 29
      docs/API-Reference.md
  8. 15
      docs/Background-Jupyter.md
  9. 301
      docs/Background-Machine-Learning.md
  10. 74
      docs/Background-TensorFlow.md
  11. 12
      docs/Background-Unity.md
  12. 242
      docs/Basic-Guide.md
  13. 136
      docs/FAQ.md
  14. 57
      docs/Feature-Memory.md
  15. 50
      docs/Feature-Monitor.md
  16. 393
      docs/Getting-Started-with-Balance-Ball.md
  17. 68
      docs/Glossary.md
  18. 251
      docs/Installation-Windows.md
  19. 90
      docs/Installation.md
  20. 64
      docs/Learning-Environment-Best-Practices.md
  21. 363
      docs/Learning-Environment-Create-New.md
  22. 55
      docs/Learning-Environment-Design-Academy.md
  23. 469
      docs/Learning-Environment-Design-Agents.md
  24. 106
      docs/Learning-Environment-Design-Brains.md
  25. 118
      docs/Learning-Environment-Design-External-Internal-Brains.md
  26. 34
      docs/Learning-Environment-Design-Heuristic-Brains.md
  27. 47
      docs/Learning-Environment-Design-Player-Brains.md
  28. 203
      docs/Learning-Environment-Design.md
  29. 381
      docs/Learning-Environment-Examples.md
  30. 219
      docs/Learning-Environment-Executable.md
  31. 27
      docs/Limitations.md
  32. 703
      docs/ML-Agents-Overview.md
  33. 135
      docs/Migrating.md
  34. 160
      docs/Python-API.md
  35. 80
      docs/Readme.md
  36. 143
      docs/Training-Curriculum-Learning.md
  37. 78
      docs/Training-Imitation-Learning.md
  38. 228
      docs/Training-ML-Agents.md
  39. 218
      docs/Training-PPO.md
  40. 118
      docs/Training-on-Amazon-Web-Service.md
  41. 112
      docs/Training-on-Microsoft-Azure-Custom-Instance.md
  42. 107
      docs/Training-on-Microsoft-Azure.md
  43. 117
      docs/Using-Docker.md
  44. 179
      docs/Using-TensorFlow-Sharp-in-Unity.md
  45. 79
      docs/Using-Tensorboard.md
  46. 8
      docs/dox-ml-agents.conf
  47. 611
      docs/images/banner.png
  48. 129
      docs/images/player_brain.png
  49. 79
      docs/images/scene-hierarchy.png
  50. 309
      docs/images/unity-logo-rgb.png
  51. 5
      docs/localized/zh-CN/README.md
  52. 26
      docs/localized/zh-CN/docs/Getting-Started-with-Balance-Ball.md
  53. 4
      docs/localized/zh-CN/docs/Installation.md
  54. 4
      docs/localized/zh-CN/docs/Learning-Environment-Create-New.md
  55. 14
      docs/localized/zh-CN/docs/Learning-Environment-Design.md
  56. 42
      docs/localized/zh-CN/docs/Learning-Environment-Examples.md
  57. 2
      docs/localized/zh-CN/docs/ML-Agents-Overview.md
  58. 2
      config/curricula/push-block/PushBlockBrain.json
  59. 2
      config/curricula/test/TestBrain.json
  60. 2
      ml-agents/requirements.txt
  61. 15
      ml-agents/mlagents/trainers/buffer.py
  62. 1
      ml-agents/mlagents/envs/__init__.py
  63. 4
      ml-agents/mlagents/envs/exception.py
  64. 7
      ml-agents/mlagents/envs/communicator.py
  65. 4
      ml-agents/mlagents/envs/socket_communicator.py
  66. 6
      ml-agents/mlagents/envs/rpc_communicator.py
  67. 1
      UnitySDK/ProjectSettings/EditorBuildSettings.asset
  68. 57
      UnitySDK/ProjectSettings/ProjectSettings.asset
  69. 4
      UnitySDK/ProjectSettings/UnityConnectSettings.asset
  70. 2
      UnitySDK/ProjectSettings/ProjectVersion.txt
  71. 12
      UnitySDK/Assets/ML-Agents/Scripts/CoreBrainInternal.cs.meta
  72. 28
      UnitySDK/Assets/ML-Agents/Scripts/Academy.cs
  73. 17
      UnitySDK/Assets/ML-Agents/Scripts/Batcher.cs
  74. 24
      UnitySDK/Assets/ML-Agents/Scripts/Brain.cs
  75. 2
      UnitySDK/Assets/ML-Agents/Scripts/CoreBrain.cs
  76. 9
      UnitySDK/Assets/ML-Agents/Scripts/CoreBrainHeuristic.cs
  77. 17
      UnitySDK/Assets/ML-Agents/Scripts/CoreBrainPlayer.cs
  78. 43
      UnitySDK/Assets/ML-Agents/Scripts/Monitor.cs
  79. 20
      UnitySDK/Assets/ML-Agents/Scripts/RpcCommunicator.cs
  80. 22
      UnitySDK/Assets/ML-Agents/Scripts/SocketCommunicator.cs
  81. 2
      UnitySDK/Assets/ML-Agents/Scripts/UnityAgentsException.cs
  82. 165
      UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
  83. 2
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/AgentActionProto.cs.meta
  84. 2
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/AgentInfoProto.cs.meta
  85. 2
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/BrainParametersProto.cs.meta
  86. 2
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/BrainTypeProto.cs.meta
  87. 2
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/CommandProto.cs.meta
  88. 19
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/EngineConfigurationProto.cs
  89. 2
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/EngineConfigurationProto.cs.meta
  90. 21
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/EnvironmentParametersProto.cs
  91. 2
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/EnvironmentParametersProto.cs.meta
  92. 14
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/Header.cs
  93. 2
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/Header.cs.meta
  94. 15
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/ResolutionProto.cs
  95. 2
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/ResolutionProto.cs.meta
  96. 2
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/SpaceTypeProto.cs.meta
  97. 27
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/UnityInput.cs
  98. 2
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/UnityInput.cs.meta
  99. 32
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/UnityMessage.cs
  100. 2
      UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/UnityMessage.cs.meta

58
.gitignore


/unity-environment/[Ll]ibrary/
/unity-environment/[Tt]emp/
/unity-environment/[Oo]bj/
/unity-environment/[Bb]uild/
/unity-environment/[Bb]uilds/
/unity-environment/[Pp]ackages/
/unity-environment/[Uu]nity[Pp]ackage[Mm]anager/
/unity-environment/Assets/AssetStoreTools*
/unity-environment/Assets/Plugins*
/unity-environment/Assets/Gizmos*
/UnitySDK/[Ll]ibrary/
/UnitySDK/[Tt]emp/
/UnitySDK/[Oo]bj/
/UnitySDK/[Bb]uild/
/UnitySDK/[Bb]uilds/
/UnitySDK/[Pp]ackages/
/UnitySDK/[Uu]nity[Pp]ackage[Mm]anager/
/UnitySDK/Assets/AssetStoreTools*
/UnitySDK/Assets/Plugins*
/UnitySDK/Assets/Gizmos*
python/models
python/summaries
# Training environments
/envs
*unity-environment.log
*UnitySDK.log
/unity-environment/.vs/
/UnitySDK/.vs/
/unity-environmentExportedObj/
/unity-environment.consulo/
/UnitySDKExportedObj/
/UnitySDK.consulo/
*.csproj
*.unityproj
*.sln

*.pidb.meta
# Unity3D Generated File On Crash Reports
/unity-environment/sysinfo.txt
/UnitySDK/sysinfo.txt
# Builds
*.apk

*.x86
# Tensorflow Sharp Files
/unity-environment/Assets/ML-Agents/Plugins/Android*
/unity-environment/Assets/ML-Agents/Plugins/iOS*
/unity-environment/Assets/ML-Agents/Plugins/Computer*
/unity-environment/Assets/ML-Agents/Plugins/System*
/UnitySDK/Assets/ML-Agents/Plugins/Android*
/UnitySDK/Assets/ML-Agents/Plugins/iOS*
/UnitySDK/Assets/ML-Agents/Plugins/Computer*
/UnitySDK/Assets/ML-Agents/Plugins/System*
# Generated doc folders
/docs/html

*.eggs*
*.gitignore.swp
# VSCode hidden files
*.vscode/
.ipynb_checkpoints
# pytest cache
*.pytest_cache/
# Ignore compiled protobuf files.
ml-agents-protobuf/cs
ml-agents-protobuf/python
ml-agents-protobuf/Grpc*
# Ignore PyPi build files.
dist/
build/

5
CODE_OF_CONDUCT.md


## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct/
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 1.4, available at
https://www.contributor-covenant.org/version/1/4/code-of-conduct/
[homepage]: https://www.contributor-covenant.org

60
CONTRIBUTING.md


# Contribution Guidelines
Thank you for your interest in contributing to the ML-Agents toolkit! We are incredibly
excited to see how members of our community will use and extend the ML-Agents toolkit.
To facilitate your contributions, we've outlined a brief set of guidelines
to ensure that your extensions can be easily integrated.
Thank you for your interest in contributing to the ML-Agents toolkit! We are
incredibly excited to see how members of our community will use and extend the
ML-Agents toolkit. To facilitate your contributions, we've outlined a brief set
of guidelines to ensure that your extensions can be easily integrated.
### Communication
## Communication
First, please read through our [code of conduct](CODE_OF_CONDUCT.md),
as we expect all our contributors to follow it.
First, please read through our [code of conduct](CODE_OF_CONDUCT.md), as we
expect all our contributors to follow it.
Second, before starting on a project that you intend to contribute
to the ML-Agents toolkit (whether environments or modifications to the codebase),
we **strongly** recommend posting on our
[Issues page](https://github.com/Unity-Technologies/ml-agents/issues) and
briefly outlining the changes you plan to make. This will enable us to provide
some context that may be helpful for you. This could range from advice and
feedback on how to optimally perform your changes or reasons for not doing it.
Second, before starting on a project that you intend to contribute to the
ML-Agents toolkit (whether environments or modifications to the codebase), we
**strongly** recommend posting on our
[Issues page](https://github.com/Unity-Technologies/ml-agents/issues)
and briefly outlining the changes you plan to make. This will enable us to
provide some context that may be helpful for you. This could range from advice
and feedback on how to optimally perform your changes or reasons for not doing
it.
### Git Branches
## Git Branches
Starting with v0.3, we adopted the
Starting with v0.3, we adopted the
Consequently, the `master` branch corresponds to the latest release of
Consequently, the `master` branch corresponds to the latest release of
* Corresponding changes to documentation, unit tests and sample environments
(if applicable)
* Corresponding changes to documentation, unit tests and sample environments (if
applicable)
### Environments
## Environments
We are also actively open to adding community contributed environments as
examples, as long as they are small, simple, demonstrate a unique feature of
the platform, and provide a unique non-trivial challenge to modern
We are also actively open to adding community contributed environments as
examples, as long as they are small, simple, demonstrate a unique feature of
the platform, and provide a unique non-trivial challenge to modern
PR explaining the nature of the environment and task.
PR explaining the nature of the environment and task.
### Style Guide
## Style Guide
When performing changes to the codebase, ensure that you follow the style
guide of the file you're modifying. For Python, we follow
[PEP 8](https://www.python.org/dev/peps/pep-0008/). For C#, we will soon be
adding a formal style guide for our repository.
When performing changes to the codebase, ensure that you follow the style guide
of the file you're modifying. For Python, we follow
[PEP 8](https://www.python.org/dev/peps/pep-0008/).
For C#, we will soon be adding a formal style guide for our repository.

11
Dockerfile


# xvfb is used to do CPU based rendering of Unity
RUN apt-get install -y xvfb
ADD python/requirements.txt .
RUN pip install --trusted-host pypi.python.org -r requirements.txt
WORKDIR /execute
COPY python /execute/python
COPY ml-agents /ml-agents
WORKDIR /ml-agents
RUN pip install .
ENTRYPOINT ["python", "python/learn.py"]
ENTRYPOINT ["mlagents-learn"]

201
LICENSE


Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2017 Unity Technologies
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

106
README.md


<img src="docs/images/unity-wide.png" align="middle" width="3000"/>
<img src="docs/images/image-banner.png" align="middle" width="3000"/>
**The Unity Machine Learning Agents Toolkit** (ML-Agents) is an open-source Unity plugin
that enables games and simulations to serve as environments for training
intelligent agents. Agents can be trained using reinforcement learning,
imitation learning, neuroevolution, or other machine learning methods through
a simple-to-use Python API. We also provide implementations (based on
TensorFlow) of state-of-the-art algorithms to enable game developers
and hobbyists to easily train intelligent agents for 2D, 3D and VR/AR games.
These trained agents can be used for multiple purposes, including
controlling NPC behavior (in a variety of settings such as multi-agent and
adversarial), automated testing of game builds and evaluating different game
design decisions pre-release. The ML-Agents toolkit is mutually beneficial for both game
developers and AI researchers as it provides a central platform where advances
in AI can be evaluated on Unity’s rich environments and then made accessible
to the wider research and game developer communities.
**The Unity Machine Learning Agents Toolkit** (ML-Agents) is an open-source
Unity plugin that enables games and simulations to serve as environments for
training intelligent agents. Agents can be trained using reinforcement learning,
imitation learning, neuroevolution, or other machine learning methods through a
simple-to-use Python API. We also provide implementations (based on TensorFlow)
of state-of-the-art algorithms to enable game developers and hobbyists to easily
train intelligent agents for 2D, 3D and VR/AR games. These trained agents can be
used for multiple purposes, including controlling NPC behavior (in a variety of
settings such as multi-agent and adversarial), automated testing of game builds
and evaluating different game design decisions pre-release. The ML-Agents
toolkit is mutually beneficial for both game developers and AI researchers as it
provides a central platform where advances in AI can be evaluated on Unity’s
rich environments and then made accessible to the wider research and game
developer communities.
* Train memory-enhanced Agents using deep reinforcement learning
* Train memory-enhanced agents using deep reinforcement learning
* Broadcasting of Agent behavior for supervised learning
* Broadcasting of agent behavior for supervised learning
* Flexible Agent control with On Demand Decision Making
* Flexible agent control with On Demand Decision Making
* Wrap learning environments as a gym
* For more information, in addition to installation and usage
instructions, see our [documentation home](docs/Readme.md).
* If you have
used a version of the ML-Agents toolkit prior to v0.4, we strongly recommend
our [guide on migrating from earlier versions](docs/Migrating.md).
* For more information, in addition to installation and usage instructions, see
our [documentation home](docs/Readme.md).
* If you are a researcher interested in a discussion of Unity as an AI platform, see a pre-print of our [reference paper on Unity and the ML-Agents Toolkit](https://arxiv.org/abs/1809.02627). Also, see below for instructions on citing this paper.
* If you have used a version of the ML-Agents toolkit prior to v0.5, we strongly
recommend our [guide on migrating from earlier versions](docs/Migrating.md).
## References
## Additional Resources
- Overviewing reinforcement learning concepts
([multi-armed bandit](https://blogs.unity3d.com/2017/06/26/unity-ai-themed-blog-entries/)
and [Q-learning](https://blogs.unity3d.com/2017/08/22/unity-ai-reinforcement-learning-with-q-learning/))
- [Using Machine Learning Agents in a real game: a beginner’s guide](https://blogs.unity3d.com/2017/12/11/using-machine-learning-agents-in-a-real-game-a-beginners-guide/)
- [Post](https://blogs.unity3d.com/2018/02/28/introducing-the-winners-of-the-first-ml-agents-challenge/) announcing the winners of our
[first ML-Agents Challenge](https://connect.unity.com/challenges/ml-agents-1)
- [Post](https://blogs.unity3d.com/2018/01/23/designing-safer-cities-through-simulations/)
overviewing how Unity can be leveraged as a simulator to design safer cities.
* Overviewing reinforcement learning concepts
([multi-armed bandit](https://blogs.unity3d.com/2017/06/26/unity-ai-themed-blog-entries/)
and
[Q-learning](https://blogs.unity3d.com/2017/08/22/unity-ai-reinforcement-learning-with-q-learning/))
* [Using Machine Learning Agents in a real game: a beginner’s guide](https://blogs.unity3d.com/2017/12/11/using-machine-learning-agents-in-a-real-game-a-beginners-guide/)
* [Post](https://blogs.unity3d.com/2018/02/28/introducing-the-winners-of-the-first-ml-agents-challenge/)
announcing the winners of our
[first ML-Agents Challenge](https://connect.unity.com/challenges/ml-agents-1)
* [Post](https://blogs.unity3d.com/2018/01/23/designing-safer-cities-through-simulations/)
overviewing how Unity can be leveraged as a simulator to design safer cities.
- [Unity AI - Unity 3D Artificial Intelligence](https://www.youtube.com/watch?v=bqsfkGbBU6k)
- [A Game Developer Learns Machine Learning](https://mikecann.co.uk/machine-learning/a-game-developer-learns-machine-learning-intent/)
- [Explore Unity Technologies ML-Agents Exclusively on Intel Architecture](https://software.intel.com/en-us/articles/explore-unity-technologies-ml-agents-exclusively-on-intel-architecture)
* [Unity AI - Unity 3D Artificial Intelligence](https://www.youtube.com/watch?v=bqsfkGbBU6k)
* [A Game Developer Learns Machine Learning](https://mikecann.co.uk/machine-learning/a-game-developer-learns-machine-learning-intent/)
* [Explore Unity Technologies ML-Agents Exclusively on Intel Architecture](https://software.intel.com/en-us/articles/explore-unity-technologies-ml-agents-exclusively-on-intel-architecture)
The ML-Agents toolkit is an open-source project and we encourage and welcome contributions.
If you wish to contribute, be sure to review our
[contribution guidelines](CONTRIBUTING.md) and
The ML-Agents toolkit is an open-source project and we encourage and welcome
contributions. If you wish to contribute, be sure to review our
[contribution guidelines](CONTRIBUTING.md) and
[Unity Machine Learning Channel](https://connect.unity.com/messages/c/035fba4f88400000)
to connect with others using the ML-Agents toolkit and Unity developers enthusiastic
about machine learning. We use that channel to surface updates
regarding the ML-Agents toolkit (and, more broadly, machine learning in games).
* If you run into any problems using the ML-Agents toolkit,
[submit an issue](https://github.com/Unity-Technologies/ml-agents/issues) and
make sure to include as much detail as possible.
[Unity Machine Learning Channel](https://connect.unity.com/messages/c/035fba4f88400000)
to connect with others using the ML-Agents toolkit and Unity developers
enthusiastic about machine learning. We use that channel to surface updates
regarding the ML-Agents toolkit (and, more broadly, machine learning in
games).
* If you run into any problems using the ML-Agents toolkit,
[submit an issue](https://github.com/Unity-Technologies/ml-agents/issues) and
make sure to include as much detail as possible.
For any other questions or feedback, connect directly with the ML-Agents
team at ml-agents@unity3d.com.

translating more pages and to other languages. Consequently,
we welcome any enhancements and improvements from the community.
- [Chinese](docs/localized/zh-CN/)
* [Chinese](docs/localized/zh-CN/)
## Citation
If you use Unity or the ML-Agents Toolkit to conduct research, we ask that you cite the following paper as a reference:
Juliani, A., Berges, V., Vckay, E., Gao, Y., Henry, H., Mattar, M., Lange, D. (2018). Unity: A General Platform for Intelligent Agents. *arXiv preprint arXiv:1809.02627.* https://github.com/Unity-Technologies/ml-agents.

29
docs/API-Reference.md


# API Reference
Our developer-facing C# classes (Academy, Agent, Decision and
Monitor) have been documented to be compatabile with
[Doxygen](http://www.stack.nl/~dimitri/doxygen/) for auto-generating HTML
Our developer-facing C# classes (Academy, Agent, Decision and Monitor) have been
documented to be compatible with
[Doxygen](http://www.stack.nl/~dimitri/doxygen/) for auto-generating HTML
To generate the API reference,
[download Doxygen](http://www.stack.nl/~dimitri/doxygen/download.html) and run
the following command within the `docs/` directory:
To generate the API reference,
[download Doxygen](http://www.stack.nl/~dimitri/doxygen/download.html)
and run the following command within the `docs/` directory:
doxygen dox-ml-agents.conf
```sh
doxygen dox-ml-agents.conf
```
that includes the classes that have been properly formatted.
The generated HTML files will be placed
in the `html/` subdirectory. Open `index.html` within that subdirectory to
navigate to the API reference home. Note that `html/` is already included in
the repository's `.gitignore` file.
that includes the classes that have been properly formatted. The generated HTML
files will be placed in the `html/` subdirectory. Open `index.html` within that
subdirectory to navigate to the API reference home. Note that `html/` is already
included in the repository's `.gitignore` file.
In the near future, we aim to expand our documentation
to include all the Unity C# classes and Python API.
In the near future, we aim to expand our documentation to include all the Unity
C# classes and Python API.

15
docs/Background-Jupyter.md


# Background: Jupyter
[Jupyter](https://jupyter.org) is a fantastic tool for writing code with
embedded visualizations. We provide one such notebook, `python/Basics.ipynb`,
for testing the Python control interface to a Unity build. This notebook is
introduced in the
[Jupyter](https://jupyter.org) is a fantastic tool for writing code with
embedded visualizations. We provide one such notebook,
`notebooks/getting-started.ipynb`, for testing the Python control interface to a
Unity build. This notebook is introduced in the
in the _Jupyter/IPython Quick Start Guide_. To launch Jupyter, run in the command line:
in the _Jupyter/IPython Quick Start Guide_. To launch Jupyter, run in the
command line:
`jupyter notebook`
```sh
jupyter notebook
```
Then navigate to `localhost:8888` to access your notebooks.

301
docs/Background-Machine-Learning.md


# Background: Machine Learning
Given that a number of users of the ML-Agents toolkit might not have a formal machine
learning background, this page provides an overview to facilitate the
understanding of the ML-Agents toolkit. However, We will not attempt to provide a thorough
treatment of machine learning as there are fantastic resources online.
Given that a number of users of the ML-Agents toolkit might not have a formal
machine learning background, this page provides an overview to facilitate the
understanding of the ML-Agents toolkit. However, We will not attempt to provide
a thorough treatment of machine learning as there are fantastic resources
online.
Machine learning, a branch of artificial intelligence, focuses on learning
Machine learning, a branch of artificial intelligence, focuses on learning
include: unsupervised learning, supervised learning and reinforcement learning.
Each class of algorithm learns from a different type of data. The following
paragraphs provide an overview for each of these classes of machine learning,
as well as introductory examples.
include: unsupervised learning, supervised learning and reinforcement learning.
Each class of algorithm learns from a different type of data. The following
paragraphs provide an overview for each of these classes of machine learning, as
well as introductory examples.
The goal of
[unsupervised learning](https://en.wikipedia.org/wiki/Unsupervised_learning) is to group or cluster similar items in a
data set. For example, consider the players of a game. We may want to group
the players depending on how engaged they are with the game. This would enable
us to target different groups (e.g. for highly-engaged players we might
invite them to be beta testers for new features, while for unengaged players
we might email them helpful tutorials). Say that we wish to split our players
into two groups. We would first define basic attributes of the players, such
as the number of hours played, total money spent on in-app purchases and
number of levels completed. We can then feed this data set (three attributes
for every player) to an unsupervised learning algorithm where we specify the
number of groups to be two. The algorithm would then split the data set of
players into two groups where the players within each group would be similar
to each other. Given the attributes we used to describe each player, in this
case, the output would be a split of all the players into two groups, where
one group would semantically represent the engaged players and the second
group would semantically represent the unengaged players.
The goal of [unsupervised
learning](https://en.wikipedia.org/wiki/Unsupervised_learning) is to group or
cluster similar items in a data set. For example, consider the players of a
game. We may want to group the players depending on how engaged they are with
the game. This would enable us to target different groups (e.g. for
highly-engaged players we might invite them to be beta testers for new features,
while for unengaged players we might email them helpful tutorials). Say that we
wish to split our players into two groups. We would first define basic
attributes of the players, such as the number of hours played, total money spent
on in-app purchases and number of levels completed. We can then feed this data
set (three attributes for every player) to an unsupervised learning algorithm
where we specify the number of groups to be two. The algorithm would then split
the data set of players into two groups where the players within each group
would be similar to each other. Given the attributes we used to describe each
player, in this case, the output would be a split of all the players into two
groups, where one group would semantically represent the engaged players and the
second group would semantically represent the unengaged players.
defined the appropriate attributes and relied on the algorithm to uncover
the two groups on its own. This type of data set is typically called an
unlabeled data set as it is lacking these direct labels. Consequently,
unsupervised learning can be helpful in situations where these labels can be
expensive or hard to produce. In the next paragraph, we overview supervised
learning algorithms which accept input labels in addition to attributes.
defined the appropriate attributes and relied on the algorithm to uncover the
two groups on its own. This type of data set is typically called an unlabeled
data set as it is lacking these direct labels. Consequently, unsupervised
learning can be helpful in situations where these labels can be expensive or
hard to produce. In the next paragraph, we overview supervised learning
algorithms which accept input labels in addition to attributes.
In [supervised learning](https://en.wikipedia.org/wiki/Supervised_learning),
we do not want to just group similar items but directly
learn a mapping from each item to the group (or class) that it belongs to.
Returning to our earlier example of
clustering players, let's say we now wish to predict which of our players are
about to churn (that is stop playing the game for the next 30 days). We
can look into our historical records and create a data set that
contains attributes of our players in addition to a label indicating whether
they have churned or not. Note that the player attributes we use for this
churn prediction task may be different from the ones we used for our earlier
clustering task. We can then feed this data set (attributes **and** label for
each player) into a supervised learning algorithm which would learn a mapping
from the player attributes to a label indicating whether that player
will churn or not. The intuition is that the supervised learning algorithm
will learn which values of these attributes typically correspond to players
who have churned and not churned (for example, it may learn that players
who spend very little and play for very short periods will most likely churn).
Now given this learned model, we can provide it the attributes of a
new player (one that recently started playing the game) and it would output
a _predicted_ label for that player. This prediction is the algorithms
expectation of whether the player will churn or not.
We can now use these predictions to target the players
who are expected to churn and entice them to continue playing the game.
In [supervised learning](https://en.wikipedia.org/wiki/Supervised_learning), we
do not want to just group similar items but directly learn a mapping from each
item to the group (or class) that it belongs to. Returning to our earlier
example of clustering players, let's say we now wish to predict which of our
players are about to churn (that is stop playing the game for the next 30 days).
We can look into our historical records and create a data set that contains
attributes of our players in addition to a label indicating whether they have
churned or not. Note that the player attributes we use for this churn prediction
task may be different from the ones we used for our earlier clustering task. We
can then feed this data set (attributes **and** label for each player) into a
supervised learning algorithm which would learn a mapping from the player
attributes to a label indicating whether that player will churn or not. The
intuition is that the supervised learning algorithm will learn which values of
these attributes typically correspond to players who have churned and not
churned (for example, it may learn that players who spend very little and play
for very short periods will most likely churn). Now given this learned model, we
can provide it the attributes of a new player (one that recently started playing
the game) and it would output a _predicted_ label for that player. This
prediction is the algorithms expectation of whether the player will churn or
not. We can now use these predictions to target the players who are expected to
churn and entice them to continue playing the game.
player. Model selection, on the other hand, pertains to selecting the
algorithm (and its parameters) that perform the task well. Both of these
tasks are active areas of machine learning research and, in practice, require
several iterations to achieve good performance.
player. Model selection, on the other hand, pertains to selecting the algorithm
(and its parameters) that perform the task well. Both of these tasks are active
areas of machine learning research and, in practice, require several iterations
to achieve good performance.
We now switch to reinforcement learning, the third class of
machine learning algorithms, and arguably the one most relevant for the ML-Agents toolkit.
We now switch to reinforcement learning, the third class of machine learning
algorithms, and arguably the one most relevant for the ML-Agents toolkit.
can be viewed as a form of learning for sequential
decision making that is commonly associated with controlling robots (but is,
in fact, much more general). Consider an autonomous firefighting robot that is
tasked with navigating into an area, finding the fire and neutralizing it. At
any given moment, the robot perceives the environment through its sensors (e.g.
camera, heat, touch), processes this information and produces an action (e.g.
move to the left, rotate the water hose, turn on the water). In other words,
it is continuously making decisions about how to interact in this environment
given its view of the world (i.e. sensors input) and objective (i.e.
neutralizing the fire). Teaching a robot to be a successful firefighting
machine is precisely what reinforcement learning is designed to do.
can be viewed as a form of learning for sequential decision making that is
commonly associated with controlling robots (but is, in fact, much more
general). Consider an autonomous firefighting robot that is tasked with
navigating into an area, finding the fire and neutralizing it. At any given
moment, the robot perceives the environment through its sensors (e.g. camera,
heat, touch), processes this information and produces an action (e.g. move to
the left, rotate the water hose, turn on the water). In other words, it is
continuously making decisions about how to interact in this environment given
its view of the world (i.e. sensors input) and objective (i.e. neutralizing the
fire). Teaching a robot to be a successful firefighting machine is precisely
what reinforcement learning is designed to do.
More specifically, the goal of reinforcement learning is to learn a **policy**,
which is essentially a mapping from **observations** to **actions**. An
observation is what the robot can measure from its **environment** (in this
More specifically, the goal of reinforcement learning is to learn a **policy**,
which is essentially a mapping from **observations** to **actions**. An
observation is what the robot can measure from its **environment** (in this
to the configuration of the robot (e.g. position of its base, position of
its water hose and whether the hose is on or off).
to the configuration of the robot (e.g. position of its base, position of its
water hose and whether the hose is on or off).
The last remaining piece
of the reinforcement learning task is the **reward signal**. When training a
robot to be a mean firefighting machine, we provide it with rewards (positive
and negative) indicating how well it is doing on completing the task.
Note that the robot does not _know_ how to put out fires before it is trained.
It learns the objective because it receives a large positive reward when it puts
out the fire and a small negative reward for every passing second. The fact that
rewards are sparse (i.e. may not be provided at every step, but only when a
robot arrives at a success or failure situation), is a defining characteristic of
reinforcement learning and precisely why learning good policies can be difficult
(and/or time-consuming) for complex environments.
The last remaining piece of the reinforcement learning task is the **reward
signal**. When training a robot to be a mean firefighting machine, we provide it
with rewards (positive and negative) indicating how well it is doing on
completing the task. Note that the robot does not _know_ how to put out fires
before it is trained. It learns the objective because it receives a large
positive reward when it puts out the fire and a small negative reward for every
passing second. The fact that rewards are sparse (i.e. may not be provided at
every step, but only when a robot arrives at a success or failure situation), is
a defining characteristic of reinforcement learning and precisely why learning
good policies can be difficult (and/or time-consuming) for complex environments.
<p align="center">
<img src="images/rl_cycle.png" alt="The reinforcement learning cycle."/>

usually requires many trials and iterative
policy updates. More specifically, the robot is placed in several
fire situations and over time learns an optimal policy which allows it
to put our fires more effectively. Obviously, we cannot expect to train a
robot repeatedly in the real world, particularly when fires are involved. This
is precisely why the use of
usually requires many trials and iterative policy updates. More specifically,
the robot is placed in several fire situations and over time learns an optimal
policy which allows it to put our fires more effectively. Obviously, we cannot
expect to train a robot repeatedly in the real world, particularly when fires
are involved. This is precisely why the use of
serves as the perfect training grounds for learning such behaviors.
While our discussion of reinforcement learning has centered around robots,
there are strong parallels between robots and characters in a game. In fact,
in many ways, one can view a non-playable character (NPC) as a virtual
robot, with its own observations about the environment, its own set of actions
and a specific objective. Thus it is natural to explore how we can
train behaviors within Unity using reinforcement learning. This is precisely
what the ML-Agents toolkit offers. The video linked below includes a reinforcement
learning demo showcasing training character behaviors using the ML-Agents toolkit.
serves as the perfect training grounds for learning such behaviors. While our
discussion of reinforcement learning has centered around robots, there are
strong parallels between robots and characters in a game. In fact, in many ways,
one can view a non-playable character (NPC) as a virtual robot, with its own
observations about the environment, its own set of actions and a specific
objective. Thus it is natural to explore how we can train behaviors within Unity
using reinforcement learning. This is precisely what the ML-Agents toolkit
offers. The video linked below includes a reinforcement learning demo showcasing
training character behaviors using the ML-Agents toolkit.
<a href="http://www.youtube.com/watch?feature=player_embedded&v=fiQsmdwEGT8" target="_blank">
<img src="http://img.youtube.com/vi/fiQsmdwEGT8/0.jpg" alt="RL Demo" width="400" border="10" />
</a>
<a href="http://www.youtube.com/watch?feature=player_embedded&v=fiQsmdwEGT8" target="_blank">
<img src="http://img.youtube.com/vi/fiQsmdwEGT8/0.jpg" alt="RL Demo" width="400" border="10" />
</a>
also involves two tasks: attribute selection and model selection.
Attribute selection is defining the set of observations for the robot
that best help it complete its objective, while model selection is defining
the form of the policy (mapping from observations to actions) and its
parameters. In practice, training behaviors is an iterative process that may
require changing the attribute and model choices.
also involves two tasks: attribute selection and model selection. Attribute
selection is defining the set of observations for the robot that best help it
complete its objective, while model selection is defining the form of the policy
(mapping from observations to actions) and its parameters. In practice, training
behaviors is an iterative process that may require changing the attribute and
model choices.
One common aspect of all three branches of machine learning is that they
all involve a **training phase** and an **inference phase**. While the
details of the training and inference phases are different for each of the
three, at a high-level, the training phase involves building a model
using the provided data, while the inference phase involves applying this
model to new, previously unseen, data. More specifically:
* For our unsupervised learning
example, the training phase learns the optimal two clusters based
on the data describing existing players, while the inference phase assigns a
new player to one of these two clusters.
* For our supervised learning example, the
training phase learns the mapping from player attributes to player label
(whether they churned or not), and the inference phase predicts whether
a new player will churn or not based on that learned mapping.
* For our reinforcement learning example, the training phase learns the
optimal policy through guided trials, and in the inference phase, the agent
observes and tales actions in the wild using its learned policy.
One common aspect of all three branches of machine learning is that they all
involve a **training phase** and an **inference phase**. While the details of
the training and inference phases are different for each of the three, at a
high-level, the training phase involves building a model using the provided
data, while the inference phase involves applying this model to new, previously
unseen, data. More specifically:
To briefly summarize: all three classes of algorithms involve training
and inference phases in addition to attribute and model selections. What
ultimately separates them is the type of data available to learn from. In
unsupervised learning our data set was a collection of attributes, in
supervised learning our data set was a collection of attribute-label pairs,
and, lastly, in reinforcement learning our data set was a collection of
* For our unsupervised learning example, the training phase learns the optimal
two clusters based on the data describing existing players, while the
inference phase assigns a new player to one of these two clusters.
* For our supervised learning example, the training phase learns the mapping
from player attributes to player label (whether they churned or not), and the
inference phase predicts whether a new player will churn or not based on that
learned mapping.
* For our reinforcement learning example, the training phase learns the optimal
policy through guided trials, and in the inference phase, the agent observes
and tales actions in the wild using its learned policy.
To briefly summarize: all three classes of algorithms involve training and
inference phases in addition to attribute and model selections. What ultimately
separates them is the type of data available to learn from. In unsupervised
learning our data set was a collection of attributes, in supervised learning our
data set was a collection of attribute-label pairs, and, lastly, in
reinforcement learning our data set was a collection of
[Deep learning](https://en.wikipedia.org/wiki/Deep_learning) is a family of
algorithms that can be used to address any of the problems introduced
above. More specifically, they can be used to solve both attribute and
model selection tasks. Deep learning has gained popularity in recent
years due to its outstanding performance on several challenging machine learning
tasks. One example is [AlphaGo](https://en.wikipedia.org/wiki/AlphaGo),
a [computer Go](https://en.wikipedia.org/wiki/Computer_Go) program, that
leverages deep learning, that was able to beat Lee Sedol (a Go world champion).
[Deep learning](https://en.wikipedia.org/wiki/Deep_learning) is a family of
algorithms that can be used to address any of the problems introduced above.
More specifically, they can be used to solve both attribute and model selection
tasks. Deep learning has gained popularity in recent years due to its
outstanding performance on several challenging machine learning tasks. One
example is [AlphaGo](https://en.wikipedia.org/wiki/AlphaGo), a [computer
Go](https://en.wikipedia.org/wiki/Computer_Go) program, that leverages deep
learning, that was able to beat Lee Sedol (a Go world champion).
complex functions from large amounts of training data. This makes them a
natural choice for reinforcement learning tasks when a large amount of data
can be generated, say through the use of a simulator or engine such as Unity.
By generating hundreds of thousands of simulations of
the environment within Unity, we can learn policies for very complex environments
(a complex environment is one where the number of observations an agent perceives
and the number of actions they can take are large).
Many of the algorithms we provide in ML-Agents use some form of deep learning,
built on top of the open-source library, [TensorFlow](Background-TensorFlow.md).
complex functions from large amounts of training data. This makes them a natural
choice for reinforcement learning tasks when a large amount of data can be
generated, say through the use of a simulator or engine such as Unity. By
generating hundreds of thousands of simulations of the environment within Unity,
we can learn policies for very complex environments (a complex environment is
one where the number of observations an agent perceives and the number of
actions they can take are large). Many of the algorithms we provide in ML-Agents
use some form of deep learning, built on top of the open-source library,
[TensorFlow](Background-TensorFlow.md).

74
docs/Background-TensorFlow.md


# Background: TensorFlow
As discussed in our
[machine learning background page](Background-Machine-Learning.md), many of the
algorithms we provide in the ML-Agents toolkit leverage some form of deep learning.
More specifically, our implementations are built on top of the open-source
library [TensorFlow](https://www.tensorflow.org/). This means that the models
produced by the ML-Agents toolkit are (currently) in a format only understood by
As discussed in our
[machine learning background page](Background-Machine-Learning.md),
many of the algorithms we provide in the
ML-Agents toolkit leverage some form of deep learning. More specifically, our
implementations are built on top of the open-source library
[TensorFlow](https://www.tensorflow.org/). This means that the models produced
by the ML-Agents toolkit are (currently) in a format only understood by
TensorFlow. In this page we provide a brief overview of TensorFlow, in addition
to TensorFlow-related tools that we leverage within the ML-Agents toolkit.

performing computations using data flow graphs, the underlying representation
of deep learning models. It facilitates training and inference on CPUs and
GPUs in a desktop, server, or mobile device. Within the ML-Agents toolkit, when you
train the behavior of an Agent, the output is a TensorFlow model (.bytes)
file that you can then embed within an Internal Brain. Unless you implement
a new algorithm, the use of TensorFlow is mostly abstracted away and behind
the scenes.
performing computations using data flow graphs, the underlying representation of
deep learning models. It facilitates training and inference on CPUs and GPUs in
a desktop, server, or mobile device. Within the ML-Agents toolkit, when you
train the behavior of an agent, the output is a TensorFlow model (.bytes) file
that you can then embed within an Internal Brain. Unless you implement a new
algorithm, the use of TensorFlow is mostly abstracted away and behind the
scenes.
One component of training models with TensorFlow is setting the
values of certain model attributes (called _hyperparameters_). Finding the
right values of these hyperparameters can require a few iterations.
Consequently, we leverage a visualization tool within TensorFlow called
[TensorBoard](https://www.tensorflow.org/programmers_guide/summaries_and_tensorboard).
It allows the visualization of certain agent attributes (e.g. reward)
throughout training which can be helpful in both building
intuitions for the different hyperparameters and setting the optimal values for
your Unity environment. We provide more details on setting the hyperparameters
in later parts of the documentation, but, in the meantime, if you are
unfamiliar with TensorBoard we recommend this
One component of training models with TensorFlow is setting the values of
certain model attributes (called _hyperparameters_). Finding the right values of
these hyperparameters can require a few iterations. Consequently, we leverage a
visualization tool within TensorFlow called
[TensorBoard](https://www.tensorflow.org/programmers_guide/summaries_and_tensorboard).
It allows the visualization of certain agent attributes (e.g. reward) throughout
training which can be helpful in both building intuitions for the different
hyperparameters and setting the optimal values for your Unity environment. We
provide more details on setting the hyperparameters in later parts of the
documentation, but, in the meantime, if you are unfamiliar with TensorBoard we
recommend this
One of the drawbacks of TensorFlow is that it does not provide a native
C# API. This means that the Internal Brain is not natively supported since
Unity scripts are written in C#. Consequently,
to enable the Internal Brain, we leverage a third-party
library [TensorFlowSharp](https://github.com/migueldeicaza/TensorFlowSharp)
which provides .NET bindings to TensorFlow. Thus, when a Unity environment
that contains an Internal Brain is built, inference is performed via
TensorFlowSharp. We provide an additional in-depth overview of how to
leverage [TensorFlowSharp within Unity](Using-TensorFlow-Sharp-in-Unity.md)
which will become more relevant once you install and start training
behaviors within the ML-Agents toolkit. Given the reliance on TensorFlowSharp, the
Internal Brain is currently marked as experimental.
One of the drawbacks of TensorFlow is that it does not provide a native C# API.
This means that the Internal Brain is not natively supported since Unity scripts
are written in C#. Consequently, to enable the Internal Brain, we leverage a
third-party library
[TensorFlowSharp](https://github.com/migueldeicaza/TensorFlowSharp) which
provides .NET bindings to TensorFlow. Thus, when a Unity environment that
contains an Internal Brain is built, inference is performed via TensorFlowSharp.
We provide an additional in-depth overview of how to leverage
[TensorFlowSharp within Unity](Using-TensorFlow-Sharp-in-Unity.md)
which will become more
relevant once you install and start training behaviors within the ML-Agents
toolkit. Given the reliance on TensorFlowSharp, the Internal Brain is currently
marked as experimental.

12
docs/Background-Unity.md


# Background: Unity
If you are not familiar with the [Unity Engine](https://unity3d.com/unity),
we highly recommend the
[Unity Manual](https://docs.unity3d.com/Manual/index.html) and
[Tutorials page](https://unity3d.com/learn/tutorials). The
If you are not familiar with the [Unity Engine](https://unity3d.com/unity), we
highly recommend the [Unity Manual](https://docs.unity3d.com/Manual/index.html)
and [Tutorials page](https://unity3d.com/learn/tutorials). The
with the ML-Agents toolkit:
with the ML-Agents toolkit:
* [Editor](https://docs.unity3d.com/Manual/UsingTheEditor.html)
* [Interface](https://docs.unity3d.com/Manual/LearningtheInterface.html)
* [Scene](https://docs.unity3d.com/Manual/CreatingScenes.html)

* [Scripting](https://docs.unity3d.com/Manual/ScriptingSection.html)
* [Physics](https://docs.unity3d.com/Manual/PhysicsSection.html)
* [Ordering of event functions](https://docs.unity3d.com/Manual/ExecutionOrder.html)
(e.g. FixedUpdate, Update)
(e.g. FixedUpdate, Update)

242
docs/Basic-Guide.md


# Basic Guide
This guide will show you how to use a pretrained model in an example Unity environment, and show you how to train the model yourself.
This guide will show you how to use a pre-trained model in an example Unity
environment, and show you how to train the model yourself.
If you are not familiar with the [Unity Engine](https://unity3d.com/unity),
we highly recommend the [Roll-a-ball tutorial](https://unity3d.com/learn/tutorials/s/roll-ball-tutorial) to learn all the basic concepts of Unity.
If you are not familiar with the [Unity Engine](https://unity3d.com/unity), we
highly recommend the [Roll-a-ball
tutorial](https://unity3d.com/learn/tutorials/s/roll-ball-tutorial) to learn all
the basic concepts of Unity.
In order to use the ML-Agents toolkit within Unity, you need to change some Unity settings first. Also [TensorFlowSharp plugin](https://s3.amazonaws.com/unity-ml-agents/0.4/TFSharpPlugin.unitypackage) is needed for you to use pretrained model within Unity, which is based on the [TensorFlowSharp repo](https://github.com/migueldeicaza/TensorFlowSharp).
In order to use the ML-Agents toolkit within Unity, you need to change some
Unity settings first. Also [TensorFlowSharp
plugin](https://s3.amazonaws.com/unity-ml-agents/0.5/TFSharpPlugin.unitypackage)
is needed for you to use pre-trained model within Unity, which is based on the
[TensorFlowSharp repo](https://github.com/migueldeicaza/TensorFlowSharp).
3. Using the file dialog that opens, locate the `unity-environment` folder within the the ML-Agents toolkit project and click **Open**.
3. Using the file dialog that opens, locate the `UnitySDK` folder
within the the ML-Agents toolkit project and click **Open**.
5. For **each** of the platforms you target
(**PC, Mac and Linux Standalone**, **iOS** or **Android**):
5. For **each** of the platforms you target (**PC, Mac and Linux Standalone**,
**iOS** or **Android**):
2. Select **Scripting Runtime Version** to
**Experimental (.NET 4.6 Equivalent or .NET 4.x Equivalent)**
3. In **Scripting Defined Symbols**, add the flag `ENABLE_TENSORFLOW`.
After typing in the flag name, press Enter.
2. Select **Scripting Runtime Version** to **Experimental (.NET 4.6
Equivalent or .NET 4.x Equivalent)**
3. In **Scripting Defined Symbols**, add the flag `ENABLE_TENSORFLOW`. After
typing in the flag name, press Enter.
[Download](https://s3.amazonaws.com/unity-ml-agents/0.4/TFSharpPlugin.unitypackage) the TensorFlowSharp plugin. Then import it into Unity by double clicking the downloaded file. You can check if it was successfully imported by checking the TensorFlow files in the Project window under **Assets** > **ML-Agents** > **Plugins** > **Computer**.
[Download](https://s3.amazonaws.com/unity-ml-agents/0.5/TFSharpPlugin.unitypackage)
the TensorFlowSharp plugin. Then import it into Unity by double clicking the
downloaded file. You can check if it was successfully imported by checking the
TensorFlow files in the Project window under **Assets** > **ML-Agents** >
**Plugins** > **Computer**.
**Note**: If you don't see anything under **Assets**, drag the `ml-agents/unity-environment/Assets/ML-Agents` folder under **Assets** within Project window.
**Note**: If you don't see anything under **Assets**, drag the
`UnitySDK/Assets/ML-Agents` folder under **Assets** within Project window.
1. In the **Project** window, go to `Assets/ML-Agents/Examples/3DBall` folder and open the `3DBall` scene file.
2. In the **Hierarchy** window, select the **Ball3DBrain** child under the **Ball3DAcademy** GameObject to view its properties in the Inspector window.
3. On the **Ball3DBrain** object's **Brain** component, change the **Brain Type** to **Internal**.
4. In the **Project** window, locate the `Assets/ML-Agents/Examples/3DBall/TFModels` folder.
5. Drag the `3DBall` model file from the `TFModels` folder to the **Graph Model** field of the **Ball3DBrain** object's **Brain** component.
5. Click the **Play** button and you will see the platforms balance the balls using the pretrained model.
1. In the **Project** window, go to `Assets/ML-Agents/Examples/3DBall` folder
and open the `3DBall` scene file.
2. In the **Hierarchy** window, select the **Ball3DBrain** child under the
**Ball3DAcademy** GameObject to view its properties in the Inspector window.
3. On the **Ball3DBrain** object's **Brain** component, change the **Brain
Type** to **Internal**.
4. In the **Project** window, locate the
`Assets/ML-Agents/Examples/3DBall/TFModels` folder.
5. Drag the `3DBall` model file from the `TFModels` folder to the **Graph
Model** field of the **Ball3DBrain** object's **Brain** component.
6. Click the **Play** button and you will see the platforms balance the balls
using the pretrained model.
The `python/Basics` [Jupyter notebook](Background-Jupyter.md) contains a
simple walkthrough of the functionality of the Python
API. It can also serve as a simple test that your environment is configured
correctly. Within `Basics`, be sure to set `env_name` to the name of the
Unity executable if you want to [use an executable](Learning-Environment-Executable.md) or to `None` if you want to interact with the current scene in the Unity Editor.
The `notebooks/getting-started.ipynb` [Jupyter notebook](Background-Jupyter.md)
contains a simple walkthrough of the functionality of the Python API. It can
also serve as a simple test that your environment is configured correctly.
Within `Basics`, be sure to set `env_name` to the name of the Unity executable
if you want to [use an executable](Learning-Environment-Executable.md) or to
`None` if you want to interact with the current scene in the Unity Editor.
More information and documentation is provided in the
More information and documentation is provided in the
Since we are going to build this environment to conduct training, we need to
set the brain used by the agents to **External**. This allows the agents to
Since we are going to build this environment to conduct training, we need to set
the Brain used by the Agents to **External**. This allows the Agents to
1. In the **Scene** window, click the triangle icon next to the Ball3DAcademy
object.
1. In the **Scene** window, click the triangle icon next to the Ball3DAcademy
object.
2. Select its child object **Ball3DBrain**.
3. In the Inspector window, set **Brain Type** to **External**.

1. Open a command or terminal window.
2. Nagivate to the folder where you installed the ML-Agents toolkit.
3. Change to the `python` directory.
4. Run `python3 learn.py --run-id=<run-identifier> --train`
Where:
- `<run-identifier>` is a string used to separate the results of different training runs
- And the `--train` tells learn.py to run a training session (rather than inference)
5. When the message _"Start training by pressing the Play button in the Unity Editor"_ is displayed on the screen, you can press the :arrow_forward: button in Unity to start training in the Editor.
**Note**: Alternatively, you can use an executable rather than the Editor to perform training. Please refer to [this page](Learning-Environment-Executable.md) for instructions on how to build and use an executable.
1. Open a command or terminal window.
2. Navigate to the folder where you cloned the ML-Agents toolkit repository.
**Note**: If you followed the default [installation](Installation.md), then
you should be able to run `mlagents-learn` from any directory.
3. Run `mlagents-learn <trainer-config-path> --run-id=<run-identifier> --train`
where:
- `<trainer-config-path>` is the relative or absolute filepath of the
trainer configuration. The defaults used by example environments included
in `MLAgentsSDK` can be found in `config/trainer_config.yaml`.
- `<run-identifier>` is a string used to separate the results of different
training runs
- `--train` tells `mlagents-learn` to run a training session (rather
than inference)
4. If you cloned the ML-Agents repo, then you can simply run
![Training command example](images/training-command-example.png)
```sh
mlagents-learn config/trainer_config.yaml --run-id=firstRun --train
```
**Note**: If you're using Anaconda, don't forget to activate the ml-agents environment first.
5. When the message _"Start training by pressing the Play button in the Unity
Editor"_ is displayed on the screen, you can press the :arrow_forward: button
in Unity to start training in the Editor.
**Note**: Alternatively, you can use an executable rather than the Editor to
perform training. Please refer to [this
page](Learning-Environment-Executable.md) for instructions on how to build and
use an executable.
If the learn.py runs correctly and starts training, you should see something like this:
```console
ml-agents$ mlagents-learn config/trainer_config.yaml --run-id=first-run --train
![Training running](images/training-running.png)
▄▄▄▓▓▓▓
╓▓▓▓▓▓▓█▓▓▓▓▓
,▄▄▄m▀▀▀' ,▓▓▓▀▓▓▄ ▓▓▓ ▓▓▌
▄▓▓▓▀' ▄▓▓▀ ▓▓▓ ▄▄ ▄▄ ,▄▄ ▄▄▄▄ ,▄▄ ▄▓▓▌▄ ▄▄▄ ,▄▄
▄▓▓▓▀ ▄▓▓▀ ▐▓▓▌ ▓▓▌ ▐▓▓ ▐▓▓▓▀▀▀▓▓▌ ▓▓▓ ▀▓▓▌▀ ^▓▓▌ ╒▓▓▌
▄▓▓▓▓▓▄▄▄▄▄▄▄▄▓▓▓ ▓▀ ▓▓▌ ▐▓▓ ▐▓▓ ▓▓▓ ▓▓▓ ▓▓▌ ▐▓▓▄ ▓▓▌
▀▓▓▓▓▀▀▀▀▀▀▀▀▀▀▓▓▄ ▓▓ ▓▓▌ ▐▓▓ ▐▓▓ ▓▓▓ ▓▓▓ ▓▓▌ ▐▓▓▐▓▓
^█▓▓▓ ▀▓▓▄ ▐▓▓▌ ▓▓▓▓▄▓▓▓▓ ▐▓▓ ▓▓▓ ▓▓▓ ▓▓▓▄ ▓▓▓▓`
'▀▓▓▓▄ ^▓▓▓ ▓▓▓ └▀▀▀▀ ▀▀ ^▀▀ `▀▀ `▀▀ '▀▀ ▐▓▓▌
▀▀▀▀▓▄▄▄ ▓▓▓▓▓▓, ▓▓▓▓▀
`▀█▓▓▓▓▓▓▓▓▓▌
¬`▀▀▀█▓
INFO:mlagents.learn:{'--curriculum': 'None',
'--docker-target-name': 'Empty',
'--env': 'None',
'--help': False,
'--keep-checkpoints': '5',
'--lesson': '0',
'--load': False,
'--no-graphics': False,
'--num-runs': '1',
'--run-id': 'first-run',
'--save-freq': '50000',
'--seed': '-1',
'--slow': False,
'--train': True,
'--worker-id': '0',
'<trainer-config-path>': 'config/trainer_config.yaml'}
INFO:mlagents.envs:Start training by pressing the Play button in the Unity Editor.
```
**Note**: If you're using Anaconda, don't forget to activate the ml-agents
environment first.
If `mlagents-learn` runs correctly and starts training, you should see something
like this:
```console
INFO:mlagents.envs:
'Ball3DAcademy' started successfully!
Unity Academy name: Ball3DAcademy
Number of Brains: 1
Number of External Brains : 1
Reset Parameters :
Unity brain name: Ball3DBrain
Number of Visual Observations (per agent): 0
Vector Observation space size (per agent): 8
Number of stacked Vector Observation: 1
Vector Action space type: continuous
Vector Action space size (per agent): [2]
Vector Action descriptions: ,
INFO:mlagents.envs:Hyperparameters for the PPO Trainer of brain Ball3DBrain:
batch_size: 64
beta: 0.001
buffer_size: 12000
epsilon: 0.2
gamma: 0.995
hidden_units: 128
lambd: 0.99
learning_rate: 0.0003
max_steps: 5.0e4
normalize: True
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 1000
use_recurrent: False
graph_scope:
summary_path: ./summaries/first-run-0
memory_size: 256
use_curiosity: False
curiosity_strength: 0.01
curiosity_enc_size: 128
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 1000. Mean Reward: 1.242. Std of Reward: 0.746. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 2000. Mean Reward: 1.319. Std of Reward: 0.693. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 3000. Mean Reward: 1.804. Std of Reward: 1.056. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 4000. Mean Reward: 2.151. Std of Reward: 1.432. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 5000. Mean Reward: 3.175. Std of Reward: 2.250. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 6000. Mean Reward: 4.898. Std of Reward: 4.019. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 7000. Mean Reward: 6.716. Std of Reward: 5.125. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 8000. Mean Reward: 12.124. Std of Reward: 11.929. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 9000. Mean Reward: 18.151. Std of Reward: 16.871. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 10000. Mean Reward: 27.284. Std of Reward: 28.667. Training.
```
You can press Ctrl+C to stop the training, and your trained model will be at `ml-agents/python/models/<run-identifier>/editor_<academy_name>_<run-identifier>.bytes` where `<academy_name>` is the name of the Academy GameObject in the current scene. This file corresponds to your model's latest checkpoint. You can now embed this trained model into your internal brain by following the steps below, which is similar to the steps described [above](#play-an-example-environment-using-pretrained-model).
1. Move your model file into
`unity-environment/Assets/ML-Agents/Examples/3DBall/TFModels/`.
You can press Ctrl+C to stop the training, and your trained model will be at
`models/<run-identifier>/editor_<academy_name>_<run-identifier>.bytes` where
`<academy_name>` is the name of the Academy GameObject in the current scene.
This file corresponds to your model's latest checkpoint. You can now embed this
trained model into your Internal Brain by following the steps below, which is
similar to the steps described
[above](#play-an-example-environment-using-pretrained-model).
1. Move your model file into
`UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/`.
5. Drag the `<env_name>_<run-identifier>.bytes` file from the Project window of the Editor
to the **Graph Model** placeholder in the **Ball3DBrain** inspector window.
5. Drag the `<env_name>_<run-identifier>.bytes` file from the Project window of
the Editor to the **Graph Model** placeholder in the **Ball3DBrain**
inspector window.
* For more information on the ML-Agents toolkit, in addition to helpful background, check out the [ML-Agents Toolkit Overview](ML-Agents-Overview.md) page.
* For a more detailed walk-through of our 3D Balance Ball environment, check out the [Getting Started](Getting-Started-with-Balance-Ball.md) page.
* For a "Hello World" introduction to creating your own learning environment, check out the [Making a New Learning Environment](Learning-Environment-Create-New.md) page.
* For a series of Youtube video tutorials, checkout the [Machine Learning Agents PlayList](https://www.youtube.com/playlist?list=PLX2vGYjWbI0R08eWQkO7nQkGiicHAX7IX) page.
- For more information on the ML-Agents toolkit, in addition to helpful
background, check out the [ML-Agents Toolkit Overview](ML-Agents-Overview.md)
page.
- For a more detailed walk-through of our 3D Balance Ball environment, check out
the [Getting Started](Getting-Started-with-Balance-Ball.md) page.
- For a "Hello World" introduction to creating your own Learning Environment,
check out the [Making a New Learning
Environment](Learning-Environment-Create-New.md) page.
- For a series of Youtube video tutorials, checkout the
[Machine Learning Agents PlayList](https://www.youtube.com/playlist?list=PLX2vGYjWbI0R08eWQkO7nQkGiicHAX7IX)
page.

136
docs/FAQ.md


# Frequently Asked Questions
## Scripting Runtime Environment not setup correctly
### Scripting Runtime Environment not setup correctly
If you haven't switched your scripting runtime version from .NET 3.5 to .NET 4.6
or .NET 4.x, you will see such error message:
If you haven't switched your scripting runtime version from .NET 3.5 to .NET 4.6 or .NET 4.x, you will see such error message:
```
```console
This is because .NET 3.5 doesn't support method Clear() for StringBuilder, refer to [Setting Up The ML-Agents Toolkit Within Unity](Installation.md#setting-up-ml-agent-within-unity) for solution.
This is because .NET 3.5 doesn't support method Clear() for StringBuilder, refer
to [Setting Up The ML-Agents Toolkit Within
Unity](Installation.md#setting-up-ml-agent-within-unity) for solution.
### TensorFlowSharp flag not turned on.
## TensorFlowSharp flag not turned on
If you have already imported the TensorFlowSharp plugin, but havn't set ENABLE_TENSORFLOW flag for your scripting define symbols, you will see the following error message:
If you have already imported the TensorFlowSharp plugin, but haven't set
ENABLE_TENSORFLOW flag for your scripting define symbols, you will see the
following error message:
```
You need to install and enable the TensorFlowSharp plugin in order to use the internal brain.
```console
You need to install and enable the TensorFlowSharp plugin in order to use the Internal Brain.
This error message occurs because the TensorFlowSharp plugin won't be usage without the ENABLE_TENSORFLOW flag, refer to [Setting Up The ML-Agents Toolkit Within Unity](Installation.md#setting-up-ml-agent-within-unity) for solution.
This error message occurs because the TensorFlowSharp plugin won't be usage
without the ENABLE_TENSORFLOW flag, refer to [Setting Up The ML-Agents Toolkit
Within Unity](Installation.md#setting-up-ml-agent-within-unity) for solution.
### Tensorflow epsilon placeholder error
## Instance of CoreBrainInternal couldn't be created
If you have a graph placeholder set in the internal Brain inspector that is not present in the TensorFlow graph, you will see some error like this:
If you try to use ML-Agents in Unity versions 2017.1 - 2017.3, you might
encounter an error that looks like this:
```
UnityAgentsException: One of the Tensorflow placeholder could not be found. In brain <some_brain_name>, there are no FloatingPoint placeholder named <some_placeholder_name>.
```console
Instance of CoreBrainInternal couldn't be created. The the script
class needs to derive from ScriptableObject.
UnityEngine.ScriptableObject:CreateInstance(String)
Solution: Go to all of your Brain object, find `Graph placeholders` and change its `size` to 0 to remove the `epsilon` placeholder.
You can fix the error by removing `CoreBrain` from CoreBrainInternal.cs:16,
clicking on your Brain Gameobject to let the scene recompile all the changed
C# scripts, then adding the `CoreBrain` back. Make sure your brain is in
Internal mode, your TensorFlowSharp plugin is imported and the
ENABLE_TENSORFLOW flag is set. This fix is only valid locally and unstable.
## Tensorflow epsilon placeholder error
Similarly, if you have a graph scope set in the internal Brain inspector that is not correctly set, you will see some error like this:
If you have a graph placeholder set in the Internal Brain inspector that is not
present in the TensorFlow graph, you will see some error like this:
```console
UnityAgentsException: One of the TensorFlow placeholder could not be found. In brain <some_brain_name>, there are no FloatingPoint placeholder named <some_placeholder_name>.
Solution: Go to all of your Brain object, find `Graph placeholders` and change
its `size` to 0 to remove the `epsilon` placeholder.
Similarly, if you have a graph scope set in the Internal Brain inspector that is
not correctly set, you will see some error like this:
```console
Solution: Make sure your Graph Scope field matches the corresponding brain object name in your Hierachy Inspector when there is multiple brain.
Solution: Make sure your Graph Scope field matches the corresponding Brain
object name in your Hierarchy Inspector when there are multiple Brains.
### Environment Permission Error
## Environment Permission Error
If you directly import your Unity environment without building it in the
editor, you might need to give it additional permissions to execute it.
If you directly import your Unity environment without building it in the
editor, you might need to give it additional permissions to execute it.
`chmod -R 755 *.app`
```sh
chmod -R 755 *.app
```
`chmod -R 755 *.x86_64`
```sh
chmod -R 755 *.x86_64
```
On Windows, you can find
On Windows, you can find
### Environment Connection Timeout
## Environment Connection Timeout
If you are able to launch the environment from `UnityEnvironment` but
then receive a timeout error, there may be a number of possible causes.
* _Cause_: There may be no Brains in your environment which are set
to `External`. In this case, the environment will not attempt to
communicate with python. _Solution_: Set the Brains(s) you wish to
externally control through the Python API to `External` from the
Unity Editor, and rebuild the environment.
* _Cause_: On OSX, the firewall may be preventing communication with
the environment. _Solution_: Add the built environment binary to the
list of exceptions on the firewall by following
[instructions](https://support.apple.com/en-us/HT201642).
* _Cause_: An error happened in the Unity Environment preventing
communication. _Solution_: Look into the
[log files](https://docs.unity3d.com/Manual/LogFiles.html)
generated by the Unity Environment to figure what error happened.
If you are able to launch the environment from `UnityEnvironment` but then
receive a timeout error, there may be a number of possible causes.
### Communication port {} still in use
* _Cause_: There may be no Brains in your environment which are set to
`External`. In this case, the environment will not attempt to communicate
with python. _Solution_: Set the Brains(s) you wish to externally control
through the Python API to `External` from the Unity Editor, and rebuild the
environment.
* _Cause_: On OSX, the firewall may be preventing communication with the
environment. _Solution_: Add the built environment binary to the list of
exceptions on the firewall by following
[instructions](https://support.apple.com/en-us/HT201642).
* _Cause_: An error happened in the Unity Environment preventing communication.
_Solution_: Look into the [log
files](https://docs.unity3d.com/Manual/LogFiles.html) generated by the Unity
Environment to figure what error happened.
If you receive an exception `"Couldn't launch new environment because
communication port {} is still in use. "`, you can change the worker
number in the Python script when calling
## Communication port {} still in use
`UnityEnvironment(file_name=filename, worker_id=X)`
If you receive an exception `"Couldn't launch new environment because
communication port {} is still in use. "`, you can change the worker number in
the Python script when calling
### Mean reward : nan
```python
UnityEnvironment(file_name=filename, worker_id=X)
```
If you receive a message `Mean reward : nan` when attempting to train a
model using PPO, this is due to the episodes of the learning environment
not terminating. In order to address this, set `Max Steps` for either
the Academy or Agents within the Scene Inspector to a value greater
than 0. Alternatively, it is possible to manually set `done` conditions
for episodes from within scripts for custom episode-terminating events.
## Mean reward : nan
If you receive a message `Mean reward : nan` when attempting to train a model
using PPO, this is due to the episodes of the Learning Environment not
terminating. In order to address this, set `Max Steps` for either the Academy or
Agents within the Scene Inspector to a value greater than 0. Alternatively, it
is possible to manually set `done` conditions for episodes from within scripts
for custom episode-terminating events.

57
docs/Feature-Memory.md


# Memory-enhanced Agents using Recurrent Neural Networks
# Memory-enhanced agents using Recurrent Neural Networks
## What are memories for?
Have you ever entered a room to get something and immediately forgot
what you were looking for? Don't let that happen to
your agents.
## What are memories used for?
Have you ever entered a room to get something and immediately forgot what you
were looking for? Don't let that happen to your agents.
It is now possible to give memories to your agents. When training, the
agents will be able to store a vector of floats to be used next time
they need to make a decision.
It is now possible to give memories to your agents. When training, the agents
will be able to store a vector of floats to be used next time they need to make
a decision.
Deciding what the agents should remember in order to solve a task is not
easy to do by hand, but our training algorithms can learn to keep
track of what is important to remember with [LSTM](https://en.wikipedia.org/wiki/Long_short-term_memory).
Deciding what the agents should remember in order to solve a task is not easy to
do by hand, but our training algorithms can learn to keep track of what is
important to remember with
[LSTM](https://en.wikipedia.org/wiki/Long_short-term_memory).
When configuring the trainer parameters in the `trainer_config.yaml`
When configuring the trainer parameters in the `config/trainer_config.yaml`
file, add the following parameters to the Brain you want to use.
```json

```
* `use_recurrent` is a flag that notifies the trainer that you want
to use a Recurrent Neural Network.
* `sequence_length` defines how long the sequences of experiences
must be while training. In order to use a LSTM, training requires
a sequence of experiences instead of single experiences.
* `memory_size` corresponds to the size of the memory the agent
must keep. Note that if this number is too small, the agent will not
be able to remember a lot of things. If this number is too large,
the neural network will take longer to train.
* `use_recurrent` is a flag that notifies the trainer that you want to use a
Recurrent Neural Network.
* `sequence_length` defines how long the sequences of experiences must be while
training. In order to use a LSTM, training requires a sequence of experiences
instead of single experiences.
* `memory_size` corresponds to the size of the memory the agent must keep. Note
that if this number is too small, the agent will not be able to remember a lot
of things. If this number is too large, the neural network will take longer to
train.
* LSTM does not work well with continuous vector action space.
Please use discrete vector action space for better results.
* Since the memories must be sent back and forth between Python
and Unity, using too large `memory_size` will slow down training.
* Adding a recurrent layer increases the complexity of the neural
network, it is recommended to decrease `num_layers` when using recurrent.
* LSTM does not work well with continuous vector action space. Please use
discrete vector action space for better results.
* Since the memories must be sent back and forth between Python and Unity, using
too large `memory_size` will slow down training.
* Adding a recurrent layer increases the complexity of the neural network, it is
recommended to decrease `num_layers` when using recurrent.
* It is required that `memory_size` be divisible by 4.

50
docs/Feature-Monitor.md


![Monitor](images/monitor.png)
The monitor allows visualizing information related to the agents or training process within a Unity scene.
The monitor allows visualizing information related to the agents or training
process within a Unity scene.
You can track many different things both related and unrelated to the agents
themselves. By default, the Monitor is only active in the *inference* phase, so
not during training. To change this behavior, you can activate or deactivate it
by calling `SetActive(boolean)`. For example to also show the monitor during
training, you can call it in the `InitializeAcademy()` method of your `Academy`:
```csharp
using MLAgents;
public class YourAcademy : Academy {
public override void InitializeAcademy()
{
Monitor.SetActive(true);
}
}
```
You can track many different things both related and unrelated to the agents themselves. To use the Monitor, call the Log function anywhere in your code :
To add values to monitor, call the `Log` function anywhere in your code:
* *`key`* is the name of the information you want to display.
* *`value`* is the information you want to display. *`value`* can have different types :
* *`string`* - The Monitor will display the string next to the key. It can be useful for displaying error messages.
* *`float`* - The Monitor will display a slider. Note that the values must be between -1 and 1. If the value is positive, the slider will be green, if the value is negative, the slider will be red.
* *`float[]`* - The Monitor Log call can take an additional argument called `displayType` that can be either `INDEPENDENT` (default) or `PROPORTIONAL` :
* *`INDEPENDENT`* is used to display multiple independent floats as a histogram. The histogram will be a sequence of vertical sliders.
* *`PROPORTION`* is used to see the proportions between numbers. For each float in values, a rectangle of width of value divided by the sum of all values will be show. It is best for visualizing values that sum to 1.
* *`target`* is the transform to which you want to attach information. If the transform is `null` the information will be attached to the global monitor.
* `key` is the name of the information you want to display.
* `value` is the information you want to display. *`value`* can have different
types:
* `string` - The Monitor will display the string next to the key. It can be
useful for displaying error messages.
* `float` - The Monitor will display a slider. Note that the values must be
between -1 and 1. If the value is positive, the slider will be green, if the
value is negative, the slider will be red.
* `float[]` - The Monitor Log call can take an additional argument called
`displayType` that can be either `INDEPENDENT` (default) or `PROPORTIONAL`:
* `INDEPENDENT` is used to display multiple independent floats as a
histogram. The histogram will be a sequence of vertical sliders.
* `PROPORTION` is used to see the proportions between numbers. For each
float in values, a rectangle of width of value divided by the sum of all
values will be show. It is best for visualizing values that sum to 1.
* `target` is the transform to which you want to attach information. If the
transform is `null` the information will be attached to the global monitor.
* **NB:** When adding a target transform that is not the global monitor, make
sure you have your main camera object tagged as `MainCamera` via the
inspector. This is needed to properly display the text onto the screen.

393
docs/Getting-Started-with-Balance-Ball.md


# Getting Started with the 3D Balance Ball Environment
This tutorial walks through the end-to-end process of opening a ML-Agents toolkit
example environment in Unity, building the Unity executable, training an agent
in it, and finally embedding the trained model into the Unity environment.
This tutorial walks through the end-to-end process of opening a ML-Agents
toolkit example environment in Unity, building the Unity executable, training an
Agent in it, and finally embedding the trained model into the Unity environment.
The ML-Agents toolkit includes a number of [example environments](Learning-Environment-Examples.md)
which you can examine to help understand the different ways in which the ML-Agents toolkit
can be used. These environments can also serve as templates for new
environments or as ways to test new ML algorithms. After reading this tutorial,
you should be able to explore and build the example environments.
The ML-Agents toolkit includes a number of [example
environments](Learning-Environment-Examples.md) which you can examine to help
understand the different ways in which the ML-Agents toolkit can be used. These
environments can also serve as templates for new environments or as ways to test
new ML algorithms. After reading this tutorial, you should be able to explore
and build the example environments.
This walk-through uses the **3D Balance Ball** environment. 3D Balance Ball contains
a number of platforms and balls (which are all copies of each other).
Each platform tries to keep its ball from falling by rotating either
horizontally or vertically. In this environment, a platform is an **agent**
that receives a reward for every step that it balances the ball. An agent is
also penalized with a negative reward for dropping the ball. The goal of the
training process is to have the platforms learn to never drop the ball.
This walk-through uses the **3D Balance Ball** environment. 3D Balance Ball
contains a number of platforms and balls (which are all copies of each other).
Each platform tries to keep its ball from falling by rotating either
horizontally or vertically. In this environment, a platform is an **Agent** that
receives a reward for every step that it balances the ball. An agent is also
penalized with a negative reward for dropping the ball. The goal of the training
process is to have the platforms learn to never drop the ball.
In order to install and set up the ML-Agents toolkit, the Python dependencies and Unity,
see the [installation instructions](Installation.md).
In order to install and set up the ML-Agents toolkit, the Python dependencies
and Unity, see the [installation instructions](Installation.md).
An agent is an autonomous actor that observes and interacts with an
_environment_. In the context of Unity, an environment is a scene containing
an Academy and one or more Brain and Agent objects, and, of course, the other
entities that an agent interacts with.
An agent is an autonomous actor that observes and interacts with an
_environment_. In the context of Unity, an environment is a scene containing an
Academy and one or more Brain and Agent objects, and, of course, the other
entities that an agent interacts with.
**Note:** In Unity, the base object of everything in a scene is the
_GameObject_. The GameObject is essentially a container for everything else,
including behaviors, graphics, physics, etc. To see the components that make
up a GameObject, select the GameObject in the Scene window, and open the
Inspector window. The Inspector shows every component on a GameObject.
The first thing you may notice after opening the 3D Balance Ball scene is that
it contains not one, but several platforms. Each platform in the scene is an
independent agent, but they all share the same brain. 3D Balance Ball does this
**Note:** In Unity, the base object of everything in a scene is the
_GameObject_. The GameObject is essentially a container for everything else,
including behaviors, graphics, physics, etc. To see the components that make up
a GameObject, select the GameObject in the Scene window, and open the Inspector
window. The Inspector shows every component on a GameObject.
The first thing you may notice after opening the 3D Balance Ball scene is that
it contains not one, but several platforms. Each platform in the scene is an
independent agent, but they all share the same Brain. 3D Balance Ball does this
The Academy object for the scene is placed on the Ball3DAcademy GameObject.
When you look at an Academy component in the inspector, you can see several
properties that control how the environment works. For example, the
**Training** and **Inference Configuration** properties set the graphics and
timescale properties for the Unity application. The Academy uses the
**Training Configuration** during training and the **Inference Configuration**
when not training. (*Inference* means that the agent is using a trained model
or heuristics or direct control — in other words, whenever **not** training.)
Typically, you set low graphics quality and a high time scale for the
**Training configuration** and a high graphics quality and the timescale to
`1.0` for the **Inference Configuration** .
The Academy object for the scene is placed on the Ball3DAcademy GameObject. When
you look at an Academy component in the inspector, you can see several
properties that control how the environment works. For example, the **Training**
and **Inference Configuration** properties set the graphics and timescale
properties for the Unity application. The Academy uses the **Training
Configuration** during training and the **Inference Configuration** when not
training. (*Inference* means that the Agent is using a trained model or
heuristics or direct control — in other words, whenever **not** training.)
Typically, you set low graphics quality and a high time scale for the **Training
configuration** and a high graphics quality and the timescale to `1.0` for the
**Inference Configuration** .
**Note:** if you want to observe the environment during training, you can
adjust the **Inference Configuration** settings to use a larger window and a
timescale closer to 1:1. Be sure to set these parameters back when training in
earnest; otherwise, training can take a very long time.
**Note:** if you want to observe the environment during training, you can adjust
the **Inference Configuration** settings to use a larger window and a timescale
closer to 1:1. Be sure to set these parameters back when training in earnest;
otherwise, training can take a very long time.
Another aspect of an environment to look at is the Academy implementation.
Since the base Academy class is abstract, you must always define a subclass.
There are three functions you can implement, though they are all optional:
Another aspect of an environment to look at is the Academy implementation. Since
the base Academy class is abstract, you must always define a subclass. There are
three functions you can implement, though they are all optional:
* Academy.AcademyStep() — Called at every simulation step before
Agent.AgentAction() (and after the agents collect their observations).
* Academy.AcademyReset() — Called when the Academy starts or restarts the
simulation (including the first time).
* Academy.AcademyStep() — Called at every simulation step before
agent.AgentAction() (and after the Agents collect their observations).
* Academy.AcademyReset() — Called when the Academy starts or restarts the
simulation (including the first time).
The 3D Balance Ball environment does not use these functions — each agent
resets itself when needed — but many environments do use these functions to
control the environment around the agents.
The 3D Balance Ball environment does not use these functions — each Agent resets
itself when needed — but many environments do use these functions to control the
environment around the Agents.
The Ball3DBrain GameObject in the scene, which contains a Brain component,
is a child of the Academy object. (All Brain objects in a scene must be
children of the Academy.) All the agents in the 3D Balance Ball environment
use the same Brain instance.
A Brain doesn't store any information about an agent,
it just routes the agent's collected observations to the decision making
process and returns the chosen action to the agent. Thus, all agents can share
the same brain, but act independently. The Brain settings tell you quite a bit
about how an agent works.
The Ball3DBrain GameObject in the scene, which contains a Brain component, is a
child of the Academy object. (All Brain objects in a scene must be children of
the Academy.) All the Agents in the 3D Balance Ball environment use the same
Brain instance. A Brain doesn't store any information about an Agent, it just
routes the Agent's collected observations to the decision making process and
returns the chosen action to the Agent. Thus, all Agents can share the same
Brain, but act independently. The Brain settings tell you quite a bit about how
an Agent works.
The **Brain Type** determines how an agent makes its decisions. The
**External** and **Internal** types work together — use **External** when
training your agents; use **Internal** when using the trained model.
The **Heuristic** brain allows you to hand-code the agent's logic by extending
the Decision class. Finally, the **Player** brain lets you map keyboard
commands to actions, which can be useful when testing your agents and
environment. If none of these types of brains do what you need, you can
implement your own CoreBrain to create your own type.
The **Brain Type** determines how an Agent makes its decisions. The **External**
and **Internal** types work together — use **External** when training your
Agents; use **Internal** when using the trained model. The **Heuristic** Brain
allows you to hand-code the Agent's logic by extending the Decision class.
Finally, the **Player** Brain lets you map keyboard commands to actions, which
can be useful when testing your agents and environment. If none of these types
of Brains do what you need, you can implement your own CoreBrain to create your
own type.
In this tutorial, you will set the **Brain Type** to **External** for training;
In this tutorial, you will set the **Brain Type** to **External** for training;
**Vector Observation Space**
#### Vector Observation Space
Before making a decision, an agent collects its observation about its state
in the world. The ML-Agents toolkit classifies vector observations into two types:
**Continuous** and **Discrete**. The **Continuous** vector observation space
collects observations in a vector of floating point numbers. The **Discrete**
vector observation space is an index into a table of states. Most of the example
environments use a continuous vector observation space.
Before making a decision, an agent collects its observation about its state in
the world. The vector observation is a vector of floating point numbers which
contain relevant information for the agent to make decisions.
The Brain instance used in the 3D Balance Ball example uses the **Continuous**
vector observation space with a **State Size** of 8. This means that the
feature vector containing the agent's observations contains eight elements:
the `x` and `z` components of the platform's rotation and the `x`, `y`, and `z`
components of the ball's relative position and velocity. (The observation
values are defined in the agent's `CollectObservations()` function.)
The Brain instance used in the 3D Balance Ball example uses the **Continuous**
vector observation space with a **State Size** of 8. This means that the feature
vector containing the Agent's observations contains eight elements: the `x` and
`z` components of the platform's rotation and the `x`, `y`, and `z` components
of the ball's relative position and velocity. (The observation values are
defined in the Agent's `CollectObservations()` function.)
**Vector Action Space**
#### Vector Action Space
An agent is given instructions from the brain in the form of *actions*. Like
states, ML-Agents toolkit classifies actions into two types: the **Continuous**
vector action space is a vector of numbers that can vary continuously. What
each element of the vector means is defined by the agent logic (the PPO
training process just learns what values are better given particular state
observations based on the rewards received when it tries different values).
For example, an element might represent a force or torque applied to a
`RigidBody` in the agent. The **Discrete** action vector space defines its
actions as a table. A specific action given to the agent is an index into
this table.
An Agent is given instructions from the Brain in the form of *actions*.
ML-Agents toolkit classifies actions into two types: the **Continuous** vector
action space is a vector of numbers that can vary continuously. What each
element of the vector means is defined by the Agent logic (the PPO training
process just learns what values are better given particular state observations
based on the rewards received when it tries different values). For example, an
element might represent a force or torque applied to a `Rigidbody` in the Agent.
The **Discrete** action vector space defines its actions as tables. An action
given to the Agent is an array of indices into tables.
space.
You can try training with both settings to observe whether there is a
difference. (Set the `Vector Action Space Size` to 4 when using the discrete
space. You can try training with both settings to observe whether there is a
difference. (Set the `Vector Action Space Size` to 4 when using the discrete
The Agent is the actor that observes and takes actions in the environment.
In the 3D Balance Ball environment, the Agent components are placed on the
twelve Platform GameObjects. The base Agent object has a few properties that
affect its behavior:
The Agent is the actor that observes and takes actions in the environment. In
the 3D Balance Ball environment, the Agent components are placed on the twelve
Platform GameObjects. The base Agent object has a few properties that affect its
behavior:
* **Brain** — Every agent must have a Brain. The brain determines how an agent
makes decisions. All the agents in the 3D Balance Ball scene share the same
brain.
* **Visual Observations** — Defines any Camera objects used by the agent to
observe its environment. 3D Balance Ball does not use camera observations.
* **Max Step** — Defines how many simulation steps can occur before the agent
decides it is done. In 3D Balance Ball, an agent restarts after 5000 steps.
* **Reset On Done** — Defines whether an agent starts over when it is finished.
3D Balance Ball sets this true so that the agent restarts after reaching the
**Max Step** count or after dropping the ball.
* **Brain** — Every Agent must have a Brain. The Brain determines how an Agent
makes decisions. All the Agents in the 3D Balance Ball scene share the same
Brain.
* **Visual Observations** — Defines any Camera objects used by the Agent to
observe its environment. 3D Balance Ball does not use camera observations.
* **Max Step** — Defines how many simulation steps can occur before the Agent
decides it is done. In 3D Balance Ball, an Agent restarts after 5000 steps.
* **Reset On Done** — Defines whether an Agent starts over when it is finished.
3D Balance Ball sets this true so that the Agent restarts after reaching the
**Max Step** count or after dropping the ball.
Perhaps the more interesting aspect of an agent is the Agent subclass
implementation. When you create an agent, you must extend the base Agent class.
Perhaps the more interesting aspect of an agents is the Agent subclass
implementation. When you create an Agent, you must extend the base Agent class.
* Agent.AgentReset() — Called when the Agent resets, including at the beginning
of a session. The Ball3DAgent class uses the reset function to reset the
platform and ball. The function randomizes the reset values so that the
training generalizes to more than a specific starting position and platform
attitude.
* Agent.CollectObservations() — Called every simulation step. Responsible for
collecting the agent's observations of the environment. Since the Brain
instance assigned to the agent is set to the continuous vector observation
space with a state size of 8, the `CollectObservations()` must call
`AddVectorObs` 8 times.
* Agent.AgentAction() — Called every simulation step. Receives the action chosen
by the brain. The Ball3DAgent example handles both the continuous and the
discrete action space types. There isn't actually much difference between the
two state types in this environment — both vector action spaces result in a
small change in platform rotation at each step. The `AgentAction()` function
assigns a reward to the agent; in this example, an agent receives a small
positive reward for each step it keeps the ball on the platform and a larger,
negative reward for dropping the ball. An agent is also marked as done when it
drops the ball so that it will reset with a new ball for the next simulation
step.
* agent.AgentReset() — Called when the Agent resets, including at the beginning
of a session. The Ball3DAgent class uses the reset function to reset the
platform and ball. The function randomizes the reset values so that the
training generalizes to more than a specific starting position and platform
attitude.
* agent.CollectObservations() — Called every simulation step. Responsible for
collecting the Agent's observations of the environment. Since the Brain
instance assigned to the Agent is set to the continuous vector observation
space with a state size of 8, the `CollectObservations()` must call
`AddVectorObs` 8 times.
* agent.AgentAction() — Called every simulation step. Receives the action chosen
by the Brain. The Ball3DAgent example handles both the continuous and the
discrete action space types. There isn't actually much difference between the
two state types in this environment — both vector action spaces result in a
small change in platform rotation at each step. The `AgentAction()` function
assigns a reward to the Agent; in this example, an Agent receives a small
positive reward for each step it keeps the ball on the platform and a larger,
negative reward for dropping the ball. An Agent is also marked as done when it
drops the ball so that it will reset with a new ball for the next simulation
step.
Now that we have an environment, we can perform the training.
Now that we have an environment, we can perform the training.
In order to train an agent to correctly balance the ball, we will use a
Reinforcement Learning algorithm called Proximal Policy Optimization (PPO).
This is a method that has been shown to be safe, efficient, and more general
purpose than many other RL algorithms, as such we have chosen it as the
example algorithm for use with ML-Agents toolkit. For more information on PPO,
OpenAI has a recent [blog post](https://blog.openai.com/openai-baselines-ppo/)
In order to train an agent to correctly balance the ball, we will use a
Reinforcement Learning algorithm called Proximal Policy Optimization (PPO). This
is a method that has been shown to be safe, efficient, and more general purpose
than many other RL algorithms, as such we have chosen it as the example
algorithm for use with ML-Agents toolkit. For more information on PPO, OpenAI
has a recent [blog post](https://blog.openai.com/openai-baselines-ppo/)
To train the agents within the Ball Balance environment, we will be using the python
package. We have provided a convenient Python wrapper script called `learn.py` which accepts arguments used to configure both training and inference phases.
To train the agents within the Ball Balance environment, we will be using the
Python package. We have provided a convenient script called `mlagents-learn`
which accepts arguments used to configure both training and inference phases.
We can use `run_id` to identify the experiment and create a folder where the model and summary statistics are stored. When using TensorBoard to observe the training statistics, it helps to set this to a sequential value
for each training run. In other words, "BalanceBall1" for the first run,
"BalanceBall2" or the second, and so on. If you don't, the summaries for
every training run are saved to the same directory and will all be included
on the same graph.
We can use `run_id` to identify the experiment and create a folder where the
model and summary statistics are stored. When using TensorBoard to observe the
training statistics, it helps to set this to a sequential value for each
training run. In other words, "BalanceBall1" for the first run, "BalanceBall2"
or the second, and so on. If you don't, the summaries for every training run are
saved to the same directory and will all be included on the same graph.
To summarize, go to your command line, enter the `ml-agents/python` directory and type:
To summarize, go to your command line, enter the `ml-agents` directory and type:
```
python3 learn.py --run-id=<run-identifier> --train
```sh
mlagents-learn config/trainer_config.yaml --run-id=<run-identifier> --train
When the message _"Start training by pressing the Play button in the Unity Editor"_ is displayed on the screen, you can press the :arrow_forward: button in Unity to start training in the Editor.
When the message _"Start training by pressing the Play button in the Unity
Editor"_ is displayed on the screen, you can press the :arrow_forward: button in
Unity to start training in the Editor.
**Note**: If you're using Anaconda, don't forget to activate the ml-agents environment first.
**Note**: If you're using Anaconda, don't forget to activate the ml-agents
environment first.
The `--train` flag tells the ML-Agents toolkit to run in training mode.
The `--train` flag tells the ML-Agents toolkit to run in training mode.
**Note**: You can train using an executable rather than the Editor. To do so, follow the intructions in
[Using an Execuatble](Learning-Environment-Executable.md).
**Note**: You can train using an executable rather than the Editor. To do so,
follow the intructions in
[Using an Executable](Learning-Environment-Executable.md).
Once you start training using `learn.py` in the way described in the previous section, the `ml-agents/python` folder will
contain a `summaries` directory. In order to observe the training process
in more detail, you can use TensorBoard. From the command line navigate to `ml-agents/python` folder and run:
Once you start training using `mlagents-learn` in the way described in the
previous section, the `ml-agents` directory will contain a `summaries`
directory. In order to observe the training process in more detail, you can use
TensorBoard. From the command line run:
`tensorboard --logdir=summaries`
```sh
tensorboard --logdir=summaries
```
* Lesson - only interesting when performing
[curriculum training](Training-Curriculum-Learning.md).
This is not used in the 3D Balance Ball environment.
* Cumulative Reward - The mean cumulative episode reward over all agents.
Should increase during a successful training session.
* Entropy - How random the decisions of the model are. Should slowly decrease
during a successful training process. If it decreases too quickly, the `beta`
hyperparameter should be increased.
* Episode Length - The mean length of each episode in the environment for all
agents.
* Learning Rate - How large a step the training algorithm takes as it searches
for the optimal policy. Should decrease over time.
* Lesson - only interesting when performing [curriculum
training](Training-Curriculum-Learning.md). This is not used in the 3D Balance
Ball environment.
* Cumulative Reward - The mean cumulative episode reward over all agents. Should
increase during a successful training session.
* Entropy - How random the decisions of the model are. Should slowly decrease
during a successful training process. If it decreases too quickly, the `beta`
hyperparameter should be increased.
* Episode Length - The mean length of each episode in the environment for all
agents.
* Learning Rate - How large a step the training algorithm takes as it searches
for the optimal policy. Should decrease over time.
much the policy (process for deciding actions) is changing. The magnitude of
this should decrease during a successful training session.
* Value Estimate - The mean value estimate for all states visited by the agent.
Should increase during a successful training session.
much the policy (process for deciding actions) is changing. The magnitude of
this should decrease during a successful training session.
* Value Estimate - The mean value estimate for all states visited by the agent.
Should increase during a successful training session.
well the model is able to predict the value of each state. This should decrease
during a successful training session.
well the model is able to predict the value of each state. This should
decrease during a successful training session.
Once the training process completes, and the training process saves the model
(denoted by the `Saved Model` message) you can add it to the Unity project and
use it with agents having an **Internal** brain type.
**Note:** Do not just close the Unity Window once the `Saved Model` message appears. Either wait for the training process to close the window or press Ctrl+C at the command-line prompt. If you simply close the window manually, the .bytes file containing the trained model is not exported into the ml-agents folder.
Once the training process completes, and the training process saves the model
(denoted by the `Saved Model` message) you can add it to the Unity project and
use it with Agents having an **Internal** Brain type. **Note:** Do not just
close the Unity Window once the `Saved Model` message appears. Either wait for
the training process to close the window or press Ctrl+C at the command-line
prompt. If you simply close the window manually, the .bytes file containing the
trained model is not exported into the ml-agents folder.
Because TensorFlowSharp support is still experimental, it is disabled by
default. In order to enable it, you must follow these steps. Please note that
Because TensorFlowSharp support is still experimental, it is disabled by
default. In order to enable it, you must follow these steps. Please note that
To set up the TensorFlowSharp Support, follow [Setting up ML-Agents Toolkit within Unity](Basic-Guide.md#setting-up-ml-agents-within-unity) section.
of the Basic Guide page.
To set up the TensorFlowSharp Support, follow [Setting up ML-Agents Toolkit
within Unity](Basic-Guide.md#setting-up-ml-agents-within-unity) section. of the
Basic Guide page.
To embed the trained model into Unity, follow the later part of [Training the Brain with Reinforcement Learning](Basic-Guide.md#training-the-brain-with-reinforcement-learning) section of the Basic Buides page.
To embed the trained model into Unity, follow the later part of [Training the
Brain with Reinforcement
Learning](Basic-Guide.md#training-the-brain-with-reinforcement-learning) section
of the Basic Guide page.

68
docs/Glossary.md


# ML-Agents Toolkit Glossary
* **Academy** - Unity Component which controls timing, reset, and
training/inference settings of the environment.
* **Action** - The carrying-out of a decision on the part of an
agent within the environment.
* **Agent** - Unity Component which produces observations and
takes actions in the environment. Agents actions are determined
by decisions produced by a linked Brain.
* **Brain** - Unity Component which makes decisions for the agents
linked to it.
* **Decision** - The specification produced by a Brain for an action
to be carried out given an observation.
* **Editor** - The Unity Editor, which may include any pane
(e.g. Hierarchy, Scene, Inspector).
* **Environment** - The Unity scene which contains Agents, Academy,
and Brains.
* **FixedUpdate** - Unity method called each time the the game engine
is stepped. ML-Agents logic should be placed here.
* **Frame** - An instance of rendering the main camera for the
display. Corresponds to each `Update` call of the game engine.
* **Observation** - Partial information describing the state of the
environment available to a given agent. (e.g. Vector, Visual, Text)
* **Policy** - Function for producing decisions from observations.
* **Reward** - Signal provided at every step used to indicate
desirability of an agent’s action within the current state
of the environment.
* **State** - The underlying properties of the environment
(including all agents within it) at a given time.
* **Step** - Corresponds to each `FixedUpdate` call of the game engine.
Is the smallest atomic change to the state possible.
* **Update** - Unity function called each time a frame is rendered.
ML-Agents logic should not be placed here.
* **External Coordinator** - ML-Agents class responsible for
communication with outside processes (in this case, the Python API).
* **Trainer** - Python class which is responsible for training a given
external brain. Contains TensorFlow graph which makes decisions
for external brain.
* **Academy** - Unity Component which controls timing, reset, and
training/inference settings of the environment.
* **Action** - The carrying-out of a decision on the part of an agent within the
environment.
* **Agent** - Unity Component which produces observations and takes actions in
the environment. Agents actions are determined by decisions produced by a
linked Brain.
* **Brain** - Unity Component which makes decisions for the agents linked to it.
* **Decision** - The specification produced by a Brain for an action to be
carried out given an observation.
* **Editor** - The Unity Editor, which may include any pane (e.g. Hierarchy,
Scene, Inspector).
* **Environment** - The Unity scene which contains Agents, Academy, and Brains.
* **FixedUpdate** - Unity method called each time the the game engine is
stepped. ML-Agents logic should be placed here.
* **Frame** - An instance of rendering the main camera for the display.
Corresponds to each `Update` call of the game engine.
* **Observation** - Partial information describing the state of the environment
available to a given agent. (e.g. Vector, Visual, Text)
* **Policy** - Function for producing decisions from observations.
* **Reward** - Signal provided at every step used to indicate desirability of an
agent’s action within the current state of the environment.
* **State** - The underlying properties of the environment (including all agents
within it) at a given time.
* **Step** - Corresponds to each `FixedUpdate` call of the game engine. Is the
smallest atomic change to the state possible.
* **Update** - Unity function called each time a frame is rendered. ML-Agents
logic should not be placed here.
* **External Coordinator** - ML-Agents class responsible for communication with
outside processes (in this case, the Python API).
* **Trainer** - Python class which is responsible for training a given External
Brain. Contains TensorFlow graph which makes decisions for External Brain.

251
docs/Installation-Windows.md


# Installing ML-Agents Toolkit for Windows
The ML-Agents toolkit supports Windows 10. While it might be possible to run the ML-Agents toolkit using other versions of Windows, it has not been tested on other versions. Furthermore, the ML-Agents toolkit has not been tested on a Windows VM such as Bootcamp or Parallels.
The ML-Agents toolkit supports Windows 10. While it might be possible to run the
ML-Agents toolkit using other versions of Windows, it has not been tested on
other versions. Furthermore, the ML-Agents toolkit has not been tested on a
Windows VM such as Bootcamp or Parallels.
To use the ML-Agents toolkit, you install Python and the required Python packages as outlined below. This guide also covers how set up GPU-based training (for advanced users). GPU-based training is not required for the v0.4 release of the ML-Agents toolkit. However, training on a GPU might be required by future versions and features.
To use the ML-Agents toolkit, you install Python and the required Python
packages as outlined below. This guide also covers how set up GPU-based training
(for advanced users). GPU-based training is not required for the v0.4 release of
the ML-Agents toolkit. However, training on a GPU might be required by future
versions and features.
[Download](https://www.anaconda.com/download/#windows) and install Anaconda for Windows. By using Anaconda, you can manage separate environments for different distributions of Python. Python 3.5 or 3.6 is required as we no longer support Python 2. In this guide, we are using Python version 3.6 and Anaconda version 5.1 ([64-bit](https://repo.continuum.io/archive/Anaconda3-5.1.0-Windows-x86_64.exe) or [32-bit](https://repo.continuum.io/archive/Anaconda3-5.1.0-Windows-x86.exe) direct links).
[Download](https://www.anaconda.com/download/#windows) and install Anaconda for
Windows. By using Anaconda, you can manage separate environments for different
distributions of Python. Python 3.5 or 3.6 is required as we no longer support
Python 2. In this guide, we are using Python version 3.6 and Anaconda version
5.1
([64-bit](https://repo.continuum.io/archive/Anaconda3-5.1.0-Windows-x86_64.exe)
or [32-bit](https://repo.continuum.io/archive/Anaconda3-5.1.0-Windows-x86.exe)
direct links).
<img src="images/anaconda_install.PNG"
alt="Anaconda Install"
width="500" border="10" />
<img src="images/anaconda_install.PNG"
alt="Anaconda Install"
width="500" border="10" />
We recommend the default _advanced installation options_. However, select the options appropriate for your specific situation.
We recommend the default _advanced installation options_. However, select the
options appropriate for your specific situation.
<img src="images/anaconda_default.PNG"
alt="Anaconda Install"
width="500" border="10" />
<img src="images/anaconda_default.PNG" alt="Anaconda Install" width="500" border="10" />
After installation, you must open __Anaconda Navigator__ to finish the setup. From the Windows search bar, type _anaconda navigator_. You can close Anaconda Navigator after it opens.
After installation, you must open __Anaconda Navigator__ to finish the setup.
From the Windows search bar, type _anaconda navigator_. You can close Anaconda
Navigator after it opens.
If environment variables were not created, you will see error "conda is not recognized as internal or external command" when you type `conda` into the command line. To solve this you will need to set the environment variable correctly.
If environment variables were not created, you will see error "conda is not
recognized as internal or external command" when you type `conda` into the
command line. To solve this you will need to set the environment variable
correctly.
Type `environment variables` in the search bar (this can be reached by hitting the Windows key or the bottom left Windows button). You should see an option called __Edit the system environment variables__.
Type `environment variables` in the search bar (this can be reached by hitting
the Windows key or the bottom left Windows button). You should see an option
called __Edit the system environment variables__.
<img src="images/edit_env_var.png"
alt="edit env variables"
width="250" border="10" />
<img src="images/edit_env_var.png"
alt="edit env variables"
width="250" border="10" />
From here, click the __Environment Variables__ button.
Double click "Path" under __System variable__ to edit the "Path" variable, click __New__ to add the following new paths.
From here, click the __Environment Variables__ button. Double click "Path" under
__System variable__ to edit the "Path" variable, click __New__ to add the
following new paths.
```
```console
You will create a new [Conda environment](https://conda.io/docs/) to be used with the ML-Agents toolkit. This means that all the packages that you install are localized to just this environment. It will not affect any other installation of Python or other environments. Whenever you want to run ML-Agents, you will need activate this Conda environment.
You will create a new [Conda environment](https://conda.io/docs/) to be used
with the ML-Agents toolkit. This means that all the packages that you install
are localized to just this environment. It will not affect any other
installation of Python or other environments. Whenever you want to run
ML-Agents, you will need activate this Conda environment.
To create a new Conda environment, open a new Anaconda Prompt (_Anaconda Prompt_ in the search bar) and type in the following command:
To create a new Conda environment, open a new Anaconda Prompt (_Anaconda Prompt_
in the search bar) and type in the following command:
```
```sh
You may be asked to install new packages. Type `y` and press enter _(make sure you are connected to the internet)_. You must install these required packages. The new Conda environment is called ml-agents and uses Python version 3.6.
You may be asked to install new packages. Type `y` and press enter _(make sure
you are connected to the internet)_. You must install these required packages.
The new Conda environment is called ml-agents and uses Python version 3.6.
<img src="images/conda_new.PNG"
alt="Anaconda Install"
width="500" border="10" />
<img src="images/conda_new.PNG" alt="Anaconda Install" width="500" border="10" />
To use this environment, you must activate it. _(To use this environment In the future, you can run the same command)_. In the same Anaconda Prompt, type in the following command:
To use this environment, you must activate it. _(To use this environment In the
future, you can run the same command)_. In the same Anaconda Prompt, type in the
following command:
```
```sh
Next, install `tensorflow`. Install this package using `pip` - which is a package management system used to install Python packages. Latest versions of Tensorflow won't work, so you will need to make sure that you install version 1.7.1. In the same Anaconda Prompt, type in the following command _(make sure you are connected to the internet)_:
Next, install `tensorflow`. Install this package using `pip` - which is a
package management system used to install Python packages. Latest versions of
TensorFlow won't work, so you will need to make sure that you install version
1.7.1. In the same Anaconda Prompt, type in the following command _(make sure
you are connected to the internet)_:
```
```sh
The ML-Agents toolkit depends on a number of Python packages. Use `pip` to install these Python dependencies.
The ML-Agents toolkit depends on a number of Python packages. Use `pip` to
install these Python dependencies.
If you haven't already, clone the ML-Agents Toolkit Github repository to your local computer. You can do this using Git ([download here](https://git-scm.com/download/win)) and running the following commands in an Anaconda Prompt _(if you open a new prompt, be sure to activate the ml-agents Conda environment by typing `activate ml-agents`)_:
If you haven't already, clone the ML-Agents Toolkit Github repository to your
local computer. You can do this using Git ([download
here](https://git-scm.com/download/win)) and running the following commands in
an Anaconda Prompt _(if you open a new prompt, be sure to activate the ml-agents
Conda environment by typing `activate ml-agents`)_:
```
```sh
If you don't want to use Git, you can always directly download all the files [here](https://github.com/Unity-Technologies/ml-agents/archive/master.zip).
If you don't want to use Git, you can always directly download all the files
[here](https://github.com/Unity-Technologies/ml-agents/archive/master.zip).
In our example, the files are located in `C:\Downloads`. After you have either cloned or downloaded the files, from the Anaconda Prompt, change to the python directory inside the ml-agents directory:
In our example, the files are located in `C:\Downloads`. After you have either
cloned or downloaded the files, from the Anaconda Prompt, change to the python
directory inside the ml-agents directory:
```
cd C:\Downloads\ml-agents\python
```console
cd C:\Downloads\ml-agents\ml-agents
Make sure you are connected to the internet and then type in the Anaconda Prompt:
Make sure you are connected to the internet and then type in the Anaconda
Prompt:
```
```sh
This will complete the installation of all the required Python packages to run the ML-Agents toolkit.
This will complete the installation of all the required Python packages to run
the ML-Agents toolkit.
GPU is not required for the ML-Agents toolkit and won't speed up the PPO algorithm a lot during training(but something in the future will benefit from GPU). This is a guide for advanced users who want to train using GPUs. Additionally, you will need to check if your GPU is CUDA compatible. Please check Nvidia's page [here](https://developer.nvidia.com/cuda-gpus).
GPU is not required for the ML-Agents toolkit and won't speed up the PPO
algorithm a lot during training(but something in the future will benefit from
GPU). This is a guide for advanced users who want to train using GPUs.
Additionally, you will need to check if your GPU is CUDA compatible. Please
check Nvidia's page [here](https://developer.nvidia.com/cuda-gpus).
[Download](https://developer.nvidia.com/cuda-toolkit-archive) and install the CUDA toolkit 9.0 from Nvidia's archive. The toolkit includes GPU-accelerated libraries, debugging and optimization tools, a C/C++ (Step Visual Studio 2017) compiler and a runtime library and is needed to run the ML-Agents toolkit. In this guide, we are using version 9.0.176 (https://developer.nvidia.com/compute/cuda/9.0/Prod/network_installers/cuda_9.0.176_win10_network-exe)).
[Download](https://developer.nvidia.com/cuda-toolkit-archive) and install the
CUDA toolkit 9.0 from Nvidia's archive. The toolkit includes GPU-accelerated
libraries, debugging and optimization tools, a C/C++ (Step Visual Studio 2017)
compiler and a runtime library and is needed to run the ML-Agents toolkit. In
this guide, we are using version
[9.0.176](https://developer.nvidia.com/compute/cuda/9.0/Prod/network_installers/cuda_9.0.176_win10_network-exe)).
Before installing, please make sure you __close any running instances of Unity or Visual Studio__.
Before installing, please make sure you __close any running instances of Unity
or Visual Studio__.
Run the installer and select the Express option. Note the directory where you installed the CUDA toolkit. In this guide, we installed in the directory `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`
Run the installer and select the Express option. Note the directory where you
installed the CUDA toolkit. In this guide, we installed in the directory
`C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`
[Download](https://developer.nvidia.com/cudnn) and install the cuDNN library from Nvidia. cuDNN is a GPU-accelerated library of primitives for deep neural networks. Before you can download, you will need to sign up for free to the Nvidia Developer Program.
[Download](https://developer.nvidia.com/cudnn) and install the cuDNN library
from Nvidia. cuDNN is a GPU-accelerated library of primitives for deep neural
networks. Before you can download, you will need to sign up for free to the
Nvidia Developer Program.
<img src="images/cuDNN_membership_required.png"
alt="cuDNN membership required"
width="500" border="10" />
<img src="images/cuDNN_membership_required.png"
alt="cuDNN membership required"
width="500" border="10" />
Once you've signed up, go back to the cuDNN [downloads page](https://developer.nvidia.com/cudnn). You may or may not be asked to fill out a short survey. When you get to the list cuDNN releases, __make sure you are downloading the right version for the CUDA toolkit you installed in Step 1.__ In this guide, we are using version 7.0.5 for CUDA toolkit version 9.0 ([direct link](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/v7.0.5/prod/9.0_20171129/cudnn-9.0-windows10-x64-v7)).
Once you've signed up, go back to the cuDNN
[downloads page](https://developer.nvidia.com/cudnn).
You may or may not be asked to fill out a short survey. When you get to the list
cuDNN releases, __make sure you are downloading the right version for the CUDA
toolkit you installed in Step 1.__ In this guide, we are using version 7.0.5 for
CUDA toolkit version 9.0
([direct link](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/v7.0.5/prod/9.0_20171129/cudnn-9.0-windows10-x64-v7)).
After you have downloaded the cuDNN files, you will need to extract the files into the CUDA toolkit directory. In the cuDNN zip file, there are three folders called `bin`, `include`, and `lib`.
After you have downloaded the cuDNN files, you will need to extract the files
into the CUDA toolkit directory. In the cuDNN zip file, there are three folders
called `bin`, `include`, and `lib`.
<img src="images/cudnn_zip_files.PNG"
alt="cuDNN zip files"
width="500" border="10" />
<img src="images/cudnn_zip_files.PNG"
alt="cuDNN zip files"
width="500" border="10" />
Copy these three folders into the CUDA toolkit directory. The CUDA toolkit directory is located at `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`
Copy these three folders into the CUDA toolkit directory. The CUDA toolkit
directory is located at
`C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`
<img src="images/cuda_toolkit_directory.PNG"
alt="cuda toolkit directory"
width="500" border="10" />
<img src="images/cuda_toolkit_directory.PNG"
alt="cuda toolkit directory"
width="500" border="10" />
</p>
### Set Environment Variables

To set the environment variable, type `environment variables` in the search bar (this can be reached by hitting the Windows key or the bottom left Windows button). You should see an option called __Edit the system environment variables__.
To set the environment variable, type `environment variables` in the search bar
(this can be reached by hitting the Windows key or the bottom left Windows
button). You should see an option called __Edit the system environment
variables__.
<img src="images/edit_env_var.png"
alt="edit env variables"
width="250" border="10" />
<img src="images/edit_env_var.png"
alt="edit env variables"
width="250" border="10" />
From here, click the __Environment Variables__ button. Click __New__ to add a new system variable _(make sure you do this under __System variables__ and not User variables_.
From here, click the __Environment Variables__ button. Click __New__ to add a
new system variable _(make sure you do this under __System variables__ and not
User variables_.
<img src="images/new_system_variable.PNG"
alt="new system variable"
width="500" border="10" />
<img src="images/new_system_variable.PNG"
alt="new system variable"
width="500" border="10" />
For __Variable Name__, enter `CUDA_HOME`. For the variable value, put the directory location for the CUDA toolkit. In this guide, the directory location is `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`. Press __OK__ once.
For __Variable Name__, enter `CUDA_HOME`. For the variable value, put the
directory location for the CUDA toolkit. In this guide, the directory location
is `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`. Press __OK__ once.
<img src="images/system_variable_name_value.PNG"
alt="system variable names and values"
width="500" border="10" />
<img src="images/system_variable_name_value.PNG"
alt="system variable names and values"
width="500" border="10" />
To set the two path variables, inside the same __Environment Variables__ window and under the second box called __System Variables__, find a variable called `Path` and click __Edit__. You will add two directories to the list. For this guide, the two entries would look like:
To set the two path variables, inside the same __Environment Variables__ window
and under the second box called __System Variables__, find a variable called
`Path` and click __Edit__. You will add two directories to the list. For this
guide, the two entries would look like:
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\lib\x64
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\extras\CUPTI\libx64
```console
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\lib\x64
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\extras\CUPTI\libx64
```
Make sure to replace the relevant directory location with the one you have installed. _Please note that case sensitivity matters_.
Make sure to replace the relevant directory location with the one you have
installed. _Please note that case sensitivity matters_.
<img src="images/path_variables.PNG"
alt="Path variables"
<img src="images/path_variables.PNG"
alt="Path variables"
Next, install `tensorflow-gpu` using `pip`. You'll need version 1.7.1. In an Anaconda Prompt with the Conda environment ml-agents activated, type in the following command to uninstall the tensorflow for cpu and install the tensorflow for gpu _(make sure you are connected to the internet)_:
Next, install `tensorflow-gpu` using `pip`. You'll need version 1.7.1. In an
Anaconda Prompt with the Conda environment ml-agents activated, type in the
following command to uninstall TensorFlow for cpu and install TensorFlow
for gpu _(make sure you are connected to the internet)_:
```
```sh
Lastly, you should test to see if everything installed properly and that TensorFlow can identify your GPU. In the same Anaconda Prompt, type in the following command:
Lastly, you should test to see if everything installed properly and that
TensorFlow can identify your GPU. In the same Anaconda Prompt, type in the
following command:
```python
import tensorflow as tf

You should see something similar to:
```
```console
We would like to thank [Jason Weimann](https://unity3d.college/2017/10/25/machine-learning-in-unity3d-setting-up-the-environment-tensorflow-for-agentml-on-windows-10/) and [Nitish S. Mutha](http://blog.nitishmutha.com/tensorflow/2017/01/22/TensorFlow-with-gpu-for-windows.html) for writing the original articles which were used to create this guide.
We would like to thank
[Jason Weimann](https://unity3d.college/2017/10/25/machine-learning-in-unity3d-setting-up-the-environment-tensorflow-for-agentml-on-windows-10/)
and
[Nitish S. Mutha](http://blog.nitishmutha.com/tensorflow/2017/01/22/TensorFlow-with-gpu-for-windows.html)
for writing the original articles which were used to create this guide.

90
docs/Installation.md


# Installation
To install and use ML-Agents, you need install Unity, clone this repository
and install Python with additional dependencies. Each of the subsections
below overviews each step, in addition to a Docker set-up.
To install and use ML-Agents, you need install Unity, clone this repository and
install Python with additional dependencies. Each of the subsections below
overviews each step, in addition to a Docker set-up.
## Install **Unity 2017.1** or Later
## Install **Unity 2017.4** or Later
like to use our Docker set-up (introduced later), make sure to select the
_Linux Build Support_ component when installing Unity.
like to use our Docker set-up (introduced later), make sure to select the _Linux
Build Support_ component when installing Unity.
<img src="images/unity_linux_build_support.png"
alt="Linux Build Support"
width="500" border="10" />
<img src="images/unity_linux_build_support.png"
alt="Linux Build Support"
width="500" border="10" />
## Clone the Ml-Agents Repository
## Clone the ML-Agents Toolkit Repository
Once installed, you will want to clone the ML-Agents Toolkit GitHub repository.
```sh
git clone https://github.com/Unity-Technologies/ml-agents.git
```
Once installed, you will want to clone the ML-Agents Toolkit GitHub repository.
The `UnitySDK` subdirectory contains the Unity Assets to add to your projects.
It also contains many [example environments](Learning-Environment-Examples.md)
that can be used to help get you familiar with Unity.
git clone https://github.com/Unity-Technologies/ml-agents.git
The `ml-agents` subdirectory contains Python packages which provide
trainers and a Python API to interface with Unity.
The `unity-environment` directory in this repository contains the Unity Assets
to add to your projects. The `python` directory contains the training code.
Both directories are located at the root of the repository.
The `gym-unity` subdirectory contains a package to interface with OpenAI Gym.
## Install Python (with Dependencies)
## Install Python and mlagents Package
In order to use ML-Agents toolkit, you need Python 3.5 or 3.6 along with
the dependencies listed in the [requirements file](../python/requirements.txt).
In order to use ML-Agents toolkit, you need Python 3.6 along with the
dependencies listed in the [requirements file](../ml-agents/requirements.txt).
- [TensorFlow](Background-TensorFlow.md)
- [Jupyter](Background-Jupyter.md)
- [TensorFlow](Background-TensorFlow.md)
- [Jupyter](Background-Jupyter.md)
### NOTES
- We do not currently support Python 3.7 or Python 3.5.
- If you are using Anaconda and are having trouble with TensorFlow, please see
the following
[note](https://www.tensorflow.org/install/install_mac#installing_with_anaconda)
on how to install TensorFlow in an Anaconda environment.
If you are a Windows user who is new to Python and TensorFlow, follow [this guide](Installation-Windows.md) to set up your Python environment.
If you are a Windows user who is new to Python and TensorFlow, follow [this
guide](Installation-Windows.md) to set up your Python environment.
[Download](https://www.python.org/downloads/) and install Python 3 if you do not already have it.
[Download](https://www.python.org/downloads/) and install Python 3.6 if you do not
already have it.
If your Python environment doesn't include `pip`, see these
If your Python environment doesn't include `pip3`, see these
To install dependencies, **go into the `python` subdirectory** of the repository,
and run from the command line:
To install the dependencies and `mlagents` Python package, enter the
`ml-agents/` subdirectory and run from the command line:
```sh
pip3 install .
```
pip3 install .
If you installed this correctly, you should be able to run
`mlagents-learn --help`
If you'd like to use Docker for ML-Agents, please follow
[this guide](Using-Docker.md).
If you'd like to use Docker for ML-Agents, please follow
[this guide](Using-Docker.md).
The [Basic Guide](Basic-Guide.md) page contains several short
tutorials on setting up the ML-Agents toolkit within Unity, running a pre-trained model, in
The [Basic Guide](Basic-Guide.md) page contains several short tutorials on
setting up the ML-Agents toolkit within Unity, running a pre-trained model, in
If you run into any problems regarding ML-Agents, refer to our [FAQ](FAQ.md) and our [Limitations](Limitations.md) pages. If you can't find anything please
If you run into any problems regarding ML-Agents, refer to our [FAQ](FAQ.md) and
our [Limitations](Limitations.md) pages. If you can't find anything please
make sure to cite relevant information on OS, Python version, and exact error
message (whenever possible).
make sure to cite relevant information on OS, Python version, and exact error
message (whenever possible).

64
docs/Learning-Environment-Best-Practices.md


# Environment Design Best Practices
## General
* It is often helpful to start with the simplest version of the problem, to ensure the agent can learn it. From there increase
complexity over time. This can either be done manually, or via Curriculum Learning, where a set of lessons which progressively increase in difficulty are presented to the agent ([learn more here](Training-Curriculum-Learning.md)).
* When possible, it is often helpful to ensure that you can complete the task by using a Player Brain to control the agent.
* It is often helpful to make many copies of the agent, and attach the brain to be trained to all of these agents. In this way the brain can get more feedback information from all of these agents, which helps it train faster.
* It is often helpful to start with the simplest version of the problem, to
ensure the agent can learn it. From there increase complexity over time. This
can either be done manually, or via Curriculum Learning, where a set of
lessons which progressively increase in difficulty are presented to the agent
([learn more here](Training-Curriculum-Learning.md)).
* When possible, it is often helpful to ensure that you can complete the task by
using a Player Brain to control the agent.
* It is often helpful to make many copies of the agent, and attach the Brain to
be trained to all of these agents. In this way the Brain can get more feedback
information from all of these agents, which helps it train faster.
* The magnitude of any given reward should typically not be greater than 1.0 in order to ensure a more stable learning process.
* Positive rewards are often more helpful to shaping the desired behavior of an agent than negative rewards.
* For locomotion tasks, a small positive reward (+0.1) for forward velocity is typically used.
* If you want the agent to finish a task quickly, it is often helpful to provide a small penalty every step (-0.05) that the agent does not complete the task. In this case completion of the task should also coincide with the end of the episode.
* Overly-large negative rewards can cause undesirable behavior where an agent learns to avoid any behavior which might produce the negative reward, even if it is also behavior which can eventually lead to a positive reward.
* The magnitude of any given reward should typically not be greater than 1.0 in
order to ensure a more stable learning process.
* Positive rewards are often more helpful to shaping the desired behavior of an
agent than negative rewards.
* For locomotion tasks, a small positive reward (+0.1) for forward velocity is
typically used.
* If you want the agent to finish a task quickly, it is often helpful to provide
a small penalty every step (-0.05) that the agent does not complete the task.
In this case completion of the task should also coincide with the end of the
episode.
* Overly-large negative rewards can cause undesirable behavior where an agent
learns to avoid any behavior which might produce the negative reward, even if
it is also behavior which can eventually lead to a positive reward.
* Vector Observations should include all variables relevant to allowing the agent to take the optimally informed decision.
* In cases where Vector Observations need to be remembered or compared over time, increase the `Stacked Vectors` value to allow the agent to keep track of multiple observations into the past.
* Categorical variables such as type of object (Sword, Shield, Bow) should be encoded in one-hot fashion (i.e. `3` > `0, 0, 1`).
* Besides encoding non-numeric values, all inputs should be normalized to be in the range 0 to +1 (or -1 to 1). For example, the `x` position information of an agent where the maximum possible value is `maxValue` should be recorded as `AddVectorObs(transform.position.x / maxValue);` rather than `AddVectorObs(transform.position.x);`. See the equation below for one approach of normalization.
* Positional information of relevant GameObjects should be encoded in relative coordinates wherever possible. This is often relative to the agent position.
* Vector Observations should include all variables relevant to allowing the
agent to take the optimally informed decision.
* In cases where Vector Observations need to be remembered or compared over
time, increase the `Stacked Vectors` value to allow the agent to keep track of
multiple observations into the past.
* Categorical variables such as type of object (Sword, Shield, Bow) should be
encoded in one-hot fashion (i.e. `3` > `0, 0, 1`).
* Besides encoding non-numeric values, all inputs should be normalized to be in
the range 0 to +1 (or -1 to 1). For example, the `x` position information of
an agent where the maximum possible value is `maxValue` should be recorded as
`AddVectorObs(transform.position.x / maxValue);` rather than
`AddVectorObs(transform.position.x);`. See the equation below for one approach
of normalization.
* Positional information of relevant GameObjects should be encoded in relative
coordinates wherever possible. This is often relative to the agent position.
* When using continuous control, action values should be clipped to an appropriate range. The provided PPO model automatically clips these values between -1 and 1, but third party training systems may not do so.
* Be sure to set the Vector Action's Space Size to the number of used Vector Actions, and not greater, as doing the latter can interfere with the efficiency of the training process.
* When using continuous control, action values should be clipped to an
appropriate range. The provided PPO model automatically clips these values
between -1 and 1, but third party training systems may not do so.
* Be sure to set the Vector Action's Space Size to the number of used Vector
Actions, and not greater, as doing the latter can interfere with the
efficiency of the training process.

363
docs/Learning-Environment-Create-New.md


# Making a New Learning Environment
This tutorial walks through the process of creating a Unity Environment. A Unity Environment is an application built using the Unity Engine which can be used to train Reinforcement Learning agents.
This tutorial walks through the process of creating a Unity Environment. A Unity
Environment is an application built using the Unity Engine which can be used to
train Reinforcement Learning Agents.
In this example, we will train a ball to roll to a randomly placed cube. The ball also learns to avoid falling off the platform.
In this example, we will train a ball to roll to a randomly placed cube. The
ball also learns to avoid falling off the platform.
Using the ML-Agents toolkit in a Unity project involves the following basic steps:
Using the ML-Agents toolkit in a Unity project involves the following basic
steps:
1. Create an environment for your agents to live in. An environment can range from a simple physical simulation containing a few objects to an entire game or ecosystem.
2. Implement an Academy subclass and add it to a GameObject in the Unity scene containing the environment. This GameObject will serve as the parent for any Brain objects in the scene. Your Academy class can implement a few optional methods to update the scene independently of any agents. For example, you can add, move, or delete agents and other entities in the environment.
1. Create an environment for your agents to live in. An environment can range
from a simple physical simulation containing a few objects to an entire game
or ecosystem.
2. Implement an Academy subclass and add it to a GameObject in the Unity scene
containing the environment. This GameObject will serve as the parent for any
Brain objects in the scene. Your Academy class can implement a few optional
methods to update the scene independently of any agents. For example, you can
add, move, or delete agents and other entities in the environment.
4. Implement your Agent subclasses. An Agent subclass defines the code an agent uses to observe its environment, to carry out assigned actions, and to calculate the rewards used for reinforcement training. You can also implement optional methods to reset the agent when it has finished or failed its task.
5. Add your Agent subclasses to appropriate GameObjects, typically, the object in the scene that represents the agent in the simulation. Each Agent object must be assigned a Brain object.
6. If training, set the Brain type to External and [run the training process](Training-ML-Agents.md).
4. Implement your Agent subclasses. An Agent subclass defines the code an Agent
uses to observe its environment, to carry out assigned actions, and to
calculate the rewards used for reinforcement training. You can also implement
optional methods to reset the Agent when it has finished or failed its task.
5. Add your Agent subclasses to appropriate GameObjects, typically, the object
in the scene that represents the Agent in the simulation. Each Agent object
must be assigned a Brain object.
6. If training, set the Brain type to External and
[run the training process](Training-ML-Agents.md).
**Note:** If you are unfamiliar with Unity, refer to [Learning the interface](https://docs.unity3d.com/Manual/LearningtheInterface.html) in the Unity Manual if an Editor task isn't explained sufficiently in this tutorial.
**Note:** If you are unfamiliar with Unity, refer to
[Learning the interface](https://docs.unity3d.com/Manual/LearningtheInterface.html)
in the Unity Manual if an Editor task isn't explained sufficiently in this
tutorial.
The first task to accomplish is simply creating a new Unity project and importing the ML-Agents assets into it:
The first task to accomplish is simply creating a new Unity project and
importing the ML-Agents assets into it:
2. In a file system window, navigate to the folder containing your cloned ML-Agents repository.
3. Drag the `ML-Agents` folder from `unity-environments/Assets` to the Unity Editor Project window.
2. In a file system window, navigate to the folder containing your cloned
ML-Agents repository.
3. Drag the `ML-Agents` folder from `UnitySDK/Assets` to the Unity Editor
Project window.
## Create the Environment:
## Create the Environment
Next, we will create a very simple scene to act as our ML-Agents environment. The "physical" components of the environment include a Plane to act as the floor for the agent to move around on, a Cube to act as the goal or target for the agent to seek, and a Sphere to represent the agent itself.
Next, we will create a very simple scene to act as our ML-Agents environment.
The "physical" components of the environment include a Plane to act as the floor
for the Agent to move around on, a Cube to act as the goal or target for the
agent to seek, and a Sphere to represent the Agent itself.
**Create the floor plane:**
### Create the floor plane
5. On the Plane's Mesh Renderer, expand the Materials property and change the default-material to *floor*.
5. On the Plane's Mesh Renderer, expand the Materials property and change the
default-material to *floor*.
(To set a new material, click the small circle icon next to the current material name. This opens the **Object Picker** dialog so that you can choose the a different material from the list of all materials currently in the project.)
(To set a new material, click the small circle icon next to the current material
name. This opens the **Object Picker** dialog so that you can choose the a
different material from the list of all materials currently in the project.)
**Add the Target Cube**
### Add the Target Cube
5. On the Cube's Mesh Renderer, expand the Materials property and change the default-material to *Block*.
5. On the Cube's Mesh Renderer, expand the Materials property and change the
default-material to *Block*.
**Add the Agent Sphere**
### Add the Agent Sphere
5. On the Sphere's Mesh Renderer, expand the Materials property and change the default-material to *checker 1*.
5. On the Sphere's Mesh Renderer, expand the Materials property and change the
default-material to *checker 1*.
7. Add the Physics/Rigidbody component to the Sphere. (Adding a Rigidbody )
7. Add the Physics/Rigidbody component to the Sphere. (Adding a Rigidbody)
Note that we will create an Agent subclass to add to this GameObject as a component later in the tutorial.
Note that we will create an Agent subclass to add to this GameObject as a
component later in the tutorial.
**Add Empty GameObjects to Hold the Academy and Brain**
### Add Empty GameObjects to Hold the Academy and Brain
1. Right click in Hierarchy window, select Create Empty.
2. Name the GameObject "Academy"

![The scene hierarchy](images/mlagents-NewTutHierarchy.png)
You can adjust the camera angles to give a better view of the scene at runtime. The next steps will be to create and add the ML-Agent components.
You can adjust the camera angles to give a better view of the scene at runtime.
The next steps will be to create and add the ML-Agent components.
The Academy object coordinates the ML-Agents in the scene and drives the decision-making portion of the simulation loop. Every ML-Agent scene needs one Academy instance. Since the base Academy class is abstract, you must make your own subclass even if you don't need to use any of the methods for a particular environment.
The Academy object coordinates the ML-Agents in the scene and drives the
decision-making portion of the simulation loop. Every ML-Agent scene needs one
Academy instance. Since the base Academy class is abstract, you must make your
own subclass even if you don't need to use any of the methods for a particular
environment.
First, add a New Script component to the Academy GameObject created earlier:
First, add a New Script component to the Academy GameObject created earlier:
1. Select the Academy GameObject to view it in the Inspector window.
2. Click **Add Component**.

Next, edit the new `RollerAcademy` script:
1. In the Unity Project window, double-click the `RollerAcademy` script to open it in your code editor. (By default new scripts are placed directly in the **Assets** folder.)
1. In the Unity Project window, double-click the `RollerAcademy` script to open
it in your code editor. (By default new scripts are placed directly in the
**Assets** folder.)
In such a basic scene, we don't need the Academy to initialize, reset, or otherwise control any objects in the environment so we have the simplest possible Academy implementation:
In such a basic scene, we don't need the Academy to initialize, reset, or
otherwise control any objects in the environment so we have the simplest
possible Academy implementation:
```csharp
using MLAgents;

The default settings for the Academy properties are also fine for this environment, so we don't need to change anything for the RollerAcademy component in the Inspector window.
The default settings for the Academy properties are also fine for this
environment, so we don't need to change anything for the RollerAcademy component
in the Inspector window.
The Brain object encapsulates the decision making process. An Agent sends its observations to its Brain and expects a decision in return. The Brain Type setting determines how the Brain makes decisions. Unlike the Academy and Agent classes, you don't make your own Brain subclasses.
The Brain object encapsulates the decision making process. An Agent sends its
observations to its Brain and expects a decision in return. The Brain Type
setting determines how the Brain makes decisions. Unlike the Academy and Agent
classes, you don't make your own Brain subclasses.
1. Select the Brain GameObject created earlier to show its properties in the Inspector window.
1. Select the Brain GameObject created earlier to show its properties in the
Inspector window.
We will come back to the Brain properties later, but leave the Brain Type as **Player** for now.
We will come back to the Brain properties later, but leave the Brain Type as
**Player** for now.
![The Brain default properties](images/mlagents-NewTutBrain.png)

Then, edit the new `RollerAgent` script:
1. In the Unity Project window, double-click the `RollerAgent` script to open it in your code editor.
1. In the Unity Project window, double-click the `RollerAgent` script to open it
in your code editor.
3. Delete the `Update()` method, but we will use the `Start()` function, so leave it alone for now.
3. Delete the `Update()` method, but we will use the `Start()` function, so
leave it alone for now.
So far, these are the basic steps that you would use to add ML-Agents to any Unity project. Next, we will add the logic that will let our agent learn to roll to the cube using reinforcement learning.
So far, these are the basic steps that you would use to add ML-Agents to any
Unity project. Next, we will add the logic that will let our Agent learn to roll
to the cube using reinforcement learning.
In this simple scenario, we don't use the Academy object to control the environment. If we wanted to change the environment, for example change the size of the floor or add or remove agents or other objects before or during the simulation, we could implement the appropriate methods in the Academy. Instead, we will have the Agent do all the work of resetting itself and the target when it succeeds or falls trying.
In this simple scenario, we don't use the Academy object to control the
environment. If we wanted to change the environment, for example change the size
of the floor or add or remove agents or other objects before or during the
simulation, we could implement the appropriate methods in the Academy. Instead,
we will have the Agent do all the work of resetting itself and the target when
it succeeds or falls trying.
**Initialization and Resetting the Agent**
### Initialization and Resetting the Agent
When the agent reaches its target, it marks itself done and its agent reset function moves the target to a random location. In addition, if the agent rolls off the platform, the reset function puts it back onto the floor.
When the Agent reaches its target, it marks itself done and its Agent reset
function moves the target to a random location. In addition, if the Agent rolls
off the platform, the reset function puts it back onto the floor.
To move the target GameObject, we need a reference to its Transform (which stores a GameObject's position, orientation and scale in the 3D world). To get this reference, add a public field of type `Transform` to the RollerAgent class. Public fields of a component in Unity get displayed in the Inspector window, allowing you to choose which GameObject to use as the target in the Unity Editor. To reset the agent's velocity (and later to apply force to move the agent) we need a reference to the Rigidbody component. A [Rigidbody](https://docs.unity3d.com/ScriptReference/Rigidbody.html) is Unity's primary element for physics simulation. (See [Physics](https://docs.unity3d.com/Manual/PhysicsSection.html) for full documentation of Unity physics.) Since the Rigidbody component is on the same GameObject as our Agent script, the best way to get this reference is using `GameObject.GetComponent<T>()`, which we can call in our script's `Start()` method.
To move the target GameObject, we need a reference to its Transform (which
stores a GameObject's position, orientation and scale in the 3D world). To get
this reference, add a public field of type `Transform` to the RollerAgent class.
Public fields of a component in Unity get displayed in the Inspector window,
allowing you to choose which GameObject to use as the target in the Unity
Editor. To reset the Agent's velocity (and later to apply force to move the
agent) we need a reference to the Rigidbody component. A
[Rigidbody](https://docs.unity3d.com/ScriptReference/Rigidbody.html) is Unity's
primary element for physics simulation. (See
[Physics](https://docs.unity3d.com/Manual/PhysicsSection.html) for full
documentation of Unity physics.) Since the Rigidbody component is on the same
GameObject as our Agent script, the best way to get this reference is using
`GameObject.GetComponent<T>()`, which we can call in our script's `Start()`
method.
So far, our RollerAgent script looks like:
So far, our RollerAgent script looks like:
```csharp
using System.Collections.Generic;

public class RollerAgent : Agent
public class RollerAgent : Agent
{
Rigidbody rBody;
void Start () {

public override void AgentReset()
{
if (this.transform.position.y < -1.0)
{
// The agent fell
{
// The Agent fell
{
{
// Move the target to a new spot
Target.position = new Vector3(Random.value * 8 - 4,
0.5f,

}
```
Next, let's implement the Agent.CollectObservations() function.
Next, let's implement the Agent.CollectObservations() function.
**Observing the Environment**
### Observing the Environment
The Agent sends the information we collect to the Brain, which uses it to make a decision. When you train the agent (or use a trained model), the data is fed into a neural network as a feature vector. For an agent to successfully learn a task, we need to provide the correct information. A good rule of thumb for deciding what information to collect is to consider what you would need to calculate an analytical solution to the problem.
The Agent sends the information we collect to the Brain, which uses it to make a
decision. When you train the Agent (or use a trained model), the data is fed
into a neural network as a feature vector. For an Agent to successfully learn a
task, we need to provide the correct information. A good rule of thumb for
deciding what information to collect is to consider what you would need to
calculate an analytical solution to the problem.
In our case, the information our agent collects includes:
In our case, the information our Agent collects includes:
* Position of the target. In general, it is better to use the relative position of other objects rather than the absolute position for more generalizable training. Note that the agent only collects the x and z coordinates since the floor is aligned with the x-z plane and the y component of the target's position never changes.
* Position of the target. In general, it is better to use the relative position
of other objects rather than the absolute position for more generalizable
training. Note that the Agent only collects the x and z coordinates since the
floor is aligned with the x-z plane and the y component of the target's
position never changes.
```csharp
// Calculate relative position

AddVectorObs(relativePosition.z / 5);
```
* Position of the agent itself within the confines of the floor. This data is collected as the agent's distance from each edge of the floor.
* Position of the Agent itself within the confines of the floor. This data is
collected as the Agent's distance from each edge of the floor.
```csharp
// Distance to edges of platform

AddVectorObs((this.transform.position.z - 5) / 5);
```
* The velocity of the agent. This helps the agent learn to control its speed so it doesn't overshoot the target and roll off the platform.
* The velocity of the Agent. This helps the Agent learn to control its speed so
it doesn't overshoot the target and roll off the platform.
```csharp
// Agent velocity

All the values are divided by 5 to normalize the inputs to the neural network to the range [-1,1]. (The number five is used because the platform is 10 units across.)
All the values are divided by 5 to normalize the inputs to the neural network to
the range [-1,1]. (The number five is used because the platform is 10 units
across.)
In total, the state observation contains 8 values and we need to use the continuous state space when we get around to setting the Brain properties:
In total, the state observation contains 8 values and we need to use the
continuous state space when we get around to setting the Brain properties:
```csharp
public override void CollectObservations()

// Agent velocity
AddVectorObs(rBody.velocity.x/5);
AddVectorObs(rBody.velocity.z/5);

The final part of the Agent code is the Agent.AgentAction() function, which receives the decision from the Brain.
The final part of the Agent code is the Agent.AgentAction() function, which
receives the decision from the Brain.
**Actions**
### Actions
The decision of the Brain comes in the form of an action array passed to the `AgentAction()` function. The number of elements in this array is determined by the `Vector Action Space Type` and `Vector Action Space Size` settings of the agent's Brain. The RollerAgent uses the continuous vector action space and needs two continuous control signals from the brain. Thus, we will set the Brain `Vector Action Size` to 2. The first element,`action[0]` determines the force applied along the x axis; `action[1]` determines the force applied along the z axis. (If we allowed the agent to move in three dimensions, then we would need to set `Vector Action Size` to 3. Each of these values returned by the network are between `-1` and `1.` Note the Brain really has no idea what the values in the action array mean. The training process just adjusts the action values in response to the observation input and then sees what kind of rewards it gets as a result.
The decision of the Brain comes in the form of an action array passed to the
`AgentAction()` function. The number of elements in this array is determined by
the `Vector Action Space Type` and `Vector Action Space Size` settings of the
agent's Brain. The RollerAgent uses the continuous vector action space and needs
two continuous control signals from the Brain. Thus, we will set the Brain
`Vector Action Size` to 2. The first element,`action[0]` determines the force
applied along the x axis; `action[1]` determines the force applied along the z
axis. (If we allowed the Agent to move in three dimensions, then we would need
to set `Vector Action Size` to 3. Each of these values returned by the network
are between `-1` and `1.` Note the Brain really has no idea what the values in
the action array mean. The training process just adjusts the action values in
response to the observation input and then sees what kind of rewards it gets as
a result.
The RollerAgent applies the values from the action[] array to its Rigidbody component, `rBody`, using the `Rigidbody.AddForce` function:
The RollerAgent applies the values from the action[] array to its Rigidbody
component, `rBody`, using the `Rigidbody.AddForce` function:
```csharp
Vector3 controlSignal = Vector3.zero;

```
**Rewards**
### Rewards
Reinforcement learning requires rewards. Assign rewards in the `AgentAction()` function. The learning algorithm uses the rewards assigned to the agent at each step in the simulation and learning process to determine whether it is giving the agent the optimal actions. You want to reward an agent for completing the assigned task (reaching the Target cube, in this case) and punish the agent if it irrevocably fails (falls off the platform). You can sometimes speed up training with sub-rewards that encourage behavior that helps the agent complete the task. For example, the RollerAgent reward system provides a small reward if the agent moves closer to the target in a step and a small negative reward at each step which encourages the agent to complete its task quickly.
Reinforcement learning requires rewards. Assign rewards in the `AgentAction()`
function. The learning algorithm uses the rewards assigned to the Agent at each
step in the simulation and learning process to determine whether it is giving
the Agent the optimal actions. You want to reward an Agent for completing the
assigned task (reaching the Target cube, in this case) and punish the Agent if
it irrevocably fails (falls off the platform). You can sometimes speed up
training with sub-rewards that encourage behavior that helps the Agent complete
the task. For example, the RollerAgent reward system provides a small reward if
the Agent moves closer to the target in a step and a small negative reward at
each step which encourages the Agent to complete its task quickly.
The RollerAgent calculates the distance to detect when it reaches the target. When it does, the code increments the Agent.reward variable by 1.0 and marks the agent as finished by setting the agent to done.
The RollerAgent calculates the distance to detect when it reaches the target.
When it does, the code increments the Agent.reward variable by 1.0 and marks the
agent as finished by setting the Agent to done.
```csharp
float distanceToTarget = Vector3.Distance(this.transform.position,

}
```
**Note:** When you mark an agent as done, it stops its activity until it is reset. You can have the agent reset immediately, by setting the Agent.ResetOnDone property to true in the inspector or you can wait for the Academy to reset the environment. This RollerBall environment relies on the `ResetOnDone` mechanism and doesn't set a `Max Steps` limit for the Academy (so it never resets the environment).
**Note:** When you mark an Agent as done, it stops its activity until it is
reset. You can have the Agent reset immediately, by setting the
Agent.ResetOnDone property to true in the inspector or you can wait for the
Academy to reset the environment. This RollerBall environment relies on the
`ResetOnDone` mechanism and doesn't set a `Max Steps` limit for the Academy (so
it never resets the environment).
To encourage the agent along, we also reward it for getting closer to the target (saving the previous distance measurement between steps):
```csharp
// Getting closer
if (distanceToTarget < previousDistance)
{
AddReward(0.1f);
}
```
It can also encourage an agent to finish a task more quickly to assign a negative reward at each step:
It can also encourage an Agent to finish a task more quickly to assign a
negative reward at each step:
```csharp
// Time penalty

Finally, to punish the agent for falling off the platform, assign a large negative reward and, of course, set the agent to done so that it resets itself in the next step:
Finally, to punish the Agent for falling off the platform, assign a large
negative reward and, of course, set the Agent to done so that it resets itself
in the next step:
```csharp
// Fell off platform

}
```
**AgentAction()**
With the action and reward logic outlined above, the final version of the `AgentAction()` function looks like:
### AgentAction()
With the action and reward logic outlined above, the final version of the
`AgentAction()` function looks like:
```csharp
public float speed = 10;

{
// Rewards
float distanceToTarget = Vector3.Distance(this.transform.position,
float distanceToTarget = Vector3.Distance(this.transform.position,
// Reached target
if (distanceToTarget < 1.42f)
{

// Getting closer
if (distanceToTarget < previousDistance)
{
AddReward(0.1f);
}
// Time penalty
AddReward(-0.05f);

AddReward(-1.0f);
Done();
}
previousDistance = distanceToTarget;
// Actions, size = 2
Vector3 controlSignal = Vector3.zero;

}
```
Note the `speed` and `previousDistance` class variables defined before the function. Since `speed` is public, you can set the value from the Inspector window.
Note the `speed` and `previousDistance` class variables defined before the
function. Since `speed` is public, you can set the value from the Inspector
window.
Now, that all the GameObjects and ML-Agent components are in place, it is time to connect everything together in the Unity Editor. This involves assigning the Brain object to the Agent and setting the Brain properties so that they are compatible with our agent code.
Now, that all the GameObjects and ML-Agent components are in place, it is time
to connect everything together in the Unity Editor. This involves assigning the
Brain object to the Agent, changing some of the Agent Components properties, and
setting the Brain properties so that they are compatible with our Agent code.
1. Expand the Academy GameObject in the Hierarchy window, so that the Brain object is visible.
2. Select the RollerAgent GameObject to show its properties in the Inspector window.
3. Drag the Brain object from the Hierarchy window to the RollerAgent Brain field.
1. Expand the Academy GameObject in the Hierarchy window, so that the Brain
object is visible.
2. Select the RollerAgent GameObject to show its properties in the Inspector
window.
3. Drag the Brain object from the Hierarchy window to the RollerAgent Brain
field.
4. Change `Decision Frequency` from `1` to `5`.
Also, drag the Target GameObject from the Hierarchy window to the RollerAgent Target field.
Also, drag the Target GameObject from the Hierarchy window to the RollerAgent
Target field.
Finally, select the Brain GameObject so that you can see its properties in the Inspector window. Set the following properties:
Finally, select the Brain GameObject so that you can see its properties in the
Inspector window. Set the following properties:
* `Vector Observation Space Type` = **Continuous**
* `Vector Observation Space Size` = 8

## Testing the Environment
It is always a good idea to test your environment manually before embarking on an extended training run. The reason we have left the Brain set to the **Player** type is so that we can control the agent using direct keyboard control. But first, you need to define the keyboard to action mapping. Although the RollerAgent only has an `Action Size` of two, we will use one key to specify positive values and one to specify negative values for each action, for a total of four keys.
It is always a good idea to test your environment manually before embarking on
an extended training run. The reason we have left the Brain set to the
**Player** type is so that we can control the Agent using direct keyboard
control. But first, you need to define the keyboard to action mapping. Although
the RollerAgent only has an `Action Size` of two, we will use one key to specify
positive values and one to specify negative values for each action, for a total
of four keys.
3. Expand the **Continuous Player Actions** dictionary (only visible when using the **Player* brain).
3. Expand the **Continuous Player Actions** dictionary (only visible when using
the **Player* brain).
4. Set **Size** to 4.
5. Set the following mappings:

| Element 2 | W | 1 | 1 |
| Element 3 | S | 1 | -1 |
The **Index** value corresponds to the index of the action array passed to `AgentAction()` function. **Value** is assigned to action[Index] when **Key** is pressed.
The **Index** value corresponds to the index of the action array passed to
`AgentAction()` function. **Value** is assigned to action[Index] when **Key** is
pressed.
Press **Play** to run the scene and use the WASD keys to move the agent around the platform. Make sure that there are no errors displayed in the Unity editor Console window and that the agent resets when it reaches its target or falls from the platform. Note that for more involved debugging, the ML-Agents SDK includes a convenient Monitor class that you can use to easily display agent status information in the Game window.
Press **Play** to run the scene and use the WASD keys to move the Agent around
the platform. Make sure that there are no errors displayed in the Unity editor
Console window and that the Agent resets when it reaches its target or falls
from the platform. Note that for more involved debugging, the ML-Agents SDK
includes a convenient Monitor class that you can use to easily display Agent
status information in the Game window.
One additional test you can perform is to first ensure that your environment
and the Python API work as expected using the `python/Basics`
[Jupyter notebook](Background-Jupyter.md). Within `Basics`, be sure to set
`env_name` to the name of the environment file you specify when building
this environment.
One additional test you can perform is to first ensure that your environment and
the Python API work as expected using the `notebooks/getting-started.ipynb`
[Jupyter notebook](Background-Jupyter.md). Within the notebook, be sure to set
`env_name` to the name of the environment file you specify when building this
environment.
Now you can train the Agent. To get ready for training, you must first to change the **Brain Type** from **Player** to **External**. From there, the process is the same as described in [Training ML-Agents](Training-ML-Agents.md).
Now you can train the Agent. To get ready for training, you must first to change
the **Brain Type** from **Player** to **External**. From there, the process is
the same as described in [Training ML-Agents](Training-ML-Agents.md).
This section briefly reviews how to organize your scene when using
Agents in your Unity environment.
This section briefly reviews how to organize your scene when using Agents in
your Unity environment.
There are three kinds of game objects you need to include in your scene in order to use Unity ML-Agents:
* Academy
* Brain
* Agents
There are three kinds of game objects you need to include in your scene in order
to use Unity ML-Agents:
* Academy
* Brain
* Agents
* There can only be one Academy game object in a scene.
* You can have multiple Brain game objects but they must be child of the Academy game object.
* There can only be one Academy game object in a scene.
* You can have multiple Brain game objects but they must be child of the Academy
game object.
Here is an example of what your scene hierarchy should look like:

55
docs/Learning-Environment-Design-Academy.md


# Creating an Academy
An Academy orchestrates all the Agent and Brain objects in a Unity scene. Every scene containing agents must contain a single Academy. To use an Academy, you must create your own subclass. However, all the methods you can override are optional.
An Academy orchestrates all the Agent and Brain objects in a Unity scene. Every
scene containing Agents must contain a single Academy. To use an Academy, you
must create your own subclass. However, all the methods you can override are
optional.
Use the Academy methods to:

See [Reinforcement Learning in Unity](Learning-Environment-Design.md) for a description of the timing of these method calls during a simulation.
See [Reinforcement Learning in Unity](Learning-Environment-Design.md) for a
description of the timing of these method calls during a simulation.
Initialization is performed once in an Academy object's lifecycle. Use the `InitializeAcademy()` method for any logic you would normally perform in the standard Unity `Start()` or `Awake()` methods.
Initialization is performed once in an Academy object's lifecycle. Use the
`InitializeAcademy()` method for any logic you would normally perform in the
standard Unity `Start()` or `Awake()` methods.
**Note:** Because the base Academy implements a `Awake()` function, you must not implement your own. Because of the way the Unity MonoBehaviour class is defined, implementing your own `Awake()` function hides the base class version and Unity will call yours instead. Likewise, do not implement a `FixedUpdate()` function in your Academy subclass.
**Note:** Because the base Academy implements a `Awake()` function, you must not
implement your own. Because of the way the Unity MonoBehaviour class is defined,
implementing your own `Awake()` function hides the base class version and Unity
will call yours instead. Likewise, do not implement a `FixedUpdate()` function
in your Academy subclass.
Implement an `AcademyReset()` function to alter the environment at the start of each episode. For example, you might want to reset an agent to its starting position or move a goal to a random position. An environment resets when the Academy `Max Steps` count is reached.
Implement an `AcademyReset()` function to alter the environment at the start of
each episode. For example, you might want to reset an Agent to its starting
position or move a goal to a random position. An environment resets when the
Academy `Max Steps` count is reached.
When you reset an environment, consider the factors that should change so that training is generalizable to different conditions. For example, if you were training a maze-solving agent, you would probably want to change the maze itself for each training episode. Otherwise, the agent would probably on learn to solve one, particular maze, not mazes in general.
When you reset an environment, consider the factors that should change so that
training is generalizable to different conditions. For example, if you were
training a maze-solving agent, you would probably want to change the maze itself
for each training episode. Otherwise, the agent would probably on learn to solve
one, particular maze, not mazes in general.
The `AcademyStep()` function is called at every step in the simulation before any agents are updated. Use this function to update objects in the environment at every step or during the episode between environment resets. For example, if you want to add elements to the environment at random intervals, you can put the logic for creating them in the `AcademyStep()` function.
The `AcademyStep()` function is called at every step in the simulation before
any Agents are updated. Use this function to update objects in the environment
at every step or during the episode between environment resets. For example, if
you want to add elements to the environment at random intervals, you can put the
logic for creating them in the `AcademyStep()` function.
* `Max Steps` - Total number of steps per-episode. `0` corresponds to episodes without a maximum number of steps. Once the step counter reaches maximum, the environment will reset.
* `Configuration` - The engine-level settings which correspond to rendering quality and engine speed.
* `Width` - Width of the environment window in pixels.
* `Height` - Width of the environment window in pixels.
* `Quality Level` - Rendering quality of environment. (Higher is better)
* `Time Scale` - Speed at which environment is run. (Higher is faster)
* `Target Frame Rate` - FPS engine attempts to maintain.
* `Reset Parameters` - List of custom parameters that can be changed in the environment on reset.
* `Max Steps` - Total number of steps per-episode. `0` corresponds to episodes
without a maximum number of steps. Once the step counter reaches maximum, the
environment will reset.
* `Configuration` - The engine-level settings which correspond to rendering
quality and engine speed.
* `Width` - Width of the environment window in pixels.
* `Height` - Width of the environment window in pixels.
* `Quality Level` - Rendering quality of environment. (Higher is better)
* `Time Scale` - Speed at which environment is run. (Higher is faster)
* `Target Frame Rate` - FPS engine attempts to maintain.
* `Reset Parameters` - List of custom parameters that can be changed in the
environment on reset.

469
docs/Learning-Environment-Design-Agents.md


# Agents
An agent is an actor that can observe its environment and decide on the best course of action using those observations. Create agents in Unity by extending the Agent class. The most important aspects of creating agents that can successfully learn are the observations the agent collects and, for reinforcement learning, the reward you assign to estimate the value of the agent's current state toward accomplishing its tasks.
An agent is an actor that can observe its environment and decide on the best
course of action using those observations. Create Agents in Unity by extending
the Agent class. The most important aspects of creating agents that can
successfully learn are the observations the agent collects for
reinforcement learning and the reward you assign to estimate the value of the
agent's current state toward accomplishing its tasks.
An agent passes its observations to its brain. The brain, then, makes a decision and passes the chosen action back to the agent. Your agent code must execute the action, for example, move the agent in one direction or another. In order to [train an agent using reinforcement learning](Learning-Environment-Design.md), your agent must calculate a reward value at each action. The reward is used to discover the optimal decision-making policy. (A reward is not used by already trained agents or for imitation learning.)
An Agent passes its observations to its Brain. The Brain, then, makes a decision
and passes the chosen action back to the agent. Your agent code must execute the
action, for example, move the agent in one direction or another. In order to
[train an agent using reinforcement learning](Learning-Environment-Design.md),
your agent must calculate a reward value at each action. The reward is used to
discover the optimal decision-making policy. (A reward is not used by already
trained agents or for imitation learning.)
The Brain class abstracts out the decision making logic from the agent itself so that you can use the same brain in multiple agents.
How a brain makes its decisions depends on the type of brain it is. An **External** brain simply passes the observations from its agents to an external process and then passes the decisions made externally back to the agents. An **Internal** brain uses the trained policy parameters to make decisions (and no longer adjusts the parameters in search of a better decision). The other types of brains do not directly involve training, but you might find them useful as part of a training project. See [Brains](Learning-Environment-Design-Brains.md).
The Brain class abstracts out the decision making logic from the Agent itself so
that you can use the same Brain in multiple Agents. How a Brain makes its
decisions depends on the type of Brain it is. An **External** Brain simply
passes the observations from its Agents to an external process and then passes
the decisions made externally back to the Agents. An **Internal** Brain uses the
trained policy parameters to make decisions (and no longer adjusts the
parameters in search of a better decision). The other types of Brains do not
directly involve training, but you might find them useful as part of a training
project. See [Brains](Learning-Environment-Design-Brains.md).
The observation-decision-action-reward cycle repeats after a configurable number of simulation steps (the frequency defaults to once-per-step). You can also set up an agent to request decisions on demand. Making decisions at regular step intervals is generally most appropriate for physics-based simulations. Making decisions on demand is generally appropriate for situations where agents only respond to specific events or take actions of variable duration. For example, an agent in a robotic simulator that must provide fine-control of joint torques should make its decisions every step of the simulation. On the other hand, an agent that only needs to make decisions when certain game or simulation events occur, should use on-demand decision making.
The observation-decision-action-reward cycle repeats after a configurable number
of simulation steps (the frequency defaults to once-per-step). You can also set
up an Agent to request decisions on demand. Making decisions at regular step
intervals is generally most appropriate for physics-based simulations. Making
decisions on demand is generally appropriate for situations where Agents only
respond to specific events or take actions of variable duration. For example, an
agent in a robotic simulator that must provide fine-control of joint torques
should make its decisions every step of the simulation. On the other hand, an
agent that only needs to make decisions when certain game or simulation events
occur, should use on-demand decision making.
To control the frequency of step-based decision making, set the **Decision Frequency** value for the Agent object in the Unity Inspector window. Agents using the same Brain instance can use a different frequency. During simulation steps in which no decision is requested, the agent receives the same action chosen by the previous decision.
To control the frequency of step-based decision making, set the **Decision
Frequency** value for the Agent object in the Unity Inspector window. Agents
using the same Brain instance can use a different frequency. During simulation
steps in which no decision is requested, the Agent receives the same action
chosen by the previous decision.
On demand decision making allows agents to request decisions from their
brains only when needed instead of receiving decisions at a fixed
frequency. This is useful when the agents commit to an action for a
variable number of steps or when the agents cannot make decisions
at the same time. This typically the case for turn based games, games
where agents must react to events or games where agents can take
actions of variable duration.
On demand decision making allows Agents to request decisions from their Brains
only when needed instead of receiving decisions at a fixed frequency. This is
useful when the agents commit to an action for a variable number of steps or
when the agents cannot make decisions at the same time. This typically the case
for turn based games, games where agents must react to events or games where
agents can take actions of variable duration.
When you turn on **On Demand Decisions** for an agent, your agent code must call the `Agent.RequestDecision()` function. This function call starts one iteration of the observation-decision-action-reward cycle. The Brain invokes the agent's `CollectObservations()` method, makes a decision and returns it by calling the `AgentAction()` method. The Brain waits for the agent to request the next decision before starting another iteration.
When you turn on **On Demand Decisions** for an Agent, your agent code must call
the `Agent.RequestDecision()` function. This function call starts one iteration
of the observation-decision-action-reward cycle. The Brain invokes the Agent's
`CollectObservations()` method, makes a decision and returns it by calling the
`AgentAction()` method. The Brain waits for the Agent to request the next
decision before starting another iteration.
To make decisions, an agent must observe its environment to determine its current state. A state observation can take the following forms:
To make decisions, an agent must observe its environment in order to infer the
state of the world. A state observation can take the following forms:
* **Continuous Vector** — a feature vector consisting of an array of numbers.
* **Discrete Vector** — an index into a state table (typically only useful for the simplest of environments).
* **Vector Observation** — a feature vector consisting of an array of floating
point numbers.
When you use the **Continuous** or **Discrete** vector observation space for an agent, implement the `Agent.CollectObservations()` method to create the feature vector or state index. When you use **Visual Observations**, you only need to identify which Unity Camera objects will provide images and the base Agent class handles the rest. You do not need to implement the `CollectObservations()` method when your agent uses visual observations (unless it also uses vector observations).
When you use vector observations for an Agent, implement the
`Agent.CollectObservations()` method to create the feature vector. When you use
**Visual Observations**, you only need to identify which Unity Camera objects
will provide images and the base Agent class handles the rest. You do not need
to implement the `CollectObservations()` method when your Agent uses visual
observations (unless it also uses vector observations).
### Continuous Vector Observation Space: Feature Vectors
### Vector Observation Space: Feature Vectors
For agents using a continuous state space, you create a feature vector to represent the agent's observation at each step of the simulation. The Brain class calls the `CollectObservations()` method of each of its agents. Your implementation of this function must call `AddVectorObs` to add vector observations.
For agents using a continuous state space, you create a feature vector to
represent the agent's observation at each step of the simulation. The Brain
class calls the `CollectObservations()` method of each of its Agents. Your
implementation of this function must call `AddVectorObs` to add vector
observations.
The observation must include all the information an agent needs to accomplish its task. Without sufficient and relevant information, an agent may learn poorly or may not learn at all. A reasonable approach for determining what information should be included is to consider what you would need to calculate an analytical solution to the problem.
The observation must include all the information an agents needs to accomplish
its task. Without sufficient and relevant information, an agent may learn poorly
or may not learn at all. A reasonable approach for determining what information
should be included is to consider what you would need to calculate an analytical
solution to the problem.
For examples of various state observation functions, you can look at the [example environments](Learning-Environment-Examples.md) included in the ML-Agents SDK. For instance, the 3DBall example uses the rotation of the platform, the relative position of the ball, and the velocity of the ball as its state observation. As an experiment, you can remove the velocity components from the observation and retrain the 3DBall agent. While it will learn to balance the ball reasonably well, the performance of the agent without using velocity is noticeably worse.
For examples of various state observation functions, you can look at the
[example environments](Learning-Environment-Examples.md) included in the
ML-Agents SDK. For instance, the 3DBall example uses the rotation of the
platform, the relative position of the ball, and the velocity of the ball as its
state observation. As an experiment, you can remove the velocity components from
the observation and retrain the 3DBall agent. While it will learn to balance the
ball reasonably well, the performance of the agent without using velocity is
noticeably worse.
```csharp
public GameObject ball;

}
```
The feature vector must always contain the same number of elements and observations must always be in the same position within the list. If the number of observed entities in an environment can vary you can pad the feature vector with zeros for any missing entities in a specific observation or you can limit an agent's observations to a fixed subset. For example, instead of observing every enemy agent in an environment, you could only observe the closest five.
The feature vector must always contain the same number of elements and
observations must always be in the same position within the list. If the number
of observed entities in an environment can vary you can pad the feature vector
with zeros for any missing entities in a specific observation or you can limit
an agent's observations to a fixed subset. For example, instead of observing
every enemy agent in an environment, you could only observe the closest five.
When you set up an Agent's brain in the Unity Editor, set the following properties to use a continuous vector observation:
When you set up an Agent's Brain in the Unity Editor, set the following
properties to use a continuous vector observation:
**Space Size** — The state size must match the length of your feature vector.
**Space Type** — Set to **Continuous**.
**Brain Type** — Set to **External** during training; set to **Internal** to use the trained model.
* **Space Size** — The state size must match the length of your feature vector.
* **Brain Type** — Set to **External** during training; set to **Internal** to
use the trained model.
The observation feature vector is a list of floating point numbers, which means you must convert any other data types to a float or a list of floats.
The observation feature vector is a list of floating point numbers, which means
you must convert any other data types to a float or a list of floats.
Integers can be be added directly to the observation vector. You must explicitly convert Boolean values to a number:
Integers can be be added directly to the observation vector. You must explicitly
convert Boolean values to a number:
For entities like positions and rotations, you can add their components to the feature list individually. For example:
For entities like positions and rotations, you can add their components to the
feature list individually. For example:
```csharp
Vector3 speed = ball.transform.GetComponent<Rigidbody>().velocity;

```
Type enumerations should be encoded in the _one-hot_ style. That is, add an element to the feature vector for each element of enumeration, setting the element representing the observed member to one and set the rest to zero. For example, if your enumeration contains \[Sword, Shield, Bow\] and the agent observes that the current item is a Bow, you would add the elements: 0, 0, 1 to the feature vector. The following code example illustrates how to add
Type enumerations should be encoded in the _one-hot_ style. That is, add an
element to the feature vector for each element of enumeration, setting the
element representing the observed member to one and set the rest to zero. For
example, if your enumeration contains \[Sword, Shield, Bow\] and the agent
observes that the current item is a Bow, you would add the elements: 0, 0, 1 to
the feature vector. The following code example illustrates how to add.
```csharp
enum CarriedItems { Sword, Shield, Bow, LastItem }

for (int ci = 0; ci < (int)CarriedItems.LastItem; ci++)
{
AddVectorObs((int)currentItem == ci ? 1.0f : 0.0f);
AddVectorObs((int)currentItem == ci ? 1.0f : 0.0f);
}
}
```

For the best results when training, you should normalize the components of your feature vector to the range [-1, +1] or [0, 1]. When you normalize the values, the PPO neural network can often converge to a solution faster. Note that it isn't always necessary to normalize to these recommended ranges, but it is considered a best practice when using neural networks. The greater the variation in ranges between the components of your observation, the more likely that training will be affected.
For the best results when training, you should normalize the components of your
feature vector to the range [-1, +1] or [0, 1]. When you normalize the values,
the PPO neural network can often converge to a solution faster. Note that it
isn't always necessary to normalize to these recommended ranges, but it is
considered a best practice when using neural networks. The greater the variation
in ranges between the components of your observation, the more likely that
training will be affected.
To normalize a value to [0, 1], you can use the following formula:

Rotations and angles should also be normalized. For angles between 0 and 360 degrees, you can use the following formulas:
Rotations and angles should also be normalized. For angles between 0 and 360
degrees, you can use the following formulas:
```csharp
Quaternion rotation = transform.rotation;

For angles that can be outside the range [0,360], you can either reduce the angle, or, if the number of turns is significant, increase the maximum value used in your normalization formula.
For angles that can be outside the range [0,360], you can either reduce the
angle, or, if the number of turns is significant, increase the maximum value
used in your normalization formula.
Camera observations use rendered textures from one or more cameras in a scene. The brain vectorizes the textures into a 3D Tensor which can be fed into a convolutional neural network (CNN). For more information on CNNs, see [this guide](http://cs231n.github.io/convolutional-networks/). You can use camera observations and either continuous feature vector or discrete state observations at the same time.
Agents using camera images can capture state of arbitrary complexity and are useful when the state is difficult to describe numerically. However, they are also typically less efficient and slower to train, and sometimes don't succeed at all.
Camera observations use rendered textures from one or more cameras in a scene.
The Brain vectorizes the textures into a 3D Tensor which can be fed into a
convolutional neural network (CNN). For more information on CNNs, see [this
guide](http://cs231n.github.io/convolutional-networks/). You can use camera
observations along side vector observations.
To add a visual observation to an agent, click on the `Add Camera` button in the Agent inspector. Then drag the camera you want to add to the `Camera` field. You can have more than one camera attached to an agent.
Agents using camera images can capture state of arbitrary complexity and are
useful when the state is difficult to describe numerically. However, they are
also typically less efficient and slower to train, and sometimes don't succeed
at all.
![Agent Camera](images/visual-observation.png)
To add a visual observation to an Agent, click on the `Add Camera` button in the
Agent inspector. Then drag the camera you want to add to the `Camera` field. You
can have more than one camera attached to an Agent.
In addition, make sure that the Agent's Brain expects a visual observation. In the Brain inspector, under **Brain Parameters** > **Visual Observations**, specify the number of Cameras the agent is using for its visual observations. For each visual observation, set the width and height of the image (in pixels) and whether or not the observation is color or grayscale (when `Black And White` is checked).
### Discrete Vector Observation Space: Table Lookup
![Agent Camera](images/visual-observation.png)
You can use the discrete vector observation space when an agent only has a limited number of possible states and those states can be enumerated by a single number. For instance, the [Basic example environment](Learning-Environment-Examples.md#basic) in the ML-Agents toolkit defines an agent with a discrete vector observation space. The states of this agent are the integer steps between two linear goals. In the Basic example, the agent learns to move to the goal that provides the greatest reward.
More generally, the discrete vector observation identifier could be an index into a table of the possible states. However, tables quickly become unwieldy as the environment becomes more complex. For example, even a simple game like [tic-tac-toe has 765 possible states](https://en.wikipedia.org/wiki/Game_complexity) (far more if you don't reduce the number of observations by combining those that are rotations or reflections of each other).
To implement a discrete state observation, implement the `CollectObservations()` method of your Agent subclass and return a `List` containing a single number representing the state:
```csharp
public override void CollectObservations()
{
AddVectorObs(stateIndex); // stateIndex is the state identifier
}
```
In addition, make sure that the Agent's Brain expects a visual observation. In
the Brain inspector, under **Brain Parameters** > **Visual Observations**,
specify the number of Cameras the Agent is using for its visual observations.
For each visual observation, set the width and height of the image (in pixels)
and whether or not the observation is color or grayscale (when `Black And White`
is checked).
An action is an instruction from the brain that the agent carries out. The action is passed to the agent as a parameter when the Academy invokes the agent's `AgentAction()` function. When you specify that the vector action space is **Continuous**, the action parameter passed to the agent is an array of control signals with length equal to the `Vector Action Space Size` property. When you specify a **Discrete** vector action space type, the action parameter is an array containing only a single value, which is an index into your list or table of commands. In the **Discrete** vector action space type, the `Vector Action Space Size` is the number of elements in your action table. Set the `Vector Action Space Size` and `Vector Action Space Type` properties on the Brain object assigned to the agent (using the Unity Editor Inspector window).
An action is an instruction from the Brain that the agent carries out. The
action is passed to the Agent as a parameter when the Academy invokes the
agent's `AgentAction()` function. When you specify that the vector action space
is **Continuous**, the action parameter passed to the Agent is an array of
control signals with length equal to the `Vector Action Space Size` property.
When you specify a **Discrete** vector action space type, the action parameter
is an array containing integers. Each integer is an index into a list or table
of commands. In the **Discrete** vector action space type, the action parameter
is an array of indices. The number of indices in the array is determined by the
number of branches defined in the `Branches Size` property. Each branch
corresponds to an action table, you can specify the size of each table by
modifying the `Branches` property. Set the `Vector Action Space Size` and
`Vector Action Space Type` properties on the Brain object assigned to the Agent
(using the Unity Editor Inspector window).
Neither the Brain nor the training algorithm know anything about what the action values themselves mean. The training algorithm simply tries different values for the action list and observes the affect on the accumulated rewards over time and many training episodes. Thus, the only place actions are defined for an agent is in the `AgentAction()` function. You simply specify the type of vector action space, and, for the continuous vector action space, the number of values, and then apply the received values appropriately (and consistently) in `ActionAct()`.
Neither the Brain nor the training algorithm know anything about what the action
values themselves mean. The training algorithm simply tries different values for
the action list and observes the affect on the accumulated rewards over time and
many training episodes. Thus, the only place actions are defined for an Agent is
in the `AgentAction()` function. You simply specify the type of vector action
space, and, for the continuous vector action space, the number of values, and
then apply the received values appropriately (and consistently) in
`ActionAct()`.
For example, if you designed an agent to move in two dimensions, you could use either continuous or the discrete vector actions. In the continuous case, you would set the vector action size to two (one for each dimension), and the agent's brain would create an action with two floating point values. In the discrete case, you would set the vector action size to four (one for each direction), and the brain would create an action array containing a single element with a value ranging from zero to four.
For example, if you designed an agent to move in two dimensions, you could use
either continuous or the discrete vector actions. In the continuous case, you
would set the vector action size to two (one for each dimension), and the
agent's Brain would create an action with two floating point values. In the
discrete case, you would use one Branch with a size of four (one for each
direction), and the Brain would create an action array containing a single
element with a value ranging from zero to three. Alternatively, you could create
two branches of size two (one for horizontal movement and one for vertical
movement), and the Brain would create an action array containing two elements
with values ranging from zero to one.
Note that when you are programming actions for an agent, it is often helpful to test your action logic using a **Player** brain, which lets you map keyboard commands to actions. See [Brains](Learning-Environment-Design-Brains.md).
Note that when you are programming actions for an agent, it is often helpful to
test your action logic using a **Player** Brain, which lets you map keyboard
commands to actions. See [Brains](Learning-Environment-Design-Brains.md).
The [3DBall](Learning-Environment-Examples.md#3dball-3d-balance-ball) and [Area](Learning-Environment-Examples.md#push-block) example environments are set up to use either the continuous or the discrete vector action spaces.
The [3DBall](Learning-Environment-Examples.md#3dball-3d-balance-ball) and
[Area](Learning-Environment-Examples.md#push-block) example environments are set
up to use either the continuous or the discrete vector action spaces.
When an agent uses a brain set to the **Continuous** vector action space, the action parameter passed to the agent's `AgentAction()` function is an array with length equal to the Brain object's `Vector Action Space Size` property value. The individual values in the array have whatever meanings that you ascribe to them. If you assign an element in the array as the speed of an agent, for example, the training process learns to control the speed of the agent though this parameter.
When an Agent uses a Brain set to the **Continuous** vector action space, the
action parameter passed to the Agent's `AgentAction()` function is an array with
length equal to the Brain object's `Vector Action Space Size` property value.
The individual values in the array have whatever meanings that you ascribe to
them. If you assign an element in the array as the speed of an Agent, for
example, the training process learns to control the speed of the Agent though
this parameter.
The [Reacher example](Learning-Environment-Examples.md#reacher) defines a continuous action space with four control values.
The [Reacher example](Learning-Environment-Examples.md#reacher) defines a
continuous action space with four control values.
![](images/reacher.png)
![reacher](images/reacher.png)
These control values are applied as torques to the bodies making up the arm :
These control values are applied as torques to the bodies making up the arm:
```csharp
public override void AgentAction(float[] act)

}
```
By default the output from our provided PPO algorithm pre-clamps the values of `vectorAction` into the [-1, 1] range. It is a best practice to manually clip these as well, if you plan to use a 3rd party algorithm with your environment. As shown above, you can scale the control values as needed after clamping them.
By default the output from our provided PPO algorithm pre-clamps the values of
`vectorAction` into the [-1, 1] range. It is a best practice to manually clip
these as well, if you plan to use a 3rd party algorithm with your environment.
As shown above, you can scale the control values as needed after clamping them.
When an agent uses a brain set to the **Discrete** vector action space, the action parameter passed to the agent's `AgentAction()` function is an array containing a single element. The value is the index of the action to in your table or list of actions. With the discrete vector action space, `Vector Action Space Size` represents the number of actions in your action table.
When an Agent uses a Brain set to the **Discrete** vector action space, the
action parameter passed to the Agent's `AgentAction()` function is an array
containing indices. With the discrete vector action space, `Branches` is an
array of integers, each value corresponds to the number of possibilities for
each branch.
The [Area example](Learning-Environment-Examples.md#push-block) defines five actions for the discrete vector action space: a jump action and one action for each cardinal direction:
For example, if we wanted an Agent that can move in an plane and jump, we could
define two branches (one for motion and one for jumping) because we want our
agent be able to move __and__ jump concurrently. We define the first branch to
have 5 possible actions (don't move, go left, go right, go backward, go forward)
and the second one to have 2 possible actions (don't jump, jump). The
AgentAction method would look something like:
// Get the action index
int movement = Mathf.FloorToInt(act[0]);
// Get the action index for movement
int movement = Mathf.FloorToInt(act[0]);
// Get the action index for jumping
int jump = Mathf.FloorToInt(act[1]);
// Look up the index in the action list:
// Look up the index in the movement action list:
if (movement == 5 && GetComponent<Rigidbody>().velocity.y <= 0) { directionY = 1; }
// Look up the index in the jump action list:
if (jump == 1 && IsGrounded()) { directionY = 1; }
// Apply the action results to move the agent
// Apply the action results to move the Agent
Note that the above code example is a simplified extract from the AreaAgent class, which provides alternate implementations for both the discrete and the continuous action spaces.
Note that the above code example is a simplified extract from the AreaAgent
class, which provides alternate implementations for both the discrete and the
continuous action spaces.
#### Masking Discrete Actions
When using Discrete Actions, it is possible to specify that some actions are
impossible for the next decision. Then the Agent is controlled by an External or
Internal Brain, the Agent will be unable to perform the specified action. Note
that when the Agent is controlled by a Player or Heuristic Brain, the Agent will
still be able to decide to perform the masked action. In order to mask an
action, call the method `SetActionMask` within the `CollectObservation` method :
```csharp
SetActionMask(branch, actionIndices)
```
Where:
* `branch` is the index (starting at 0) of the branch on which you want to mask
the action
* `actionIndices` is a list of `int` or a single `int` corresponding to the
index of the action that the Agent cannot perform.
For example, if you have an Agent with 2 branches and on the first branch
(branch 0) there are 4 possible actions : _"do nothing"_, _"jump"_, _"shoot"_
and _"change weapon"_. Then with the code bellow, the Agent will either _"do
nothing"_ or _"change weapon"_ for his next decision (since action index 1 and 2
are masked)
```csharp
SetActionMask(0, new int[2]{1,2})
```
Notes:
* You can call `SetActionMask` multiple times if you want to put masks on
multiple branches.
* You cannot mask all the actions of a branch.
* You cannot mask actions in continuous control.
In reinforcement learning, the reward is a signal that the agent has done something right. The PPO reinforcement learning algorithm works by optimizing the choices an agent makes such that the agent earns the highest cumulative reward over time. The better your reward mechanism, the better your agent will learn.
In reinforcement learning, the reward is a signal that the agent has done
something right. The PPO reinforcement learning algorithm works by optimizing
the choices an agent makes such that the agent earns the highest cumulative
reward over time. The better your reward mechanism, the better your agent will
learn.
**Note:** Rewards are not used during inference by a brain using an already trained policy and is also not used during imitation learning.
Perhaps the best advice is to start simple and only add complexity as needed. In general, you should reward results rather than actions you think will lead to the desired results. To help develop your rewards, you can use the Monitor class to display the cumulative reward received by an agent. You can even use a Player brain to control the agent while watching how it accumulates rewards.
**Note:** Rewards are not used during inference by a Brain using an already
trained policy and is also not used during imitation learning.
Allocate rewards to an agent by calling the `AddReward()` method in the `AgentAction()` function. The reward assigned in any step should be in the range [-1,1]. Values outside this range can lead to unstable training. The `reward` value is reset to zero at every step.
Perhaps the best advice is to start simple and only add complexity as needed. In
general, you should reward results rather than actions you think will lead to
the desired results. To help develop your rewards, you can use the Monitor class
to display the cumulative reward received by an Agent. You can even use a Player
Brain to control the Agent while watching how it accumulates rewards.
**Examples**
Allocate rewards to an Agent by calling the `AddReward()` method in the
`AgentAction()` function. The reward assigned in any step should be in the range
[-1,1]. Values outside this range can lead to unstable training. The `reward`
value is reset to zero at every step.
You can examine the `AgentAction()` functions defined in the [example environments](Learning-Environment-Examples.md) to see how those projects allocate rewards.
### Examples
The `GridAgent` class in the [GridWorld example](Learning-Environment-Examples.md#gridworld) uses a very simple reward system:
You can examine the `AgentAction()` functions defined in the [example
environments](Learning-Environment-Examples.md) to see how those projects
allocate rewards.
The `GridAgent` class in the [GridWorld
example](Learning-Environment-Examples.md#gridworld) uses a very simple reward
system:
Collider[] hitObjects = Physics.OverlapBox(trueAgent.transform.position,
Collider[] hitObjects = Physics.OverlapBox(trueAgent.transform.position,
new Vector3(0.3f, 0.3f, 0.3f));
if (hitObjects.Where(col => col.gameObject.tag == "goal").ToArray().Length == 1)
{

}
```
The agent receives a positive reward when it reaches the goal and a negative reward when it falls into the pit. Otherwise, it gets no rewards. This is an example of a _sparse_ reward system. The agent must explore a lot to find the infrequent reward.
The agent receives a positive reward when it reaches the goal and a negative
reward when it falls into the pit. Otherwise, it gets no rewards. This is an
example of a _sparse_ reward system. The agent must explore a lot to find the
infrequent reward.
In contrast, the `AreaAgent` in the [Area example](Learning-Environment-Examples.md#push-block) gets a small negative reward every step. In order to get the maximum reward, the agent must finish its task of reaching the goal square as quickly as possible:
In contrast, the `AreaAgent` in the [Area
example](Learning-Environment-Examples.md#push-block) gets a small negative
reward every step. In order to get the maximum reward, the agent must finish its
task of reaching the goal square as quickly as possible:
if (gameObject.transform.position.y < 0.0f ||
Mathf.Abs(gameObject.transform.position.x - area.transform.position.x) > 8f ||
if (gameObject.transform.position.y < 0.0f ||
Mathf.Abs(gameObject.transform.position.x - area.transform.position.x) > 8f ||
Mathf.Abs(gameObject.transform.position.z + 5 - area.transform.position.z) > 8)
{
Done();

The agent also gets a larger negative penalty if it falls off the playing surface.
The agent also gets a larger negative penalty if it falls off the playing
surface.
The `Ball3DAgent` in the [3DBall](Learning-Environment-Examples.md#3dball-3d-balance-ball) takes a similar approach, but allocates a small positive reward as long as the agent balances the ball. The agent can maximize its rewards by keeping the ball on the platform:
The `Ball3DAgent` in the
[3DBall](Learning-Environment-Examples.md#3dball-3d-balance-ball) takes a
similar approach, but allocates a small positive reward as long as the agent
balances the ball. The agent can maximize its rewards by keeping the ball on the
platform:
```csharp
if (IsDone() == false)

// When ball falls mark agent as done and give a negative penalty
// When ball falls mark Agent as done and give a negative penalty
if ((ball.transform.position.y - gameObject.transform.position.y) < -2f ||
Mathf.Abs(ball.transform.position.x - gameObject.transform.position.x) > 3f ||
Mathf.Abs(ball.transform.position.z - gameObject.transform.position.z) > 3f)

}
```
The `Ball3DAgent` also assigns a negative penalty when the ball falls off the platform.
The `Ball3DAgent` also assigns a negative penalty when the ball falls off the
platform.
* `Brain` - The brain to register this agent to. Can be dragged into the inspector using the Editor.
* `Visual Observations` - A list of `Cameras` which will be used to generate observations.
* `Max Step` - The per-agent maximum number of steps. Once this number is reached, the agent will be reset if `Reset On Done` is checked.
* `Reset On Done` - Whether the agent's `AgentReset()` function should be called when the agent reaches its `Max Step` count or is marked as done in code.
* `On Demand Decision` - Whether the agent requests decisions at a fixed step interval or explicitly requests decisions by calling `RequestDecision()`.
* If not checked, the Agent will request a new
decision every `Decision Frequency` steps and
perform an action every step. In the example above,
`CollectObservations()` will be called every 5 steps and
`AgentAction()` will be called at every step. This means that the
Agent will reuse the decision the Brain has given it.
* If checked, the Agent controls when to receive
decisions, and take actions. To do so, the Agent may leverage one or two methods:
* `RequestDecision()` Signals that the Agent is requesting a decision.
This causes the Agent to collect its observations and ask the Brain for a
decision at the next step of the simulation. Note that when an Agent
requests a decision, it also request an action.
This is to ensure that all decisions lead to an action during training.
* `RequestAction()` Signals that the Agent is requesting an action. The
action provided to the Agent in this case is the same action that was
provided the last time it requested a decision.
* `Decision Frequency` - The number of steps between decision requests. Not used if `On Demand Decision`, is true.
* `Brain` - The Brain to register this Agent to. Can be dragged into the
inspector using the Editor.
* `Visual Observations` - A list of `Cameras` which will be used to generate
observations.
* `Max Step` - The per-agent maximum number of steps. Once this number is
reached, the Agent will be reset if `Reset On Done` is checked.
* `Reset On Done` - Whether the Agent's `AgentReset()` function should be called
when the Agent reaches its `Max Step` count or is marked as done in code.
* `On Demand Decision` - Whether the Agent requests decisions at a fixed step
interval or explicitly requests decisions by calling `RequestDecision()`.
* If not checked, the Agent will request a new decision every `Decision
Frequency` steps and perform an action every step. In the example above,
`CollectObservations()` will be called every 5 steps and `AgentAction()`
will be called at every step. This means that the Agent will reuse the
decision the Brain has given it.
* If checked, the Agent controls when to receive decisions, and take actions.
To do so, the Agent may leverage one or two methods:
* `RequestDecision()` Signals that the Agent is requesting a decision. This
causes the Agent to collect its observations and ask the Brain for a
decision at the next step of the simulation. Note that when an Agent
requests a decision, it also request an action. This is to ensure that
all decisions lead to an action during training.
* `RequestAction()` Signals that the Agent is requesting an action. The
action provided to the Agent in this case is the same action that was
provided the last time it requested a decision.
* `Decision Frequency` - The number of steps between decision requests. Not used
if `On Demand Decision`, is true.
We created a helpful `Monitor` class that enables visualizing variables within
a Unity environment. While this was built for monitoring an Agent's value
function throughout the training process, we imagine it can be more broadly
useful. You can learn more [here](Feature-Monitor.md).
We created a helpful `Monitor` class that enables visualizing variables within a
Unity environment. While this was built for monitoring an agent's value function
throughout the training process, we imagine it can be more broadly useful. You
can learn more [here](Feature-Monitor.md).
To add an Agent to an environment at runtime, use the Unity `GameObject.Instantiate()` function. It is typically easiest to instantiate an agent from a [Prefab](https://docs.unity3d.com/Manual/Prefabs.html) (otherwise, you have to instantiate every GameObject and Component that make up your agent individually). In addition, you must assign a Brain instance to the new Agent and initialize it by calling its `AgentReset()` method. For example, the following function creates a new agent given a Prefab, Brain instance, location, and orientation:
To add an Agent to an environment at runtime, use the Unity
`GameObject.Instantiate()` function. It is typically easiest to instantiate an
agent from a [Prefab](https://docs.unity3d.com/Manual/Prefabs.html) (otherwise,
you have to instantiate every GameObject and Component that make up your Agent
individually). In addition, you must assign a Brain instance to the new Agent
and initialize it by calling its `AgentReset()` method. For example, the
following function creates a new Agent given a Prefab, Brain instance, location,
and orientation:
private void CreateAgent(GameObject agentPrefab, Brain brain, Vector3 position, Quaternion orientation)
private void CreateAgent(GameObject AgentPrefab, Brain brain, Vector3 position, Quaternion orientation)
GameObject agentObj = Instantiate(agentPrefab, position, orientation);
Agent agent = agentObj.GetComponent<Agent>();
agent.GiveBrain(brain);
agent.AgentReset();
GameObject AgentObj = Instantiate(agentPrefab, position, orientation);
Agent Agent = AgentObj.GetComponent<Agent>();
Agent.GiveBrain(brain);
Agent.AgentReset();
Before destroying an Agent GameObject, you must mark it as done (and wait for the next step in the simulation) so that the Brain knows that this agent is no longer active. Thus, the best place to destroy an agent is in the `Agent.AgentOnDone()` function:
Before destroying an Agent GameObject, you must mark it as done (and wait for
the next step in the simulation) so that the Brain knows that this Agent is no
longer active. Thus, the best place to destroy an Agent is in the
`Agent.AgentOnDone()` function:
```csharp
public override void AgentOnDone()

```
Note that in order for `AgentOnDone()` to be called, the agent's `ResetOnDone` property must be false. You can set `ResetOnDone` on the agent's Inspector or in code.
Note that in order for `AgentOnDone()` to be called, the Agent's `ResetOnDone`
property must be false. You can set `ResetOnDone` on the Agent's Inspector or in
code.

106
docs/Learning-Environment-Design-Brains.md


# Brains
The Brain encapsulates the decision making process. Brain objects must be children of the Academy in the Unity scene hierarchy. Every Agent must be assigned a Brain, but you can use the same Brain with more than one Agent. You can also create several Brains, attach each of the Brain to one or more than one Agent.
The Brain encapsulates the decision making process. Brain objects must be
children of the Academy in the Unity scene hierarchy. Every Agent must be
assigned a Brain, but you can use the same Brain with more than one Agent. You
can also create several Brains, attach each of the Brain to one or more than one
Agent.
Use the Brain class directly, rather than a subclass. Brain behavior is determined by the **Brain Type**. The ML-Agents toolkit defines four Brain Types:
Use the Brain class directly, rather than a subclass. Brain behavior is
determined by the **Brain Type**. The ML-Agents toolkit defines four Brain
Types:
* [External](Learning-Environment-Design-External-Internal-Brains.md) — The **External** and **Internal** types typically work together; set **External** when training your agents. You can also use the **External** brain to communicate with a Python script via the Python `UnityEnvironment` class included in the Python portion of the ML-Agents SDK.
* [Internal](Learning-Environment-Design-External-Internal-Brains.md) – Set **Internal** to make use of a trained model.
* [Heuristic](Learning-Environment-Design-Heuristic-Brains.md) – Set **Heuristic** to hand-code the agent's logic by extending the Decision class.
* [Player](Learning-Environment-Design-Player-Brains.md) – Set **Player** to map keyboard keys to agent actions, which can be useful to test your agent code.
* [External](Learning-Environment-Design-External-Internal-Brains.md) — The
**External** and **Internal** types typically work together; set **External**
when training your Agents. You can also use the **External** Brain to
communicate with a Python script via the Python `UnityEnvironment` class
included in the Python portion of the ML-Agents SDK.
* [Internal](Learning-Environment-Design-External-Internal-Brains.md) – Set
**Internal** to make use of a trained model.
* [Heuristic](Learning-Environment-Design-Heuristic-Brains.md) – Set
**Heuristic** to hand-code the Agent's logic by extending the Decision class.
* [Player](Learning-Environment-Design-Player-Brains.md) – Set **Player** to map
keyboard keys to Agent actions, which can be useful to test your Agent code.
During training, set your agent's brain type to **External**. To use the trained model, import the model file into the Unity project and change the brain type to **Internal**.
During training, set your Agent's Brain type to **External**. To use the trained
model, import the model file into the Unity project and change the Brain type to
**Internal**.
The Brain class has several important properties that you can set using the Inspector window. These properties must be appropriate for the agents using the brain. For example, the `Vector Observation Space Size` property must match the length of the feature vector created by an agent exactly. See [Agents](Learning-Environment-Design-Agents.md) for information about creating agents and setting up a Brain instance correctly.
The Brain class has several important properties that you can set using the
Inspector window. These properties must be appropriate for the Agents using the
Brain. For example, the `Vector Observation Space Size` property must match the
length of the feature vector created by an Agent exactly. See
[Agents](Learning-Environment-Design-Agents.md) for information about creating
agents and setting up a Brain instance correctly.
The Brain Inspector window in the Unity Editor displays the properties assigned to a Brain component:
The Brain Inspector window in the Unity Editor displays the properties assigned
to a Brain component:
* `Brain Parameters` - Define vector observations, visual observation, and vector actions for the Brain.
* `Vector Observation`
* `Space Type` - Corresponds to whether the observation vector contains a single integer (Discrete) or a series of real-valued floats (Continuous).
* `Space Size` - Length of vector observation for brain (In _Continuous_ space type). Or number of possible values (in _Discrete_ space type).
* `Stacked Vectors` - The number of previous vector observations that will be stacked and used collectively for decision making. This results in the effective size of the vector observation being passed to the brain being: _Space Size_ x _Stacked Vectors_.
* `Visual Observations` - Describes height, width, and whether to grayscale visual observations for the Brain.
* `Vector Action`
* `Space Type` - Corresponds to whether action vector contains a single integer (Discrete) or a series of real-valued floats (Continuous).
* `Space Size` - Length of action vector for brain (In _Continuous_ state space). Or number of possible values (in _Discrete_ action space).
* `Action Descriptions` - A list of strings used to name the available actions for the Brain.
* `Brain Parameters` - Define vector observations, visual observation, and
vector actions for the Brain.
* `Vector Observation`
* `Space Size` - Length of vector observation for Brain.
* `Stacked Vectors` - The number of previous vector observations that will
be stacked and used collectively for decision making. This results in the
effective size of the vector observation being passed to the Brain being:
_Space Size_ x _Stacked Vectors_.
* `Visual Observations` - Describes height, width, and whether to grayscale
visual observations for the Brain.
* `Vector Action`
* `Space Type` - Corresponds to whether action vector contains a single
integer (Discrete) or a series of real-valued floats (Continuous).
* `Space Size` (Continuous) - Length of action vector for Brain.
* `Branches` (Discrete) - An array of integers, defines multiple concurrent
discrete actions. The values in the `Branches` array correspond to the
number of possible discrete values for each action branch.
* `Action Descriptions` - A list of strings used to name the available
actions for the Brain.
* `External` - Actions are decided by an external process, such as the PPO training process.
* `Internal` - Actions are decided using internal TensorFlowSharp model.
* `Player` - Actions are decided using keyboard input mappings.
* `Heuristic` - Actions are decided using a custom `Decision` script, which must be attached to the Brain game object.
* `External` - Actions are decided by an external process, such as the PPO
training process.
* `Internal` - Actions are decided using internal TensorFlowSharp model.
* `Player` - Actions are decided using keyboard input mappings.
* `Heuristic` - Actions are decided using a custom `Decision` script, which
must be attached to the Brain game object.
The Player, Heuristic and Internal brains have been updated to support broadcast. The broadcast feature allows you to collect data from your agents using a Python program without controlling them.
The Player, Heuristic and Internal Brains have been updated to support
broadcast. The broadcast feature allows you to collect data from your Agents
using a Python program without controlling them.
### How to use: Unity

### How to use: Python
### How to use: Python
When you launch your Unity Environment from a Python program, you can see what the agents connected to non-external brains are doing. When calling `step` or `reset` on your environment, you retrieve a dictionary mapping brain names to `BrainInfo` objects. The dictionary contains a `BrainInfo` object for each non-external brain set to broadcast as well as for any external brains.
Just like with an external brain, the `BrainInfo` object contains the fields for `visual_observations`, `vector_observations`, `text_observations`, `memories`,`rewards`, `local_done`, `max_reached`, `agents` and `previous_actions`. Note that `previous_actions` corresponds to the actions that were taken by the agents at the previous step, not the current one.
Note that when you do a `step` on the environment, you cannot provide actions for non-external brains. If there are no external brains in the scene, simply call `step()` with no arguments.
When you launch your Unity Environment from a Python program, you can see what
the Agents connected to non-External Brains are doing. When calling `step` or
`reset` on your environment, you retrieve a dictionary mapping Brain names to
`BrainInfo` objects. The dictionary contains a `BrainInfo` object for each
non-External Brain set to broadcast as well as for any External Brains.
You can use the broadcast feature to collect data generated by Player, Heuristics or Internal brains game sessions. You can then use this data to train an agent in a supervised context.
Just like with an External Brain, the `BrainInfo` object contains the fields for
`visual_observations`, `vector_observations`, `text_observations`,
`memories`,`rewards`, `local_done`, `max_reached`, `agents` and
`previous_actions`. Note that `previous_actions` corresponds to the actions that
were taken by the Agents at the previous step, not the current one.
Note that when you do a `step` on the environment, you cannot provide actions
for non-External Brains. If there are no External Brains in the scene, simply
call `step()` with no arguments.
You can use the broadcast feature to collect data generated by Player,
Heuristics or Internal Brains game sessions. You can then use this data to train
an agent in a supervised context.

118
docs/Learning-Environment-Design-External-Internal-Brains.md


# External and Internal Brains
The **External** and **Internal** types of Brains work in different phases of training. When training your agents, set their brain types to **External**; when using the trained models, set their brain types to **Internal**.
The **External** and **Internal** types of Brains work in different phases of
training. When training your Agents, set their Brain types to **External**; when
using the trained models, set their Brain types to **Internal**.
When [running an ML-Agents training algorithm](Training-ML-Agents.md), at least one Brain object in a scene must be set to **External**. This allows the training process to collect the observations of agents using that brain and give the agents their actions.
When [running an ML-Agents training algorithm](Training-ML-Agents.md), at least
one Brain object in a scene must be set to **External**. This allows the
training process to collect the observations of Agents using that Brain and give
the Agents their actions.
In addition to using an External brain for training using the ML-Agents learning algorithms, you can use an External brain to control agents in a Unity environment using an external Python program. See [Python API](Python-API.md) for more information.
In addition to using an External Brain for training using the ML-Agents learning
algorithms, you can use an External Brain to control Agents in a Unity
environment using an external Python program. See [Python API](Python-API.md)
for more information.
Unlike the other types, the External Brain has no properties to set in the Unity Inspector window.
Unlike the other types, the External Brain has no properties to set in the Unity
Inspector window.
The Internal Brain type uses a [TensorFlow model](https://www.tensorflow.org/get_started/get_started_for_beginners#models_and_training) to make decisions. The Proximal Policy Optimization (PPO) and Behavioral Cloning algorithms included with the ML-Agents SDK produce trained TensorFlow models that you can use with the Internal Brain type.
The Internal Brain type uses a
[TensorFlow model](https://www.tensorflow.org/get_started/get_started_for_beginners#models_and_training)
to make decisions. The Proximal Policy Optimization (PPO) and Behavioral Cloning
algorithms included with the ML-Agents SDK produce trained TensorFlow models
that you can use with the Internal Brain type.
A __model__ is a mathematical relationship mapping an agent's observations to its actions. TensorFlow is a software library for performing numerical computation through data flow graphs. A TensorFlow model, then, defines the mathematical relationship between your agent's observations and its actions using a TensorFlow data flow graph.
A __model__ is a mathematical relationship mapping an agent's observations to
its actions. TensorFlow is a software library for performing numerical
computation through data flow graphs. A TensorFlow model, then, defines the
mathematical relationship between your Agent's observations and its actions
using a TensorFlow data flow graph.
The training algorithms included in the ML-Agents SDK produce TensorFlow graph models as the end result of the training process. See [Training ML-Agents](Training-ML-Agents.md) for instructions on how to train a model.
The training algorithms included in the ML-Agents SDK produce TensorFlow graph
models as the end result of the training process. See
[Training ML-Agents](Training-ML-Agents.md) for instructions on how to train a
model.
1. Select the Brain GameObject in the **Hierarchy** window of the Unity Editor. (The Brain GameObject must be a child of the Academy GameObject and must have a Brain component.)
1. Select the Brain GameObject in the **Hierarchy** window of the Unity Editor.
(The Brain GameObject must be a child of the Academy GameObject and must have
a Brain component.)
**Note:** In order to see the **Internal** Brain Type option, you must
[enable TensorFlowSharp](Using-TensorFlow-Sharp-in-Unity.md).
3. Import the `environment_run-id.bytes` file produced by the PPO training
program. (Where `environment_run-id` is the name of the model file, which is
constructed from the name of your Unity environment executable and the run-id
value you assigned when running the training process.)
**Note:** In order to see the **Internal** Brain Type option, you must [enable TensorFlowSharp](Using-TensorFlow-Sharp-in-Unity.md).
3. Import the `environment_run-id.bytes` file produced by the PPO training program. (Where `environment_run-id` is the name of the model file, which is constructed from the name of your Unity environment executable and the run-id value you assigned when running the training process.)
You can [import assets into Unity](https://docs.unity3d.com/Manual/ImportingAssets.html) in various ways. The easiest way is to simply drag the file into the **Project** window and drop it into an appropriate folder.
4. Once the `environment.bytes` file is imported, drag it from the **Project** window to the **Graph Model** field of the Brain component.
You can
[import assets into Unity](https://docs.unity3d.com/Manual/ImportingAssets.html)
in various ways. The easiest way is to simply drag the file into the
**Project** window and drop it into an appropriate folder.
4. Once the `environment.bytes` file is imported, drag it from the **Project**
window to the **Graph Model** field of the Brain component.
If you are using a model produced by the ML-Agents `learn.py` program, use the default values for the other Internal Brain parameters.
If you are using a model produced by the ML-Agents `mlagents-learn` command, use
the default values for the other Internal Brain parameters.
The default values of the TensorFlow graph parameters work with the model produced by the PPO and BC training code in the ML-Agents SDK. To use a default ML-Agents model, the only parameter that you need to set is the `Graph Model`, which must be set to the .bytes file containing the trained model itself.
The default values of the TensorFlow graph parameters work with the model
produced by the PPO and BC training code in the ML-Agents SDK. To use a default
ML-Agents model, the only parameter that you need to set is the `Graph Model`,
which must be set to the .bytes file containing the trained model itself.
* `Graph Model` : This must be the `bytes` file corresponding to the pre-trained
TensorFlow graph. (You must first drag this file into your Resources folder
and then from the Resources folder into the inspector)
* `Graph Model` : This must be the `bytes` file corresponding to the pre-trained TensorFlow graph. (You must first drag this file into your Resources folder and then from the Resources folder into the inspector)
Only change the following Internal Brain properties if you have created your own
TensorFlow model and are not using an ML-Agents model:
Only change the following Internal Brain properties if you have created your own TensorFlow model and are not using an ML-Agents model:
* `Graph Scope` : If you set a scope while training your TensorFlow model, all your placeholder name will have a prefix. You must specify that prefix here. Note that if more than one Brain were set to external during training, you must give a `Graph Scope` to the internal Brain corresponding to the name of the Brain GameObject.
* `Batch Size Node Name` : If the batch size is one of the inputs of your graph, you must specify the name if the placeholder here. The brain will make the batch size equal to the number of agents connected to the brain automatically.
* `State Node Name` : If your graph uses the state as an input, you must specify the name of the placeholder here.
* `Recurrent Input Node Name` : If your graph uses a recurrent input / memory as input and outputs new recurrent input / memory, you must specify the name if the input placeholder here.
* `Recurrent Output Node Name` : If your graph uses a recurrent input / memory as input and outputs new recurrent input / memory, you must specify the name if the output placeholder here.
* `Observation Placeholder Name` : If your graph uses observations as input, you must specify it here. Note that the number of observations is equal to the length of `Camera Resolutions` in the brain parameters.
* `Action Node Name` : Specify the name of the placeholder corresponding to the actions of the brain in your graph. If the action space type is continuous, the output must be a one dimensional tensor of float of length `Action Space Size`, if the action space type is discrete, the output must be a one dimensional tensor of int of length 1.
* `Graph Placeholder` : If your graph takes additional inputs that are fixed (example: noise level) you can specify them here. Note that in your graph, these must correspond to one dimensional tensors of int or float of size 1.
* `Name` : Corresponds to the name of the placeholder.
* `Value Type` : Either Integer or Floating Point.
* `Min Value` and `Max Value` : Specify the range of the value here. The value will be sampled from the uniform distribution ranging from `Min Value` to `Max Value` inclusive.
* `Graph Scope` : If you set a scope while training your TensorFlow model, all
your placeholder name will have a prefix. You must specify that prefix here.
Note that if more than one Brain were set to external during training, you
must give a `Graph Scope` to the Internal Brain corresponding to the name of
the Brain GameObject.
* `Batch Size Node Name` : If the batch size is one of the inputs of your
graph, you must specify the name if the placeholder here. The Brain will make
the batch size equal to the number of Agents connected to the Brain
automatically.
* `State Node Name` : If your graph uses the state as an input, you must specify
the name of the placeholder here.
* `Recurrent Input Node Name` : If your graph uses a recurrent input / memory as
input and outputs new recurrent input / memory, you must specify the name if
the input placeholder here.
* `Recurrent Output Node Name` : If your graph uses a recurrent input / memory
as input and outputs new recurrent input / memory, you must specify the name
if the output placeholder here.
* `Observation Placeholder Name` : If your graph uses observations as input, you
must specify it here. Note that the number of observations is equal to the
length of `Camera Resolutions` in the Brain parameters.
* `Action Node Name` : Specify the name of the placeholder corresponding to the
actions of the Brain in your graph. If the action space type is continuous,
the output must be a one dimensional tensor of float of length `Action Space
Size`, if the action space type is discrete, the output must be a one
dimensional tensor of int of the same length as the `Branches` array.
* `Graph Placeholder` : If your graph takes additional inputs that are fixed
(example: noise level) you can specify them here. Note that in your graph,
these must correspond to one dimensional tensors of int or float of size 1.
* `Name` : Corresponds to the name of the placeholder.
* `Value Type` : Either Integer or Floating Point.
* `Min Value` and `Max Value` : Specify the range of the value here. The value
will be sampled from the uniform distribution ranging from `Min Value` to
`Max Value` inclusive.

34
docs/Learning-Environment-Design-Heuristic-Brains.md


# Heuristic Brain
The **Heuristic** brain type allows you to hand code an agent's decision making process. A Heuristic brain requires an implementation of the Decision interface to which it delegates the decision making process.
The **Heuristic** Brain type allows you to hand code an Agent's decision making
process. A Heuristic Brain requires an implementation of the Decision interface
to which it delegates the decision making process.
When you set the **Brain Type** property of a Brain to **Heuristic**, you must add a component implementing the Decision interface to the same GameObject as the Brain.
When you set the **Brain Type** property of a Brain to **Heuristic**, you must
add a component implementing the Decision interface to the same GameObject as
the Brain.
When creating your Decision class, extend MonoBehaviour (so you can use the class as a Unity component) and extend the Decision interface.
When creating your Decision class, extend MonoBehaviour (so you can use the
class as a Unity component) and extend the Decision interface.
public class HeuristicLogic : MonoBehaviour, Decision
public class HeuristicLogic : MonoBehaviour, Decision
The Decision interface defines two methods, `Decide()` and `MakeMemory()`.
The Decision interface defines two methods, `Decide()` and `MakeMemory()`.
The `Decide()` method receives an agents current state, consisting of the agent's observations, reward, memory and other aspects of the agent's state, and must return an array containing the action that the agent should take. The format of the returned action array depends on the **Vector Action Space Type**. When using a **Continuous** action space, the action array is just a float array with a length equal to the **Vector Action Space Size** setting. When using a **Discrete** action space, the array contains just a single value. In the discrete action space, the **Space Size** value defines the number of discrete values that your `Decide()` function can return, which don't need to be consecutive integers.
The `Decide()` method receives an Agents current state, consisting of the
agent's observations, reward, memory and other aspects of the Agent's state, and
must return an array containing the action that the Agent should take. The
format of the returned action array depends on the **Vector Action Space Type**.
When using a **Continuous** action space, the action array is just a float array
with a length equal to the **Vector Action Space Size** setting. When using a
**Discrete** action space, the action array is an integer array with the same
size as the `Branches` array. In the discrete action space, the values of the
**Branches** array define the number of discrete values that your `Decide()`
function can return for each branch, which don't need to be consecutive
integers.
The `MakeMemory()` function allows you to pass data forward to the next iteration of an agent's decision making process. The array you return from `MakeMemory()` is passed to the `Decide()` function in the next iteration. You can use the memory to allow the agent's decision process to take past actions and observations into account when making the current decision. If your heuristic logic does not require memory, just return an empty array.
The `MakeMemory()` function allows you to pass data forward to the next
iteration of an Agent's decision making process. The array you return from
`MakeMemory()` is passed to the `Decide()` function in the next iteration. You
can use the memory to allow the Agent's decision process to take past actions
and observations into account when making the current decision. If your
heuristic logic does not require memory, just return an empty array.

47
docs/Learning-Environment-Design-Player-Brains.md


# Player Brain
The **Player** brain type allows you to control an agent using keyboard commands. You can use Player brains to control a "teacher" agent that trains other agents during [imitation learning](Training-Imitation-Learning.md). You can also use Player brains to test your agents and environment before changing their brain types to **External** and running the training process.
The **Player** Brain type allows you to control an Agent using keyboard
commands. You can use Player Brains to control a "teacher" Agent that trains
other Agents during [imitation learning](Training-Imitation-Learning.md). You
can also use Player Brains to test your Agents and environment before changing
their Brain types to **External** and running the training process.
The **Player** brain properties allow you to assign one or more keyboard keys to each action and a unique value to send when a key is pressed.
The **Player** Brain properties allow you to assign one or more keyboard keys to
each action and a unique value to send when a key is pressed.
Note the differences between the discrete and continuous action spaces. When a brain uses the discrete action space, you can send one integer value as the action per step. In contrast, when a brain uses the continuous action space you can send any number of floating point values (up to the **Vector Action Space Size** setting).
| **Property** | | **Description** |
| :-- |:-- | :-- |
|**Continuous Player Actions**|| The mapping for the continuous vector action space. Shown when the action space is **Continuous**|.
|| **Size** | The number of key commands defined. You can assign more than one command to the same action index in order to send different values for that action. (If you press both keys at the same time, deterministic results are not guaranteed.)|
||**Element 0–N**| The mapping of keys to action values. |
|| **Key** | The key on the keyboard. |
|| **Index** | The element of the agent's action vector to set when this key is pressed. The index value cannot exceed the size of the Action Space (minus 1, since it is an array index).|
|| **Value** | The value to send to the agent as its action for the specified index when the mapped key is pressed. All other members of the action vector are set to 0. |
|**Discrete Player Actions**|| The mapping for the discrete vector action space. Shown when the action space is **Discrete**.|
|| **Default Action** | The value to send when no keys are pressed.|
|| **Size** | The number of key commands defined. |
||**Element 0–N**| The mapping of keys to action values. |
|| **Key** | The key on the keyboard. |
|| **Value** | The value to send to the agent as its action when the mapped key is pressed.|
Note the differences between the discrete and continuous action spaces. When a
Brain uses the discrete action space, you can send one integer value as the
action per step. In contrast, when a Brain uses the continuous action space you
can send any number of floating point values (up to the **Vector Action Space
Size** setting).
For more information about the Unity input system, see [Input](https://docs.unity3d.com/ScriptReference/Input.html).
| **Property** | | **Description** |
| :---------------------------- | :--------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Continuous Player Actions** | | The mapping for the continuous vector action space. Shown when the action space is **Continuous**. |
| | **Size** | The number of key commands defined. You can assign more than one command to the same action index in order to send different values for that action. (If you press both keys at the same time, deterministic results are not guaranteed.) |
| | **Element 0–N** | The mapping of keys to action values. |
| | **Key** | The key on the keyboard. |
| | **Index** | The element of the Agent's action vector to set when this key is pressed. The index value cannot exceed the size of the Action Space (minus 1, since it is an array index). |
| | **Value** | The value to send to the Agent as its action for the specified index when the mapped key is pressed. All other members of the action vector are set to 0. |
| **Discrete Player Actions** | | The mapping for the discrete vector action space. Shown when the action space is **Discrete**. |
| | **Size** | The number of key commands defined. |
| | **Element 0–N** | The mapping of keys to action values. |
| | **Key** | The key on the keyboard. |
| | **Branch Index** | The element of the Agent's action vector to set when this key is pressed. The index value cannot exceed the size of the Action Space (minus 1, since it is an array index). |
| | **Value** | The value to send to the Agent as its action when the mapped key is pressed. Cannot exceed the max value for the associated branch (minus 1, since it is an array index). |
For more information about the Unity input system, see
[Input](https://docs.unity3d.com/ScriptReference/Input.html).

203
docs/Learning-Environment-Design.md


# Reinforcement Learning in Unity
Reinforcement learning is an artificial intelligence technique that trains _agents_ to perform tasks by rewarding desirable behavior. During reinforcement learning, an agent explores its environment, observes the state of things, and, based on those observations, takes an action. If the action leads to a better state, the agent receives a positive reward. If it leads to a less desirable state, then the agent receives no reward or a negative reward (punishment). As the agent learns during training, it optimizes its decision making so that it receives the maximum reward over time.
Reinforcement learning is an artificial intelligence technique that trains
_agents_ to perform tasks by rewarding desirable behavior. During reinforcement
learning, an agent explores its environment, observes the state of things, and,
based on those observations, takes an action. If the action leads to a better
state, the agent receives a positive reward. If it leads to a less desirable
state, then the agent receives no reward or a negative reward (punishment). As
the agent learns during training, it optimizes its decision making so that it
receives the maximum reward over time.
The ML-Agents toolkit uses a reinforcement learning technique called [Proximal Policy Optimization (PPO)](https://blog.openai.com/openai-baselines-ppo/). PPO uses a neural network to approximate the ideal function that maps an agent's observations to the best action an agent can take in a given state. The ML-Agents PPO algorithm is implemented in TensorFlow and runs in a separate Python process (communicating with the running Unity application over a socket).
The ML-Agents toolkit uses a reinforcement learning technique called
[Proximal Policy Optimization (PPO)](https://blog.openai.com/openai-baselines-ppo/).
PPO uses a neural network to approximate the ideal function that maps an agent's
observations to the best action an agent can take in a given state. The
ML-Agents PPO algorithm is implemented in TensorFlow and runs in a separate
Python process (communicating with the running Unity application over a socket).
**Note:** if you aren't studying machine and reinforcement learning as a subject and just want to train agents to accomplish tasks, you can treat PPO training as a _black box_. There are a few training-related parameters to adjust inside Unity as well as on the Python training side, but you do not need in-depth knowledge of the algorithm itself to successfully create and train agents. Step-by-step procedures for running the training process are provided in the [Training section](Training-ML-Agents.md).
**Note:** if you aren't studying machine and reinforcement learning as a subject
and just want to train agents to accomplish tasks, you can treat PPO training as
a _black box_. There are a few training-related parameters to adjust inside
Unity as well as on the Python training side, but you do not need in-depth
knowledge of the algorithm itself to successfully create and train agents.
Step-by-step procedures for running the training process are provided in the
[Training section](Training-ML-Agents.md).
Training and simulation proceed in steps orchestrated by the ML-Agents Academy class. The Academy works with Agent and Brain objects in the scene to step through the simulation. When either the Academy has reached its maximum number of steps or all agents in the scene are _done_, one training episode is finished.
Training and simulation proceed in steps orchestrated by the ML-Agents Academy
class. The Academy works with Agent and Brain objects in the scene to step
through the simulation. When either the Academy has reached its maximum number
of steps or all Agents in the scene are _done_, one training episode is
finished.
During training, the external Python training process communicates with the Academy to run a series of episodes while it collects data and optimizes its neural network model. The type of Brain assigned to an agent determines whether it participates in training or not. The **External** brain communicates with the external process to train the TensorFlow model. When training is completed successfully, you can add the trained model file to your Unity project for use with an **Internal** brain.
During training, the external Python training process communicates with the
Academy to run a series of episodes while it collects data and optimizes its
neural network model. The type of Brain assigned to an Agent determines whether
it participates in training or not. The **External** Brain communicates with the
external process to train the TensorFlow model. When training is completed
successfully, you can add the trained model file to your Unity project for use
with an **Internal** Brain.
2. Calls the `AgentReset()` function for each agent in the scene.
3. Calls the `CollectObservations()` function for each agent in the scene.
4. Uses each agent's Brain class to decide on the agent's next action.
5. Calls your subclass's `AcademyAct()` function.
6. Calls the `AgentAction()` function for each agent in the scene, passing in the action chosen by the agent's brain. (This function is not called if the agent is done.)
7. Calls the agent's `AgentOnDone()` function if the agent has reached its `Max Step` count or has otherwise marked itself as `done`. Optionally, you can set an agent to restart if it finishes before the end of an episode. In this case, the Academy calls the `AgentReset()` function.
8. When the Academy reaches its own `Max Step` count, it starts the next episode again by calling your Academy subclass's `AcademyReset()` function.
2. Calls the `AgentReset()` function for each Agent in the scene.
3. Calls the `CollectObservations()` function for each Agent in the scene.
4. Uses each Agent's Brain class to decide on the Agent's next action.
5. Calls your subclass's `AcademyStep()` function.
6. Calls the `AgentAction()` function for each Agent in the scene, passing in
the action chosen by the Agent's Brain. (This function is not called if the
Agent is done.)
7. Calls the Agent's `AgentOnDone()` function if the Agent has reached its `Max
Step` count or has otherwise marked itself as `done`. Optionally, you can set
an Agent to restart if it finishes before the end of an episode. In this
case, the Academy calls the `AgentReset()` function.
8. When the Academy reaches its own `Max Step` count, it starts the next episode
again by calling your Academy subclass's `AcademyReset()` function.
To create a training environment, extend the Academy and Agent classes to implement the above methods. The `Agent.CollectObservations()` and `Agent.AgentAction()` functions are required; the other methods are optional — whether you need to implement them or not depends on your specific scenario.
**Note:** The API used by the Python PPO training process to communicate with and control the Academy during training can be used for other purposes as well. For example, you could use the API to use Unity as the simulation engine for your own machine learning algorithms. See [Python API](Python-API.md) for more information.
To create a training environment, extend the Academy and Agent classes to
implement the above methods. The `Agent.CollectObservations()` and
`Agent.AgentAction()` functions are required; the other methods are optional —
whether you need to implement them or not depends on your specific scenario.
**Note:** The API used by the Python PPO training process to communicate with
and control the Academy during training can be used for other purposes as well.
For example, you could use the API to use Unity as the simulation engine for
your own machine learning algorithms. See [Python API](Python-API.md) for more
information.
To train and use the ML-Agents toolkit in a Unity scene, the scene must contain a single Academy subclass along with as many Brain objects and Agent subclasses as you need. Any Brain instances in the scene must be attached to GameObjects that are children of the Academy in the Unity Scene Hierarchy. Agent instances should be attached to the GameObject representing that agent.
To train and use the ML-Agents toolkit in a Unity scene, the scene must contain
a single Academy subclass along with as many Brain objects and Agent subclasses
as you need. Any Brain instances in the scene must be attached to GameObjects
that are children of the Academy in the Unity Scene Hierarchy. Agent instances
should be attached to the GameObject representing that Agent.
You must assign a brain to every agent, but you can share brains between multiple agents. Each agent will make its own observations and act independently, but will use the same decision-making logic and, for **Internal** brains, the same trained TensorFlow model.
You must assign a Brain to every Agent, but you can share Brains between
multiple Agents. Each Agent will make its own observations and act
independently, but will use the same decision-making logic and, for **Internal**
Brains, the same trained TensorFlow model.
The Academy object orchestrates agents and their decision making processes. Only place a single Academy object in a scene.
The Academy object orchestrates Agents and their decision making processes. Only
place a single Academy object in a scene.
You must create a subclass of the Academy class (since the base class is abstract). When you create your Academy subclass, you can implement the following methods (all are optional):
You must create a subclass of the Academy class (since the base class is
abstract). When you create your Academy subclass, you can implement the
following methods (all are optional):
* `AcademyReset()` — Prepare the environment and agents for the next training episode. Use this function to place and initialize entities in the scene as necessary.
* `AcademyStep()` — Prepare the environment for the next simulation step. The base Academy class calls this function before calling any `AgentAction()` methods for the current step. You can use this function to update other objects in the scene before the agents take their actions. Note that the agents have already collected their observations and chosen an action before the Academy invokes this method.
* `AcademyReset()` — Prepare the environment and Agents for the next training
episode. Use this function to place and initialize entities in the scene as
necessary.
* `AcademyStep()` — Prepare the environment for the next simulation step. The
base Academy class calls this function before calling any `AgentAction()`
methods for the current step. You can use this function to update other
objects in the scene before the Agents take their actions. Note that the
Agents have already collected their observations and chosen an action before
the Academy invokes this method.
The base Academy classes also defines several important properties that you can
set in the Unity Editor Inspector. For training, the most important of these
properties is `Max Steps`, which determines how long each training episode
lasts. Once the Academy's step counter reaches this value, it calls the
`AcademyReset()` function to start the next episode.
The base Academy classes also defines several important properties that you can set in the Unity Editor Inspector. For training, the most important of these properties is `Max Steps`, which determines how long each training episode lasts. Once the Academy's step counter reaches this value, it calls the `AcademyReset()` function to start the next episode.
See [Academy](Learning-Environment-Design-Academy.md) for a complete list of the Academy properties and their uses.
See [Academy](Learning-Environment-Design-Academy.md) for a complete list of
the Academy properties and their uses.
The Brain encapsulates the decision making process. Brain objects must be children of the Academy in the Unity scene hierarchy. Every Agent must be assigned a Brain, but you can use the same Brain with more than one Agent.
Use the Brain class directly, rather than a subclass. Brain behavior is determined by the brain type. During training, set your agent's brain type to **External**. To use the trained model, import the model file into the Unity project and change the brain type to **Internal**. See [Brains](Learning-Environment-Design-Brains.md) for details on using the different types of brains. You can extend the CoreBrain class to create different brain types if the four built-in types don't do what you need.
The Brain encapsulates the decision making process. Brain objects must be
children of the Academy in the Unity scene hierarchy. Every Agent must be
assigned a Brain, but you can use the same Brain with more than one Agent.
The Brain class has several important properties that you can set using the Inspector window. These properties must be appropriate for the agents using the brain. For example, the `Vector Observation Space Size` property must match the length of the feature vector created by an agent exactly. See [Agents](Learning-Environment-Design-Agents.md) for information about creating agents and setting up a Brain instance correctly.
Use the Brain class directly, rather than a subclass. Brain behavior is
determined by the Brain type. During training, set your Agent's Brain type to
**External**. To use the trained model, import the model file into the Unity
project and change the Brain type to **Internal**. See
[Brains](Learning-Environment-Design-Brains.md) for details on using the
different types of Brains. You can extend the CoreBrain class to create
different Brain types if the four built-in types don't do what you need.
The Brain class has several important properties that you can set using the
Inspector window. These properties must be appropriate for the Agents using the
Brain. For example, the `Vector Observation Space Size` property must match the
length of the feature vector created by an Agent exactly. See
[Agents](Learning-Environment-Design-Agents.md) for information about creating
agents and setting up a Brain instance correctly.
See [Brains](Learning-Environment-Design-Brains.md) for a complete list of the Brain properties.
See [Brains](Learning-Environment-Design-Brains.md) for a complete list of the
Brain properties.
The Agent class represents an actor in the scene that collects observations and carries out actions. The Agent class is typically attached to the GameObject in the scene that otherwise represents the actor — for example, to a player object in a football game or a car object in a vehicle simulation. Every Agent must be assigned a Brain.
The Agent class represents an actor in the scene that collects observations and
carries out actions. The Agent class is typically attached to the GameObject in
the scene that otherwise represents the actor — for example, to a player object
in a football game or a car object in a vehicle simulation. Every Agent must be
assigned a Brain.
To create an Agent, extend the Agent class and implement the essential
`CollectObservations()` and `AgentAction()` methods:
To create an agent, extend the Agent class and implement the essential `CollectObservations()` and `AgentAction()` methods:
* `CollectObservations()` — Collects the Agent's observation of its environment.
* `AgentAction()` — Carries out the action chosen by the Agent's Brain and
assigns a reward to the current state.
* `CollectObservations()` — Collects the agent's observation of its environment.
* `AgentAction()` — Carries out the action chosen by the agent's brain and assigns a reward to the current state.
Your implementations of these functions determine how the properties of the
Brain assigned to this Agent must be set.
Your implementations of these functions determine how the properties of the Brain assigned to this agent must be set.
You must also determine how an Agent finishes its task or times out. You can manually set an agent to done in your `AgentAction()` function when the agent has finished (or irrevocably failed) its task. You can also set the agent's `Max Steps` property to a positive value and the agent will consider itself done after it has taken that many steps. When the Academy reaches its own `Max Steps` count, it starts the next episode. If you set an agent's `ResetOnDone` property to true, then the agent can attempt its task several times in one episode. (Use the `Agent.AgentReset()` function to prepare the agent to start again.)
You must also determine how an Agent finishes its task or times out. You can
manually set an Agent to done in your `AgentAction()` function when the Agent
has finished (or irrevocably failed) its task. You can also set the Agent's `Max
Steps` property to a positive value and the Agent will consider itself done
after it has taken that many steps. When the Academy reaches its own `Max Steps`
count, it starts the next episode. If you set an Agent's `ResetOnDone` property
to true, then the Agent can attempt its task several times in one episode. (Use
the `Agent.AgentReset()` function to prepare the Agent to start again.)
See [Agents](Learning-Environment-Design-Agents.md) for detailed information about programing your own agents.
See [Agents](Learning-Environment-Design-Agents.md) for detailed information
about programing your own Agents.
An _environment_ in the ML-Agents toolkit can be any scene built in Unity. The Unity scene provides the environment in which agents observe, act, and learn. How you set up the Unity scene to serve as a learning environment really depends on your goal. You may be trying to solve a specific reinforcement learning problem of limited scope, in which case you can use the same scene for both training and for testing trained agents. Or, you may be training agents to operate in a complex game or simulation. In this case, it might be more efficient and practical to create a purpose-built training scene.
Both training and testing (or normal game) scenes must contain an Academy object to control the agent decision making process. The Academy defines several properties that can be set differently for a training scene versus a regular scene. The Academy's **Configuration** properties control rendering and time scale. You can set the **Training Configuration** to minimize the time Unity spends rendering graphics in order to speed up training. You may need to adjust the other functional, Academy settings as well. For example, `Max Steps` should be as short as possible for training — just long enough for the agent to accomplish its task, with some extra time for "wandering" while it learns. In regular scenes, you often do not want the Academy to reset the scene at all; if so, `Max Steps` should be set to zero.
An _environment_ in the ML-Agents toolkit can be any scene built in Unity. The
Unity scene provides the environment in which agents observe, act, and learn.
How you set up the Unity scene to serve as a learning environment really depends
on your goal. You may be trying to solve a specific reinforcement learning
problem of limited scope, in which case you can use the same scene for both
training and for testing trained agents. Or, you may be training agents to
operate in a complex game or simulation. In this case, it might be more
efficient and practical to create a purpose-built training scene.
When you create a training environment in Unity, you must set up the scene so that it can be controlled by the external training process. Considerations include:
Both training and testing (or normal game) scenes must contain an Academy object
to control the agent decision making process. The Academy defines several
properties that can be set differently for a training scene versus a regular
scene. The Academy's **Configuration** properties control rendering and time
scale. You can set the **Training Configuration** to minimize the time Unity
spends rendering graphics in order to speed up training. You may need to adjust
the other functional, Academy settings as well. For example, `Max Steps` should
be as short as possible for training — just long enough for the agent to
accomplish its task, with some extra time for "wandering" while it learns. In
regular scenes, you often do not want the Academy to reset the scene at all; if
so, `Max Steps` should be set to zero.
* The training scene must start automatically when your Unity application is launched by the training process.
* The scene must include at least one **External** brain.
* The Academy must reset the scene to a valid starting point for each episode of training.
* A training episode must have a definite end — either using `Max Steps` or by each agent setting itself to `done`.
When you create a training environment in Unity, you must set up the scene so
that it can be controlled by the external training process. Considerations
include:
* The training scene must start automatically when your Unity application is
launched by the training process.
* The scene must include at least one **External** Brain.
* The Academy must reset the scene to a valid starting point for each episode of
training.
* A training episode must have a definite end — either using `Max Steps` or by
each Agent setting itself to `done`.

381
docs/Learning-Environment-Examples.md


# Example Learning Environments
The Unity ML-Agents toolkit contains an expanding set of example environments which
demonstrate various features of the platform. Environments are located in
`unity-environment/Assets/ML-Agents/Examples` and summarized below.
Additionally, our
[first ML Challenge](https://connect.unity.com/challenges/ml-agents-1)
contains environments created by the community.
The Unity ML-Agents toolkit contains an expanding set of example environments
which demonstrate various features of the platform. Environments are located in
`UnitySDK/Assets/ML-Agents/Examples` and summarized below. Additionally, our
[first ML Challenge](https://connect.unity.com/challenges/ml-agents-1) contains
environments created by the community.
This page only overviews the example environments we provide. To learn more
on how to design and build your own environments see our
[Making a New Learning Environment](Learning-Environment-Create-New.md)
page.
This page only overviews the example environments we provide. To learn more on
how to design and build your own environments see our [Making a New Learning
Environment](Learning-Environment-Create-New.md) page.
Note: Environment scenes marked as _optional_ do not have accompanying pre-trained model files, and are designed to serve as challenges for researchers.
Note: Environment scenes marked as _optional_ do not have accompanying
pre-trained model files, and are designed to serve as challenges for
researchers.
If you would like to contribute environments, please see our
[contribution guidelines](../CONTRIBUTING.md) page.
If you would like to contribute environments, please see our
[contribution guidelines](../CONTRIBUTING.md) page.
* Set-up: A linear movement task where the agent must move left or right to rewarding states.
* Set-up: A linear movement task where the agent must move left or right to
rewarding states.
* Agents: The environment contains one agent linked to a single brain.
* Agent Reward Function:
* +0.1 for arriving at suboptimal state.
* +1.0 for arriving at optimal state.
* Brains: One brain with the following observation/action space.
* Vector Observation space: (Discrete) One variable corresponding to current state.
* Vector Action space: (Discrete) Two possible actions (Move left, move right).
* Visual Observations: None.
* Agents: The environment contains one agent linked to a single Brain.
* Agent Reward Function:
* +0.1 for arriving at suboptimal state.
* +1.0 for arriving at optimal state.
* Brains: One Brain with the following observation/action space.
* Vector Observation space: One variable corresponding to current state.
* Vector Action space: (Discrete) Two possible actions (Move left, move
right).
* Visual Observations: None.
* Reset Parameters: None
* Benchmark Mean Reward: 0.94

* Set-up: A balance-ball task, where the agent controls the platform.
* Goal: The agent must balance the platform in order to keep the ball on it for as long as possible.
* Agents: The environment contains 12 agents of the same kind, all linked to a single brain.
* Agent Reward Function:
* +0.1 for every step the ball remains on the platform.
* -1.0 if the ball falls from the platform.
* Brains: One brain with the following observation/action space.
* Vector Observation space: (Continuous) 8 variables corresponding to rotation of platform, and position, rotation, and velocity of ball.
* Vector Observation space (Hard Version): (Continuous) 5 variables corresponding to rotation of platform and position and rotation of ball.
* Vector Action space: (Continuous) Size of 2, with one value corresponding to X-rotation, and the other to Z-rotation.
* Visual Observations: None.
* Set-up: A balance-ball task, where the agent controls the platform.
* Goal: The agent must balance the platform in order to keep the ball on it for
as long as possible.
* Agents: The environment contains 12 agents of the same kind, all linked to a
single Brain.
* Agent Reward Function:
* +0.1 for every step the ball remains on the platform.
* -1.0 if the ball falls from the platform.
* Brains: One Brain with the following observation/action space.
* Vector Observation space: 8 variables corresponding to rotation of platform,
and position, rotation, and velocity of ball.
* Vector Observation space (Hard Version): 5 variables corresponding to
rotation of platform and position and rotation of ball.
* Vector Action space: (Continuous) Size of 2, with one value corresponding to
X-rotation, and the other to Z-rotation.
* Visual Observations: None.
* Reset Parameters: None
* Benchmark Mean Reward: 100

* Set-up: A version of the classic grid-world task. Scene contains agent, goal, and obstacles.
* Goal: The agent must navigate the grid to the goal while avoiding the obstacles.
* Agents: The environment contains one agent linked to a single brain.
* Agent Reward Function:
* -0.01 for every step.
* +1.0 if the agent navigates to the goal position of the grid (episode ends).
* -1.0 if the agent navigates to an obstacle (episode ends).
* Brains: One brain with the following observation/action space.
* Vector Observation space: None
* Vector Action space: (Discrete) Size of 4, corresponding to movement in cardinal directions.
* Visual Observations: One corresponding to top-down view of GridWorld.
* Reset Parameters: Three, corresponding to grid size, number of obstacles, and number of goals.
* Set-up: A version of the classic grid-world task. Scene contains agent, goal,
and obstacles.
* Goal: The agent must navigate the grid to the goal while avoiding the
obstacles.
* Agents: The environment contains one agent linked to a single Brain.
* Agent Reward Function:
* -0.01 for every step.
* +1.0 if the agent navigates to the goal position of the grid (episode ends).
* -1.0 if the agent navigates to an obstacle (episode ends).
* Brains: One Brain with the following observation/action space.
* Vector Observation space: None
* Vector Action space: (Discrete) Size of 4, corresponding to movement in
cardinal directions. Note that for this environment,
[action masking](Learning-Environment-Design-Agents.md#masking-discrete-actions)
is turned on by default (this option can be toggled
using the `Mask Actions` checkbox within the `trueAgent` GameObject).
The trained model file provided was generated with action masking turned on.
* Visual Observations: One corresponding to top-down view of GridWorld.
* Reset Parameters: Three, corresponding to grid size, number of obstacles, and
number of goals.
* Benchmark Mean Reward: 0.8
## [Tennis](https://youtu.be/RDaIh7JX6RI)

* Set-up: Two-player game where agents control rackets to bounce ball over a net.
* Goal: The agents must bounce ball between one another while not dropping or sending ball out of bounds.
* Agents: The environment contains two agent linked to a single brain named TennisBrain. After training you can attach another brain named MyBrain to one of the agent to play against your trained model.
* Agent Reward Function (independent):
* +0.1 To agent when hitting ball over net.
* -0.1 To agent who let ball hit their ground, or hit ball out of bounds.
* Brains: One brain with the following observation/action space.
* Vector Observation space: (Continuous) 8 variables corresponding to position and velocity of ball and racket.
* Vector Action space: (Continuous) Size of 2, corresponding to movement toward net or away from net, and jumping.
* Visual Observations: None.
* Set-up: Two-player game where agents control rackets to bounce ball over a
net.
* Goal: The agents must bounce ball between one another while not dropping or
sending ball out of bounds.
* Agents: The environment contains two agent linked to a single Brain named
TennisBrain. After training you can attach another Brain named MyBrain to one
of the agent to play against your trained model.
* Agent Reward Function (independent):
* +0.1 To agent when hitting ball over net.
* -0.1 To agent who let ball hit their ground, or hit ball out of bounds.
* Brains: One Brain with the following observation/action space.
* Vector Observation space: 8 variables corresponding to position and velocity
of ball and racket.
* Vector Action space: (Continuous) Size of 2, corresponding to movement
toward net or away from net, and jumping.
* Visual Observations: None.
* Reset Parameters: One, corresponding to size of ball.
* Benchmark Mean Reward: 2.5
* Optional Imitation Learning scene: `TennisIL`.

* Set-up: A platforming environment where the agent can push a block around.
* Goal: The agent must push the block to the goal.
* Agents: The environment contains one agent linked to a single brain.
* Agent Reward Function:
* -0.0025 for every step.
* +1.0 if the block touches the goal.
* Agent Reward Function:
* -0.0025 for every step.
* +1.0 if the block touches the goal.
* Vector Observation space: (Continuous) 70 variables corresponding to 14 ray-casts each detecting one of three possible objects (wall, goal, or block).
* Vector Action space: (Continuous) Size of 2, corresponding to movement in X and Z directions.
* Visual Observations (Optional): One first-person camera. Use `VisualPushBlock` scene.
* Vector Observation space: (Continuous) 70 variables corresponding to 14
ray-casts each detecting one of three possible objects (wall, goal, or
block).
* Vector Action space: (Discrete) Size of 6, corresponding to turn clockwise
and counterclockwise and move along four different face directions.
* Visual Observations (Optional): One first-person camera. Use
`VisualPushBlock` scene.
* Reset Parameters: None.
* Benchmark Mean Reward: 4.5
* Optional Imitation Learning scene: `PushBlockIL`.

* Set-up: A platforming environment where the agent can jump over a wall.
* Goal: The agent must use the block to scale the wall and reach the goal.
* Agents: The environment contains one agent linked to two different brains. The brain the agent is linked to changes depending on the height of the wall.
* Agent Reward Function:
* -0.0005 for every step.
* +1.0 if the agent touches the goal.
* -1.0 if the agent falls off the platform.
* Brains: Two brains, each with the following observation/action space.
* Vector Observation space: (Continuous) 16 variables corresponding to position and velocities of agent, block, and goal, plus the height of the wall.
* Vector Action space: (Discrete) Size of 74, corresponding to 14 raycasts each detecting 4 possible objects. plus the global position of the agent and whether or not the agent is grounded.
* Visual Observations: None.
* Agents: The environment contains one agent linked to two different Brains. The
Brain the agent is linked to changes depending on the height of the wall.
* Agent Reward Function:
* -0.0005 for every step.
* +1.0 if the agent touches the goal.
* -1.0 if the agent falls off the platform.
* Brains: Two Brains, each with the following observation/action space.
* Vector Observation space: Size of 74, corresponding to 14 ray casts each
detecting 4 possible objects. plus the global position of the agent and
whether or not the agent is grounded.
* Vector Action space: (Discrete) 4 Branches:
* Forward Motion (3 possible actions: Forward, Backwards, No Action)
* Rotation (3 possible actions: Rotate Left, Rotate Right, No Action)
* Side Motion (3 possible actions: Left, Right, No Action)
* Jump (2 possible actions: Jump, No Action)
* Visual Observations: None.
* Reset Parameters: 4, corresponding to the height of the possible walls.
* Benchmark Mean Reward (Big & Small Wall Brain): 0.8

* Set-up: Double-jointed arm which can move to target locations.
* Goal: The agents must move it's hand to the goal location, and keep it there.
* Agents: The environment contains 10 agent linked to a single brain.
* Agent Reward Function (independent):
* +0.1 Each step agent's hand is in goal location.
* Brains: One brain with the following observation/action space.
* Vector Observation space: (Continuous) 26 variables corresponding to position, rotation, velocity, and angular velocities of the two arm Rigidbodies.
* Vector Action space: (Continuous) Size of 4, corresponding to torque applicable to two joints.
* Visual Observations: None.
* Agents: The environment contains 10 agent linked to a single Brain.
* Agent Reward Function (independent):
* +0.1 Each step agent's hand is in goal location.
* Brains: One Brain with the following observation/action space.
* Vector Observation space: 26 variables corresponding to position, rotation,
velocity, and angular velocities of the two arm Rigidbodies.
* Vector Action space: (Continuous) Size of 4, corresponding to torque
applicable to two joints.
* Visual Observations: None.
* Reset Parameters: Two, corresponding to goal size, and goal movement speed.
* Benchmark Mean Reward: 30

* Set-up: A creature with 4 arms and 4 forearms.
* Goal: The agents must move its body toward the goal direction without falling.
* `CrawlerStaticTarget` - Goal direction is always forward.
* `CrawlerDynamicTarget`- Goal direction is randomized.
* Agents: The environment contains 3 agent linked to a single brain.
* Agent Reward Function (independent):
* +0.03 times body velocity in the goal direction.
* +0.01 times body direction alignment with goal direction.
* Brains: One brain with the following observation/action space.
* Vector Observation space: (Continuous) 117 variables corresponding to position, rotation, velocity, and angular velocities of each limb plus the acceleration and angular acceleration of the body.
* Vector Action space: (Continuous) Size of 20, corresponding to target rotations for joints.
* Visual Observations: None.
* `CrawlerStaticTarget` - Goal direction is always forward.
* `CrawlerDynamicTarget`- Goal direction is randomized.
* Agents: The environment contains 3 agent linked to a single Brain.
* Agent Reward Function (independent):
* +0.03 times body velocity in the goal direction.
* +0.01 times body direction alignment with goal direction.
* Brains: One Brain with the following observation/action space.
* Vector Observation space: 117 variables corresponding to position, rotation,
velocity, and angular velocities of each limb plus the acceleration and
angular acceleration of the body.
* Vector Action space: (Continuous) Size of 20, corresponding to target
rotations for joints.
* Visual Observations: None.
* Reset Parameters: None
* Benchmark Mean Reward: 2000

* Set-up: A multi-agent environment where agents compete to collect bananas.
* Goal: The agents must learn to move to as many yellow bananas as possible while avoiding blue bananas.
* Agents: The environment contains 5 agents linked to a single brain.
* Agent Reward Function (independent):
* +1 for interaction with yellow banana
* -1 for interaction with blue banana.
* Brains: One brain with the following observation/action space.
* Vector Observation space: (Continuous) 53 corresponding to velocity of agent (2), whether agent is frozen and/or shot its laser (2), plus ray-based perception of objects around agent's forward direction (49; 7 raycast angles with 7 measurements for each).
* Vector Action space: (Continuous) Size of 3, corresponding to forward movement, y-axis rotation, and whether to use laser to disable other agents.
* Visual Observations (Optional): First-person camera per-agent. Use `VisualBanana` scene.
* Set-up: A multi-agent environment where agents compete to collect bananas.
* Goal: The agents must learn to move to as many yellow bananas as possible
while avoiding blue bananas.
* Agents: The environment contains 5 agents linked to a single Brain.
* Agent Reward Function (independent):
* +1 for interaction with yellow banana
* -1 for interaction with blue banana.
* Brains: One Brain with the following observation/action space.
* Vector Observation space: 53 corresponding to velocity of agent (2), whether
agent is frozen and/or shot its laser (2), plus ray-based perception of
objects around agent's forward direction (49; 7 raycast angles with 7
measurements for each).
* Vector Action space: (Discrete) 4 Branches:
* Forward Motion (3 possible actions: Forward, Backwards, No Action)
* Side Motion (3 possible actions: Left, Right, No Action)
* Rotation (3 possible actions: Rotate Left, Rotate Right, No Action)
* Laser (2 possible actions: Laser, No Action)
* Visual Observations (Optional): First-person camera per-agent. Use
`VisualBanana` scene.
* Optional Imitation Learning scene: `BananaIL`.
* Optional Imitation Learning scene: `BananaIL`.
* Set-up: Environment where the agent needs to find information in a room, remember it, and use it to move to the correct goal.
* Goal: Move to the goal which corresponds to the color of the block in the room.
* Agents: The environment contains one agent linked to a single brain.
* Set-up: Environment where the agent needs to find information in a room,
remember it, and use it to move to the correct goal.
* Goal: Move to the goal which corresponds to the color of the block in the
room.
* Agents: The environment contains one agent linked to a single Brain.
* +1 For moving to correct goal.
* -0.1 For moving to incorrect goal.
* -0.0003 Existential penalty.
* Brains: One brain with the following observation/action space:
* Vector Observation space: (Continuous) 30 corresponding to local ray-casts detecting objects, goals, and walls.
* Vector Action space: (Discrete) 4 corresponding to agent rotation and forward/backward movement.
* Visual Observations (Optional): First-person view for the agent. Use `VisualHallway` scene.
* +1 For moving to correct goal.
* -0.1 For moving to incorrect goal.
* -0.0003 Existential penalty.
* Brains: One Brain with the following observation/action space:
* Vector Observation space: 30 corresponding to local ray-casts detecting
objects, goals, and walls.
* Vector Action space: (Discrete) 1 Branch, 4 actions corresponding to agent
rotation and forward/backward movement.
* Visual Observations (Optional): First-person view for the agent. Use
`VisualHallway` scene.
* Reset Parameters: None.
* Benchmark Mean Reward: 0.7
* Optional Imitation Learning scene: `HallwayIL`.

![Bouncer](images/bouncer.png)
* Set-up: Environment where the agent needs on-demand decision making. The agent must decide how perform its next bounce only when it touches the ground.
* Set-up: Environment where the agent needs on-demand decision making. The agent
must decide how perform its next bounce only when it touches the ground.
* Agents: The environment contains one agent linked to a single brain.
* Agents: The environment contains one agent linked to a single Brain.
* +1 For catching the banana.
* -1 For bouncing out of bounds.
* -0.05 Times the action squared. Energy expenditure penalty.
* Brains: One brain with the following observation/action space:
* Vector Observation space: (Continuous) 6 corresponding to local position of agent and banana.
* Vector Action space: (Continuous) 3 corresponding to agent force applied for the jump.
* Visual Observations: None.
* +1 For catching the banana.
* -1 For bouncing out of bounds.
* -0.05 Times the action squared. Energy expenditure penalty.
* Brains: One Brain with the following observation/action space:
* Vector Observation space: 6 corresponding to local position of agent and
banana.
* Vector Action space: (Continuous) 3 corresponding to agent force applied for
the jump.
* Visual Observations: None.
* Reset Parameters: None.
* Benchmark Mean Reward: 2.5

* Set-up: Environment where four agents compete in a 2 vs 2 toy soccer game.
* Set-up: Environment where four agents compete in a 2 vs 2 toy soccer game.
* Striker: Get the ball into the opponent's goal.
* Goalie: Prevent the ball from entering its own goal.
* Agents: The environment contains four agents, with two linked to one brain (strikers) and two linked to another (goalies).
* Striker: Get the ball into the opponent's goal.
* Goalie: Prevent the ball from entering its own goal.
* Agents: The environment contains four agents, with two linked to one Brain
(strikers) and two linked to another (goalies).
* Striker:
* +1 When ball enters opponent's goal.
* -0.1 When ball enters own team's goal.
* -0.001 Existential penalty.
* Goalie:
* -1 When ball enters team's goal.
* +0.1 When ball enters opponents goal.
* +0.001 Existential bonus.
* Brains: Two brain with the following observation/action space:
* Vector Observation space: (Continuous) 112 corresponding to local 14 ray casts, each detecting 7 possible object types, along with the object's distance. Perception is in 180 degree view from front of agent.
* Vector Action space: (Discrete)
* Striker: 6 corresponding to forward, backward, sideways movement, as well as rotation.
* Goalie: 4 corresponding to forward, backward, sideways movement.
* Visual Observations: None.
* Striker:
* +1 When ball enters opponent's goal.
* -0.1 When ball enters own team's goal.
* -0.001 Existential penalty.
* Goalie:
* -1 When ball enters team's goal.
* +0.1 When ball enters opponents goal.
* +0.001 Existential bonus.
* Brains: Two Brain with the following observation/action space:
* Vector Observation space: 112 corresponding to local 14 ray casts, each
detecting 7 possible object types, along with the object's distance.
Perception is in 180 degree view from front of agent.
* Vector Action space: (Discrete) One Branch
* Striker: 6 actions corresponding to forward, backward, sideways movement,
as well as rotation.
* Goalie: 4 actions corresponding to forward, backward, sideways movement.
* Visual Observations: None.
* Benchmark Mean Reward (Striker & Goalie Brain): 0 (the means will be inverse of each other and criss crosses during training)
* Benchmark Mean Reward (Striker & Goalie Brain): 0 (the means will be inverse
of each other and criss crosses during training)
* Set-up: Physics-based Humanoids agents with 26 degrees of freedom. These DOFs correspond to articulation of the following body-parts: hips, chest, spine, head, thighs, shins, feets, arms, forearms and hands.
* Goal: The agents must move its body toward the goal direction as quickly as possible without falling.
* Agents: The environment contains 11 independent agent linked to a single brain.
* Agent Reward Function (independent):
* +0.03 times body velocity in the goal direction.
* +0.01 times head y position.
* +0.01 times body direction alignment with goal direction.
* -0.01 times head velocity difference from body velocity.
* Brains: One brain with the following observation/action space.
* Vector Observation space: (Continuous) 215 variables corresponding to position, rotation, velocity, and angular velocities of each limb, along with goal direction.
* Vector Action space: (Continuous) Size of 39, corresponding to target rotations applicable to the joints.
* Visual Observations: None.
* Set-up: Physics-based Humanoids agents with 26 degrees of freedom. These DOFs
correspond to articulation of the following body-parts: hips, chest, spine,
head, thighs, shins, feet, arms, forearms and hands.
* Goal: The agents must move its body toward the goal direction as quickly as
possible without falling.
* Agents: The environment contains 11 independent agent linked to a single
Brain.
* Agent Reward Function (independent):
* +0.03 times body velocity in the goal direction.
* +0.01 times head y position.
* +0.01 times body direction alignment with goal direction.
* -0.01 times head velocity difference from body velocity.
* Brains: One Brain with the following observation/action space.
* Vector Observation space: 215 variables corresponding to position, rotation,
velocity, and angular velocities of each limb, along with goal direction.
* Vector Action space: (Continuous) Size of 39, corresponding to target
rotations applicable to the joints.
* Visual Observations: None.
* Reset Parameters: None.
* Benchmark Mean Reward: 1000

* Set-up: Environment where the agent needs to press a button to spawn a pyramid, then navigate to the pyramid, knock it over, and move to the gold brick at the top.
* Set-up: Environment where the agent needs to press a button to spawn a
pyramid, then navigate to the pyramid, knock it over, and move to the gold
brick at the top.
* Agents: The environment contains one agent linked to a single brain.
* Agents: The environment contains one agent linked to a single Brain.
* +2 For moving to golden brick (minus 0.001 per step).
* Brains: One brain with the following observation/action space:
* Vector Observation space: (Continuous) 148 corresponding to local ray-casts detecting switch, bricks, golden brick, and walls, plus variable indicating switch state.
* Vector Action space: (Discrete) 4 corresponding to agent rotation and forward/backward movement.
* Visual Observations (Optional): First-person camera per-agent. Use `VisualPyramids` scene.
* +2 For moving to golden brick (minus 0.001 per step).
* Brains: One Brain with the following observation/action space:
* Vector Observation space: 148 corresponding to local ray-casts detecting
switch, bricks, golden brick, and walls, plus variable indicating switch
state.
* Vector Action space: (Discrete) 4 corresponding to agent rotation and
forward/backward movement.
* Visual Observations (Optional): First-person camera per-agent. Us
`VisualPyramids` scene.
* Reset Parameters: None.
* Optional Imitation Learning scene: `PyramidsIL`.
* Benchmark Mean Reward: 1.75

219
docs/Learning-Environment-Executable.md


# Using an Environment Executable
This section will help you create and use built environments rather than the Editor to interact with an environment. Using an executable has some advantages over using the Editor :
This section will help you create and use built environments rather than the
Editor to interact with an environment. Using an executable has some advantages
over using the Editor:
* You can exchange executable with other people without having to share your entire repository.
* You can put your executable on a remote machine for faster training.
* You can use `Headless` mode for faster training.
* You can keep using the Unity Editor for other tasks while the agents are training.
* You can exchange executable with other people without having to share your
entire repository.
* You can put your executable on a remote machine for faster training.
* You can use `Headless` mode for faster training.
* You can keep using the Unity Editor for other tasks while the agents are
training.
## Building the 3DBall environment

1. Launch Unity.
2. On the Projects dialog, choose the **Open** option at the top of the window.
3. Using the file dialog that opens, locate the `unity-environment` folder
within the ML-Agents project and click **Open**.
4. In the **Project** window, navigate to the folder
`Assets/ML-Agents/Examples/3DBall/`.
5. Double-click the `3DBall` file to load the scene containing the Balance
Ball environment.
3. Using the file dialog that opens, locate the `UnitySDK` folder within the
ML-Agents project and click **Open**.
4. In the **Project** window, navigate to the folder
`Assets/ML-Agents/Examples/3DBall/`.
5. Double-click the `3DBall` file to load the scene containing the Balance Ball
environment.
Make sure the Brains in the scene have the right type. For example, if you want to be able to control your agents from Python, you will need to set the corresponding brain to **External**.
Make sure the Brains in the scene have the right type. For example, if you want
to be able to control your agents from Python, you will need to set the
corresponding Brain to **External**.
1. In the **Scene** window, click the triangle icon next to the Ball3DAcademy
object.
1. In the **Scene** window, click the triangle icon next to the Ball3DAcademy
object.
Next, we want the set up scene to play correctly when the training process
Next, we want the set up scene to play correctly when the training process
* The environment application runs in the background
* No dialogs require interaction
* The correct scene loads automatically
* The environment application runs in the background.
* No dialogs require interaction.
* The correct scene loads automatically.
- Ensure that **Run in Background** is Checked.
- Ensure that **Display Resolution Dialog** is set to Disabled.
* Ensure that **Run in Background** is Checked.
* Ensure that **Display Resolution Dialog** is set to Disabled.
- (optional) Select “Development Build” to
[log debug messages](https://docs.unity3d.com/Manual/LogFiles.html).
5. If any scenes are shown in the **Scenes in Build** list, make sure that
the 3DBall Scene is the only one checked. (If the list is empty, than only the
current scene is included in the build).
* (optional) Select “Development Build” to [log debug
messages](https://docs.unity3d.com/Manual/LogFiles.html).
5. If any scenes are shown in the **Scenes in Build** list, make sure that the
3DBall Scene is the only one checked. (If the list is empty, than only the
current scene is included in the build).
- In the File dialog, navigate to the `python` folder in your ML-Agents
directory.
- Assign a file name and click **Save**.
- (For Windows)With Unity 2018.1, it will ask you to select a folder instead of a file name. Create a subfolder within `python` folder and select that folder to build. In the following steps you will refer to this subfolder's name as `env_name`.
* In the File dialog, navigate to your ML-Agents directory.
* Assign a file name and click **Save**.
* (For Windows)With Unity 2018.1, it will ask you to select a folder instead
of a file name. Create a subfolder within the ML-Agents folder and select
that folder to build. In the following steps you will refer to this
subfolder's name as `env_name`.
Now that we have a Unity executable containing the simulation environment, we
Now that we have a Unity executable containing the simulation environment, we
If you want to use the [Python API](Python-API.md) to interact with your executable, you can pass the name of the executable with the argument 'file_name' of the `UnityEnvironment`. For instance :
If you want to use the [Python API](Python-API.md) to interact with your
executable, you can pass the name of the executable with the argument
'file_name' of the `UnityEnvironment`. For instance:
from unityagents import UnityEnvironment
from mlagents.envs import UnityEnvironment
1. Open a command or terminal window.
2. Nagivate to the folder where you installed ML-Agents.
3. Change to the python directory.
4. Run `python3 learn.py <env_name> --run-id=<run-identifier> --train`
Where:
- `<env_name>` is the name and path to the executable you exported from Unity (without extension)
- `<run-identifier>` is a string used to separate the results of different training runs
- And the `--train` tells learn.py to run a training session (rather than inference)
1. Open a command or terminal window.
2. Navigate to the folder where you installed the ML-Agents Toolkit. If you
followed the default [installation](Installation.md), then navigate to the
`ml-agents/` folder.
3. Run
`mlagents-learn <trainer-config-file> --env=<env_name> --run-id=<run-identifier> --train`
Where:
* `<trainer-config-file>` is the file path of the trainer configuration yaml
* `<env_name>` is the name and path to the executable you exported from Unity
(without extension)
* `<run-identifier>` is a string used to separate the results of different
training runs
* And the `--train` tells `mlagents-learn` to run a training session (rather
than inference)
For example, if you are training with a 3DBall executable you exported to the ml-agents/python directory, run:
For example, if you are training with a 3DBall executable you exported to the
the directory where you installed the ML-Agents Toolkit, run:
```sh
mlagents-learn ../config/trainer_config.yaml --env=3DBall --run-id=firstRun --train
python3 learn.py 3DBall --run-id=firstRun --train
And you should see something like
```console
ml-agents$ mlagents-learn config/trainer_config.yaml --env=3DBall --run-id=first-run --train
▄▄▄▓▓▓▓
╓▓▓▓▓▓▓█▓▓▓▓▓
,▄▄▄m▀▀▀' ,▓▓▓▀▓▓▄ ▓▓▓ ▓▓▌
▄▓▓▓▀' ▄▓▓▀ ▓▓▓ ▄▄ ▄▄ ,▄▄ ▄▄▄▄ ,▄▄ ▄▓▓▌▄ ▄▄▄ ,▄▄
▄▓▓▓▀ ▄▓▓▀ ▐▓▓▌ ▓▓▌ ▐▓▓ ▐▓▓▓▀▀▀▓▓▌ ▓▓▓ ▀▓▓▌▀ ^▓▓▌ ╒▓▓▌
▄▓▓▓▓▓▄▄▄▄▄▄▄▄▓▓▓ ▓▀ ▓▓▌ ▐▓▓ ▐▓▓ ▓▓▓ ▓▓▓ ▓▓▌ ▐▓▓▄ ▓▓▌
▀▓▓▓▓▀▀▀▀▀▀▀▀▀▀▓▓▄ ▓▓ ▓▓▌ ▐▓▓ ▐▓▓ ▓▓▓ ▓▓▓ ▓▓▌ ▐▓▓▐▓▓
^█▓▓▓ ▀▓▓▄ ▐▓▓▌ ▓▓▓▓▄▓▓▓▓ ▐▓▓ ▓▓▓ ▓▓▓ ▓▓▓▄ ▓▓▓▓`
'▀▓▓▓▄ ^▓▓▓ ▓▓▓ └▀▀▀▀ ▀▀ ^▀▀ `▀▀ `▀▀ '▀▀ ▐▓▓▌
▀▀▀▀▓▄▄▄ ▓▓▓▓▓▓, ▓▓▓▓▀
`▀█▓▓▓▓▓▓▓▓▓▌
¬`▀▀▀█▓
INFO:mlagents.learn:{'--curriculum': 'None',
'--docker-target-name': 'Empty',
'--env': '3DBall',
'--help': False,
'--keep-checkpoints': '5',
'--lesson': '0',
'--load': False,
'--no-graphics': False,
'--num-runs': '1',
'--run-id': 'firstRun',
'--save-freq': '50000',
'--seed': '-1',
'--slow': False,
'--train': True,
'--worker-id': '0',
'<trainer-config-path>': 'config/trainer_config.yaml'}
![Training command example](images/training-command-example.png)
**Note**: If you're using Anaconda, don't forget to activate the ml-agents
environment first.
**Note**: If you're using Anaconda, don't forget to activate the ml-agents environment first.
If `mlagents-learn` runs correctly and starts training, you should see something
like this:
If the learn.py runs correctly and starts training, you should see something like this:
```console
CrashReporter: initialized
Mono path[0] = '/Users/dericp/workspace/ml-agents/3DBall.app/Contents/Resources/Data/Managed'
Mono config path = '/Users/dericp/workspace/ml-agents/3DBall.app/Contents/MonoBleedingEdge/etc'
INFO:mlagents.envs:
'Ball3DAcademy' started successfully!
INFO:mlagents.envs:
'Ball3DAcademy' started successfully!
Unity Academy name: Ball3DAcademy
Number of Brains: 1
Number of External Brains : 1
Reset Parameters :
![Training running](images/training-running.png)
Unity brain name: Ball3DBrain
Number of Visual Observations (per agent): 0
Vector Observation space size (per agent): 8
Number of stacked Vector Observation: 1
Vector Action space type: continuous
Vector Action space size (per agent): [2]
Vector Action descriptions: ,
INFO:mlagents.envs:Hyperparameters for the PPO Trainer of brain Ball3DBrain:
batch_size: 64
beta: 0.001
buffer_size: 12000
epsilon: 0.2
gamma: 0.995
hidden_units: 128
lambd: 0.99
learning_rate: 0.0003
max_steps: 5.0e4
normalize: True
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 1000
use_recurrent: False
graph_scope:
summary_path: ./summaries/first-run-0
memory_size: 256
use_curiosity: False
curiosity_strength: 0.01
curiosity_enc_size: 128
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 1000. Mean Reward: 1.242. Std of Reward: 0.746. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 2000. Mean Reward: 1.319. Std of Reward: 0.693. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 3000. Mean Reward: 1.804. Std of Reward: 1.056. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 4000. Mean Reward: 2.151. Std of Reward: 1.432. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 5000. Mean Reward: 3.175. Std of Reward: 2.250. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 6000. Mean Reward: 4.898. Std of Reward: 4.019. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 7000. Mean Reward: 6.716. Std of Reward: 5.125. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 8000. Mean Reward: 12.124. Std of Reward: 11.929. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 9000. Mean Reward: 18.151. Std of Reward: 16.871. Training.
INFO:mlagents.trainers: first-run-0: Ball3DBrain: Step: 10000. Mean Reward: 27.284. Std of Reward: 28.667. Training.
```
You can press Ctrl+C to stop the training, and your trained model will be at `ml-agents/python/models/<run-identifier>/<env_name>_<run-identifier>.bytes`, which corresponds to your model's latest checkpoint. You can now embed this trained model into your internal brain by following the steps below:
You can press Ctrl+C to stop the training, and your trained model will be at
`models/<run-identifier>/<env_name>_<run-identifier>.bytes`, which corresponds
to your model's latest checkpoint. You can now embed this trained model into
your Internal Brain by following the steps below:
1. Move your model file into
`unity-environment/Assets/ML-Agents/Examples/3DBall/TFModels/`.
1. Move your model file into
`UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/`.
5. Drag the `<env_name>_<run-identifier>.bytes` file from the Project window of the Editor
to the **Graph Model** placeholder in the **Ball3DBrain** inspector window.
5. Drag the `<env_name>_<run-identifier>.bytes` file from the Project window of
the Editor to the **Graph Model** placeholder in the **Ball3DBrain**
inspector window.
6. Press the Play button at the top of the editor.

27
docs/Limitations.md


# Limitations
# Limitations
If you enable Headless mode, you will not be able to collect visual
observations from your agents.
If you enable Headless mode, you will not be able to collect visual observations
from your agents.
Currently the speed of the game physics can only be increased to 100x
real-time. The Academy also moves in time with FixedUpdate() rather than
Update(), so game behavior implemented in Update() may be out of sync with the Agent decision making. See [Execution Order of Event Functions](https://docs.unity3d.com/Manual/ExecutionOrder.html) for more information.
Currently the speed of the game physics can only be increased to 100x real-time.
The Academy also moves in time with FixedUpdate() rather than Update(), so game
behavior implemented in Update() may be out of sync with the agent decision
making. See
[Execution Order of Event Functions](https://docs.unity3d.com/Manual/ExecutionOrder.html)
for more information.
As of version 0.3, we no longer support Python 2.
As of version 0.3, we no longer support Python 2.
### TensorFlow support
### Tensorflow support
Currently the Ml-Agents toolkit uses TensorFlow 1.7.1 due to the version of the TensorFlowSharp plugin we are using.
Currently the Ml-Agents toolkit uses TensorFlow 1.7.1 due to the version of the
TensorFlowSharp plugin we are using.

703
docs/ML-Agents-Overview.md


# ML-Agents Toolkit Overview
**The Unity Machine Learning Agents Toolkit** (ML-Agents Toolkit) is an open-source Unity plugin
that enables games and simulations to serve as environments for training
intelligent agents. Agents can be trained using reinforcement learning,
imitation learning, neuroevolution, or other machine learning methods through
a simple-to-use Python API. We also provide implementations (based on
TensorFlow) of state-of-the-art algorithms to enable game developers
and hobbyists to easily train intelligent agents for 2D, 3D and VR/AR games.
These trained agents can be used for multiple purposes, including
**The Unity Machine Learning Agents Toolkit** (ML-Agents Toolkit) is an
open-source Unity plugin that enables games and simulations to serve as
environments for training intelligent agents. Agents can be trained using
reinforcement learning, imitation learning, neuroevolution, or other machine
learning methods through a simple-to-use Python API. We also provide
implementations (based on TensorFlow) of state-of-the-art algorithms to enable
game developers and hobbyists to easily train intelligent agents for 2D, 3D and
VR/AR games. These trained agents can be used for multiple purposes, including
design decisions pre-release. The ML-Agents toolkit is mutually beneficial for both game
developers and AI researchers as it provides a central platform where advances
in AI can be evaluated on Unity’s rich environments and then made accessible
to the wider research and game developer communities.
design decisions pre-release. The ML-Agents toolkit is mutually beneficial for
both game developers and AI researchers as it provides a central platform where
advances in AI can be evaluated on Unity’s rich environments and then made
accessible to the wider research and game developer communities.
Depending on your background (i.e. researcher, game developer, hobbyist),
you may have very different questions on your mind at the moment.
To make your transition to the ML-Agents toolkit easier, we provide several background
pages that include overviews and helpful resources on the
[Unity Engine](Background-Unity.md),
[machine learning](Background-Machine-Learning.md) and
[TensorFlow](Background-TensorFlow.md). We **strongly** recommend browsing
the relevant background pages if you're not familiar with a Unity scene,
basic machine learning concepts or have not previously heard of TensorFlow.
Depending on your background (i.e. researcher, game developer, hobbyist), you
may have very different questions on your mind at the moment. To make your
transition to the ML-Agents toolkit easier, we provide several background pages
that include overviews and helpful resources on the [Unity
Engine](Background-Unity.md), [machine learning](Background-Machine-Learning.md)
and [TensorFlow](Background-TensorFlow.md). We **strongly** recommend browsing
the relevant background pages if you're not familiar with a Unity scene, basic
machine learning concepts or have not previously heard of TensorFlow.
components, different training modes and scenarios. By the end of it, you
should have a good sense of _what_ the ML-Agents toolkit allows you to do. The subsequent
documentation pages provide examples of _how_ to use ML-Agents.
components, different training modes and scenarios. By the end of it, you should
have a good sense of _what_ the ML-Agents toolkit allows you to do. The
subsequent documentation pages provide examples of _how_ to use ML-Agents.
hypothetical, running example throughout. We will explore the
problem of training the behavior of a non-playable character (NPC) in a game.
(An NPC is a game character that is never controlled by a human player and
its behavior is pre-defined by the game developer.) More specifically, let's
assume we're building a multi-player, war-themed game in which players control
the soldiers. In this game, we have a single NPC who serves as a medic, finding
and reviving wounded players. Lastly, let us assume that there
are two teams, each with five players and one NPC medic.
hypothetical, running example throughout. We will explore the problem of
training the behavior of a non-playable character (NPC) in a game. (An NPC is a
game character that is never controlled by a human player and its behavior is
pre-defined by the game developer.) More specifically, let's assume we're
building a multi-player, war-themed game in which players control the soldiers.
In this game, we have a single NPC who serves as a medic, finding and reviving
wounded players. Lastly, let us assume that there are two teams, each with five
players and one NPC medic.
location. Second, it needs to be aware of which of its team members are
injured and require assistance. In the case of multiple injuries, it needs to
assess the degree of injury and decide who to help first. Lastly, a good
medic will always place itself in a position where it can quickly help its
team members. Factoring in all of these traits means that at every instance,
the medic needs to measure several attributes of the environment (e.g.
position of team members, position of enemies, which of its team members are
injured and to what degree) and then decide on an action (e.g. hide from enemy
fire, move to help one of its members). Given the large number of settings of
the environment and the large number of actions that the medic can take,
defining and implementing such complex behaviors by hand is challenging and
prone to errors.
location. Second, it needs to be aware of which of its team members are injured
and require assistance. In the case of multiple injuries, it needs to assess the
degree of injury and decide who to help first. Lastly, a good medic will always
place itself in a position where it can quickly help its team members. Factoring
in all of these traits means that at every instance, the medic needs to measure
several attributes of the environment (e.g. position of team members, position
of enemies, which of its team members are injured and to what degree) and then
decide on an action (e.g. hide from enemy fire, move to help one of its
members). Given the large number of settings of the environment and the large
number of actions that the medic can take, defining and implementing such
complex behaviors by hand is challenging and prone to errors.
With ML-Agents, it is possible to _train_ the behaviors of such NPCs
(called **agents**) using a variety of methods. The basic idea is quite simple.
We need to define three entities at every moment of the game
(called **environment**):
With ML-Agents, it is possible to _train_ the behaviors of such NPCs (called
**agents**) using a variety of methods. The basic idea is quite simple. We need
to define three entities at every moment of the game (called **environment**):
Observations can be numeric and/or visual. Numeric observations measure
attributes of the environment from the point of view of the agent. For
our medic this would be attributes of the battlefield that are visible to it.
Observations can either be _discrete_ or _continuous_ depending on the complexity
of the game and agent. For most interesting environments, an agent will require
several continuous numeric observations, while for simple environments with
a small number of unique configurations, a discrete observation will suffice.
Visual observations, on the other hand, are images generated from the cameras
attached to the agent and represent what the agent is seeing at that point
in time. It is common to confuse an agent's observation with the environment
(or game) **state**. The environment state represents information about the
entire scene containing all the game characters. The agents observation,
however, only contains information that the agent is aware of and is typically
a subset of the environment state. For example, the medic observation cannot
include information about an enemy in hiding that the medic is unaware of.
- **Actions** - what actions the medic can take. Similar
to observations, actions can either be continuous or discrete depending
on the complexity of the environment and agent. In the case of the medic,
if the environment is a simple grid world where only their location matters,
then a discrete action taking on one of four values (north, south, east, west)
suffices. However, if the environment is more complex and the medic can move
freely then using two continuous actions (one for direction and another
for speed) is more appropriate.
Observations can be numeric and/or visual. Numeric observations measure
attributes of the environment from the point of view of the agent. For our
medic this would be attributes of the battlefield that are visible to it. For
most interesting environments, an agent will require several continuous
numeric observations. Visual observations, on the other hand, are images
generated from the cameras attached to the agent and represent what the agent
is seeing at that point in time. It is common to confuse an agent's
observation with the environment (or game) **state**. The environment state
represents information about the entire scene containing all the game
characters. The agents observation, however, only contains information that
the agent is aware of and is typically a subset of the environment state. For
example, the medic observation cannot include information about an enemy in
hiding that the medic is unaware of.
- **Actions** - what actions the medic can take. Similar to observations,
actions can either be continuous or discrete depending on the complexity of
the environment and agent. In the case of the medic, if the environment is a
simple grid world where only their location matters, then a discrete action
taking on one of four values (north, south, east, west) suffices. However, if
the environment is more complex and the medic can move freely then using two
continuous actions (one for direction and another for speed) is more
appropriate.
Note that the reward signal need not be
provided at every moment, but only when the medic performs an action that is
good or bad. For example, it can receive a large negative reward if it dies,
a modest positive reward whenever it revives a wounded team member, and a
modest negative reward when a wounded team member dies due to lack of
assistance. Note that the reward signal is how the objectives of the task
are communicated to the agent, so they need to be set up in a manner where
maximizing reward generates the desired optimal behavior.
Note that the reward signal need not be provided at every moment, but only
when the medic performs an action that is good or bad. For example, it can
receive a large negative reward if it dies, a modest positive reward whenever
it revives a wounded team member, and a modest negative reward when a wounded
team member dies due to lack of assistance. Note that the reward signal is how
the objectives of the task are communicated to the agent, so they need to be
set up in a manner where maximizing reward generates the desired optimal
behavior.
After defining these three entities (the building blocks of a
**reinforcement learning task**),
we can now _train_ the medic's behavior. This is achieved by simulating the
environment for many trials where the medic, over time, learns what is the
optimal action to take for every observation it measures by maximizing
its future reward. The key is that by learning the actions that maximize its
reward, the medic is learning the behaviors that make it a good medic (i.e.
one who saves the most number of lives). In **reinforcement learning**
terminology, the behavior that is learned is called a **policy**, which is
essentially a (optimal) mapping from observations to actions. Note that
After defining these three entities (the building blocks of a **reinforcement
learning task**), we can now _train_ the medic's behavior. This is achieved by
simulating the environment for many trials where the medic, over time, learns
what is the optimal action to take for every observation it measures by
maximizing its future reward. The key is that by learning the actions that
maximize its reward, the medic is learning the behaviors that make it a good
medic (i.e. one who saves the most number of lives). In **reinforcement
learning** terminology, the behavior that is learned is called a **policy**,
which is essentially a (optimal) mapping from observations to actions. Note that
**training phase**, while playing the game with an NPC that is using its
learned policy is called the **inference phase**.
**training phase**, while playing the game with an NPC that is using its learned
policy is called the **inference phase**.
The ML-Agents toolkit provides all the necessary tools for using Unity as the simulation
engine for learning the policies of different objects in a Unity environment.
In the next few sections, we discuss how the ML-Agents toolkit achieves this and what
features it provides.
The ML-Agents toolkit provides all the necessary tools for using Unity as the
simulation engine for learning the policies of different objects in a Unity
environment. In the next few sections, we discuss how the ML-Agents toolkit
achieves this and what features it provides.
The ML-Agents toolkit is a Unity plugin that contains three high-level components:
* **Learning Environment** - which contains the Unity scene and all the game
characters.
* **Python API** - which contains all the machine learning algorithms that are
used for training (learning a behavior or policy). Note that, unlike
the Learning Environment, the Python API is not part of Unity, but lives
outside and communicates with Unity through the External Communicator.
* **External Communicator** - which connects the Learning Environment
with the Python API. It lives within the Learning Environment.
The ML-Agents toolkit is a Unity plugin that contains three high-level
components:
- **Learning Environment** - which contains the Unity scene and all the game
characters.
- **Python API** - which contains all the machine learning algorithms that are
used for training (learning a behavior or policy). Note that, unlike the
Learning Environment, the Python API is not part of Unity, but lives outside
and communicates with Unity through the External Communicator.
- **External Communicator** - which connects the Learning Environment with the
Python API. It lives within the Learning Environment.
<img src="images/learning_environment_basic.png"
alt="Simplified ML-Agents Scene Block Diagram"
width="700" border="10" />
<img src="images/learning_environment_basic.png"
alt="Simplified ML-Agents Scene Block Diagram"
width="700" border="10" />
organize the Unity scene:
* **Agents** - which is attached to a Unity GameObject (any character within a
scene) and handles generating its observations, performing the actions it
receives and assigning a reward (positive / negative) when appropriate.
Each Agent is linked to exactly one Brain.
* **Brains** - which encapsulates the logic for making decisions for the Agent.
In essence, the Brain is what holds on to the policy for each Agent and
determines which actions the Agent should take at each instance. More
specifically, it is the component that receives the observations and rewards
from the Agent and returns an action.
* **Academy** - which orchestrates the observation and decision making process.
Within the Academy, several environment-wide parameters such as the rendering
quality and the speed at which the environment is run can be specified. The
External Communicator lives within the Academy.
organize the Unity scene:
Every Learning Environment will always have one global Academy and one Agent
for every character in the scene. While each Agent must be linked to a Brain,
it is possible for Agents that have similar observations and actions to be
linked to the same Brain. In our sample game, we have two teams each with
their own medic. Thus we will have two Agents in our Learning Environment,
one for each medic, but both of these medics can be linked to the same Brain.
Note that these two medics are linked to the same Brain because their _space_
of observations and actions are similar. This does not mean that at each
instance they will have identical observation and action _values_. In other
words, the Brain defines the space of all possible observations and actions,
while the Agents connected to it (in this case the medics) can each have
their own, unique observation and action values. If we expanded our game
to include tank driver NPCs, then the Agent attached to those characters
cannot share a Brain with the Agent linked to the medics (medics and drivers
have different actions).
- **Agents** - which is attached to a Unity GameObject (any character within a
scene) and handles generating its observations, performing the actions it
receives and assigning a reward (positive / negative) when appropriate. Each
Agent is linked to exactly one Brain.
- **Brains** - which encapsulates the logic for making decisions for the Agent.
In essence, the Brain is what holds on to the policy for each Agent and
determines which actions the Agent should take at each instance. More
specifically, it is the component that receives the observations and rewards
from the Agent and returns an action.
- **Academy** - which orchestrates the observation and decision making process.
Within the Academy, several environment-wide parameters such as the rendering
quality and the speed at which the environment is run can be specified. The
External Communicator lives within the Academy.
Every Learning Environment will always have one global Academy and one Agent for
every character in the scene. While each Agent must be linked to a Brain, it is
possible for Agents that have similar observations and actions to be linked to
the same Brain. In our sample game, we have two teams each with their own medic.
Thus we will have two Agents in our Learning Environment, one for each medic,
but both of these medics can be linked to the same Brain. Note that these two
medics are linked to the same Brain because their _space_ of observations and
actions are similar. This does not mean that at each instance they will have
identical observation and action _values_. In other words, the Brain defines the
space of all possible observations and actions, while the Agents connected to it
(in this case the medics) can each have their own, unique observation and action
values. If we expanded our game to include tank driver NPCs, then the Agent
attached to those characters cannot share a Brain with the Agent linked to the
medics (medics and drivers have different actions).
<img src="images/learning_environment_example.png"
alt="Example ML-Agents Scene Block Diagram"
border="10" />
<img src="images/learning_environment_example.png"
alt="Example ML-Agents Scene Block Diagram"
border="10" />
We have yet to discuss how the ML-Agents toolkit trains behaviors, and what role the
Python API and External Communicator play. Before we dive into those details,
let's summarize the earlier components. Each character is attached to an Agent,
and each Agent is linked to a Brain. The Brain receives observations and
rewards from the Agent and returns actions. The Academy ensures that all the
We have yet to discuss how the ML-Agents toolkit trains behaviors, and what role
the Python API and External Communicator play. Before we dive into those
details, let's summarize the earlier components. Each character is attached to
an Agent, and each Agent is linked to a Brain. The Brain receives observations
and rewards from the Agent and returns actions. The Academy ensures that all the
* **External** - where decisions are made using the Python API. Here, the
observations and rewards collected by the Brain are forwarded to the Python
API through the External Communicator. The Python API then returns the
corresponding action that needs to be taken by the Agent.
* **Internal** - where decisions are made using an embedded
[TensorFlow](Background-TensorFlow.md) model.
The embedded TensorFlow model represents a learned policy and the Brain
directly uses this model to determine the action for each Agent.
* **Player** - where decisions are made using real input from a keyboard or
controller. Here, a human player is controlling the Agent and the observations
and rewards collected by the Brain are not used to control the Agent.
* **Heuristic** - where decisions are made using hard-coded behavior. This
resembles how most character behaviors are currently defined and can be
helpful for debugging or comparing how an Agent with hard-coded rules compares
to an Agent whose behavior has been trained. In our example, once we have
trained a Brain for the medics we could assign a medic on one team to the
trained Brain and assign the medic on the other team a Heuristic Brain
with hard-coded behaviors. We can then evaluate which medic is more effective.
As currently described, it may seem that the External Communicator
and Python API are only leveraged by the External Brain. This is not true.
It is possible to configure the Internal, Player and Heuristic Brains to
also send the observations, rewards and actions to the Python API through
the External Communicator (a feature called _broadcasting_). As we will see
shortly, this enables additional training modes.
- **External** - where decisions are made using the Python API. Here, the
observations and rewards collected by the Brain are forwarded to the Python
API through the External Communicator. The Python API then returns the
corresponding action that needs to be taken by the Agent.
- **Internal** - where decisions are made using an embedded
[TensorFlow](Background-TensorFlow.md) model. The embedded TensorFlow model
represents a learned policy and the Brain directly uses this model to
determine the action for each Agent.
- **Player** - where decisions are made using real input from a keyboard or
controller. Here, a human player is controlling the Agent and the observations
and rewards collected by the Brain are not used to control the Agent.
- **Heuristic** - where decisions are made using hard-coded behavior. This
resembles how most character behaviors are currently defined and can be
helpful for debugging or comparing how an Agent with hard-coded rules compares
to an Agent whose behavior has been trained. In our example, once we have
trained a Brain for the medics we could assign a medic on one team to the
trained Brain and assign the medic on the other team a Heuristic Brain with
hard-coded behaviors. We can then evaluate which medic is more effective.
As currently described, it may seem that the External Communicator and Python
API are only leveraged by the External Brain. This is not true. It is possible
to configure the Internal, Player and Heuristic Brains to also send the
observations, rewards and actions to the Python API through the External
Communicator (a feature called _broadcasting_). As we will see shortly, this
enables additional training modes.
<img src="images/learning_environment.png"
alt="ML-Agents Scene Block Diagram"
border="10" />
<img src="images/learning_environment.png"
alt="ML-Agents Scene Block Diagram"
border="10" />
_An example of how a scene containing multiple Agents and Brains might be
_An example of how a scene containing multiple Agents and Brains might be
configured._
## Training Modes

### Built-in Training and Inference
As mentioned previously, the ML-Agents toolkit ships with several implementations of
state-of-the-art algorithms for training intelligent agents. In this mode, the
Brain type is set to External during training and Internal during inference.
More specifically, during training, all the medics in the scene send their
observations to the Python API through the External Communicator (this is the
behavior with an External Brain). The Python API processes these observations
and sends back actions for each medic to take. During training these actions
are mostly exploratory to help the Python API learn the best policy for each
medic. Once training concludes, the learned policy for each medic can be
exported. Given that all our implementations are based on TensorFlow, the
learned policy is just a TensorFlow model file. Then during the inference
phase, we switch the Brain type to Internal and include the TensorFlow model
generated from the training phase. Now during the inference phase, the medics
still continue to generate their observations, but instead of being sent to
the Python API, they will be fed into their (internal, embedded) model to
generate the _optimal_ action for each medic to take at every point in time.
As mentioned previously, the ML-Agents toolkit ships with several
implementations of state-of-the-art algorithms for training intelligent agents.
In this mode, the Brain type is set to External during training and Internal
during inference. More specifically, during training, all the medics in the
scene send their observations to the Python API through the External
Communicator (this is the behavior with an External Brain). The Python API
processes these observations and sends back actions for each medic to take.
During training these actions are mostly exploratory to help the Python API
learn the best policy for each medic. Once training concludes, the learned
policy for each medic can be exported. Given that all our implementations are
based on TensorFlow, the learned policy is just a TensorFlow model file. Then
during the inference phase, we switch the Brain type to Internal and include the
TensorFlow model generated from the training phase. Now during the inference
phase, the medics still continue to generate their observations, but instead of
being sent to the Python API, they will be fed into their (internal, embedded)
model to generate the _optimal_ action for each medic to take at every point in
time.
To summarize: our built-in implementations are based on TensorFlow, thus,
during training the Python API uses the observations it receives to learn
a TensorFlow model. This model is then embedded within the Internal Brain
during inference to generate the optimal actions for all Agents linked to
that Brain. **Note that our Internal Brain is currently experimental as it
is limited to TensorFlow models and leverages the third-party
[TensorFlowSharp](https://github.com/migueldeicaza/TensorFlowSharp)
library.**
To summarize: our built-in implementations are based on TensorFlow, thus, during
training the Python API uses the observations it receives to learn a TensorFlow
model. This model is then embedded within the Internal Brain during inference to
generate the optimal actions for all Agents linked to that Brain. **Note that
our Internal Brain is currently experimental as it is limited to TensorFlow
models and leverages the third-party
[TensorFlowSharp](https://github.com/migueldeicaza/TensorFlowSharp) library.**
The
[Getting Started with the 3D Balance Ball Example](Getting-Started-with-Balance-Ball.md)

In the previous mode, the External Brain type was used for training
to generate a TensorFlow model that the Internal Brain type can understand
and use. However, any user of the ML-Agents toolkit can leverage their own algorithms
for both training and inference. In this case, the Brain type would be set
to External for both training and inferences phases and the behaviors of
all the Agents in the scene will be controlled within Python.
In the previous mode, the External Brain type was used for training to generate
a TensorFlow model that the Internal Brain type can understand and use. However,
any user of the ML-Agents toolkit can leverage their own algorithms for both
training and inference. In this case, the Brain type would be set to External
for both training and inferences phases and the behaviors of all the Agents in
the scene will be controlled within Python.
We do not currently have a tutorial highlighting this mode, but you can
learn more about the Python API [here](Python-API.md).

This mode is an extension of _Built-in Training and Inference_, and
is particularly helpful when training intricate behaviors for complex
environments. Curriculum learning is a way of training a machine learning
model where more difficult aspects of a problem are gradually introduced in
such a way that the model is always optimally challenged. This idea has been
around for a long time, and it is how we humans typically learn. If you
imagine any childhood primary school education, there is an ordering of
classes and topics. Arithmetic is taught before algebra, for example.
Likewise, algebra is taught before calculus. The skills and knowledge learned
in the earlier subjects provide a scaffolding for later lessons. The same
principle can be applied to machine learning, where training on easier tasks
can provide a scaffolding for harder tasks in the future.
This mode is an extension of _Built-in Training and Inference_, and is
particularly helpful when training intricate behaviors for complex environments.
Curriculum learning is a way of training a machine learning model where more
difficult aspects of a problem are gradually introduced in such a way that the
model is always optimally challenged. This idea has been around for a long time,
and it is how we humans typically learn. If you imagine any childhood primary
school education, there is an ordering of classes and topics. Arithmetic is
taught before algebra, for example. Likewise, algebra is taught before calculus.
The skills and knowledge learned in the earlier subjects provide a scaffolding
for later lessons. The same principle can be applied to machine learning, where
training on easier tasks can provide a scaffolding for harder tasks in the
future.
<img src="images/math.png"
alt="Example Math Curriculum"
width="700"
border="10" />
<img src="images/math.png"
alt="Example Math Curriculum"
width="700"
border="10" />
_Example of a mathematics curriculum. Lessons progress from simpler topics to more
complex ones, with each building on the last._
_Example of a mathematics curriculum. Lessons progress from simpler topics to
more complex ones, with each building on the last._
When we think about how reinforcement learning actually works, the
learning signal is reward received occasionally throughout training.
The starting point when training an agent to accomplish this task will be a
random policy. That starting policy will have the agent running in circles,
and will likely never, or very rarely achieve the reward for complex
environments. Thus by simplifying the environment at the beginning of training,
we allow the agent to quickly update the random policy to a more meaningful
one that is successively improved as the environment gradually increases in
complexity. In our example, we can imagine first training the medic when each
team only contains one player, and then iteratively increasing the number of
players (i.e. the environment complexity). The ML-Agents toolkit supports setting
custom environment parameters within the Academy. This allows
elements of the environment related to difficulty or complexity to be
dynamically adjusted based on training progress.
When we think about how reinforcement learning actually works, the learning
signal is reward received occasionally throughout training. The starting point
when training an agent to accomplish this task will be a random policy. That
starting policy will have the agent running in circles, and will likely never,
or very rarely achieve the reward for complex environments. Thus by simplifying
the environment at the beginning of training, we allow the agent to quickly
update the random policy to a more meaningful one that is successively improved
as the environment gradually increases in complexity. In our example, we can
imagine first training the medic when each team only contains one player, and
then iteratively increasing the number of players (i.e. the environment
complexity). The ML-Agents toolkit supports setting custom environment
parameters within the Academy. This allows elements of the environment related
to difficulty or complexity to be dynamically adjusted based on training
progress.
The [Training with Curriculum Learning](Training-Curriculum-Learning.md)
tutorial covers this training mode with the **Wall Area** sample environment.

It is often more intuitive to simply demonstrate the behavior we
want an agent to perform, rather than attempting to have it learn via
trial-and-error methods. For example, instead of training the medic by
setting up its reward function, this mode allows providing real examples from
a game controller on how the medic should behave. More specifically,
in this mode, the Brain type during training is set to Player and all the
actions performed with the controller (in addition to the agent observations)
will be recorded and sent to the Python API. The imitation learning algorithm
will then use these pairs of observations and actions from the human player
to learn a policy. [Video Link](https://youtu.be/kpb8ZkMBFYs).
It is often more intuitive to simply demonstrate the behavior we want an agent
to perform, rather than attempting to have it learn via trial-and-error methods.
For example, instead of training the medic by setting up its reward function,
this mode allows providing real examples from a game controller on how the medic
should behave. More specifically, in this mode, the Brain type during training
is set to Player and all the actions performed with the controller (in addition
to the agent observations) will be recorded and sent to the Python API. The
imitation learning algorithm will then use these pairs of observations and
actions from the human player to learn a policy. [Video
Link](https://youtu.be/kpb8ZkMBFYs).
The [Training with Imitation Learning](Training-Imitation-Learning.md) tutorial covers this
training mode with the **Banana Collector** sample environment.
The [Training with Imitation Learning](Training-Imitation-Learning.md) tutorial
covers this training mode with the **Banana Collector** sample environment.
While the discussion so-far has mostly focused on training a single agent, with
ML-Agents, several training scenarios are possible.
We are excited to see what kinds of novel and fun environments the community
creates. For those new to training intelligent agents, below are a few examples
that can serve as inspiration:
* Single-Agent. A single Agent linked to a single Brain, with its own reward
signal. The traditional way of training an agent. An example is any
single-player game, such as Chicken.
[Video Link](https://www.youtube.com/watch?v=fiQsmdwEGT8&feature=youtu.be).
* Simultaneous Single-Agent. Multiple independent Agents with independent
reward signals linked to a single Brain. A parallelized version of the
traditional training scenario, which can speed-up and stabilize the training
process. Helpful when you have multiple versions of the same character in an
environment who should learn similar behaviors. An example might be training
a dozen robot-arms to each open a door simultaneously.
[Video Link](https://www.youtube.com/watch?v=fq0JBaiCYNA).
* Adversarial Self-Play. Two interacting Agents with inverse reward signals
linked to a single Brain. In two-player games, adversarial self-play can allow
an agent to become increasingly more skilled, while always having the perfectly
matched opponent: itself. This was the strategy employed when training AlphaGo,
and more recently used by OpenAI to train a human-beating 1-vs-1 Dota 2 agent.
* Cooperative Multi-Agent. Multiple interacting Agents with a shared reward
signal linked to either a single or multiple different Brains. In this
scenario, all agents must work together to accomplish a task that cannot be
done alone. Examples include environments where each agent only has access to
partial information, which needs to be shared in order to accomplish the task
or collaboratively solve a puzzle.
* Competitive Multi-Agent. Multiple interacting Agents with inverse reward
signals linked to either a single or multiple different Brains. In this
scenario, agents must compete with one another to either win a competition,
or obtain some limited set of resources. All team sports fall into this
scenario.
* Ecosystem. Multiple interacting Agents with independent reward signals
linked to either a single or multiple different Brains. This scenario can be
thought of as creating a small world in which animals with different goals all
interact, such as a savanna in which there might be zebras, elephants and
giraffes, or an autonomous driving simulation within an urban environment.
While the discussion so-far has mostly focused on training a single agent, with
ML-Agents, several training scenarios are possible. We are excited to see what
kinds of novel and fun environments the community creates. For those new to
training intelligent agents, below are a few examples that can serve as
inspiration:
- Single-Agent. A single agent linked to a single Brain, with its own reward
signal. The traditional way of training an agent. An example is any
single-player game, such as Chicken. [Video
Link](https://www.youtube.com/watch?v=fiQsmdwEGT8&feature=youtu.be).
- Simultaneous Single-Agent. Multiple independent agents with independent reward
signals linked to a single Brain. A parallelized version of the traditional
training scenario, which can speed-up and stabilize the training process.
Helpful when you have multiple versions of the same character in an
environment who should learn similar behaviors. An example might be training a
dozen robot-arms to each open a door simultaneously. [Video
Link](https://www.youtube.com/watch?v=fq0JBaiCYNA).
- Adversarial Self-Play. Two interacting agents with inverse reward signals
linked to a single Brain. In two-player games, adversarial self-play can allow
an agent to become increasingly more skilled, while always having the
perfectly matched opponent: itself. This was the strategy employed when
training AlphaGo, and more recently used by OpenAI to train a human-beating
1-vs-1 Dota 2 agent.
- Cooperative Multi-Agent. Multiple interacting agents with a shared reward
signal linked to either a single or multiple different Brains. In this
scenario, all agents must work together to accomplish a task that cannot be
done alone. Examples include environments where each agent only has access to
partial information, which needs to be shared in order to accomplish the task
or collaboratively solve a puzzle.
- Competitive Multi-Agent. Multiple interacting s with inverse reward
signals linked to either a single or multiple different Brains. In this
scenario, s must compete with one another to either win a competition, or
obtain some limited set of resources. All team sports fall into this scenario.
- Ecosystem. Multiple interacting s with independent reward signals linked
to either a single or multiple different Brains. This scenario can be thought
of as creating a small world in which animals with different goals all
interact, such as a savanna in which there might be zebras, elephants and
giraffes, or an autonomous driving simulation within an urban environment.
Beyond the flexible training scenarios available, the ML-Agents toolkit includes
Beyond the flexible training scenarios available, the ML-Agents toolkit includes
* **On Demand Decision Making** - With the ML-Agents toolkit it is possible to have agents
request decisions only when needed as opposed to requesting decisions at
every step of the environment. This enables training of turn based games,
games where agents
must react to events or games where agents can take actions of variable
duration. Switching between decision taking at every step and
on-demand-decision is one button click away. You can learn more about the
on-demand-decision feature
[here](Learning-Environment-Design-Agents.md#on-demand-decision-making).
- **On Demand Decision Making** - With the ML-Agents toolkit it is possible to
have agents request decisions only when needed as opposed to requesting
decisions at every step of the environment. This enables training of turn
based games, games where agents must react to events or games where agents can
take actions of variable duration. Switching between decision taking at every
step and on-demand-decision is one button click away. You can learn more about
the on-demand-decision feature
[here](Learning-Environment-Design-Agents.md#on-demand-decision-making).
* **Memory-enhanced Agents** - In some scenarios, agents must learn to
remember the past in order to take the
best decision. When an agent only has partial observability of the environment,
keeping track of past observations can help the agent learn. We provide an
implementation of _Long Short-term Memory_
([LSTM](https://en.wikipedia.org/wiki/Long_short-term_memory))
in our trainers that enable the agent to store memories to be used in future
steps. You can learn more about enabling LSTM during training
[here](Feature-Memory.md).
- **Memory-enhanced Agents** - In some scenarios, agents must learn to remember
the past in order to take the best decision. When an agent only has partial
observability of the environment, keeping track of past observations can help
the agent learn. We provide an implementation of _Long Short-term Memory_
([LSTM](https://en.wikipedia.org/wiki/Long_short-term_memory)) in our trainers
that enable the agent to store memories to be used in future steps. You can
learn more about enabling LSTM during training [here](Feature-Memory.md).
* **Monitoring Agent’s Decision Making** - Since communication in ML-Agents
is a two-way street, we provide an agent Monitor class in Unity which can
display aspects of the trained agent, such as the agents perception on how
well it is doing (called **value estimates**) within the Unity environment
itself. By leveraging Unity as a visualization tool and providing these
outputs in real-time, researchers and developers can more easily debug an
agent’s behavior. You can learn more about using the Monitor class
[here](Feature-Monitor.md).
- **Monitoring Agent’s Decision Making** - Since communication in ML-Agents is a
two-way street, we provide an Agent Monitor class in Unity which can display
aspects of the trained Agent, such as the Agents perception on how well it is
doing (called **value estimates**) within the Unity environment itself. By
leveraging Unity as a visualization tool and providing these outputs in
real-time, researchers and developers can more easily debug an Agent’s
behavior. You can learn more about using the Monitor class
[here](Feature-Monitor.md).
- **Complex Visual Observations** - Unlike other platforms, where the agent’s
observation might be limited to a single vector or image, the ML-Agents
toolkit allows multiple cameras to be used for observations per agent. This
enables agents to learn to integrate information from multiple visual streams.
This can be helpful in several scenarios such as training a self-driving car
which requires multiple cameras with different viewpoints, or a navigational
agent which might need to integrate aerial and first-person visuals. You can
learn more about adding visual observations to an agent
[here](Learning-Environment-Design-Agents.md#multiple-visual-observations).
* **Complex Visual Observations** - Unlike other platforms, where the agent’s
observation might be limited to a single vector or image, the ML-Agents toolkit allows
multiple cameras to be used for observations per agent. This enables agents to
learn to integrate information from multiple visual streams. This can be
helpful in several scenarios such as training a self-driving car which requires
multiple cameras with different viewpoints, or a navigational agent which might
need to integrate aerial and first-person visuals. You can learn more about
adding visual observations to an agent
[here](Learning-Environment-Design-Agents.md#multiple-visual-observations).
* **Broadcasting** - As discussed earlier, an External Brain sends the
observations for all its Agents to the Python API by default. This is helpful
for training or inference. Broadcasting is a feature which can be enabled
for the other three modes (Player, Internal, Heuristic) where the Agent
observations and actions are also sent to the Python API (despite the fact
that the Agent is **not** controlled by the Python API). This feature is
leveraged by Imitation Learning, where the observations and actions for a
Player Brain are used to learn the policies of an agent through demonstration.
However, this could also be helpful for the Heuristic and Internal Brains,
particularly when debugging agent behaviors. You can learn more about using
the broadcasting feature
[here](Learning-Environment-Design-Brains.md#using-the-broadcast-feature).
- **Broadcasting** - As discussed earlier, an External Brain sends the
observations for all its Agents to the Python API by default. This is helpful
for training or inference. Broadcasting is a feature which can be enabled for
the other three modes (Player, Internal, Heuristic) where the Agent
observations and actions are also sent to the Python API (despite the fact
that the Agent is **not** controlled by the Python API). This feature is
leveraged by Imitation Learning, where the observations and actions for a
Player Brain are used to learn the policies of an agent through demonstration.
However, this could also be helpful for the Heuristic and Internal Brains,
particularly when debugging agent behaviors. You can learn more about using
the broadcasting feature
[here](Learning-Environment-Design-Brains.md#using-the-broadcast-feature).
* **Docker Set-up (Experimental)** - To facilitate setting up ML-Agents
without installing Python or TensorFlow directly, we provide a
[guide](Using-Docker.md) on how
to create and run a Docker container.
- **Docker Set-up (Experimental)** - To facilitate setting up ML-Agents without
installing Python or TensorFlow directly, we provide a
[guide](Using-Docker.md) on how to create and run a Docker container.
* **Cloud Training on AWS** - To facilitate using the ML-Agents toolkit on
Amazon Web Services (AWS) machines, we provide a
[guide](Training-on-Amazon-Web-Service.md)
on how to set-up EC2 instances in addition to a public pre-configured Amazon
Machine Image (AMI).
- **Cloud Training on AWS** - To facilitate using the ML-Agents toolkit on
Amazon Web Services (AWS) machines, we provide a
[guide](Training-on-Amazon-Web-Service.md) on how to set-up EC2 instances in
addition to a public pre-configured Amazon Machine Image (AMI).
* **Cloud Training on Microsoft Azure** - To facilitate using the ML-Agents toolkit on
Azure machines, we provide a
[guide](Training-on-Microsoft-Azure.md)
on how to set-up virtual machine instances in addition to a pre-configured data science image.
- **Cloud Training on Microsoft Azure** - To facilitate using the ML-Agents
toolkit on Azure machines, we provide a
[guide](Training-on-Microsoft-Azure.md) on how to set-up virtual machine
instances in addition to a pre-configured data science image.
To briefly summarize: The ML-Agents toolkit enables games and simulations built in Unity
to serve as the platform for training intelligent agents. It is designed
to enable a large variety of training modes and scenarios and comes packed
with several features to enable researchers and developers to leverage
To briefly summarize: The ML-Agents toolkit enables games and simulations built
in Unity to serve as the platform for training intelligent agents. It is
designed to enable a large variety of training modes and scenarios and comes
packed with several features to enable researchers and developers to leverage
To help you use ML-Agents, we've created several in-depth tutorials
for [installing ML-Agents](Installation.md),
[getting started](Getting-Started-with-Balance-Ball.md)
with the 3D Balance Ball environment (one of our many
[sample environments](Learning-Environment-Examples.md)) and
To help you use ML-Agents, we've created several in-depth tutorials for
[installing ML-Agents](Installation.md),
[getting started](Getting-Started-with-Balance-Ball.md) with the 3D Balance Ball
environment (one of our many
[sample environments](Learning-Environment-Examples.md)) and

135
docs/Migrating.md


# Migrating from ML-Agents toolkit v0.3 to v0.4
# Migrating
## Migrating from ML-Agents toolkit v0.4 to v0.5
### Important
* The Unity project `unity-environment` has been renamed `UnitySDK`.
* The `python` folder has been renamed to `ml-agents`. It now contains two
packages, `mlagents.env` and `mlagents.trainers`. `mlagents.env` can be used
to interact directly with a Unity environment, while `mlagents.trainers`
contains the classes for training agents.
* The supported Unity version has changed from `2017.1 or later` to `2017.4
or later`. 2017.4 is an LTS (Long Term Support) version that helps us
maintain good quality and support. Earlier versions of Unity might still work,
but you may encounter an
[error](FAQ.md#instance-of-corebraininternal-couldnt-be-created) listed here.
### Unity API
* Discrete Actions now use [branches](https://arxiv.org/abs/1711.08946). You can
now specify concurrent discrete actions. You will need to update the Brain
Parameters in the Brain Inspector in all your environments that use discrete
actions. Refer to the
[discrete action documentation](Learning-Environment-Design-Agents.md#discrete-action-space)
for more information.
### Python API
* In order to run a training session, you can now use the command
`mlagents-learn` instead of `python3 learn.py` after installing the `mlagents`
packages. This change is documented
[here](Training-ML-Agents.md#training-with-mlagents-learn). For example,
if we previously ran
```sh
python3 learn.py 3DBall --train
```
from the `python/` directory, we now run
```sh
mlagents-learn config/trainer_config.yaml --env=3DBall --train
```
from the directory where we installed the ML-Agents Toolkit.
* It is now required to specify the path to the yaml trainer configuration file
when running `mlagents-learn`. For an example trainer configuration file, see
[trainer_config.yaml](../config/trainer_config.yaml). An example of passing
a trainer configuration to `mlagents-learn` is shown above.
* The environment name is now passed through the `--env` option.
* Curriculum learning has been changed. Refer to the
[curriculum learning documentation](Training-Curriculum-Learning.md)
for detailed information. In summary:
* Curriculum files for the same environment must now be placed into a folder.
Each curriculum file should be named after the brain whose curriculum it
specifies.
* `min_lesson_length` now specifies the minimum number of episodes in a lesson
and affects reward thresholding.
* It is no longer necessary to specify the `Max Steps` of the Academy to use
curriculum learning.
## Migrating from ML-Agents toolkit v0.3 to v0.4
### Unity API
* `using MLAgents;` needs to be added in all of the C# scripts that use
ML-Agents.
## Unity API
* `using MLAgents;` needs to be added in all of the C# scripts that use ML-Agents.
### Python API
## Python API
* We've changed some of the python packages dependencies in requirement.txt file. Make sure to run `pip install .` within your `ml-agents/python` folder to update your python packages.
* We've changed some of the Python packages dependencies in requirement.txt
file. Make sure to run `pip3 install .` within your `ml-agents/python` folder
to update your Python packages.
# Migrating from ML-Agents toolkit v0.2 to v0.3
## Migrating from ML-Agents toolkit v0.2 to v0.3
There are a large number of new features and improvements in the ML-Agents toolkit v0.3 which change both the training process and Unity API in ways which will cause incompatibilities with environments made using older versions. This page is designed to highlight those changes for users familiar with v0.1 or v0.2 in order to ensure a smooth transition.
There are a large number of new features and improvements in the ML-Agents
toolkit v0.3 which change both the training process and Unity API in ways which
will cause incompatibilities with environments made using older versions. This
page is designed to highlight those changes for users familiar with v0.1 or v0.2
in order to ensure a smooth transition.
## Important
* The ML-Agents toolkit is no longer compatible with Python 2.
### Important
* The ML-Agents toolkit is no longer compatible with Python 2.
### Python Training
* The training script `ppo.py` and `PPO.ipynb` Python notebook have been
replaced with a single `learn.py` script as the launching point for training
with ML-Agents. For more information on using `learn.py`, see
[here](Training-ML-Agents.md#training-with-mlagents-learn).
* Hyperparameters for training Brains are now stored in the
`trainer_config.yaml` file. For more information on using this file, see
[here](Training-ML-Agents.md#training-config-file).
### Unity API
## Python Training
* The training script `ppo.py` and `PPO.ipynb` Python notebook have been replaced with a single `learn.py` script as the launching point for training with ML-Agents. For more information on using `learn.py`, see [here]().
* Hyperparameters for training brains are now stored in the `trainer_config.yaml` file. For more information on using this file, see [here]().
* Modifications to an Agent's rewards must now be done using either
`AddReward()` or `SetReward()`.
* Setting an Agent to done now requires the use of the `Done()` method.
* `CollectStates()` has been replaced by `CollectObservations()`, which now no
longer returns a list of floats.
* To collect observations, call `AddVectorObs()` within `CollectObservations()`.
Note that you can call `AddVectorObs()` with floats, integers, lists and
arrays of floats, Vector3 and Quaternions.
* `AgentStep()` has been replaced by `AgentAction()`.
* `WaitTime()` has been removed.
* The `Frame Skip` field of the Academy is replaced by the Agent's `Decision
Frequency` field, enabling the Agent to make decisions at different frequencies.
* The names of the inputs in the Internal Brain have been changed. You must
replace `state` with `vector_observation` and `observation` with
`visual_observation`. In addition, you must remove the `epsilon` placeholder.
## Unity API
* Modifications to an Agent's rewards must now be done using either `AddReward()` or `SetReward()`.
* Setting an Agent to done now requires the use of the `Done()` method.
* `CollectStates()` has been replaced by `CollectObservations()`, which now no longer returns a list of floats.
* To collect observations, call `AddVectorObs()` within `CollectObservations()`. Note that you can call `AddVectorObs()` with floats, integers, lists and arrays of floats, Vector3 and Quaternions.
* `AgentStep()` has been replaced by `AgentAction()`.
* `WaitTime()` has been removed.
* The `Frame Skip` field of the Academy is replaced by the Agent's `Decision Frequency` field, enabling agent to make decisions at different frequencies.
* The names of the inputs in the Internal Brain have been changed. You must replace `state` with `vector_observation` and `observation` with `visual_observation`. In addition, you must remove the `epsilon` placeholder.
### Semantics
## Semantics
In order to more closely align with the terminology used in the Reinforcement Learning field, and to be more descriptive, we have changed the names of some of the concepts used in ML-Agents. The changes are highlighted in the table below.
In order to more closely align with the terminology used in the Reinforcement
Learning field, and to be more descriptive, we have changed the names of some of
the concepts used in ML-Agents. The changes are highlighted in the table below.
| Old - v0.2 and earlier | New - v0.3 and later |
| --- | --- |

160
docs/Python-API.md


# Python API
# Unity ML-Agents Python Interface and Trainers
The ML-Agents toolkit provides a Python API for controlling the agent simulation loop of a environment or game built with Unity. This API is used by the ML-Agent training algorithms (run with `learn.py`), but you can also write your Python programs using this API.
The `mlagents` Python package is part of the [ML-Agents
Toolkit](https://github.com/Unity-Technologies/ml-agents). `mlagents` provides a
Python API that allows direct interaction with the Unity game engine as well as
a collection of trainers and algorithms to train agents in Unity environments.
The key objects in the Python API include:
The `mlagents` Python package contains two components: a low level API which
allows you to interact directly with a Unity Environment (`mlagents.envs`) and
an entry point to train (`mlagents-learn`) which allows you to train agents in
Unity Environments using our implementations of reinforcement learning or
imitation learning.
* **UnityEnvironment** — the main interface between the Unity application and your code. Use UnityEnvironment to start and control a simulation or training session.
* **BrainInfo** — contains all the data from agents in the simulation, such as observations and rewards.
* **BrainParameters** — describes the data elements in a BrainInfo object. For example, provides the array length of an observation in BrainInfo.
## mlagents.envs
These classes are all defined in the `python/unityagents` folder of the ML-Agents SDK.
The ML-Agents Toolkit provides a Python API for controlling the Agent simulation
loop of an environment or game built with Unity. This API is used by the
training algorithms inside the ML-Agent Toolkit, but you can also write your own
Python programs using this API.
To communicate with an agent in a Unity environment from a Python program, the agent must either use an **External** brain or use a brain that is broadcasting (has its **Broadcast** property set to true). Your code is expected to return actions for agents with external brains, but can only observe broadcasting brains (the information you receive for an agent is the same in both cases). See [Using the Broadcast Feature](Learning-Environment-Design-Brains.md#using-the-broadcast-feature).
The key objects in the Python API include:
For a simple example of using the Python API to interact with a Unity environment, see the Basic [Jupyter](Background-Jupyter.md) notebook (`python/Basics.ipynb`), which opens an environment, runs a few simulation steps taking random actions, and closes the environment.
- **UnityEnvironment** — the main interface between the Unity application and
your code. Use UnityEnvironment to start and control a simulation or training
session.
- **BrainInfo** — contains all the data from Agents in the simulation, such as
observations and rewards.
- **BrainParameters** — describes the data elements in a BrainInfo object. For
example, provides the array length of an observation in BrainInfo.
_Notice: Currently communication between Unity and Python takes place over an open socket without authentication. As such, please make sure that the network where training takes place is secure. This will be addressed in a future release._
These classes are all defined in the `ml-agents/mlagents/envs` folder of
the ML-Agents SDK.
## Loading a Unity Environment
To communicate with an Agent in a Unity environment from a Python program, the
Agent must either use an **External** Brain or use a Brain that is broadcasting
(has its **Broadcast** property set to true). Your code is expected to return
actions for Agents with external Brains, but can only observe broadcasting
Brains (the information you receive for an Agent is the same in both cases).
Python-side communication happens through `UnityEnvironment` which is located in `python/unityagents`. To load a Unity environment from a built binary file, put the file in the same directory as `unityagents`. For example, if the filename of your Unity environment is 3DBall.app, in python, run:
_Notice: Currently communication between Unity and Python takes place over an
open socket without authentication. As such, please make sure that the network
where training takes place is secure. This will be addressed in a future
release._
### Loading a Unity Environment
Python-side communication happens through `UnityEnvironment` which is located in
`ml-agents/mlagents/envs`. To load a Unity environment from a built binary
file, put the file in the same directory as `envs`. For example, if the filename
of your Unity environment is 3DBall.app, in python, run:
from unityagents import UnityEnvironment
from mlagents.env import UnityEnvironment
* `file_name` is the name of the environment binary (located in the root directory of the python project).
* `worker_id` indicates which port to use for communication with the environment. For use in parallel training regimes such as A3C.
* `seed` indicates the seed to use when generating random numbers during the training process. In environments which do not involve physics calculations, setting the seed enables reproducible experimentation by ensuring that the environment and trainers utilize the same random seed.
- `file_name` is the name of the environment binary (located in the root
directory of the python project).
- `worker_id` indicates which port to use for communication with the
environment. For use in parallel training regimes such as A3C.
- `seed` indicates the seed to use when generating random numbers during the
training process. In environments which do not involve physics calculations,
setting the seed enables reproducible experimentation by ensuring that the
environment and trainers utilize the same random seed.
If you want to directly interact with the Editor, you need to use `file_name=None`, then press the :arrow_forward: button in the Editor when the message _"Start training by pressing the Play button in the Unity Editor"_ is displayed on the screen
If you want to directly interact with the Editor, you need to use
`file_name=None`, then press the :arrow_forward: button in the Editor when the
message _"Start training by pressing the Play button in the Unity Editor"_ is
displayed on the screen
## Interacting with a Unity Environment
### Interacting with a Unity Environment
* **`visual_observations`** : A list of 4 dimensional numpy arrays. Matrix n of the list corresponds to the n<sup>th</sup> observation of the brain.
* **`vector_observations`** : A two dimensional numpy array of dimension `(batch size, vector observation size)` if the vector observation space is continuous and `(batch size, 1)` if the vector observation space is discrete.
* **`text_observations`** : A list of string corresponding to the agents text observations.
* **`memories`** : A two dimensional numpy array of dimension `(batch size, memory size)` which corresponds to the memories sent at the previous step.
* **`rewards`** : A list as long as the number of agents using the brain containing the rewards they each obtained at the previous step.
* **`local_done`** : A list as long as the number of agents using the brain containing `done` flags (whether or not the agent is done).
* **`max_reached`** : A list as long as the number of agents using the brain containing true if the agents reached their max steps.
* **`agents`** : A list of the unique ids of the agents using the brain.
* **`previous_actions`** : A two dimensional numpy array of dimension `(batch size, vector action size)` if the vector action space is continuous and `(batch size, 1)` if the vector action space is discrete.
- **`visual_observations`** : A list of 4 dimensional numpy arrays. Matrix n of
the list corresponds to the n<sup>th</sup> observation of the Brain.
- **`vector_observations`** : A two dimensional numpy array of dimension `(batch
size, vector observation size)`.
- **`text_observations`** : A list of string corresponding to the Agents text
observations.
- **`memories`** : A two dimensional numpy array of dimension `(batch size,
memory size)` which corresponds to the memories sent at the previous step.
- **`rewards`** : A list as long as the number of Agents using the Brain
containing the rewards they each obtained at the previous step.
- **`local_done`** : A list as long as the number of Agents using the Brain
containing `done` flags (whether or not the Agent is done).
- **`max_reached`** : A list as long as the number of Agents using the Brain
containing true if the Agents reached their max steps.
- **`agents`** : A list of the unique ids of the Agents using the Brain.
- **`previous_actions`** : A two dimensional numpy array of dimension `(batch
size, vector action size)` if the vector action space is continuous and
`(batch size, number of branches)` if the vector action space is discrete.
Once loaded, you can use your UnityEnvironment object, which referenced by a variable named `env` in this example, can be used in the following way:
Once loaded, you can use your UnityEnvironment object, which referenced by a
variable named `env` in this example, can be used in the following way:
Prints all parameters relevant to the loaded environment and the external brains.
Prints all parameters relevant to the loaded environment and the external
Brains.
Send a reset signal to the environment, and provides a dictionary mapping brain names to BrainInfo objects.
- `train_model` indicates whether to run the environment in train (`True`) or test (`False`) mode.
- `config` is an optional dictionary of configuration flags specific to the environment. For generic environments, `config` can be ignored. `config` is a dictionary of strings to floats where the keys are the names of the `resetParameters` and the values are their corresponding float values. Define the reset parameters on the [Academy Inspector](Learning-Environment-Design-Academy.md#academy-properties) window in the Unity Editor.
Send a reset signal to the environment, and provides a dictionary mapping
Brain names to BrainInfo objects.
- `train_model` indicates whether to run the environment in train (`True`) or
test (`False`) mode.
- `config` is an optional dictionary of configuration flags specific to the
environment. For generic environments, `config` can be ignored. `config` is
a dictionary of strings to floats where the keys are the names of the
`resetParameters` and the values are their corresponding float values.
Define the reset parameters on the Academy Inspector window in the Unity
Editor.
Sends a step signal to the environment using the actions. For each brain :
- `action` can be one dimensional arrays or two dimensional arrays if you have multiple agents per brains.
- `memory` is an optional input that can be used to send a list of floats per agents to be retrieved at the next step.
- `text_action` is an optional input that be used to send a single string per agent.
Sends a step signal to the environment using the actions. For each Brain :
- `action` can be one dimensional arrays or two dimensional arrays if you have
multiple Agents per Brain.
- `memory` is an optional input that can be used to send a list of floats per
Agents to be retrieved at the next step.
- `text_action` is an optional input that be used to send a single string per
Agent.
Returns a dictionary mapping brain names to BrainInfo objects.
For example, to access the BrainInfo belonging to a brain called 'brain_name', and the BrainInfo field 'vector_observations':
Returns a dictionary mapping Brain names to BrainInfo objects.
For example, to access the BrainInfo belonging to a Brain called
'brain_name', and the BrainInfo field 'vector_observations':
```
```
Note that if you have more than one external brain in the environment, you must provide dictionaries from brain names to arrays for `action`, `memory` and `value`. For example: If you have two external brains named `brain1` and `brain2` each with one agent taking two continuous actions, then you can have:
Note that if you have more than one external Brain in the environment, you
must provide dictionaries from Brain names to arrays for `action`, `memory`
and `value`. For example: If you have two external Brains named `brain1` and
`brain2` each with one Agent taking two continuous actions, then you can
have:
Returns a dictionary mapping brain names to BrainInfo objects.
- **Close : `env.close()`**
Sends a shutdown signal to the environment and closes the communication socket.
Returns a dictionary mapping Brain names to BrainInfo objects.
- **Close : `env.close()`**
Sends a shutdown signal to the environment and closes the communication
socket.
## mlagents-learn
For more detailed documentation on using `mlagents-learn`, check out
[Training ML-Agents](Training-ML-Agents.md)

80
docs/Readme.md


# Unity ML-Agents Toolkit Documentation
## Installation & Set-up
* [Installation](Installation.md)
* [Background: Jupyter Notebooks](Background-Jupyter.md)
* [Docker Set-up](Using-Docker.md)
* [Basic Guide](Basic-Guide.md)
* [Installation](Installation.md)
* [Background: Jupyter Notebooks](Background-Jupyter.md)
* [Docker Set-up](Using-Docker.md)
* [Basic Guide](Basic-Guide.md)
* [ML-Agents Toolkit Overview](ML-Agents-Overview.md)
* [Background: Unity](Background-Unity.md)
* [Background: Machine Learning](Background-Machine-Learning.md)
* [Background: TensorFlow](Background-TensorFlow.md)
* [Getting Started with the 3D Balance Ball Environment](Getting-Started-with-Balance-Ball.md)
* [Example Environments](Learning-Environment-Examples.md)
* [ML-Agents Toolkit Overview](ML-Agents-Overview.md)
* [Background: Unity](Background-Unity.md)
* [Background: Machine Learning](Background-Machine-Learning.md)
* [Background: TensorFlow](Background-TensorFlow.md)
* [Getting Started with the 3D Balance Ball Environment](Getting-Started-with-Balance-Ball.md)
* [Example Environments](Learning-Environment-Examples.md)
* [Making a New Learning Environment](Learning-Environment-Create-New.md)
* [Designing a Learning Environment](Learning-Environment-Design.md)
* [Agents](Learning-Environment-Design-Agents.md)
* [Academy](Learning-Environment-Design-Academy.md)
* [Brains](Learning-Environment-Design-Brains.md): [Player](Learning-Environment-Design-Player-Brains.md), [Heuristic](Learning-Environment-Design-Heuristic-Brains.md), [Internal & External](Learning-Environment-Design-External-Internal-Brains.md)
* [Learning Environment Best Practices](Learning-Environment-Best-Practices.md)
* [Using the Monitor](Feature-Monitor.md)
* [Using an Executable Environment](Learning-Environment-Executable.md)
* [TensorFlowSharp in Unity (Experimental)](Using-TensorFlow-Sharp-in-Unity.md)
* [Making a New Learning Environment](Learning-Environment-Create-New.md)
* [Designing a Learning Environment](Learning-Environment-Design.md)
* [Agents](Learning-Environment-Design-Agents.md)
* [Academy](Learning-Environment-Design-Academy.md)
* [Brains](Learning-Environment-Design-Brains.md):
[Player](Learning-Environment-Design-Player-Brains.md),
[Heuristic](Learning-Environment-Design-Heuristic-Brains.md),
[Internal & External](Learning-Environment-Design-External-Internal-Brains.md)
* [Learning Environment Best Practices](Learning-Environment-Best-Practices.md)
* [Using the Monitor](Feature-Monitor.md)
* [Using an Executable Environment](Learning-Environment-Executable.md)
* [TensorFlowSharp in Unity (Experimental)](Using-TensorFlow-Sharp-in-Unity.md)
* [Training ML-Agents](Training-ML-Agents.md)
* [Training with Proximal Policy Optimization](Training-PPO.md)
* [Training with Curriculum Learning](Training-Curriculum-Learning.md)
* [Training with Imitation Learning](Training-Imitation-Learning.md)
* [Training with LSTM](Feature-Memory.md)
* [Training on the Cloud with Amazon Web Services](Training-on-Amazon-Web-Service.md)
* [Training on the Cloud with Microsoft Azure](Training-on-Microsoft-Azure.md)
* [Using TensorBoard to Observe Training](Using-Tensorboard.md)
* [Training ML-Agents](Training-ML-Agents.md)
* [Training with Proximal Policy Optimization](Training-PPO.md)
* [Training with Curriculum Learning](Training-Curriculum-Learning.md)
* [Training with Imitation Learning](Training-Imitation-Learning.md)
* [Training with LSTM](Feature-Memory.md)
* [Training on the Cloud with Amazon Web Services](Training-on-Amazon-Web-Service.md)
* [Training on the Cloud with Microsoft Azure](Training-on-Microsoft-Azure.md)
* [Using TensorBoard to Observe Training](Using-Tensorboard.md)
* [Migrating from earlier versions of ML-Agents](Migrating.md)
* [Frequently Asked Questions](FAQ.md)
* [ML-Agents Glossary](Glossary.md)
* [Limitations](Limitations.md)
* [Migrating from earlier versions of ML-Agents](Migrating.md)
* [Frequently Asked Questions](FAQ.md)
* [ML-Agents Glossary](Glossary.md)
* [Limitations](Limitations.md)
* [API Reference](API-Reference.md)
* [How to use the Python API](Python-API.md)
* [API Reference](API-Reference.md)
* [How to use the Python API](Python-API.md)
* [Wrapping Learning Environment as a Gym](../gym-unity/README.md)

143
docs/Training-Curriculum-Learning.md


## Sample Environment
Imagine a task in which an agent needs to scale a wall to arrive at a goal. The starting
point when training an agent to accomplish this task will be a random policy. That
starting policy will have the agent running in circles, and will likely never, or very
rarely scale the wall properly to the achieve the reward. If we start with a simpler
task, such as moving toward an unobstructed goal, then the agent can easily learn to
accomplish the task. From there, we can slowly add to the difficulty of the task by
increasing the size of the wall, until the agent can complete the initially
near-impossible task of scaling the wall. We are including just such an environment with
the ML-Agents toolkit 0.2, called Wall Jump.
Imagine a task in which an agent needs to scale a wall to arrive at a goal. The
starting point when training an agent to accomplish this task will be a random
policy. That starting policy will have the agent running in circles, and will
likely never, or very rarely scale the wall properly to the achieve the reward.
If we start with a simpler task, such as moving toward an unobstructed goal,
then the agent can easily learn to accomplish the task. From there, we can
slowly add to the difficulty of the task by increasing the size of the wall,
until the agent can complete the initially near-impossible task of scaling the
wall. We are including just such an environment with the ML-Agents toolkit 0.2,
called __Wall Jump__.
_Demonstration of a curriculum training scenario in which a progressively taller wall
obstructs the path to the goal._
To see this in action, observe the two learning curves below. Each displays the reward
over time for an agent trained using PPO with the same set of training hyperparameters.
The difference is that one agent was trained using the full-height wall
version of the task, and the other agent was trained using the curriculum version of
the task. As you can see, without using curriculum learning the agent has a lot of
difficulty. We think that by using well-crafted curricula, agents trained using
reinforcement learning will be able to accomplish tasks otherwise much more difficult.
_Demonstration of a curriculum training scenario in which a progressively taller
wall obstructs the path to the goal._
To see this in action, observe the two learning curves below. Each displays the
reward over time for an agent trained using PPO with the same set of training
hyperparameters. The difference is that one agent was trained using the
full-height wall version of the task, and the other agent was trained using the
curriculum version of the task. As you can see, without using curriculum
learning the agent has a lot of difficulty. We think that by using well-crafted
curricula, agents trained using reinforcement learning will be able to
accomplish tasks otherwise much more difficult.
So how does it work? In order to define a curriculum, the first step is to decide which
parameters of the environment will vary. In the case of the Wall Area environment, what
varies is the height of the wall. We can define this as a reset parameter in the Academy
object of our scene, and by doing so it becomes adjustable via the Python API. Rather
than adjusting it by hand, we then create a simple JSON file which describes the
structure of the curriculum. Within it we can set at what points in the training process
our wall height will change, either based on the percentage of training steps which have
taken place, or what the average reward the agent has received in the recent past is.
Once these are in place, we simply launch ppo.py using the `–curriculum-file` flag to
point to the JSON file, and PPO we will train using Curriculum Learning. Of course we can
then keep track of the current lesson and progress via TensorBoard.
Each Brain in an environment can have a corresponding curriculum. These
curriculums are held in what we call a metacurriculum. A metacurriculum allows
different Brains to follow different curriculums within the same environment.
### Specifying a Metacurriculum
We first create a folder inside `config/curricula/` for the environment we want
to use curriculum learning with. For example, if we were creating a
metacurriculum for Wall Jump, we would create the folder
`config/curricula/wall-jump/`. We will place our curriculums inside this folder.
### Specifying a Curriculum
In order to define a curriculum, the first step is to decide which parameters of
the environment will vary. In the case of the Wall Jump environment, what varies
is the height of the wall. We define this as a `Reset Parameter` in the Academy
object of our scene, and by doing so it becomes adjustable via the Python API.
Rather than adjusting it by hand, we will create a JSON file which
describes the structure of the curriculum. Within it, we can specify which
points in the training process our wall height will change, either based on the
percentage of training steps which have taken place, or what the average reward
the agent has received in the recent past is. Below is an example curriculum for
the BigWallBrain in the Wall Jump environment.
"min_lesson_length" : 2,
"signal_smoothing" : true,
"parameters" :
"min_lesson_length" : 100,
"signal_smoothing" : true,
"parameters" :
"big_wall_max_height" : [4.0, 7.0, 8.0, 8.0],
"small_wall_height" : [1.5, 2.0, 2.5, 4.0]
"big_wall_max_height" : [4.0, 7.0, 8.0, 8.0]
* `reward` - Uses a measure received reward.
* `progress` - Uses ratio of steps/max_steps.
* `thresholds` (float array) - Points in value of `measure` where lesson should be increased.
* `min_lesson_length` (int) - How many times the progress measure should be reported before
incrementing the lesson.
* `signal_smoothing` (true/false) - Whether to weight the current progress measure by previous values.
* If `true`, weighting will be 0.75 (new) 0.25 (old).
* `parameters` (dictionary of key:string, value:float array) - Corresponds to academy reset parameters to control. Length of each array
should be one greater than number of thresholds.
* `reward` - Uses a measure received reward.
* `progress` - Uses ratio of steps/max_steps.
* `thresholds` (float array) - Points in value of `measure` where lesson should
be increased.
* `min_lesson_length` (int) - The minimum number of episodes that should be
completed before the lesson can change. If `measure` is set to `reward`, the
average cumulative reward of the last `min_lesson_length` episodes will be
used to determine if the lesson should change. Must be nonnegative.
__Important__: the average reward that is compared to the thresholds is
different than the mean reward that is logged to the console. For example,
if `min_lesson_length` is `100`, the lesson will increment after the average
cumulative reward of the last `100` episodes exceeds the current threshold.
The mean reward logged to the console is dictated by the `summary_freq`
parameter in the
[trainer configuration file](Training-ML-Agents.md#training-config-file).
* `signal_smoothing` (true/false) - Whether to weight the current progress
measure by previous values.
* If `true`, weighting will be 0.75 (new) 0.25 (old).
* `parameters` (dictionary of key:string, value:float array) - Corresponds to
Academy reset parameters to control. Length of each array should be one
greater than number of thresholds.
Once our curriculum is defined, we have to use the reset parameters we defined
and modify the environment from the Agent's `AgentReset()` function. See
[WallJumpAgent.cs](https://github.com/Unity-Technologies/ml-agents/blob/master/UnitySDK/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs)
for an example. Note that if the Academy's __Max Steps__ is not set to some
positive number the environment will never be reset. The Academy must reset
for the environment to reset.
We will save this file into our metacurriculum folder with the name of its
corresponding Brain. For example, in the Wall Jump environment, there are two
Brains---BigWallBrain and SmallWallBrain. If we want to define a curriculum for
the BigWallBrain, we will save `BigWallBrain.json` into
`curricula/wall-jump/`.
### Training with a Curriculum
Once we have specified our metacurriculum and curriculums, we can launch
`mlagents-learn` using the `–curriculum` flag to point to the metacurriculum
folder and PPO will train using Curriculum Learning. For example, to train
agents in the Wall Jump environment with curriculum learning, we can run
```sh
mlagents-learn config/trainer_config.yaml --curriculum=curricula/wall-jump/ --run-id=wall-jump-curriculum --train
```
We can then keep track of the current lessons and progresses via TensorBoard.

78
docs/Training-Imitation-Learning.md


# Imitation Learning
It is often more intuitive to simply demonstrate the behavior we want an agent to perform, rather than attempting to have it learn via trial-and-error methods. Consider our [running example](ML-Agents-Overview.md#running-example-training-npc-behaviors) of training a medic NPC : instead of indirectly training a medic with the help of a reward function, we can give the medic real world examples of observations from the game and actions from a game controller to guide the medic's behavior. More specifically, in this mode, the Brain type during training is set to Player and all the actions performed with the controller (in addition to the agent observations) will be recorded and sent to the Python API. The imitation learning algorithm will then use these pairs of observations and actions from the human player to learn a policy. [Video Link](https://youtu.be/kpb8ZkMBFYs).
It is often more intuitive to simply demonstrate the behavior we want an agent
to perform, rather than attempting to have it learn via trial-and-error methods.
Consider our
[running example](ML-Agents-Overview.md#running-example-training-npc-behaviors)
of training a medic NPC : instead of indirectly training a medic with the help
of a reward function, we can give the medic real world examples of observations
from the game and actions from a game controller to guide the medic's behavior.
More specifically, in this mode, the Brain type during training is set to Player
and all the actions performed with the controller (in addition to the agent
observations) will be recorded and sent to the Python API. The imitation
learning algorithm will then use these pairs of observations and actions from
the human player to learn a policy. [Video Link](https://youtu.be/kpb8ZkMBFYs).
There are a variety of possible imitation learning algorithms which can be used, the simplest one of them is Behavioral Cloning. It works by collecting training data from a teacher, and then simply uses it to directly learn a policy, in the same way the supervised learning for image classification or other traditional Machine Learning tasks work.
There are a variety of possible imitation learning algorithms which can be used,
the simplest one of them is Behavioral Cloning. It works by collecting training
data from a teacher, and then simply uses it to directly learn a policy, in the
same way the supervised learning for image classification or other traditional
Machine Learning tasks work.
1. In order to use imitation learning in a scene, the first thing you will need is to create two Brains, one which will be the "Teacher," and the other which will be the "Student." We will assume that the names of the brain `GameObject`s are "Teacher" and "Student" respectively.
2. Set the "Teacher" brain to Player mode, and properly configure the inputs to map to the corresponding actions. **Ensure that "Broadcast" is checked within the Brain inspector window.**
3. Set the "Student" brain to External mode.
4. Link the brains to the desired agents (one agent as the teacher and at least one agent as a student).
5. In `trainer_config.yaml`, add an entry for the "Student" brain. Set the `trainer` parameter of this entry to `imitation`, and the `brain_to_imitate` parameter to the name of the teacher brain: "Teacher". Additionally, set `batches_per_epoch`, which controls how much training to do each moment. Increase the `max_steps` option if you'd like to keep training the agents for a longer period of time.
6. Launch the training process with `python3 python/learn.py --train --slow`, and press the :arrow_forward: button in Unity when the message _"Start training by pressing the Play button in the Unity Editor"_ is displayed on the screen
7. From the Unity window, control the agent with the Teacher brain by providing "teacher demonstrations" of the behavior you would like to see.
8. Watch as the agent(s) with the student brain attached begin to behave similarly to the demonstrations.
9. Once the Student agents are exhibiting the desired behavior, end the training process with `CTL+C` from the command line.
10. Move the resulting `*.bytes` file into the `TFModels` subdirectory of the Assets folder (or a subdirectory within Assets of your choosing) , and use with `Internal` brain.
1. In order to use imitation learning in a scene, the first thing you will need
is to create two Brains, one which will be the "Teacher," and the other which
will be the "Student." We will assume that the names of the Brain
`GameObject`s are "Teacher" and "Student" respectively.
2. Set the "Teacher" Brain to Player mode, and properly configure the inputs to
map to the corresponding actions. **Ensure that "Broadcast" is checked within
the Brain inspector window.**
3. Set the "Student" Brain to External mode.
4. Link the Brains to the desired Agents (one Agent as the teacher and at least
one Agent as a student).
5. In `config/trainer_config.yaml`, add an entry for the "Student" Brain. Set
the `trainer` parameter of this entry to `imitation`, and the
`brain_to_imitate` parameter to the name of the teacher Brain: "Teacher".
Additionally, set `batches_per_epoch`, which controls how much training to do
each moment. Increase the `max_steps` option if you'd like to keep training
the Agents for a longer period of time.
6. Launch the training process with `mlagents-learn config/trainer_config.yaml
--train --slow`, and press the :arrow_forward: button in Unity when the
message _"Start training by pressing the Play button in the Unity Editor"_ is
displayed on the screen
7. From the Unity window, control the Agent with the Teacher Brain by providing
"teacher demonstrations" of the behavior you would like to see.
8. Watch as the Agent(s) with the student Brain attached begin to behave
similarly to the demonstrations.
9. Once the Student Agents are exhibiting the desired behavior, end the training
process with `CTL+C` from the command line.
10. Move the resulting `*.bytes` file into the `TFModels` subdirectory of the
Assets folder (or a subdirectory within Assets of your choosing) , and use
with `Internal` Brain.
We provide a convenience utility, `BC Teacher Helper` component that you can add to the Teacher Agent.
We provide a convenience utility, `BC Teacher Helper` component that you can add
to the Teacher Agent.
<img src="images/bc_teacher_helper.png"
alt="BC Teacher Helper"
width="375" border="10" />
<img src="images/bc_teacher_helper.png"
alt="BC Teacher Helper"
width="375" border="10" />
1. To start and stop recording experiences. This is useful in case you'd like to interact with the game _but not have the agents learn from these interactions_. The default command to toggle this is to press `R` on the keyboard.
1. To start and stop recording experiences. This is useful in case you'd like to
interact with the game _but not have the agents learn from these
interactions_. The default command to toggle this is to press `R` on the
keyboard.
2. Reset the training buffer. This enables you to instruct the agents to forget their buffer of recent experiences. This is useful if you'd like to get them to quickly learn a new behavior. The default command to reset the buffer is to press `C` on the keyboard.
2. Reset the training buffer. This enables you to instruct the agents to forget
their buffer of recent experiences. This is useful if you'd like to get them
to quickly learn a new behavior. The default command to reset the buffer is
to press `C` on the keyboard.

228
docs/Training-ML-Agents.md


# Training ML-Agents
The ML-Agents toolkit conducts training using an external Python training process. During training, this external process communicates with the Academy object in the Unity scene to generate a block of agent experiences. These experiences become the training set for a neural network used to optimize the agent's policy (which is essentially a mathematical function mapping observations to actions). In reinforcement learning, the neural network optimizes the policy by maximizing the expected rewards. In imitation learning, the neural network optimizes the policy to achieve the smallest difference between the actions chosen by the agent trainee and the actions chosen by the expert in the same situation.
The ML-Agents toolkit conducts training using an external Python training
process. During training, this external process communicates with the Academy
object in the Unity scene to generate a block of agent experiences. These
experiences become the training set for a neural network used to optimize the
agent's policy (which is essentially a mathematical function mapping
observations to actions). In reinforcement learning, the neural network
optimizes the policy by maximizing the expected rewards. In imitation learning,
the neural network optimizes the policy to achieve the smallest difference
between the actions chosen by the agent trainee and the actions chosen by the
expert in the same situation.
The output of the training process is a model file containing the optimized policy. This model file is a TensorFlow data graph containing the mathematical operations and the optimized weights selected during the training process. You can use the generated model file with the Internal Brain type in your Unity project to decide the best course of action for an agent.
The output of the training process is a model file containing the optimized
policy. This model file is a TensorFlow data graph containing the mathematical
operations and the optimized weights selected during the training process. You
can use the generated model file with the Internal Brain type in your Unity
project to decide the best course of action for an agent.
Use the Python program, `learn.py` to train your agents. This program can be found in the `python` directory of the ML-Agents SDK. The [configuration file](#training-config-file), `trainer_config.yaml` specifies the hyperparameters used during training. You can edit this file with a text editor to add a specific configuration for each brain.
Use the command `mlagents-learn` to train your agents. This command is installed
with the `mlagents` package and its implementation can be found at
`ml-agents/mlagents/trainers/learn.py`. The [configuration file](#training-config-file),
`config/trainer_config.yaml` specifies the hyperparameters used during training.
You can edit this file with a text editor to add a specific configuration for
each Brain.
For a broader overview of reinforcement learning, imitation learning and the ML-Agents training process, see [ML-Agents Toolkit Overview](ML-Agents-Overview.md).
For a broader overview of reinforcement learning, imitation learning and the
ML-Agents training process, see [ML-Agents Toolkit
Overview](ML-Agents-Overview.md).
## Training with learn.py
## Training with mlagents-learn
Use the Python `learn.py` program to train agents. `learn.py` supports training with [reinforcement learning](Background-Machine-Learning.md#reinforcement-learning), [curriculum learning](Training-Curriculum-Learning.md), and [behavioral cloning imitation learning](Training-Imitation-Learning.md).
Use the `mlagents-learn` command to train agents. `mlagents-learn` supports
training with
[reinforcement learning](Background-Machine-Learning.md#reinforcement-learning),
[curriculum learning](Training-Curriculum-Learning.md),
and [behavioral cloning imitation learning](Training-Imitation-Learning.md).
Run `learn.py` from the command line to launch the training process. Use the command line patterns and the `trainer_config.yaml` file to control training options.
Run `mlagents-learn` from the command line to launch the training process. Use
the command line patterns and the `config/trainer_config.yaml` file to control
training options.
python3 learn.py <env_name> --run-id=<run-identifier> --train
```sh
mlagents-learn <trainer-config-file> --env=<env_name> --run-id=<run-identifier> --train
```
where
* `<env_name>`__(Optional)__ is the name (including path) of your Unity executable containing the agents to be trained. If `<env_name>` is not passed, the training will happen in the Editor. Press the :arrow_forward: button in Unity when the message _"Start training by pressing the Play button in the Unity Editor"_ is displayed on the screen.
* `<run-identifier>` is an optional identifier you can use to identify the results of individual training runs.
where
For example, suppose you have a project in Unity named "CatsOnBicycles" which contains agents ready to train. To perform the training:
* `<trainer-config-file>` is the file path of the trainer configuration yaml.
* `<env_name>`__(Optional)__ is the name (including path) of your Unity
executable containing the agents to be trained. If `<env_name>` is not passed,
the training will happen in the Editor. Press the :arrow_forward: button in
Unity when the message _"Start training by pressing the Play button in the
Unity Editor"_ is displayed on the screen.
* `<run-identifier>` is an optional identifier you can use to identify the
results of individual training runs.
1. [Build the project](Learning-Environment-Executable.md), making sure that you only include the training scene.
For example, suppose you have a project in Unity named "CatsOnBicycles" which
contains agents ready to train. To perform the training:
1. [Build the project](Learning-Environment-Executable.md), making sure that you
only include the training scene.
3. Navigate to the ml-agents `python` folder.
4. Run the following to launch the training process using the path to the Unity environment you built in step 1:
3. Navigate to the directory where you installed the ML-Agents Toolkit.
4. Run the following to launch the training process using the path to the Unity
environment you built in step 1:
python3 learn.py ../../projects/Cats/CatsOnBicycles.app --run-id=cob_1 --train
```sh
mlagents-learn config/trainer_config.yaml --env=../../projects/Cats/CatsOnBicycles.app --run-id=cob_1 --train
```
During a training session, the training program prints out and saves updates at regular intervals (specified by the `summary_freq` option). The saved statistics are grouped by the `run-id` value so you should assign a unique id to each training run if you plan to view the statistics. You can view these statistics using TensorBoard during or after training by running the following command (from the ML-Agents python directory):
During a training session, the training program prints out and saves updates at
regular intervals (specified by the `summary_freq` option). The saved statistics
are grouped by the `run-id` value so you should assign a unique id to each
training run if you plan to view the statistics. You can view these statistics
using TensorBoard during or after training by running the following command:
tensorboard --logdir=summaries
```sh
tensorboard --logdir=summaries
```
When training is finished, you can find the saved model in the `python/models` folder under the assigned run-id — in the cats example, the path to the model would be `python/models/cob_1/CatsOnBicycles_cob_1.bytes`.
When training is finished, you can find the saved model in the `models` folder
under the assigned run-id — in the cats example, the path to the model would be
`models/cob_1/CatsOnBicycles_cob_1.bytes`.
While this example used the default training hyperparameters, you can edit the [training_config.yaml file](#training-config-file) with a text editor to set different values.
While this example used the default training hyperparameters, you can edit the
[training_config.yaml file](#training-config-file) with a text editor to set
different values.
In addition to passing the path of the Unity executable containing your training environment, you can set the following command line options when invoking `learn.py`:
In addition to passing the path of the Unity executable containing your training
environment, you can set the following command line options when invoking
`mlagents-learn`:
* `--curriculum=<file>` – Specify a curriculum JSON file for defining the lessons for curriculum training. See [Curriculum Training](Training-Curriculum-Learning.md) for more information.
* `--keep-checkpoints=<n>` – Specify the maximum number of model checkpoints to keep. Checkpoints are saved after the number of steps specified by the `save-freq` option. Once the maximum number of checkpoints has been reached, the oldest checkpoint is deleted when saving a new checkpoint. Defaults to 5.
* `--lesson=<n>` – Specify which lesson to start with when performing curriculum training. Defaults to 0.
* `--load` – If set, the training code loads an already trained model to initialize the neural network before training. The learning code looks for the model in `python/models/<run-id>/` (which is also where it saves models at the end of training). When not set (the default), the neural network weights are randomly initialized and an existing model is not loaded.
* `--run-id=<path>` – Specifies an identifier for each training run. This identifier is used to name the subdirectories in which the trained model and summary statistics are saved as well as the saved model itself. The default id is "ppo". If you use TensorBoard to view the training statistics, always set a unique run-id for each training run. (The statistics for all runs with the same id are combined as if they were produced by a the same session.)
* `--save-freq=<n>` Specifies how often (in steps) to save the model during training. Defaults to 50000.
* `--seed=<n>` – Specifies a number to use as a seed for the random number generator used by the training code.
* `--slow` – Specify this option to run the Unity environment at normal, game speed. The `--slow` mode uses the **Time Scale** and **Target Frame Rate** specified in the Academy's **Inference Configuration**. By default, training runs using the speeds specified in your Academy's **Training Configuration**. See [Academy Properties](Learning-Environment-Design-Academy.md#academy-properties).
* `--train` – Specifies whether to train model or only run in inference mode. When training, **always** use the `--train` option.
* `--worker-id=<n>` – When you are running more than one training environment at the same time, assign each a unique worker-id number. The worker-id is added to the communication port opened between the current instance of learn.py and the ExternalCommunicator object in the Unity environment. Defaults to 0.
* `--docker-target-name=<dt>` – The Docker Volume on which to store curriculum, executable and model files. See [Using Docker](Using-Docker.md).
* `--no-graphics` - Specify this option to run the Unity executable in `-batchmode` and doesn't initialize the graphics driver. Use this only if your training doesn't involve visual observations (reading from Pixels). See [here](https://docs.unity3d.com/Manual/CommandLineArguments.html) for more details.
* `--env=<env>` - Specify an executable environment to train.
* `--curriculum=<file>` – Specify a curriculum JSON file for defining the
lessons for curriculum training. See [Curriculum
Training](Training-Curriculum-Learning.md) for more information.
* `--keep-checkpoints=<n>` – Specify the maximum number of model checkpoints to
keep. Checkpoints are saved after the number of steps specified by the
`save-freq` option. Once the maximum number of checkpoints has been reached,
the oldest checkpoint is deleted when saving a new checkpoint. Defaults to 5.
* `--lesson=<n>` – Specify which lesson to start with when performing curriculum
training. Defaults to 0.
* `--load` – If set, the training code loads an already trained model to
initialize the neural network before training. The learning code looks for the
model in `models/<run-id>/` (which is also where it saves models at the end of
training). When not set (the default), the neural network weights are randomly
initialized and an existing model is not loaded.
* `--num-runs=<n>` - Sets the number of concurrent training sessions to perform.
Default is set to 1. Set to higher values when benchmarking performance and
multiple training sessions is desired. Training sessions are independent, and
do not improve learning performance.
* `--run-id=<path>` – Specifies an identifier for each training run. This
identifier is used to name the subdirectories in which the trained model and
summary statistics are saved as well as the saved model itself. The default id
is "ppo". If you use TensorBoard to view the training statistics, always set a
unique run-id for each training run. (The statistics for all runs with the
same id are combined as if they were produced by a the same session.)
* `--save-freq=<n>` Specifies how often (in steps) to save the model during
training. Defaults to 50000.
* `--seed=<n>` – Specifies a number to use as a seed for the random number
generator used by the training code.
* `--slow` – Specify this option to run the Unity environment at normal, game
speed. The `--slow` mode uses the **Time Scale** and **Target Frame Rate**
specified in the Academy's **Inference Configuration**. By default, training
runs using the speeds specified in your Academy's **Training Configuration**.
See
[Academy Properties](Learning-Environment-Design-Academy.md#academy-properties).
* `--train` – Specifies whether to train model or only run in inference mode.
When training, **always** use the `--train` option.
* `--worker-id=<n>` – When you are running more than one training environment at
the same time, assign each a unique worker-id number. The worker-id is added
to the communication port opened between the current instance of
`mlagents-learn` and the ExternalCommunicator object in the Unity environment.
Defaults to 0.
* `--docker-target-name=<dt>` – The Docker Volume on which to store curriculum,
executable and model files. See [Using Docker](Using-Docker.md).
* `--no-graphics` - Specify this option to run the Unity executable in
`-batchmode` and doesn't initialize the graphics driver. Use this only if your
training doesn't involve visual observations (reading from Pixels). See
[here](https://docs.unity3d.com/Manual/CommandLineArguments.html) for more
details.
The training config file, `trainer_config.yaml` specifies the training method, the hyperparameters, and a few additional values to use during training. The file is divided into sections. The **default** section defines the default values for all the available settings. You can also add new sections to override these defaults to train specific Brains. Name each of these override sections after the GameObject containing the Brain component that should use these settings. (This GameObject will be a child of the Academy in your scene.) Sections for the example environments are included in the provided config file. `Learn.py` finds the config file by name and looks for it in the same directory as itself.
The training config file, `config/trainer_config.yaml` specifies the training
method, the hyperparameters, and a few additional values to use during training.
The file is divided into sections. The **default** section defines the default
values for all the available settings. You can also add new sections to override
these defaults to train specific Brains. Name each of these override sections
after the GameObject containing the Brain component that should use these
settings. (This GameObject will be a child of the Academy in your scene.)
Sections for the example environments are included in the provided config file.
| ** Setting ** | **Description** | **Applies To Trainer**|
| :-- | :-- | :-- |
| batch_size | The number of experiences in each iteration of gradient descent.| PPO, BC |
| batches_per_epoch | In imitation learning, the number of batches of training examples to collect before training the model.| BC |
| beta | The strength of entropy regularization.| PPO, BC |
| brain\_to\_imitate | For imitation learning, the name of the GameObject containing the Brain component to imitate. | BC |
| buffer_size | The number of experiences to collect before updating the policy model. | PPO |
| curiosity\_enc\_size | The size of the encoding to use in the forward and inverse models in the Curioity module. | PPO |
| curiosity_strength | Magnitude of intrinsic reward generated by Intrinsic Curiosity Module. | PPO |
| epsilon | Influences how rapidly the policy can evolve during training.| PPO, BC |
| gamma | The reward discount rate for the Generalized Advantage Estimator (GAE). | PPO |
| hidden_units | The number of units in the hidden layers of the neural network. | PPO, BC |
| lambd | The regularization parameter. | PPO |
| learning_rate | The initial learning rate for gradient descent. | PPO, BC |
| max_steps | The maximum number of simulation steps to run during a training session. | PPO, BC |
| memory_size | The size of the memory an agent must keep. Used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, BC |
| normalize | Whether to automatically normalize observations. | PPO, BC |
| num_epoch | The number of passes to make through the experience buffer when performing gradient descent optimization. | PPO, BC |
| num_layers | The number of hidden layers in the neural network. | PPO, BC |
| sequence_length | Defines how long the sequences of experiences must be while training. Only used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, BC |
| summary_freq | How often, in steps, to save training statistics. This determines the number of data points shown by TensorBoard. | PPO, BC |
| time_horizon | How many steps of experience to collect per-agent before adding it to the experience buffer. | PPO, BC |
| trainer | The type of training to perform: "ppo" or "imitation".| PPO, BC |
| use_curiosity | Train using an additional intrinsic reward signal generated from Intrinsic Curiosity Module. | PPO |
| use_recurrent | Train using a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md).| PPO, BC |
|| PPO = Proximal Policy Optimization, BC = Behavioral Cloning (Imitation)) ||
| **Setting** | **Description** | **Applies To Trainer\*** |
| :------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------- |
| batch_size | The number of experiences in each iteration of gradient descent. | PPO, BC |
| batches_per_epoch | In imitation learning, the number of batches of training examples to collect before training the model. | BC |
| beta | The strength of entropy regularization. | PPO |
| brain\_to\_imitate | For imitation learning, the name of the GameObject containing the Brain component to imitate. | BC |
| buffer_size | The number of experiences to collect before updating the policy model. | PPO |
| curiosity\_enc\_size | The size of the encoding to use in the forward and inverse models in the Curioity module. | PPO |
| curiosity_strength | Magnitude of intrinsic reward generated by Intrinsic Curiosity Module. | PPO |
| epsilon | Influences how rapidly the policy can evolve during training. | PPO |
| gamma | The reward discount rate for the Generalized Advantage Estimator (GAE). | PPO |
| hidden_units | The number of units in the hidden layers of the neural network. | PPO, BC |
| lambd | The regularization parameter. | PPO |
| learning_rate | The initial learning rate for gradient descent. | PPO, BC |
| max_steps | The maximum number of simulation steps to run during a training session. | PPO, BC |
| memory_size | The size of the memory an agent must keep. Used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, BC |
| normalize | Whether to automatically normalize observations. | PPO |
| num_epoch | The number of passes to make through the experience buffer when performing gradient descent optimization. | PPO |
| num_layers | The number of hidden layers in the neural network. | PPO, BC |
| sequence_length | Defines how long the sequences of experiences must be while training. Only used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, BC |
| summary_freq | How often, in steps, to save training statistics. This determines the number of data points shown by TensorBoard. | PPO, BC |
| time_horizon | How many steps of experience to collect per-agent before adding it to the experience buffer. | PPO, BC |
| trainer | The type of training to perform: "ppo" or "imitation". | PPO, BC |
| use_curiosity | Train using an additional intrinsic reward signal generated from Intrinsic Curiosity Module. | PPO |
| use_recurrent | Train using a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, BC |
\*PPO = Proximal Policy Optimization, BC = Behavioral Cloning (Imitation)
For specific advice on setting hyperparameters based on the type of training you are conducting, see:
For specific advice on setting hyperparameters based on the type of training you
are conducting, see:
* [Training with PPO](Training-PPO.md)
* [Using Recurrent Neural Networks](Feature-Memory.md)

You can also compare the [example environments](Learning-Environment-Examples.md) to the corresponding sections of the `trainer-config.yaml` file for each example to see how the hyperparameters and other configuration variables have been changed from the defaults.
You can also compare the
[example environments](Learning-Environment-Examples.md)
to the corresponding sections of the `config/trainer_config.yaml` file for each
example to see how the hyperparameters and other configuration variables have
been changed from the defaults.

218
docs/Training-PPO.md


# Training with Proximal Policy Optimization
ML-Agents uses a reinforcement learning technique called [Proximal Policy Optimization (PPO)](https://blog.openai.com/openai-baselines-ppo/). PPO uses a neural network to approximate the ideal function that maps an agent's observations to the best action an agent can take in a given state. The ML-Agents PPO algorithm is implemented in TensorFlow and runs in a separate Python process (communicating with the running Unity application over a socket).
ML-Agents uses a reinforcement learning technique called
[Proximal Policy Optimization (PPO)](https://blog.openai.com/openai-baselines-ppo/).
PPO uses a neural network to approximate the ideal function that maps an agent's
observations to the best action an agent can take in a given state. The
ML-Agents PPO algorithm is implemented in TensorFlow and runs in a separate
Python process (communicating with the running Unity application over a socket).
See [Training ML-Agents](Training-ML-Agents.md) for instructions on running the training program, `learn.py`.
See [Training ML-Agents](Training-ML-Agents.md) for instructions on running the
training program, `learn.py`.
If you are using the recurrent neural network (RNN) to utilize memory, see [Using Recurrent Neural Networks](Feature-Memory.md) for RNN-specific training details.
If you are using the recurrent neural network (RNN) to utilize memory, see
[Using Recurrent Neural Networks](Feature-Memory.md) for RNN-specific training
details.
If you are using curriculum training to pace the difficulty of the learning task presented to an agent, see [Training with Curriculum Learning](Training-Curriculum-Learning.md).
If you are using curriculum training to pace the difficulty of the learning task
presented to an agent, see [Training with Curriculum
Learning](Training-Curriculum-Learning.md).
For information about imitation learning, which uses a different training algorithm, see [Training with Imitation Learning](Training-Imitation-Learning.md).
For information about imitation learning, which uses a different training
algorithm, see
[Training with Imitation Learning](Training-Imitation-Learning.md).
Successfully training a Reinforcement Learning model often involves tuning the training hyperparameters. This guide contains some best practices for tuning the training process when the default parameters don't seem to be giving the level of performance you would like.
Successfully training a Reinforcement Learning model often involves tuning the
training hyperparameters. This guide contains some best practices for tuning the
training process when the default parameters don't seem to be giving the level
of performance you would like.
#### Gamma
### Gamma
`gamma` corresponds to the discount factor for future rewards. This can be thought of as how far into the future the agent should care about possible rewards. In situations when the agent should be acting in the present in order to prepare for rewards in the distant future, this value should be large. In cases when rewards are more immediate, it can be smaller.
`gamma` corresponds to the discount factor for future rewards. This can be
thought of as how far into the future the agent should care about possible
rewards. In situations when the agent should be acting in the present in order
to prepare for rewards in the distant future, this value should be large. In
cases when rewards are more immediate, it can be smaller.
#### Lambda
### Lambda
`lambd` corresponds to the `lambda` parameter used when calculating the Generalized Advantage Estimate ([GAE](https://arxiv.org/abs/1506.02438)). This can be thought of as how much the agent relies on its current value estimate when calculating an updated value estimate. Low values correspond to relying more on the current value estimate (which can be high bias), and high values correspond to relying more on the actual rewards received in the environment (which can be high variance). The parameter provides a trade-off between the two, and the right value can lead to a more stable training process.
`lambd` corresponds to the `lambda` parameter used when calculating the
Generalized Advantage Estimate ([GAE](https://arxiv.org/abs/1506.02438)). This
can be thought of as how much the agent relies on its current value estimate
when calculating an updated value estimate. Low values correspond to relying
more on the current value estimate (which can be high bias), and high values
correspond to relying more on the actual rewards received in the environment
(which can be high variance). The parameter provides a trade-off between the
two, and the right value can lead to a more stable training process.
#### Buffer Size
### Buffer Size
`buffer_size` corresponds to how many experiences (agent observations, actions and rewards obtained) should be collected before we do any
learning or updating of the model. **This should be a multiple of `batch_size`**. Typically larger `buffer_size` correspond to more stable training updates.
`buffer_size` corresponds to how many experiences (agent observations, actions
and rewards obtained) should be collected before we do any learning or updating
of the model. **This should be a multiple of `batch_size`**. Typically a larger
`buffer_size` corresponds to more stable training updates.
#### Batch Size
### Batch Size
`batch_size` is the number of experiences used for one iteration of a gradient descent update. **This should always be a fraction of the
`buffer_size`**. If you are using a continuous action space, this value should be large (in the order of 1000s). If you are using a discrete action space, this value
should be smaller (in order of 10s).
`batch_size` is the number of experiences used for one iteration of a gradient
descent update. **This should always be a fraction of the `buffer_size`**. If
you are using a continuous action space, this value should be large (in the
order of 1000s). If you are using a discrete action space, this value should be
smaller (in order of 10s).
### Number of Epochs
#### Number of Epochs
`num_epoch` is the number of passes through the experience buffer during gradient descent. The larger the `batch_size`, the
larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning.
`num_epoch` is the number of passes through the experience buffer during
gradient descent. The larger the `batch_size`, the larger it is acceptable to
make this. Decreasing this will ensure more stable updates, at the cost of
slower learning.
### Learning Rate
#### Learning Rate
`learning_rate` corresponds to the strength of each gradient descent update step. This should typically be decreased if
training is unstable, and the reward does not consistently increase.
`learning_rate` corresponds to the strength of each gradient descent update
step. This should typically be decreased if training is unstable, and the reward
does not consistently increase.
#### Time Horizon
### Time Horizon
`time_horizon` corresponds to how many steps of experience to collect per-agent before adding it to the experience buffer.
When this limit is reached before the end of an episode, a value estimate is used to predict the overall expected reward from the agent's current state.
As such, this parameter trades off between a less biased, but higher variance estimate (long time horizon) and more biased, but less varied estimate (short time horizon).
In cases where there are frequent rewards within an episode, or episodes are prohibitively large, a smaller number can be more ideal.
This number should be large enough to capture all the important behavior within a sequence of an agent's actions.
`time_horizon` corresponds to how many steps of experience to collect per-agent
before adding it to the experience buffer. When this limit is reached before the
end of an episode, a value estimate is used to predict the overall expected
reward from the agent's current state. As such, this parameter trades off
between a less biased, but higher variance estimate (long time horizon) and more
biased, but less varied estimate (short time horizon). In cases where there are
frequent rewards within an episode, or episodes are prohibitively large, a
smaller number can be more ideal. This number should be large enough to capture
all the important behavior within a sequence of an agent's actions.
#### Max Steps
### Max Steps
`max_steps` corresponds to how many steps of the simulation (multiplied by frame-skip) are run during the training process. This value should be increased for more complex problems.
`max_steps` corresponds to how many steps of the simulation (multiplied by
frame-skip) are run during the training process. This value should be increased
for more complex problems.
#### Beta
### Beta
`beta` corresponds to the strength of the entropy regularization, which makes the policy "more random." This ensures that agents properly explore the action space during training. Increasing this will ensure more random actions are taken. This should be adjusted such that the entropy (measurable from TensorBoard) slowly decreases alongside increases in reward. If entropy drops too quickly, increase `beta`. If entropy drops too slowly, decrease `beta`.
`beta` corresponds to the strength of the entropy regularization, which makes
the policy "more random." This ensures that agents properly explore the action
space during training. Increasing this will ensure more random actions are
taken. This should be adjusted such that the entropy (measurable from
TensorBoard) slowly decreases alongside increases in reward. If entropy drops
too quickly, increase `beta`. If entropy drops too slowly, decrease `beta`.
#### Epsilon
### Epsilon
`epsilon` corresponds to the acceptable threshold of divergence between the old and new policies during gradient descent updating. Setting this value small will result in more stable updates, but will also slow the training process.
`epsilon` corresponds to the acceptable threshold of divergence between the old
and new policies during gradient descent updating. Setting this value small will
result in more stable updates, but will also slow the training process.
#### Normalize
### Normalize
`normalize` corresponds to whether normalization is applied to the vector observation inputs. This normalization is based on the running average and variance of the vector observation.
Normalization can be helpful in cases with complex continuous control problems, but may be harmful with simpler discrete control problems.
`normalize` corresponds to whether normalization is applied to the vector
observation inputs. This normalization is based on the running average and
variance of the vector observation. Normalization can be helpful in cases with
complex continuous control problems, but may be harmful with simpler discrete
control problems.
#### Number of Layers
### Number of Layers
`num_layers` corresponds to how many hidden layers are present after the observation input, or after the CNN encoding of the visual observation. For simple problems,
fewer layers are likely to train faster and more efficiently. More layers may be necessary for more complex control problems.
`num_layers` corresponds to how many hidden layers are present after the
observation input, or after the CNN encoding of the visual observation. For
simple problems, fewer layers are likely to train faster and more efficiently.
More layers may be necessary for more complex control problems.
#### Hidden Units
### Hidden Units
`hidden_units` correspond to how many units are in each fully connected layer of the neural network. For simple problems
where the correct action is a straightforward combination of the observation inputs, this should be small. For problems where
the action is a very complex interaction between the observation variables, this should be larger.
`hidden_units` correspond to how many units are in each fully connected layer of
the neural network. For simple problems where the correct action is a
straightforward combination of the observation inputs, this should be small. For
problems where the action is a very complex interaction between the observation
variables, this should be larger.
### (Optional) Recurrent Neural Network Hyperparameters
## (Optional) Recurrent Neural Network Hyperparameters
#### Sequence Length
### Sequence Length
`sequence_length` corresponds to the length of the sequences of experience passed through the network during training. This should be long enough to capture whatever information your agent might need to remember over time. For example, if your agent needs to remember the velocity of objects, then this can be a small value. If your agent needs to remember a piece of information given only once at the beginning of an episode, then this should be a larger value.
`sequence_length` corresponds to the length of the sequences of experience
passed through the network during training. This should be long enough to
capture whatever information your agent might need to remember over time. For
example, if your agent needs to remember the velocity of objects, then this can
be a small value. If your agent needs to remember a piece of information given
only once at the beginning of an episode, then this should be a larger value.
#### Memory Size
### Memory Size
`memory_size` corresponds to the size of the array of floating point numbers used to store the hidden state of the recurrent neural network. This value must be a multiple of 4, and should scale with the amount of information you expect the agent will need to remember in order to successfully complete the task.
`memory_size` corresponds to the size of the array of floating point numbers
used to store the hidden state of the recurrent neural network. This value must
be a multiple of 4, and should scale with the amount of information you expect
the agent will need to remember in order to successfully complete the task.
### (Optional) Intrinsic Curiosity Module Hyperparameters
## (Optional) Intrinsic Curiosity Module Hyperparameters
#### Curioisty Encoding Size
### Curioisty Encoding Size
`curiosity_enc_size` corresponds to the size of the hidden layer used to encode the observations within the intrinsic curiosity module. This value should be small enough to encourage the curiosity module to compress the original observation, but also not too small to prevent it from learning the dynamics of the environment.
`curiosity_enc_size` corresponds to the size of the hidden layer used to encode
the observations within the intrinsic curiosity module. This value should be
small enough to encourage the curiosity module to compress the original
observation, but also not too small to prevent it from learning the dynamics of
the environment.
#### Curiosity Strength
### Curiosity Strength
`curiosity_strength` corresponds to the magnitude of the intrinsic reward generated by the intrinsic curiosity module. This should be scaled in order to ensure it is large enough to not be overwhelmed by extrnisic reward signals in the environment. Likewise it should not be too large to overwhelm the extrinsic reward signal.
`curiosity_strength` corresponds to the magnitude of the intrinsic reward
generated by the intrinsic curiosity module. This should be scaled in order to
ensure it is large enough to not be overwhelmed by extrnisic reward signals in
the environment. Likewise it should not be too large to overwhelm the extrinsic
reward signal.
To view training statistics, use TensorBoard. For information on launching and using TensorBoard, see [here](./Getting-Started-with-Balance-Ball.md#observing-training-progress).
To view training statistics, use TensorBoard. For information on launching and
using TensorBoard, see
[here](./Getting-Started-with-Balance-Ball.md#observing-training-progress).
#### Cumulative Reward
### Cumulative Reward
The general trend in reward should consistently increase over time. Small ups and downs are to be expected. Depending on the complexity of the task, a significant increase in reward may not present itself until millions of steps into the training process.
The general trend in reward should consistently increase over time. Small ups
and downs are to be expected. Depending on the complexity of the task, a
significant increase in reward may not present itself until millions of steps
into the training process.
#### Entropy
### Entropy
This corresponds to how random the decisions of a brain are. This should consistently decrease during training. If it decreases too soon or not at all, `beta` should be adjusted (when using discrete action space).
This corresponds to how random the decisions of a Brain are. This should
consistently decrease during training. If it decreases too soon or not at all,
`beta` should be adjusted (when using discrete action space).
#### Learning Rate
### Learning Rate
#### Policy Loss
### Policy Loss
These values will oscillate during training. Generally they should be less than 1.0.
These values will oscillate during training. Generally they should be less than
1.0.
#### Value Estimate
### Value Estimate
These values should increase as the cumulative reward increases. They correspond to how much future reward the agent predicts itself receiving at any given point.
These values should increase as the cumulative reward increases. They correspond
to how much future reward the agent predicts itself receiving at any given
point.
#### Value Loss
### Value Loss
These values will increase as the reward increases, and then should decrease once reward becomes stable.
These values will increase as the reward increases, and then should decrease
once reward becomes stable.

118
docs/Training-on-Amazon-Web-Service.md


# Training on Amazon Web Service
This page contains instructions for setting up an EC2 instance on Amazon Web Service for training ML-Agents environments.
This page contains instructions for setting up an EC2 instance on Amazon Web
Service for training ML-Agents environments.
We've prepared an preconfigured AMI for you with the ID: `ami-18642967` in the `us-east-1` region. It was created as a modification of [Deep Learning AMI (Ubuntu)](https://aws.amazon.com/marketplace/pp/B077GCH38C). If you want to do training without the headless mode, you need to enable X Server on it. After launching your EC2 instance using the ami and ssh into it, run the following commands to enable it:
We've prepared an preconfigured AMI for you with the ID: `ami-18642967` in the
`us-east-1` region. It was created as a modification of [Deep Learning AMI
(Ubuntu)](https://aws.amazon.com/marketplace/pp/B077GCH38C). If you want to do
training without the headless mode, you need to enable X Server on it. After
launching your EC2 instance using the ami and ssh into it, run the following
commands to enable it:
```
```console
sudo /usr/bin/X :0 &
$ sudo /usr/bin/X :0 &
nvidia-smi
$ nvidia-smi
/*
* Thu Jun 14 20:27:26 2018
* +-----------------------------------------------------------------------------+

*/
//Make the ubuntu use X Server for display
export DISPLAY=:0
$ export DISPLAY=:0
You could also choose to configure your own instance. To begin with, you will need an EC2 instance which contains the latest Nvidia drivers, CUDA9, and cuDNN. In this tutorial we used the [Deep Learning AMI (Ubuntu)](https://aws.amazon.com/marketplace/pp/B077GCH38C) listed under AWS Marketplace with a p2.xlarge instance.
You could also choose to configure your own instance. To begin with, you will
need an EC2 instance which contains the latest Nvidia drivers, CUDA9, and cuDNN.
In this tutorial we used the
[Deep Learning AMI (Ubuntu)](https://aws.amazon.com/marketplace/pp/B077GCH38C)
listed under AWS Marketplace with a p2.xlarge instance.
### Installing the ML-Agents toolkit on the instance

```
```sh
2. Clone the ML-Agents repo and install the required python packages
2. Clone the ML-Agents repo and install the required Python packages
```
```sh
cd ml-agents/python
cd ml-agents/ml-agents/
X Server setup is only necessary if you want to do training that requires visual observation input. _Instructions here are adapted from this [Medium post](https://medium.com/towards-data-science/how-to-run-unity-on-amazon-cloud-or-without-monitor-3c10ce022639) on running general Unity applications in the cloud._
X Server setup is only necessary if you want to do training that requires visual
observation input. _Instructions here are adapted from this
[Medium post](https://medium.com/towards-data-science/how-to-run-unity-on-amazon-cloud-or-without-monitor-3c10ce022639)
on running general Unity applications in the cloud._
Current limitations of the Unity Engine require that a screen be available to render to when using visual observations. In order to make this possible when training on a remote server, a virtual screen is required. We can do this by installing Xorg and creating a virtual screen. Once installed and created, we can display the Unity environment in the virtual environment, and train as we would on a local machine. Ensure that `headless` mode is disabled when building linux executables which use visual observations.
Current limitations of the Unity Engine require that a screen be available to
render to when using visual observations. In order to make this possible when
training on a remote server, a virtual screen is required. We can do this by
installing Xorg and creating a virtual screen. Once installed and created, we
can display the Unity environment in the virtual environment, and train as we
would on a local machine. Ensure that `headless` mode is disabled when building
linux executables which use visual observations.
```
```console
sudo apt-get update
sudo apt-get install -y xserver-xorg mesa-utils
sudo nvidia-xconfig -a --use-display-device=None --virtual=1280x1024
$ sudo apt-get update
$ sudo apt-get install -y xserver-xorg mesa-utils
$ sudo nvidia-xconfig -a --use-display-device=None --virtual=1280x1024
nvidia-xconfig --query-gpu-info
$ nvidia-xconfig --query-gpu-info
sudo sed -i 's/ BoardName "Tesla K80"/ BoardName "Tesla K80"\n BusID "0:30:0"/g' /etc/X11/xorg.conf
$ sudo sed -i 's/ BoardName "Tesla K80"/ BoardName "Tesla K80"\n BusID "0:30:0"/g' /etc/X11/xorg.conf
sudo vim /etc/X11/xorg.conf //And remove two lines that contain Section "Files" and EndSection
$ sudo vim /etc/X11/xorg.conf //And remove two lines that contain Section "Files" and EndSection
```
```console
wget http://download.nvidia.com/XFree86/Linux-x86_64/390.67/NVIDIA-Linux-x86_64-390.67.run
sudo /bin/bash ./NVIDIA-Linux-x86_64-390.67.run --accept-license --no-questions --ui=none
$ wget http://download.nvidia.com/XFree86/Linux-x86_64/390.67/NVIDIA-Linux-x86_64-390.67.run
$ sudo /bin/bash ./NVIDIA-Linux-x86_64-390.67.run --accept-license --no-questions --ui=none
sudo echo 'blacklist nouveau' | sudo tee -a /etc/modprobe.d/blacklist.conf
sudo echo 'options nouveau modeset=0' | sudo tee -a /etc/modprobe.d/blacklist.conf
sudo echo options nouveau modeset=0 | sudo tee -a /etc/modprobe.d/nouveau-kms.conf
sudo update-initramfs -u
$ sudo echo 'blacklist nouveau' | sudo tee -a /etc/modprobe.d/blacklist.conf
$ sudo echo 'options nouveau modeset=0' | sudo tee -a /etc/modprobe.d/blacklist.conf
$ sudo echo options nouveau modeset=0 | sudo tee -a /etc/modprobe.d/nouveau-kms.conf
$ sudo update-initramfs -u
2. Restart the EC2 instance:
3. Restart the EC2 instance:
```
```console
3. Make sure there are no Xorg processes running:
4. Make sure there are no Xorg processes running:
```
```console
sudo killall Xorg
$ sudo killall Xorg
nvidia-smi
$ nvidia-smi
/*
* Thu Jun 14 20:21:11 2018
* +-----------------------------------------------------------------------------+

*/
```
4. Start X Server and make the ubuntu use X Server for display:
5. Start X Server and make the ubuntu use X Server for display:
```
```console
sudo /usr/bin/X :0 &
$ sudo /usr/bin/X :0 &
nvidia-smi
$ nvidia-smi
export DISPLAY=:0
$ export DISPLAY=:0
5. Ensure the Xorg is correctly configured:
6. Ensure the Xorg is correctly configured:
```
//For more information on glxgears, see ftp://www.x.org/pub/X11R6.8.1/doc/glxgears.1.html.
glxgears
```console
//For more information on glxgears, see ftp://www.x.org/pub/X11R6.8.1/doc/glxgears.1.html.
$ glxgears
//If Xorg is configured correctly, you should see the following message
/*
* Running synchronized to the vertical refresh. The framerate should be

## Training on EC2 instance
1. In the Unity Editor, load a project containing an ML-Agents environment (you can use one of the example environments if you have not created your own).
1. In the Unity Editor, load a project containing an ML-Agents environment (you
can use one of the example environments if you have not created your own).
6. Upload the executable to your EC2 instance within `ml-agents/python` folder.
6. Upload the executable to your EC2 instance within `ml-agents` folder.
from unityagents import UnityEnvironment
from mlagents.envs import UnityEnvironment
```console
chmod +x <your_env>.x86_64
mlagents-learn <trainer-config-file> --env=<your_env> --train
//cd into your ml-agents/python folder
chmod +x <your_env>.x86_64
python learn.py <your_env> --train
```

112
docs/Training-on-Microsoft-Azure-Custom-Instance.md


This page contains instructions for setting up a custom Virtual Machine on Microsoft Azure so you can running ML-Agents training in the cloud.
1. Start by [deploying an Azure VM](https://docs.microsoft.com/azure/virtual-machines/linux/quick-create-portal) with Ubuntu Linux (tests were done with 16.04 LTS). To use GPU support, use a N-Series VM.
2. SSH into your VM.
3. Start with the following commands to install the Nvidia driver:
1. Start by
[deploying an Azure VM](https://docs.microsoft.com/azure/virtual-machines/linux/quick-create-portal)
with Ubuntu Linux (tests were done with 16.04 LTS). To use GPU support, use
a N-Series VM.
2. SSH into your VM.
3. Start with the following commands to install the Nvidia driver:
```
wget http://us.download.nvidia.com/tesla/375.66/nvidia-diag-driver-local-repo-ubuntu1604_375.66-1_amd64.deb
```sh
wget http://us.download.nvidia.com/tesla/375.66/nvidia-diag-driver-local-repo-ubuntu1604_375.66-1_amd64.deb
sudo dpkg -i nvidia-diag-driver-local-repo-ubuntu1604_375.66-1_amd64.deb
sudo dpkg -i nvidia-diag-driver-local-repo-ubuntu1604_375.66-1_amd64.deb
sudo apt-get update
sudo apt-get update
sudo apt-get install cuda-drivers
sudo apt-get install cuda-drivers
sudo reboot
```
sudo reboot
```
4. After a minute you should be able to reconnect to your VM and install the CUDA toolkit:
4. After a minute you should be able to reconnect to your VM and install the
CUDA toolkit:
```
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_8.0.61-1_amd64.deb
```sh
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_8.0.61-1_amd64.deb
sudo dpkg -i cuda-repo-ubuntu1604_8.0.61-1_amd64.deb
sudo dpkg -i cuda-repo-ubuntu1604_8.0.61-1_amd64.deb
sudo apt-get update
sudo apt-get update
sudo apt-get install cuda-8-0
```
sudo apt-get install cuda-8-0
```
5. You'll next need to download cuDNN from the Nvidia developer site. This requires a registered account.
5. You'll next need to download cuDNN from the Nvidia developer site. This
requires a registered account.
6. Navigate to [http://developer.nvidia.com](http://developer.nvidia.com) and create an account and verify it.
6. Navigate to [http://developer.nvidia.com](http://developer.nvidia.com) and
create an account and verify it.
7. Download (to your own computer) cuDNN from [this url](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/v6/prod/8.0_20170307/Ubuntu16_04_x64/libcudnn6_6.0.20-1+cuda8.0_amd64-deb).
7. Download (to your own computer) cuDNN from [this url](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/v6/prod/8.0_20170307/Ubuntu16_04_x64/libcudnn6_6.0.20-1+cuda8.0_amd64-deb).
8. Copy the deb package to your VM: ```scp libcudnn6_6.0.21-1+cuda8.0_amd64.deb <VMUserName>@<VMIPAddress>:libcudnn6_6.0.21-1+cuda8.0_amd64.deb ```
8. Copy the deb package to your VM:
9. SSH back to your VM and execute the following:
```sh
scp libcudnn6_6.0.21-1+cuda8.0_amd64.deb <VMUserName>@<VMIPAddress>:libcudnn6_6.0.21-1+cuda8.0_amd64.deb
```
```
sudo dpkg -i libcudnn6_6.0.21-1+cuda8.0_amd64.deb
9. SSH back to your VM and execute the following:
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
. ~/.profile
```console
sudo dpkg -i libcudnn6_6.0.21-1+cuda8.0_amd64.deb
sudo reboot
```
export LD_LIBRARY_PATH=/usr/local/cuda/lib64/:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
. ~/.profile
10. After a minute, you should be able to SSH back into your VM. After doing so, run the following:
sudo reboot
```
```
sudo apt install python-pip
sudo apt install python3-pip
```
10. After a minute, you should be able to SSH back into your VM. After doing
so, run the following:
11. At this point, you need to install TensorFlow. The version you install should be tied to if you are using GPU to train:
```sh
sudo apt install python-pip
sudo apt install python3-pip
```
```
pip3 install tensorflow-gpu==1.4.0 keras==2.0.6
```
Or CPU to train:
```
pip3 install tensorflow==1.4.0 keras==2.0.6
```
11. At this point, you need to install TensorFlow. The version you install
should be tied to if you are using GPU to train:
12. You'll then need to install additional dependencies:
```
pip3 install pillow
pip3 install numpy
pip3 install docopt
```
```sh
pip3 install tensorflow-gpu==1.4.0 keras==2.0.6
```
Or CPU to train:
13. You can now return to the [main Azure instruction page](Training-on-Microsoft-Azure.md).
```sh
pip3 install tensorflow==1.4.0 keras==2.0.6
```
12. You'll then need to install additional dependencies:
```sh
pip3 install pillow
pip3 install numpy
pip3 install docopt
```
13. You can now return to the
[main Azure instruction page](Training-on-Microsoft-Azure.md).

107
docs/Training-on-Microsoft-Azure.md


# Training on Microsoft Azure (works with ML-Agents toolkit v0.3)
This page contains instructions for setting up training on Microsoft Azure through either [Azure Container Instances](https://azure.microsoft.com/services/container-instances/) or Virtual Machines. Non "headless" training has not yet been tested to verify support.
This page contains instructions for setting up training on Microsoft Azure
through either
[Azure Container Instances](https://azure.microsoft.com/services/container-instances/)
or Virtual Machines. Non "headless" training has not yet been tested to verify
support.
A pre-configured virtual machine image is available in the Azure Marketplace and is nearly compltely ready for training. You can start by deploying the [Data Science Virtual Machine for Linux (Ubuntu)](https://azuremarketplace.microsoft.com/marketplace/apps/microsoft-ads.linux-data-science-vm-ubuntu) into your Azure subscription. Once your VM is deployed, SSH into it and run the following command to complete dependency installation:
```
pip install docopt
A pre-configured virtual machine image is available in the Azure Marketplace and
is nearly compltely ready for training. You can start by deploying the
[Data Science Virtual Machine for Linux (Ubuntu)](https://azuremarketplace.microsoft.com/marketplace/apps/microsoft-ads.linux-data-science-vm-ubuntu)
into your Azure subscription. Once your VM is deployed, SSH into it and run the
following command to complete dependency installation:
```sh
pip3 install docopt
Note that, if you choose to deploy the image to an [N-Series GPU optimized VM](https://docs.microsoft.com/azure/virtual-machines/linux/sizes-gpu), training will, by default, run on the GPU. If you choose any other type of VM, training will run on the CPU.
Note that, if you choose to deploy the image to an
[N-Series GPU optimized VM](https://docs.microsoft.com/azure/virtual-machines/linux/sizes-gpu),
training will, by default, run on the GPU. If you choose any other type of VM,
training will run on the CPU.
Setting up your own instance requires a number of package installations. Please view the documentation for doing so [here](Training-on-Microsoft-Azure-Custom-Instance.md).
Setting up your own instance requires a number of package installations. Please
view the documentation for doing so
[here](Training-on-Microsoft-Azure-Custom-Instance.md).
2. [Move](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/copy-files-to-linux-vm-using-scp) the `python` sub-folder of this ml-agents repo to the remote Azure instance, and set it as the working directory.
1. [Move](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/copy-files-to-linux-vm-using-scp)
the `ml-agents` sub-folder of this ml-agents repo to the remote Azure
instance, and set it as the working directory.
2. Install the required packages with `pip3 install .`.
## Testing

1. In the Unity Editor, load a project containing an ML-Agents environment (you can use one of the example environments if you have not created your own).
1. In the Unity Editor, load a project containing an ML-Agents environment (you
can use one of the example environments if you have not created your own).
2. Open the Build Settings window (menu: File > Build Settings).
3. Select Linux as the Target Platform, and x86_64 as the target architecture.
4. Check Headless Mode.

```python
from unityagents import UnityEnvironment
from mlagents.envs import UnityEnvironment
You should receive a message confirming that the environment was loaded successfully.
You should receive a message confirming that the environment was loaded
successfully.
1. [Move](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/copy-files-to-linux-vm-using-scp) your built Unity application to your Virtual Machine.
2. Set the `python` sub-folder of the ml-agents repo to your working directory.
3. Run the following command:
1. [Move](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/copy-files-to-linux-vm-using-scp)
your built Unity application to your Virtual Machine.
2. Set the the directory where the ML-Agents Toolkit was installed to your
working directory.
3. Run the following command:
```
python3 learn.py <your_app> --run-id=<run_id> --train
```sh
mlagents-learn <trainer_config> --env=<your_app> --run-id=<run_id> --train
Where `<your_app>` is the path to your app (i.e. `~/unity-volume/3DBallHeadless`) and `<run_id>` is an identifer you would like to identify your training run with.
Where `<your_app>` is the path to your app (i.e.
`~/unity-volume/3DBallHeadless`) and `<run_id>` is an identifer you would like
to identify your training run with.
If you've selected to run on a N-Series VM with GPU support, you can verify that the GPU is being used by running `nvidia-smi` from the command line.
If you've selected to run on a N-Series VM with GPU support, you can verify that
the GPU is being used by running `nvidia-smi` from the command line.
Once you have started training, you can [use Tensorboard to observe the training](Using-Tensorboard.md).
Once you have started training, you can [use Tensorboard to observe the
training](Using-Tensorboard.md).
1. Start by [opening the appropriate port for web traffic to connect to your VM](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/nsg-quickstart-portal).
* Note that you don't need to generate a new `Network Security Group` but
instead, go to the **Networking** tab under **Settings** for your VM.
* As an example, you could use the following settings to open the Port with
the following Inbound Rule settings:
* Source: Any
* Source Port Ranges: *
* Destination: Any
* Destination Port Ranges: 6006
* Protocol: Any
* Action: Allow
* Priority: (Leave as default)
1. Start by [opening the appropriate port for web traffic to connect to your VM](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/nsg-quickstart-portal).
* Note that you don't need to generate a new `Network Security Group` but instead, go to the **Networking** tab under **Settings** for your VM.
* As an example, you could use the following settings to open the Port with the following Inbound Rule settings:
* Source: Any
* Source Port Ranges: *
* Destination: Any
* Destination Port Ranges: 6006
* Protocol: Any
* Action: Allow
* Priority: <Leave as default>
2. Unless you started the training as a background process, connect to your VM from another terminal instance.
3. Set the `python` folder in ml-agents to your current working directory.
4. Run the following command from your `tensorboard --logdir=summaries --host 0.0.0.0`
5. You should now be able to open a browser and navigate to `<Your_VM_IP_Address>:6060` to view the TensorBoard report.
2. Unless you started the training as a background process, connect to your VM
from another terminal instance.
3. Run the following command from your terminal
`tensorboard --logdir=summaries --host 0.0.0.0`
4. You should now be able to open a browser and navigate to
`<Your_VM_IP_Address>:6060` to view the TensorBoard report.
[Azure Container Instances](https://azure.microsoft.com/services/container-instances/) allow you to spin up a container, on demand, that will run your training and then be shut down. This ensures you aren't leaving a billable VM running when it isn't needed. You can read more about [The ML-Agents toolkit support for Docker containers here](Using-Docker.md). Using ACI enables you to offload training of your models without needing to install Python and Tensorflow on your own computer. You can find [instructions, including a pre-deployed image in DockerHub for you to use, available here](https://github.com/druttka/unity-ml-on-azure).
[Azure Container Instances](https://azure.microsoft.com/services/container-instances/)
allow you to spin up a container, on demand, that will run your training and
then be shut down. This ensures you aren't leaving a billable VM running when
it isn't needed. You can read more about
[The ML-Agents toolkit support for Docker containers here](Using-Docker.md).
Using ACI enables you to offload training of your models without needing to
install Python and TensorFlow on your own computer. You can find instructions,
including a pre-deployed image in DockerHub for you to use, available
[here](https://github.com/druttka/unity-ml-on-azure).

117
docs/Using-Docker.md


# Using Docker For ML-Agents
We currently offer a solution for Windows and Mac users who would like to do training or inference using Docker. This option may be appealing to those who would like to avoid installing Python and TensorFlow themselves. The current setup forces both TensorFlow and Unity to _only_ rely on the CPU for computations. Consequently, our Docker simulation does not use a GPU and uses [`Xvfb`](https://en.wikipedia.org/wiki/Xvfb) to do visual rendering. `Xvfb` is a utility that enables `ML-Agents` (or any other application) to do rendering virtually i.e. it does not assume that the machine running `ML-Agents` has a GPU or a display attached to it. This means that rich environments which involve agents using camera-based visual observations might be slower.
We currently offer a solution for Windows and Mac users who would like to do
training or inference using Docker. This option may be appealing to those who
would like to avoid installing Python and TensorFlow themselves. The current
setup forces both TensorFlow and Unity to _only_ rely on the CPU for
computations. Consequently, our Docker simulation does not use a GPU and uses
[`Xvfb`](https://en.wikipedia.org/wiki/Xvfb) to do visual rendering. `Xvfb` is a
utility that enables `ML-Agents` (or any other application) to do rendering
virtually i.e. it does not assume that the machine running `ML-Agents` has a GPU
or a display attached to it. This means that rich environments which involve
agents using camera-based visual observations might be slower.
## Requirements
## Requirements
- [Download](https://unity3d.com/get-unity/download) the Unity Installer and
add the _Linux Build Support_ Component
- [Download](https://unity3d.com/get-unity/download) the Unity Installer and add
the _Linux Build Support_ Component
- [Download](https://www.docker.com/community-edition#/download) and
install Docker if you don't have it setup on your machine.
- [Download](https://www.docker.com/community-edition#/download) and install
Docker if you don't have it setup on your machine.
- Since Docker runs a container in an environment that is isolated from the host machine, a mounted directory in your host machine is used to share data, e.g. the Unity executable, curriculum files and TensorFlow graph. For convenience, we created an empty `unity-volume` directory at the root of the repository for this purpose, but feel free to use any other directory. The remainder of this guide assumes that the `unity-volume` directory is the one used.
- Since Docker runs a container in an environment that is isolated from the host
machine, a mounted directory in your host machine is used to share data, e.g.
the trainer configuration file, Unity executable, curriculum files and
TensorFlow graph. For convenience, we created an empty `unity-volume`
directory at the root of the repository for this purpose, but feel free to use
any other directory. The remainder of this guide assumes that the
`unity-volume` directory is the one used.
Using Docker for ML-Agents involves three steps: building the Unity environment with specific flags, building a Docker container and, finally, running the container. If you are not familiar with building a Unity environment for ML-Agents, please read through our [Getting Started with the 3D Balance Ball Example](Getting-Started-with-Balance-Ball.md) guide first.
Using Docker for ML-Agents involves three steps: building the Unity environment
with specific flags, building a Docker container and, finally, running the
container. If you are not familiar with building a Unity environment for
ML-Agents, please read through our [Getting Started with the 3D Balance Ball
Example](Getting-Started-with-Balance-Ball.md) guide first.
Since Docker typically runs a container sharing a (linux) kernel with the host machine, the
Unity environment **has** to be built for the **linux platform**. When building a Unity environment, please select the following options from the the Build Settings window:
Since Docker typically runs a container sharing a (linux) kernel with the host
machine, the Unity environment **has** to be built for the **linux platform**.
When building a Unity environment, please select the following options from the
the Build Settings window:
- If the environment does not contain visual observations, you can select the `headless` option here.
- If the environment does not contain visual observations, you can select the
`headless` option here.
Then click `Build`, pick an environment name (e.g. `3DBall`) and set the output directory to `unity-volume`. After building, ensure that the file `<environment-name>.x86_64` and subdirectory `<environment-name>_Data/` are created under `unity-volume`.
Then click `Build`, pick an environment name (e.g. `3DBall`) and set the output
directory to `unity-volume`. After building, ensure that the file
`<environment-name>.x86_64` and subdirectory `<environment-name>_Data/` are
created under `unity-volume`.
First, make sure the Docker engine is running on your machine. Then build the Docker container by calling the following command at the top-level of the repository:
First, make sure the Docker engine is running on your machine. Then build the
Docker container by calling the following command at the top-level of the
repository:
```
```sh
```
Replace `<image-name>` with a name for the Docker image, e.g. `balance.ball.v0.1`.
```
**Note** if you modify hyperparameters in `trainer_config.yaml` you will have to build a new Docker Container before running.
Replace `<image-name>` with a name for the Docker image, e.g.
`balance.ball.v0.1`.
Run the Docker container by calling the following command at the top-level of the repository:
Run the Docker container by calling the following command at the top-level of
the repository:
```
```sh
<image-name>:latest <environment-name> \
<image-name>:latest \
<trainer-config-file> \
--env=<environment-name> \
- `<container-name>` is used to identify the container (in case you want to interrupt and terminate it). This is optional and Docker will generate a random name if this is not set. _Note that this must be unique for every run of a Docker image._
- `<container-name>` is used to identify the container (in case you want to
interrupt and terminate it). This is optional and Docker will generate a
random name if this is not set. _Note that this must be unique for every run
of a Docker image._
- `<environemnt-name>` __(Optional)__: If you are training with a linux executable, this is the name of the executable. If you are training in the Editor, do not pass a `<environemnt-name>` argument and press the :arrow_forward: button in Unity when the message _"Start training by pressing the Play button in the Unity Editor"_ is displayed on the screen.
- `source`: Reference to the path in your host OS where you will store the Unity executable.
- `target`: Tells Docker to mount the `source` path as a disk with this name.
- `docker-target-name`: Tells the ML-Agents Python package what the name of the disk where it can read the Unity executable and store the graph. **This should therefore be identical to `target`.**
- `train`, `run-id`: ML-Agents arguments passed to `learn.py`. `train` trains the algorithm, `run-id` is used to tag each experiment with a unique identifier.
- `<environment-name>` __(Optional)__: If you are training with a linux
executable, this is the name of the executable. If you are training in the
Editor, do not pass a `<environment-name>` argument and press the
:arrow_forward: button in Unity when the message _"Start training by pressing
the Play button in the Unity Editor"_ is displayed on the screen.
- `source`: Reference to the path in your host OS where you will store the Unity
executable.
- `target`: Tells Docker to mount the `source` path as a disk with this name.
- `docker-target-name`: Tells the ML-Agents Python package what the name of the
disk where it can read the Unity executable and store the graph. **This should
therefore be identical to `target`.**
- `trainer-config-file`, `train`, `run-id`: ML-Agents arguments passed to
`mlagents-learn`. `trainer-config-file` is the filename of the trainer config
file, `train` trains the algorithm, and `run-id` is used to tag each
experiment with a unique identifier. We recommend placing the trainer-config
file inside `unity-volume` so that the container has access to the file.
```
```sh
trainer_config.yaml \
--env=3DBall
For more detail on Docker mounts, check out [these](https://docs.docker.com/storage/bind-mounts/) docs from Docker.
For more detail on Docker mounts, check out
[these](https://docs.docker.com/storage/bind-mounts/) docs from Docker.
If you are satisfied with the training progress, you can stop the Docker container while saving state by either using `Ctrl+C` or `⌘+C` (Mac) or by using the following command:
If you are satisfied with the training progress, you can stop the Docker
container while saving state by either using `Ctrl+C` or `⌘+C` (Mac) or by using
the following command:
```
```sh
`<container-name>` is the name of the container specified in the earlier `docker run` command. If you didn't specify one, you can find the randomly generated identifier by running `docker container ls`.
`<container-name>` is the name of the container specified in the earlier `docker
run` command. If you didn't specify one, you can find the randomly generated
identifier by running `docker container ls`.

179
docs/Using-TensorFlow-Sharp-in-Unity.md


# Using TensorFlowSharp in Unity (Experimental)
The ML-Agents toolkit allows you to use pre-trained [TensorFlow graphs](https://www.tensorflow.org/programmers_guide/graphs) inside your Unity games. This support is possible thanks to [the TensorFlowSharp project](https://github.com/migueldeicaza/TensorFlowSharp). The primary purpose for this support is to use the TensorFlow models produced by the ML-Agents toolkit's own training programs, but a side benefit is that you can use any TensorFlow model.
The ML-Agents toolkit allows you to use pre-trained
[TensorFlow graphs](https://www.tensorflow.org/programmers_guide/graphs)
inside your Unity
games. This support is possible thanks to the
[TensorFlowSharp project](https://github.com/migueldeicaza/TensorFlowSharp).
The primary purpose for this support is to use the TensorFlow models produced by
the ML-Agents toolkit's own training programs, but a side benefit is that you
can use any TensorFlow model.
_Notice: This feature is still experimental. While it is possible to embed trained models into Unity games, Unity Technologies does not officially support this use-case for production games at this time. As such, no guarantees are provided regarding the quality of experience. If you encounter issues regarding battery life, or general performance (especially on mobile), please let us know._
_Notice: This feature is still experimental. While it is possible to embed
trained models into Unity games, Unity Technologies does not officially support
this use-case for production games at this time. As such, no guarantees are
provided regarding the quality of experience. If you encounter issues regarding
battery life, or general performance (especially on mobile), please let us
know._
## Supported devices :
## Supported devices
* Linux 64 bits
* Mac OS X 64 bits
* Windows 64 bits
* iOS (Requires additional steps)
* Android
* Linux 64 bits
* Mac OS X 64 bits
* Windows 64 bits
* iOS (Requires additional steps)
* Android
* Unity 2017.1 or above
* Unity TensorFlow Plugin ([Download here](https://s3.amazonaws.com/unity-ml-agents/0.4/TFSharpPlugin.unitypackage))
* Unity 2017.4 or above
* Unity TensorFlow Plugin ([Download here](https://s3.amazonaws.com/unity-ml-agents/0.5/TFSharpPlugin.unitypackage))
# Using TensorFlowSharp with ML-Agents
## Using TensorFlowSharp with ML-Agents
Go to `Edit` -> `Player Settings` and add `ENABLE_TENSORFLOW` to the `Scripting Define Symbols` for each type of device you want to use (**`PC, Mac and Linux Standalone`**, **`iOS`** or **`Android`**).
Go to `Edit` -> `Player Settings` and add `ENABLE_TENSORFLOW` to the `Scripting
Define Symbols` for each type of device you want to use (**`PC, Mac and Linux
Standalone`**, **`iOS`** or **`Android`**).
Set the Brain you used for training to `Internal`. Drag `your_name_graph.bytes` into Unity and then drag it into The `Graph Model` field in the Brain.
Set the Brain you used for training to `Internal`. Drag `your_name_graph.bytes`
into Unity and then drag it into The `Graph Model` field in the Brain.
The TensorFlow data graphs produced by the ML-Agents training programs work without any additional settings.
The TensorFlow data graphs produced by the ML-Agents training programs work
without any additional settings.
In order to use a TensorFlow data graph in Unity, make sure the nodes of your graph have appropriate names. You can assign names to nodes in TensorFlow :
In order to use a TensorFlow data graph in Unity, make sure the nodes of your
graph have appropriate names. You can assign names to nodes in TensorFlow :
```python
variable= tf.identity(variable, name="variable_name")

* Name the batch size input placeholder `batch_size`
* Name the input vector observation placeholder `state`
* Name the output node `action`
* Name the recurrent vector (memory) input placeholder `recurrent_in` (if any)
* Name the recurrent vector (memory) output node `recurrent_out` (if any)
* Name the observations placeholders input placeholders `visual_observation_i` where `i` is the index of the observation (starting at 0)
* Name the batch size input placeholder `batch_size`
* Name the input vector observation placeholder `state`
* Name the output node `action`
* Name the recurrent vector (memory) input placeholder `recurrent_in` (if any)
* Name the recurrent vector (memory) output node `recurrent_out` (if any)
* Name the observations placeholders input placeholders `visual_observation_i`
where `i` is the index of the observation (starting at 0)
You can have additional placeholders for float or integers but they must be placed in placeholders of dimension 1 and size 1. (Be sure to name them.)
You can have additional placeholders for float or integers but they must be
placed in placeholders of dimension 1 and size 1. (Be sure to name them.)
It is important that the inputs and outputs of the graph are exactly the ones you receive and return when training your model with an `External` brain. This means you cannot have any operations such as reshaping outside of the graph.
The object you get by calling `step` or `reset` has fields `vector_observations`, `visual_observations` and `memories` which must correspond to the placeholders of your graph. Similarly, the arguments `action` and `memory` you pass to `step` must correspond to the output nodes of your graph.
It is important that the inputs and outputs of the graph are exactly the ones
you receive and return when training your model with an `External` Brain. This
means you cannot have any operations such as reshaping outside of the graph. The
object you get by calling `step` or `reset` has fields `vector_observations`,
`visual_observations` and `memories` which must correspond to the placeholders
of your graph. Similarly, the arguments `action` and `memory` you pass to `step`
must correspond to the output nodes of your graph.
While training your Agent using the Python API, you can save your graph at any point of the training. Note that the argument `output_node_names` must be the name of the tensor your graph outputs (separated by a coma if using multiple outputs). In this case, it will be either `action` or `action,recurrent_out` if you have recurrent outputs.
While training your Agent using the Python API, you can save your graph at any
point of the training. Note that the argument `output_node_names` must be the
name of the tensor your graph outputs (separated by a coma if using multiple
outputs). In this case, it will be either `action` or `action,recurrent_out` if
you have recurrent outputs.
```python
from tensorflow.python.tools import freeze_graph

restore_op_name = "save/restore_all", filename_tensor_name = "save/Const:0")
```
Your model will be saved with the name `your_name_graph.bytes` and will contain both the graph and associated weights. Note that you must save your graph as a .bytes file so Unity can load it.
Your model will be saved with the name `your_name_graph.bytes` and will contain
both the graph and associated weights. Note that you must save your graph as a
.bytes file so Unity can load it.
In the Unity Editor, you must specify the names of the nodes used by your graph in the **Internal** brain Inspector window. If you used a scope when defining your graph, specify it in the `Graph Scope` field.
In the Unity Editor, you must specify the names of the nodes used by your graph
in the **Internal** Brain Inspector window. If you used a scope when defining
your graph, specify it in the `Graph Scope` field.
See [Internal Brain](Learning-Environment-Design-External-Internal-Brains.md#internal-brain) for more information about using Internal Brains.
See
[Internal Brain](Learning-Environment-Design-External-Internal-Brains.md#internal-brain)
for more information about using Internal Brains.
If you followed these instructions well, the agents in your environment that use this brain will use your fully trained network to make decisions.
If you followed these instructions well, the Agents in your environment that use
this Brain will use your fully trained network to make decisions.
# iOS additional instructions for building
## iOS additional instructions for building
* Once you build the project for iOS in the editor, open the .xcodeproj file within the project folder using Xcode.
* Set up your ios account following the [iOS Account setup page](https://docs.unity3d.com/Manual/iphone-accountsetup.html).
* Before build your game against iOS platform, make sure you've set the
flag `ENABLE_TENSORFLOW` for it.
* Once you build the project for iOS in the editor, open the .xcodeproj file
within the project folder using Xcode.
* Set up your ios account following the
[iOS Account setup page](https://docs.unity3d.com/Manual/iphone-accountsetup.html).
* Drag the library `libtensorflow-core.a` from the **Project Navigator** on the left under `Libraries/ML-Agents/Plugins/iOS` into the flag list, after `-force_load`.
* Drag the library `libtensorflow-core.a` from the **Project Navigator** on
the left under `Libraries/ML-Agents/Plugins/iOS` into the flag list, after
`-force_load`.
# Using TensorFlowSharp without ML-Agents
## Using TensorFlowSharp without ML-Agents
Beyond controlling an in-game agent, you can also use TensorFlowSharp for more general computation. The following instructions describe how to generally embed TensorFlow models without using the ML-Agents framework.
Beyond controlling an in-game agent, you can also use TensorFlowSharp for more
general computation. The following instructions describe how to generally embed
TensorFlow models without using the ML-Agents framework.
You must have a TensorFlow graph, such as `your_name_graph.bytes`, made using TensorFlow's `freeze_graph.py`. The process to create such graph is explained in the [Using your own trained graphs](#using-your-own-trained-graphs) section.
You must have a TensorFlow graph, such as `your_name_graph.bytes`, made using
TensorFlow's `freeze_graph.py`. The process to create such graph is explained in
the [Using your own trained graphs](#using-your-own-trained-graphs) section.
## Inside of Unity
### Inside of Unity
To load and use a TensorFlow data graph in Unity:

```csharp
using TensorFlow;
```
```csharp
using TensorFlow;
```
3. If you will be building for android, you must add this block at the start of your code :
3. If you will be building for android, you must add this block at the start of
your code :
```csharp
#if UNITY_ANDROID
TensorFlowSharp.Android.NativeBinding.Init();
#endif
```
```csharp
#if UNITY_ANDROID && !UNITY_EDITOR
TensorFlowSharp.Android.NativeBinding.Init();
#endif
```
```csharp
TextAsset graphModel = Resources.Load (your_name_graph) as TextAsset;
```
```csharp
TextAsset graphModel = Resources.Load (your_name_graph) as TextAsset;
```
```csharp
graph = new TFGraph ();
graph.Import (graphModel.bytes);
session = new TFSession (graph);
```
```csharp graph = new TFGraph ();
graph.Import (graphModel.bytes);
session = new TFSession (graph);
```
6. Assign the input tensors for the graph. For example, the following code assigns a one dimensional input tensor of size 2:
6. Assign the input tensors for the graph. For example, the following code
assigns a one dimensional input tensor of size 2:
```csharp
var runner = session.GetRunner ();
runner.AddInput (graph ["input_placeholder_name"] [0], new float[]{ placeholder_value1, placeholder_value2 });
```
```csharp
var runner = session.GetRunner ();
runner.AddInput (graph ["input_placeholder_name"] [0], new float[]{ placeholder_value1, placeholder_value2 });
```
You must provide all required inputs to the graph. Supply one input per TensorFlow placeholder.
You must provide all required inputs to the graph. Supply one input per
TensorFlow placeholder.
```csharp
runner.Fetch (graph["output_placeholder_name"][0]);
float[,] recurrent_tensor = runner.Run () [0].GetValue () as float[,];
```
```csharp
runner.Fetch (graph["output_placeholder_name"][0]);
float[,] recurrent_tensor = runner.Run () [0].GetValue () as float[,];
```
Note that this example assumes the output array is a two-dimensional tensor of floats. Cast to a long array if your outputs are integers.
Note that this example assumes the output array is a two-dimensional tensor of
floats. Cast to a long array if your outputs are integers.

79
docs/Using-Tensorboard.md


# Using TensorBoard to Observe Training
The ML-Agents toolkit saves statistics during learning session that you can view with a TensorFlow utility named, [TensorBoard](https://www.tensorflow.org/programmers_guide/summaries_and_tensorboard).
The ML-Agents toolkit saves statistics during learning session that you can view
with a TensorFlow utility named,
[TensorBoard](https://www.tensorflow.org/programmers_guide/summaries_and_tensorboard).
The `learn.py` program saves training statistics to a folder named `summaries`, organized by the `run-id` value you assign to a training session.
The `mlagents-learn` command saves training statistics to a folder named
`summaries`, organized by the `run-id` value you assign to a training session.
In order to observe the training process, either during training or afterward,
In order to observe the training process, either during training or afterward,
2. Navigate to the ml-agents/python folder.
2. Navigate to the directory where the ML-Agents Toolkit is installed.
tensorboard --logdir=summaries
```sh
tensorboard --logdir=summaries
```
**Note:** If you don't assign a `run-id` identifier, `learn.py` uses the default string, "ppo". All the statistics will be saved to the same sub-folder and displayed as one session in TensorBoard. After a few runs, the displays can become difficult to interpret in this situation. You can delete the folders under the `summaries` directory to clear out old statistics.
**Note:** If you don't assign a `run-id` identifier, `mlagents-learn` uses the
default string, "ppo". All the statistics will be saved to the same sub-folder
and displayed as one session in TensorBoard. After a few runs, the displays can
become difficult to interpret in this situation. You can delete the folders
under the `summaries` directory to clear out old statistics.
On the left side of the TensorBoard window, you can select which of the training runs you want to display. You can select multiple run-ids to compare statistics. The TensorBoard window also provides options for how to display and smooth graphs.
When you run the training program, `learn.py`, you can use the `--save-freq` option to specify how frequently to save the statistics.
On the left side of the TensorBoard window, you can select which of the training
runs you want to display. You can select multiple run-ids to compare statistics.
The TensorBoard window also provides options for how to display and smooth
graphs.
When you run the training program, `mlagents-learn`, you can use the
`--save-freq` option to specify how frequently to save the statistics.
The ML-agents training program saves the following statistics:
The ML-Agents training program saves the following statistics:
* Lesson - Plots the progress from lesson to lesson. Only interesting when performing
[curriculum training](Training-Curriculum-Learning.md).
* Lesson - Plots the progress from lesson to lesson. Only interesting when
performing [curriculum training](Training-Curriculum-Learning.md).
* Cumulative Reward - The mean cumulative episode reward over all agents.
Should increase during a successful training session.
* Cumulative Reward - The mean cumulative episode reward over all agents. Should
increase during a successful training session.
* Entropy - How random the decisions of the model are. Should slowly decrease
during a successful training process. If it decreases too quickly, the `beta`
hyperparameter should be increased.
* Entropy - How random the decisions of the model are. Should slowly decrease
during a successful training process. If it decreases too quickly, the `beta`
hyperparameter should be increased.
* Episode Length - The mean length of each episode in the environment for all
agents.
* Episode Length - The mean length of each episode in the environment for all
agents.
* Learning Rate - How large a step the training algorithm takes as it searches
for the optimal policy. Should decrease over time.
* Learning Rate - How large a step the training algorithm takes as it searches
for the optimal policy. Should decrease over time.
much the policy (process for deciding actions) is changing. The magnitude of
this should decrease during a successful training session.
much the policy (process for deciding actions) is changing. The magnitude of
this should decrease during a successful training session.
* Value Estimate - The mean value estimate for all states visited by the agent.
Should increase during a successful training session.
* Value Estimate - The mean value estimate for all states visited by the agent.
Should increase during a successful training session.
well the model is able to predict the value of each state. This should increase
while the agent is learning, and then decrease once the reward stabilizes.
well the model is able to predict the value of each state. This should
increase while the agent is learning, and then decrease once the reward
stabilizes.
* _(Curiosity-Specific)_ Intrinsic Reward - This corresponds to the mean cumulative intrinsic reward generated per-episode.
* _(Curiosity-Specific)_ Intrinsic Reward - This corresponds to the mean
cumulative intrinsic reward generated per-episode.
* _(Curiosity-Specific)_ Forward Loss - The mean magnitude of the inverse model loss function. Corresponds to how well the model is able to predict the new observation encoding.
* _(Curiosity-Specific)_ Forward Loss - The mean magnitude of the inverse model
loss function. Corresponds to how well the model is able to predict the new
observation encoding.
* _(Curiosity-Specific)_ Inverse Loss - The mean magnitude of the forward model loss function. Corresponds to how well the model is able to predict the action taken between two observations.
* _(Curiosity-Specific)_ Inverse Loss - The mean magnitude of the forward model
loss function. Corresponds to how well the model is able to predict the action
taken between two observations.

8
docs/dox-ml-agents.conf


# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
# Note: If this tag is empty the current directory is searched.
INPUT = ../unity-environment/Assets/ML-Agents/Scripts/Academy.cs \
../unity-environment/Assets/ML-Agents/Scripts/Agent.cs \
../unity-environment/Assets/ML-Agents/Scripts/Monitor.cs \
../unity-environment/Assets/ML-Agents/Scripts/Decision.cs
INPUT = ../UnitySDK/Assets/ML-Agents/Scripts/Academy.cs \
../UnitySDK/Assets/ML-Agents/Scripts/Agent.cs \
../UnitySDK/Assets/ML-Agents/Scripts/Monitor.cs \
../UnitySDK/Assets/ML-Agents/Scripts/Decision.cs
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses

611
docs/images/banner.png

之前 之后
宽度: 1702  |  高度: 666  |  大小: 167 KiB

129
docs/images/player_brain.png

之前 之后
宽度: 486  |  高度: 394  |  大小: 44 KiB

79
docs/images/scene-hierarchy.png

之前 之后
宽度: 254  |  高度: 270  |  大小: 21 KiB

309
docs/images/unity-logo-rgb.png

之前 之后
宽度: 2412  |  高度: 899  |  大小: 87 KiB

5
docs/localized/zh-CN/README.md


[贡献准则](/CONTRIBUTING.md)和
[行为准则](/CODE_OF_CONDUCT.md)。
您可以通过 Unity Connect 和 GitHub 与我们
以及更广泛的社区进行交流:
您可以通过 Unity Connect 和 GitHub 与我们以及更广泛的社区进行交流:
* 加入我们的
[Unity 机器学习频道](https://connect.unity.com/messages/c/035fba4f88400000)
与使用 ML-Agents 的其他人以及对机器学习充满热情的 Unity 开发者

确保提供尽可能多的详细信息。
对于任何其他问题或反馈,请直接与 ML-Agents 团队联系,
地址为 ml-agents@unity3d.com。
电子邮件地址为 ml-agents@unity3d.com。
## 许可证

26
docs/localized/zh-CN/docs/Getting-Started-with-Balance-Ball.md


在打开 3D Balance Ball 场景后,您可能会首先注意到它包含的
不是一个平台,而是多个平台。场景中的每个平台都是
独立的 agent,但它们全部共享同一个 brain。3D Balance Ball 通过
独立的 agent,但它们全部共享同一个 Brain。3D Balance Ball 通过
这种方式可以加快训练速度,因为所有 12 个 agent 可以并行参与训练任务。
### Academy

Brain 不存储关于 agent 的任何信息,
只是将 agent 收集的观测结果发送到决策过程,
然后将所选的动作返回给 agent。因此,所有 agent 可共享
同一个 brain,但会独立行动。Brain 设置可以提供很多
同一个 Brain,但会独立行动。Brain 设置可以提供很多
**Heuristic** brain 允许您通过扩展 Decision 类来对 agent 的逻辑进行
手动编码。最后,**Player** brain 可让您将键盘命令
**Heuristic** Brain 允许您通过扩展 Decision 类来对 agent 的逻辑进行
手动编码。最后,**Player** Brain 可让您将键盘命令
会非常有用。如果这些类型的 brain 都不能满足您的需求,您可以
会非常有用。如果这些类型的 Brain 都不能满足您的需求,您可以
实现自己的 CoreBrain 来创建自有的类型。
在本教程中,进行训练时,需要将 **Brain Type** 设置为 **External**

**向量运动空间**
brain 以*动作*的形式向 agent 提供指令。与状态
Brain 以*动作*的形式向 agent 提供指令。与状态
`RigidBody` 上的力或扭矩。**Discrete** 向量运动空间将其动作
`Rigidbody` 上的力或扭矩。**Discrete** 向量运动空间将其动作
定义为一个表。提供给 agent 的具体动作是这个表的
索引。

平台游戏对象上。基础 Agent 对象有一些影响其行为的
属性:
* **Brain** — 每个 Agent 必须有一个 Brain。brain 决定了 agent 如何
* **Brain** — 每个 Agent 必须有一个 Brain。Brain 决定了 agent 如何
brain。
Brain。
* **Visual Observations** — 定义 agent 用来观测其环境的
任何 Camera 对象。3D Balance Ball 不使用摄像机观测。
* **Max Step** — 定义在 agent 决定自己完成之前可以发生多少个

agent 的 Brain 实例设置为状态大小为 8 的连续向量观测空间,
因此 `CollectObservations()` 必须调用 8 次
`AddVectorObs`
* Agent.AgentAction() — 在每个模拟步骤调用。接收 brain 选择的
* Agent.AgentAction() — 在每个模拟步骤调用。接收 Brain 选择的
动作。Ball3DAgent 示例可以处理连续和离散
运动空间类型。在此环境中,两种状态类型之间实际上
没有太大的差别:这两种向量运动空间在每一步都会

![3DBall 场景](images/mlagents-Open3DBall.png)
由于我们要建立此环境来进行训练,因此我们需要
将 agent 使用的 brain 设置为 **External**。这样 agent 在
将 agent 使用的 Brain 设置为 **External**。这样 agent 在
进行决策时能够与外部训练过程进行通信。
1. 在 **Scene** 窗口中,单击 Ball3DAcademy 对象旁边的三角形

一旦训练过程完成,并且训练过程保存了模型
(通过 `Saved Model` 消息可看出),您便可以将该模型添加到 Unity 项目中,
然后将其用于 brain 类型为 **Internal** 的 agent。
然后将其用于 Brain 类型为 **Internal** 的 agent。
### 设置 TensorFlowSharp 支持

1. 确保 TensorFlowSharp 插件位于 `Assets` 文件夹中。
可在
[此处](https://s3.amazonaws.com/unity-ml-agents/0.3/TFSharpPlugin.unitypackage)下载一个包含 TF# 的 Plugins 文件夹。
[此处](https://s3.amazonaws.com/unity-ml-agents/0.5/TFSharpPlugin.unitypackage)下载一个包含 TF# 的 Plugins 文件夹。
下载后,双击并将其导入。您可以在 Project 选项卡中
(位于 `Assets` > `ML-Agents` > `Plugins` > `Computer` 下)
检查 TensorFlow 的相关文件来查看是否安装成功

4
docs/localized/zh-CN/docs/Installation.md


### Mac 和 Unix 用户
如果您的 Python 环境不包括 `pip`,请参阅这些
如果您的 Python 环境不包括 `pip3`,请参阅这些
[说明](https://packaging.python.org/guides/installing-using-linux-tools/#installing-pip-setuptools-wheel-with-linux-package-managers)
以了解其安装方法。

## Unity 包
您可以通过 Unity 包的形式下载TensorFlowSharp 插件([AWS S3链接](https://s3.amazonaws.com/unity-ml-agents/0.3/TFSharpPlugin.unitypackage),[百度盘链接](https://pan.baidu.com/s/1s0mJN8lvuxTcYbs2kL2FqA))
您可以通过 Unity 包的形式下载TensorFlowSharp 插件 ([AWS S3链接](https://s3.amazonaws.com/unity-ml-agents/0.5/TFSharpPlugin.unitypackage),[百度盘链接](https://pan.baidu.com/s/1s0mJN8lvuxTcYbs2kL2FqA))
## 帮助

4
docs/localized/zh-CN/docs/Learning-Environment-Create-New.md


**动作**
Brain 的决策以动作数组的形式传递给 `AgentAction()` 函数。此数组中的元素数量由 agent 的 Brain 的 `Vector Action Space Type``Vector Action Space Size` 设置确定。RollerAgent 使用连续向量运动空间,并需要 brain 提供的两个连续控制信号。因此,我们要将 Brain `Vector Action Size` 设置为 2。第一个元素 `action[0]` 确定沿 x 轴施加的力;`action[1]` 确定沿 z 轴施加的力。(如果我们允许 agent 以三维方式移动,那么我们需要将 `Vector Action Size` 设置为 3。)注意,Brain 并不知道动作数组中的值是什么意思。训练过程只是根据观测输入来调整动作值,然后看看会得到什么样的奖励。
Brain 的决策以动作数组的形式传递给 `AgentAction()` 函数。此数组中的元素数量由 agent 的 Brain 的 `Vector Action Space Type``Vector Action Space Size` 设置确定。RollerAgent 使用连续向量运动空间,并需要 Brain 提供的两个连续控制信号。因此,我们要将 Brain `Vector Action Size` 设置为 2。第一个元素 `action[0]` 确定沿 x 轴施加的力;`action[1]` 确定沿 z 轴施加的力。(如果我们允许 agent 以三维方式移动,那么我们需要将 `Vector Action Size` 设置为 3。)注意,Brain 并不知道动作数组中的值是什么意思。训练过程只是根据观测输入来调整动作值,然后看看会得到什么样的奖励。
RollerAgent 使用 `Rigidbody.AddForce` 函数将 action[] 数组中的值应用到其 Rigidbody 组件 `rBody`

1. 选择 Brain 游戏对象以便在 Inspector 中查看该对象的属性。
2. 将 **Brain Type** 设置为 **Player**
3. 展开 **Continuous Player Actions**(仅在使用 **Player* brain 时可见)。
3. 展开 **Continuous Player Actions**(仅在使用 **Player* Brain 时可见)。
4. 将 **Size** 设置为 4。
5. 设置以下映射:

14
docs/localized/zh-CN/docs/Learning-Environment-Design.md


训练和模拟过程以 ML-Agents Academy 类编排的步骤进行。Academy 与场景中的 Agent 和 Brain 对象一起协作逐步完成模拟。当 Academy 已达到其最大步数或场景中的所有 agent 均_完成_时,一个训练场景即完成。
在训练期间,处于外部的 Python 进程会在训练过程中与 Academy 不断进行通信以便运行一系列场景,同时会收集数据并优化其神经网络模型。分配给 agent 的 Brain 类型决定了我们是否进行训练。**External** brain 会与外部过程进行通信以训练 TensorFlow 模型。成功完成训练后,您可以将经过训练的模型文件添加到您的 Unity 项目中,以便提供给 **Internal** brain 来控制agent的行为。
在训练期间,处于外部的 Python 进程会在训练过程中与 Academy 不断进行通信以便运行一系列场景,同时会收集数据并优化其神经网络模型。分配给 agent 的 Brain 类型决定了我们是否进行训练。**External** Brain 会与外部过程进行通信以训练 TensorFlow 模型。成功完成训练后,您可以将经过训练的模型文件添加到您的 Unity 项目中,以便提供给 **Internal** Brain 来控制agent的行为。
ML-Agents Academy 类按如下方式编排 agent 模拟循环:

4. 使用每个 agent 的 Brain 类来决定 agent 的下一动作。
5. 调用您的子类的 `AcademyAct()` 函数。
6. 对场景中的每个 agent 调用 `AgentAction()` 函数,传入由 agent 的 brain 选择的动作。(如果 agent 已完成,则不调用此函数。)
6. 对场景中的每个 agent 调用 `AgentAction()` 函数,传入由 agent 的 Brain 选择的动作。(如果 agent 已完成,则不调用此函数。)
7. 如果 agent 已达到其 `Max Step` 计数或者已将其自身标记为 `done`,则调用 agent 的 `AgentOnDone()` 函数。或者,如果某个 agent 在场景结束之前已完成,您可以将其设置为重新开始。在这种情况下,Academy 会调用 `AgentReset()` 函数。
8. 当 Academy 达到其自身的 `Max Step` 计数时,它会通过调用您的 Academy 子类的 `AcademyReset()` 函数来再次开始下一场景。

[Screenshot of scene hierarchy]
您必须为每个 agent 分配一个 brain,但可以在多个 agent 之间共享 brain。每个 agent 都将进行自己的观测并独立行动,但会使用相同的决策逻辑,而对于 **Internal** brain,则会使用相同的经过训练的 TensorFlow 模型。
您必须为每个 agent 分配一个 Brain,但可以在多个 agent 之间共享 Brain。每个 agent 都将进行自己的观测并独立行动,但会使用相同的决策逻辑,而对于 **Internal** Brain,则会使用相同的经过训练的 TensorFlow 模型。
### Academy

Brain 内部封装了决策过程。Brain 对象必须放在 Hierarchy 视图中的 Academy 的子级。我们必须为每个 Agent 分配一个 Brain,但可以在多个 Agent 之间共享同一个 Brain。
当我们使用 Brain 类的时候不需要使用其子类,而应该直接使用 Brain 这个类。Brain 的行为取决于 brain 的类型。在训练期间,应将 agent 上连接的 Brain 的 Brain Type 设置为 **External**。要使用经过训练的模型,请将模型文件导入 Unity 项目,并将对应 Brain 的 Brain Type 更改为 **Internal**。请参阅 [Brain](/docs/Learning-Environment-Design-Brains.md) 以了解有关使用不同类型的 Brain 的详细信息。如果四种内置的类型不能满足您的需求,您可以扩展 CoreBrain 类以创建其它的 Brain 类型。
当我们使用 Brain 类的时候不需要使用其子类,而应该直接使用 Brain 这个类。Brain 的行为取决于 Brain 的类型。在训练期间,应将 agent 上连接的 Brain 的 Brain Type 设置为 **External**。要使用经过训练的模型,请将模型文件导入 Unity 项目,并将对应 Brain 的 Brain Type 更改为 **Internal**。请参阅 [Brain](/docs/Learning-Environment-Design-Brains.md) 以了解有关使用不同类型的 Brain 的详细信息。如果四种内置的类型不能满足您的需求,您可以扩展 CoreBrain 类以创建其它的 Brain 类型。
Brain 类有若干可以使用 Inspector 窗口进行设置的重要属性。对于使用 brain 的 agent,这些属性必须恰当。例如,`Vector Observation Space Size` 属性必须与 agent 创建的特征向量的长度完全匹配。请参阅 [Agent](/docs/Learning-Environment-Design-Agents.md) 以获取有关创建 agent 和正确设置 Brain 实例的信息。
Brain 类有若干可以使用 Inspector 窗口进行设置的重要属性。对于使用 Brain 的 agent,这些属性必须恰当。例如,`Vector Observation Space Size` 属性必须与 agent 创建的特征向量的长度完全匹配。请参阅 [Agent](/docs/Learning-Environment-Design-Agents.md) 以获取有关创建 agent 和正确设置 Brain 实例的信息。
请参阅 [Brain](/docs/Learning-Environment-Design-Brains.md) 以查看 Brain 属性的完整列表。

要创建 agent,请扩展 Agent 类并实现基本的 `CollectObservations()``AgentAction()` 方法:
* `CollectObservations()` — 收集 agent 对其环境的观测结果。
* `AgentAction()` — 执行由 agent 的 brain 选择的动作,并为当前状态分配奖励。
* `AgentAction()` — 执行由 agent 的 Brain 选择的动作,并为当前状态分配奖励。
这些函数的实现决定了分配给此 agent 的 Brain 的属性要如何设置。

在 Unity 中创建训练环境时,必须设置场景以便可以通过外部训练过程来控制场景。注意以下几点:
* 在训练程序启动后,Unity 可执行文件会被自动打开,然后训练场景会自动开始训练。
* 场景中至少须包括一个 **External** brain。
* 场景中至少须包括一个 **External** Brain。
* Academy 必须在每一轮训练后将场景重置为有效的初始状态。
* 训练场景必须有明确的结束状态,为此需要使用 `Max Steps`,或让每个 agent 将自身设置为 `done`

42
docs/localized/zh-CN/docs/Learning-Environment-Examples.md


* 训练环境:一种线性移动任务,在此任务中 agent 必须向左或向右移动到奖励状态。
* 目标:移动到最高奖励状态。
* Agent设置:环境包含一个 agent,上面附带了单个 brain。
* Agent设置:环境包含一个 agent,上面附带了单个 Brain。
* Brain 设置:一个有以下观测/运动空间的 brain。
* Brain 设置:一个有以下观测/运动空间的 Brain。
* 向量观测空间:(离散变量)一个变量,对应于当前状态。
* 向量运动空间:(离散变量)两个可能的动作(向左移动、向右移动)。
* 视觉观测:0

* 训练环境:一种平衡球任务,在此任务中 agent 需要控制平台。
* 目标:agent 必须平衡平台,以尽可能长时间在平台上保持球不掉落。
* Agent设置:环境包含 12 个全部链接到单个 brain 的同类 agent。
* Agent设置:环境包含 12 个全部链接到单个 Brain 的同类 agent。
* Brain 设置:一个有以下观测/运动空间的 brain。
* Brain 设置:一个有以下观测/运动空间的 Brain。
* 向量观测空间:(连续变量)8 个,对应于平台的旋转以及球的位置、旋转和速度。
* 向量观测空间(困难版本,因为观测到的信息减少了):(连续变量)5 个变量,对应于平台的旋转以及球的位置和旋转。
* 向量运动空间:(连续变量)2 个,其中一个值对应于 X 旋转,而另一个值对应于 Z 旋转。

* 训练环境:某一个典型版本的的grid-world任务。场景包含 agent、目标和障碍。
* 目标:agent 必须在网格中避开障碍的同时移动到目标。
* Agent设置:环境包含一个链接到单个 brain 的 agent。
* Agent设置:环境包含一个链接到单个 Brain 的 agent。
* Brain 设置:一个有以下观测/运动空间的 brain。
* Brain 设置:一个有以下观测/运动空间的 Brain。
* 向量观测空间:无
* 向量运动空间:(离散变量)4 个,对应于基本方向的移动。
* 视觉观测:一个对应于 GridWorld 自上而下的视图。

* 训练环境:agent 控制球拍将球弹过球网的双人游戏。
* 目标:agent 必须在彼此之间弹起网球,同时不能丢球或击球出界。
* Agent设置:环境包含两个链接到单个 brain(名为 TennisBrain)的 agent。在训练之后,您可以将另一个名为 MyBrain 的 brain 附加到其中一个 agent,从而与经过训练的模型进行游戏比赛。
* Agent设置:环境包含两个链接到单个 Brain TennisBrain)的 agent。在训练之后,您可以将另一个名为 MyBrain 的 Brain 附加到其中一个 agent,从而与经过训练的模型进行游戏比赛。
* Brain 设置:一个有以下观测/运动空间的 brain。
* Brain 设置:一个有以下观测/运动空间的 Brain。
* 向量观测空间:(连续变量)8 个,分别对应于球和球拍的位置和速度。
* 向量运动空间:(连续变量)2 个,分别对应于朝向球网或远离球网的运动,以及上下的运动。
* 视觉观测:无

* 训练环境:一个平台,agent 可以在该平台上推动方块。
* 目标:agent 必须将方块推向目标。
* Agent设置:环境包含一个链接到单个 brain 的 agent。
* Agent设置:环境包含一个链接到单个 Brain 的 agent。
* Brain 设置:一个有以下观测/运动空间的 brain。
* Brain 设置:一个有以下观测/运动空间的 Brain。
* 向量观测空间:(连续变量)15 个,分别对应于 agent、方块和目标的位置和速度。
* 向量运动空间:(连续变量)2 个,分别对应于 X 和 Z 方向的移动。
* 视觉观测:无。

* 训练环境:一个平台环境,agent 可以在该环境中跳过墙。
* 目标:agent 必须利用一个方块越过墙并到达目标。
* Agent设置:环境包含一个链接到两个不同 brain 的 agent。agent 链接到的 brain 根据墙的高度而变化。
* Agent设置:环境包含一个链接到两个不同 Brain 的 agent。agent 链接到的 Brain 根据墙的高度而变化。
* Brain 设置:一个有以下观测/运动空间的 brain。
* Brain 设置:一个有以下观测/运动空间的 Brain。
* 向量观测空间:(连续变量)16 个,分别对应于 agent、方块和目标的位置和速度以及墙的高度。
* 向量运动空间:(离散变量)74 个,分别对应于 14 个射线投射,每个射线投射可检测 4 个可能的物体,加上 agent 的全局位置以及 agent 是否落地。
* 视觉观测:无。

* 训练环境:可以移动到目标位置的双关节臂。
* 目标:agent 必须将手移动到目标位置,并保持在此处。
* Agent设置:环境包含 32 个链接到单个 brain 的 agent。
* Agent设置:环境包含 32 个链接到单个 Brain 的 agent。
* Brain 设置:一个有以下观测/运动空间的 brain。
* Brain 设置:一个有以下观测/运动空间的 Brain。
* 向量观测空间:(连续变量)26 个,对应于两个机械臂 Rigidbody 的位置、旋转、速度和角速度。
* 向量运动空间:(连续变量)4 个,对应于两个关节的两个方向上的转动。
* 视觉观测:无

* 训练环境:一种有 4 个手臂的生物,每个手臂分两节
* 目标:agent 必须沿 x 轴移动其身体,并且保持不跌倒。
* Agent设置:环境包含 3 个链接到单个 brain 的 agent。
* Agent设置:环境包含 3 个链接到单个 Brain 的 agent。
* Agent 奖励函数设置(agent互相之间独立):
* +1 乘以 x 方向的速度
* 跌倒时 -1。

* Brain 设置:一个有以下观测/运动空间的 brain。
* Brain 设置:一个有以下观测/运动空间的 Brain。
* 向量观测空间:(连续变量)117 个,对应于每个肢体的位置、旋转、速度和角速度以及身体的加速度和角速度。
* 向量运动空间:(连续变量)12 个,对应于适用于 12 个关节的扭矩。
* 视觉观测:无

* 训练环境:一个包含多个 agent 的环境,这些 agent 争相收集香蕉。
* 目标:agent 必须学习尽可能接近更多的黄色香蕉,同时避开红色香蕉。
* Agent设置:环境包含 10 个链接到单个 brain 的 agent。
* Agent设置:环境包含 10 个链接到单个 Brain 的 agent。
* Brain 设置:一个有以下观测/运动空间的 brain。
* Brain 设置:一个有以下观测/运动空间的 Brain。
* 向量观测空间:(连续变量)51 个,对应于 agent 的速度, agent 前进方向,以及 agent 对周围物体进行基于射线的感知。
* 向量运动空间:(连续变量)3 个,对应于向前移动,绕 y 轴旋转,以及是否使用激光使其他 agent 瘫痪。
* 视觉观测(可选):每个 agent 的第一人称视图。

* 训练环境:在一个环境中,agent 需要在房间内查找信息、记住信息并使用信息移动到正确目标。
* 目标:移动到与房间内的方块的颜色相对应的目标。
* Agent设置:环境包含一个链接到单个 brain 的 agent。
* Agent设置:环境包含一个链接到单个 Brain 的 agent。
* Agent 奖励函数设置(agent互相之间独立):
* 移动到正确目标时 +1。
* 移动到错误目标时 -0.1。

* 训练环境:在一个环境中,agent 需要按需决策。agent 必须决定在接触地面时如何进行下一次弹跳。
* 目标:抓住漂浮的香蕉。跳跃次数有限。
* Agent设置:环境包含一个链接到单个 brain 的 agent。
* Agent设置:环境包含一个链接到单个 Brain 的 agent。
* Agent 奖励函数设置(agent互相之间独立):
* 抓住香蕉时 +1。
* 弹跳出界时 -1。

* 目标:
* 前锋:让球进入对手的球门。
* 守门员:防止球进入自己的球门。
* Agent设置:环境包含四个 agent,其中两个链接到一个 brain(前锋),两个链接到另一个 brain(守门员)。
* Agent设置:环境包含四个 agent,其中两个链接到一个 Brain(前锋),两个链接到另一个 Brain(守门员)。
* Agent 奖励函数设置(agent互相之间非独立):
* 前锋:
* 球进入对手球门时 +1。

2
docs/localized/zh-CN/docs/ML-Agents-Overview.md


我们将 Brain 类型切换为 Internal,并加入从训练阶段
生成的 TensorFlow 模型。现在,在预测阶段,军医
仍然继续生成他们的观测结果,但不再将结果发送到
Python API,而是送入他们的嵌入了的 Tensorflow 模型,
Python API,而是送入他们的嵌入了的 TensorFlow 模型,
以便生成每个军医在每个时间点上要采取的_最佳_动作。
总结一下:我们的实现是基于 TensorFlow 的,因此,

2
config/curricula/push-block/PushBlockBrain.json


{
"measure" : "reward",
"thresholds" : [0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75],
"min_lesson_length" : 2,
"min_lesson_length" : 100,
"signal_smoothing" : true,
"parameters" :
{

2
config/curricula/test/TestBrain.json


{
"measure" : "reward",
"thresholds" : [10, 20, 50],
"min_lesson_length" : 3,
"min_lesson_length" : 100,
"signal_smoothing" : true,
"parameters" :
{

2
ml-agents/requirements.txt


pytest>=3.2.2
docopt
pyyaml
protobuf==3.5.2
protobuf==3.6.0
grpcio==1.11.0

15
ml-agents/mlagents/trainers/buffer.py


import numpy as np
from unityagents.exception import UnityException
from mlagents.envs.exception import UnityException
class BufferException(UnityException):

def shuffle(self, key_list=None):
"""
Shuffles the fields in key_list in a consistent way: The reordering will
Shuffles the fields in key_list in a consistent way: The reordering will
be the same across fields.
:param key_list: The fields that must be shuffled.
"""

np.random.shuffle(s)
for key in key_list:
self[key][:] = [self[key][i] for i in s]
def make_mini_batch(self, start, end):
"""
Creates a mini-batch from buffer.
:param start: Starting index of buffer.
:param end: Ending index of buffer.
:return: Dict of mini batch.
"""
mini_batch = {}
for key in self:
mini_batch[key] = np.array(self[key][start:end])
return mini_batch
def __init__(self):
self.update_buffer = self.AgentBuffer()

1
ml-agents/mlagents/envs/__init__.py


from .environment import *
from .brain import *
from .exception import *
from .curriculum import *

4
ml-agents/mlagents/envs/exception.py


import logging
logger = logging.getLogger("unityagents")
logger = logging.getLogger("mlagents.envs")
class UnityException(Exception):
"""

"You can check the logfile for more information at {}".format(log_file_path))
except:
logger.error("An error might have occured in the environment. "
"No unity-environment.log file could be found.")
"No UnitySDK.log file could be found.")
super(UnityTimeOutException, self).__init__(message)

7
ml-agents/mlagents/envs/communicator.py


import logging
from communicator_objects import UnityOutput, UnityInput
from .communicator_objects import UnityOutput, UnityInput
logger = logging.getLogger("unityagents")
logger = logging.getLogger("mlagents.envs")
def __init__(self, worker_id=0,
base_port=5005):
def __init__(self, worker_id=0, base_port=5005):
"""
Python side of the communication. Must be used in pair with the right Unity Communicator equivalent.

4
ml-agents/mlagents/envs/socket_communicator.py


import struct
from .communicator import Communicator
from communicator_objects import UnityMessage, UnityOutput, UnityInput
from .communicator_objects import UnityMessage, UnityOutput, UnityInput
logger = logging.getLogger("unityagents")
logger = logging.getLogger("mlagents.envs")
class SocketCommunicator(Communicator):

6
ml-agents/mlagents/envs/rpc_communicator.py


from concurrent.futures import ThreadPoolExecutor
from .communicator import Communicator
from communicator_objects import UnityToExternalServicer, add_UnityToExternalServicer_to_server
from communicator_objects import UnityMessage, UnityInput, UnityOutput
from .communicator_objects import UnityToExternalServicer, add_UnityToExternalServicer_to_server
from .communicator_objects import UnityMessage, UnityInput, UnityOutput
logger = logging.getLogger("unityagents")
logger = logging.getLogger("mlagents.envs")
class UnityToExternalServicerImplementation(UnityToExternalServicer):

1
UnitySDK/ProjectSettings/EditorBuildSettings.asset


m_ObjectHideFlags: 0
serializedVersion: 2
m_Scenes: []
m_configObjects: {}

57
UnitySDK/ProjectSettings/ProjectSettings.asset


--- !u!129 &1
PlayerSettings:
m_ObjectHideFlags: 0
serializedVersion: 15
serializedVersion: 14
AndroidEnableSustainedPerformanceMode: 0
defaultScreenOrientation: 4
targetDevice: 2
useOnDemandResources: 0

preserveFramebufferAlpha: 0
disableDepthAndStencilBuffers: 0
androidBlitType: 0
defaultIsFullScreen: 0
defaultIsNativeResolution: 1
macRetinaSupport: 1
runInBackground: 1

visibleInBackground: 0
allowFullscreenSwitch: 1
graphicsJobMode: 0
fullscreenMode: -1
macFullscreenMode: 2
d3d11FullscreenMode: 1
xboxSpeechDB: 0
xboxEnableHeadOrientation: 0
xboxEnableGuest: 0

xboxOneLoggingLevel: 1
xboxOneDisableEsram: 0
xboxOnePresentImmediateThreshold: 0
switchQueueCommandMemory: 0
wiiUTVResolution: 0
wiiUGamePadMSAA: 1
wiiUSupportsNunchuk: 0
wiiUSupportsClassicController: 0
wiiUSupportsBalanceBoard: 0
wiiUSupportsMotionPlus: 0
wiiUSupportsProController: 0
wiiUAllowScreenCapture: 1
wiiUControllerCount: 0
m_SupportedAspectRatios:
4:3: 1
5:4: 1

hololens:
depthFormat: 1
depthBufferSharingEnabled: 0
enable360StereoCapture: 0
oculus:
sharedDepthBuffer: 0
dashSupport: 0

APKExpansionFiles: 0
keepLoadedShadersAlive: 0
StripUnusedMeshComponents: 0
VertexChannelCompressionMask: 214
VertexChannelCompressionMask:
serializedVersion: 2
m_Bits: 4294901998
iPhoneSdkVersion: 988
iOSTargetOSVersionString: 8.0
tvOSSdkVersion: 0

tvOSSmallIconLayers: []
tvOSSmallIconLayers2x: []
tvOSLargeIconLayers: []
tvOSLargeIconLayers2x: []
tvOSTopShelfImageLayers: []
tvOSTopShelfImageLayers2x: []
tvOSTopShelfImageWideLayers: []

appleDeveloperTeamID:
iOSManualSigningProvisioningProfileID:
tvOSManualSigningProvisioningProfileID:
iOSManualSigningProvisioningProfileType: 0
tvOSManualSigningProvisioningProfileType: 0
iOSRequireARKit: 0
appleEnableProMotion: 0
templatePackageId:
templateDefaultScene:
AndroidTargetArchitectures: 5
AndroidTargetDevice: 0
AndroidSplashScreenScale: 0
androidSplashScreen: {fileID: 0}
AndroidKeystoreName:

androidGamepadSupportLevel: 0
resolutionDialogBanner: {fileID: 0}
m_BuildTargetIcons: []
m_BuildTargetPlatformIcons: []
m_BuildTargetBatching: []
m_BuildTargetGraphicsAPIs:
- m_BuildTarget: MacStandaloneSupport

m_EncodingQuality: 1
- m_BuildTarget: PS4
m_EncodingQuality: 1
wiiUTitleID: 0005000011000000
wiiUGroupID: 00010000
wiiUCommonSaveSize: 4096
wiiUAccountSaveSize: 2048
wiiUOlvAccessKey: 0
wiiUTinCode: 0
wiiUJoinGameId: 0
wiiUJoinGameModeMask: 0000000000000000
wiiUCommonBossSize: 0
wiiUAccountBossSize: 0
wiiUAddOnUniqueIDs: []
wiiUMainThreadStackSize: 3072
wiiULoaderThreadStackSize: 1024
wiiUSystemHeapSize: 128
wiiUTVStartupScreen: {fileID: 0}
wiiUGamePadStartupScreen: {fileID: 0}
wiiUDrcBufferDisabled: 0
wiiUProfilerLibPath:
runPlayModeTestAsEditModeTest: 0
actionOnDotNetUnhandledException: 1
enableInternalProfiler: 0
logObjCUncaughtExceptions: 1

ps4pnFriends: 1
ps4pnGameCustomData: 1
playerPrefsSupport: 0
enableApplicationExit: 0
restrictedAudioUsageRights: 0
ps4UseResolutionFallback: 0
ps4ReprojectionSupport: 0

psp2InfoBarOnStartup: 0
psp2InfoBarColor: 0
psp2ScriptOptimizationLevel: 0
psmSplashimage: {fileID: 0}
splashScreenBackgroundSourceLandscape: {fileID: 0}
splashScreenBackgroundSourcePortrait: {fileID: 0}
spritePackerPolicy:

webGLTemplate: APPLICATION:Default
webGLAnalyzeBuildSize: 0
webGLUseEmbeddedResources: 0
webGLUseWasm: 0
webGLLinkerTarget: 0
scriptingDefineSymbols:
1:
7: UNITY_POST_PROCESSING_STACK_V2

27: UNITY_POST_PROCESSING_STACK_V2
platformArchitecture: {}
scriptingBackend: {}
il2cppCompilerConfiguration: {}
allowUnsafeCode: 0
additionalIl2CppArgs:
scriptingRuntimeVersion: 1
apiCompatibilityLevelPerPlatform:

4
UnitySDK/ProjectSettings/UnityConnectSettings.asset


--- !u!310 &1
UnityConnectSettings:
m_ObjectHideFlags: 0
m_Enabled: 0
m_Enabled: 1
m_TestMode: 0
m_TestEventUrl:
m_TestConfigUrl:

m_NativeEventUrl: https://perf-events.cloud.unity3d.com/symbolicate
m_Enabled: 0
m_CaptureEditorExceptions: 1
UnityPurchasingSettings:

m_Enabled: 0
m_InitializeOnStartup: 1
m_TestMode: 0
m_EnabledPlatforms: 4294967295
m_IosGameId:
m_AndroidGameId:
m_GameIds: {}

2
UnitySDK/ProjectSettings/ProjectVersion.txt


m_EditorVersion: 2018.1.0f2
m_EditorVersion: 2017.4.10f1

12
UnitySDK/Assets/ML-Agents/Scripts/CoreBrainInternal.cs.meta


fileFormatVersion: 2
guid: 8b23992c8eb17439887f5e944bf04a40
timeCreated: 1504070347
licenseType: Free
MonoImporter:
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

28
UnitySDK/Assets/ML-Agents/Scripts/Academy.cs


"docs/Learning-Environment-Design-Academy.md")]
public abstract class Academy : MonoBehaviour
{
private const string kApiVersion = "API-4";
private const string kApiVersion = "API-5";
// Fields provided in the Inspector

(MLAgents.CommunicatorObjects.BrainTypeProto)
brain.brainType));
}
academyParameters.EnvironmentParameters =
new MLAgents.CommunicatorObjects.EnvironmentParametersProto();

var pythonParameters = brainBatcher.SendAcademyParameters(academyParameters);
Random.InitState(pythonParameters.Seed);
Application.logMessageReceived += HandleLog;
logPath = Path.GetFullPath(".") + "/unity-environment.log";
logPath = Path.GetFullPath(".") + "/UnitySDK.log";
logWriter = new StreamWriter(logPath, false);
logWriter.WriteLine(System.DateTime.Now.ToString());
logWriter.WriteLine(" ");

ConfigureEnvironment();
}
private void UpdateResetParameters()
{
var newResetParameters = brainBatcher.GetEnvironmentParameters();
if (newResetParameters != null)
{
foreach (var kv in newResetParameters.FloatParameters)
{
resetParameters[kv.Key] = kv.Value;
}
}
}
void HandleLog(string logString, string stackTrace, LogType type)
{
logWriter = new StreamWriter(logPath, true);

if (brainBatcher.GetCommand() ==
MLAgents.CommunicatorObjects.CommandProto.Reset)
{
// Update reset parameters.
var newResetParameters = brainBatcher.GetEnvironmentParameters();
if (newResetParameters != null)
{
foreach (var kv in newResetParameters.FloatParameters)
{
resetParameters[kv.Key] = kv.Value;
}
}
UpdateResetParameters();
SetIsInference(!brainBatcher.GetIsTraining());

}
else if (!firstAcademyReset)
{
UpdateResetParameters();
ForcedFullReset();
}

17
UnitySDK/Assets/ML-Agents/Scripts/Batcher.cs


{
StackedVectorObservation = { info.stackedVectorObservation },
StoredVectorActions = { info.storedVectorActions },
Memories = { info.memories },
StoredTextActions = info.storedTextActions,
TextObservation = info.textObservation,
Reward = info.reward,

};
if (info.memories != null)
{
agentInfoProto.Memories.Add(info.memories);
}
if (info.actionMasks != null)
{
agentInfoProto.ActionMask.AddRange(info.actionMasks);
}
ByteString.CopyFrom(obs.EncodeToJPG())
ByteString.CopyFrom(obs.EncodeToPNG())
);
}
return agentInfoProto;

{
VectorObservationSize = brainParameters.vectorObservationSize,
NumStackedVectorObservations = brainParameters.numStackedVectorObservations,
VectorActionSize = brainParameters.vectorActionSize,
VectorActionSize = {brainParameters.vectorActionSize},
VectorObservationSpaceType =
(CommunicatorObjects.SpaceTypeProto)brainParameters.vectorObservationSpaceType,
BrainName = name,
BrainType = type
};

action.Memories.ToList());
agent.UpdateTextAction(
action.TextActions);
agent.UpdateValueAction(
action.Value);
}
}

24
UnitySDK/Assets/ML-Agents/Scripts/Brain.cs


[Range(1, 50)] public int numStackedVectorObservations = 1;
public int vectorActionSize = 1;
public int[] vectorActionSize = new int[1]{1};
/**< \brief If continuous : The length of the float vector that represents
* the action
* <br> If discrete : The number of possible values the action can take*/

public SpaceType vectorActionSpaceType = SpaceType.discrete;
/**< \brief Defines if the action is discrete or continuous */
public SpaceType vectorObservationSpaceType = SpaceType.continuous;
/**< \brief Defines if the state is discrete or continuous */
}
[HelpURL("https://github.com/Unity-Technologies/ml-agents/blob/master/" +

*/
public class Brain : MonoBehaviour
{
private bool isInitialized = false;
private bool isInitialized;
private Dictionary<Agent, AgentInfo> agentInfos =
new Dictionary<Agent, AgentInfo>(1024);

public BrainType brainType;
//[HideInInspector]
///**< \brief Keeps track of the agents which subscribe to this brain*/
/// Keeps track of the agents which subscribe to this brain*/
// public Dictionary<int, Agent> agents = new Dictionary<int, Agent>();
[SerializeField] ScriptableObject[] CoreBrains;

{
CoreBrains[(int) bt] =
ScriptableObject.CreateInstance(
"CoreBrain" + bt.ToString());
"CoreBrain" + bt);
CoreBrains[(int) bt] =
ScriptableObject.Instantiate(CoreBrains[(int) bt]);
CoreBrains[(int) bt] = Instantiate(CoreBrains[(int) bt]);
}
}

if (!gameObject.activeSelf)
{
throw new UnityAgentsException(
string.Format("Agent {0} tried to request an action " +
"from brain {1} but it is not active.",
agent.gameObject.name, gameObject.name));
$"Agent {agent.gameObject.name} tried to request an action " +
$"from brain {gameObject.name} but it is not active.");
string.Format("Agent {0} tried to request an action " +
"from brain {1} but it was not initialized.",
agent.gameObject.name, gameObject.name));
$"Agent {agent.gameObject.name} tried to request an action " +
$"from brain {gameObject.name} but it was not initialized.");
}
else
{

2
UnitySDK/Assets/ML-Agents/Scripts/CoreBrain.cs


void SetBrain(Brain b);
/// Implement this method to initialize CoreBrain
void InitializeCoreBrain(MLAgents.Batcher brainBatcher);
void InitializeCoreBrain(Batcher brainBatcher);
/// Implement this method to define the logic for deciding actions
void DecideAction(Dictionary<Agent, AgentInfo> agentInfo);

9
UnitySDK/Assets/ML-Agents/Scripts/CoreBrainHeuristic.cs


/**< Reference to the brain that uses this CoreBrainHeuristic */
public Brain brain;
MLAgents.Batcher brainBatcher;
Batcher brainBatcher;
/**< Reference to the Decision component used to decide the actions */
public Decision decision;

}
/// Create the reference to decision
public void InitializeCoreBrain(MLAgents.Batcher brainBatcher)
public void InitializeCoreBrain(Batcher brainBatcher)
{
decision = brain.gameObject.GetComponent<Decision>();

/// Uses the Decision Component to decide that action to take
public void DecideAction(Dictionary<Agent, AgentInfo> agentInfo)
{
if (brainBatcher != null)
{
brainBatcher.SendBrainInfo(brain.gameObject.name, agentInfo);
}
brainBatcher?.SendBrainInfo(brain.gameObject.name, agentInfo);
if (decision == null)
{

17
UnitySDK/Assets/ML-Agents/Scripts/CoreBrainPlayer.cs


private struct DiscretePlayerAction
{
public KeyCode key;
public int branchIndex;
public int value;
}

[Tooltip("The list of keys and the value they correspond to for discrete control.")]
/// Contains the mapping from input to discrete actions
private DiscretePlayerAction[] discretePlayerActions;
[SerializeField]
private int defaultAction = 0;
/// Reference to the brain that uses this CoreBrainPlayer
public Brain brain;

{
foreach (Agent agent in agentInfo.Keys)
{
var action = new float[brain.brainParameters.vectorActionSize];
var action = new float[brain.brainParameters.vectorActionSize[0]];
foreach (KeyContinuousPlayerAction cha in keyContinuousPlayerActions)
{
if (Input.GetKey(cha.key))

{
foreach (Agent agent in agentInfo.Keys)
{
var action = new float[1] {defaultAction};
var action = new float[brain.brainParameters.vectorActionSize.Length];
action[0] = (float) dha.value;
break;
action[dha.branchIndex] = (float) dha.value;
agent.UpdateVectorAction(action);
}

}
foreach (KeyContinuousPlayerAction action in keyContinuousPlayerActions)
{
if (action.index >= brain.brainParameters.vectorActionSize)
if (action.index >= brain.brainParameters.vectorActionSize[0])
{
EditorGUILayout.HelpBox(
string.Format(

}
foreach (AxisContinuousPlayerAction action in axisContinuousPlayerActions)
{
if (action .index >= brain.brainParameters.vectorActionSize)
if (action .index >= brain.brainParameters.vectorActionSize[0])
{
EditorGUILayout.HelpBox(
string.Format(

else
{
GUILayout.Label("Edit the discrete inputs for your actions", EditorStyles.boldLabel);
defaultAction = EditorGUILayout.IntField("Default Action", defaultAction);
var dhas = serializedBrain.FindProperty("discretePlayerActions");
serializedBrain.Update();
EditorGUILayout.PropertyField(dhas, true);

43
UnitySDK/Assets/ML-Agents/Scripts/Monitor.cs


static bool isInstantiated;
static GameObject canvas;
static Dictionary<Transform, Dictionary<string, DisplayValue>> displayTransformValues;
/// <summary>
/// Camera used to calculate GUI screen position relative to the target
/// transform.
/// </summary>
static Dictionary<Transform, Camera> transformCamera;
static Color[] barColors;
struct DisplayValue

/// <param name="value">The string value you want to display.</param>
/// <param name="target">The transform you want to attach the information to.
/// </param>
/// <param name="camera">Camera used to calculate GUI position relative to
/// the target. If null, `Camera.main` will be used.</param>
Transform target = null)
Transform target = null,
Camera camera = null)
{
if (!isInstantiated)
{

{
target = canvas.transform;
}
transformCamera[target] = camera;
if (!displayTransformValues.Keys.Contains(target))
{

/// <param name="value">The float value you want to display.</param>
/// <param name="target">The transform you want to attach the information to.
/// </param>
/// <param name="camera">Camera used to calculate GUI position relative to
/// the target. If null, `Camera.main` will be used.</param>
Transform target = null)
Transform target = null,
Camera camera = null)
{
if (!isInstantiated)
{

{
target = canvas.transform;
}
transformCamera[target] = camera;
if (!displayTransformValues.Keys.Contains(target))
{

/// <param name="displayType">The type of display.</param>
/// <param name="target">The transform you want to attach the information to.
/// </param>
/// <param name="camera">Camera used to calculate GUI position relative to
/// the target. If null, `Camera.main` will be used.</param>
DisplayType displayType = DisplayType.INDEPENDENT
DisplayType displayType = DisplayType.INDEPENDENT,
Camera camera = null
)
{
if (!isInstantiated)

{
target = canvas.transform;
}
transformCamera[target] = camera;
if (!displayTransformValues.Keys.Contains(target))
{

displayTransformValues = new Dictionary<Transform,
Dictionary<string, DisplayValue>>();
transformCamera = new Dictionary<Transform, Camera>();
}
/// <summary> <inheritdoc/> </summary>

continue;
}
// get camera
Camera cam = transformCamera[target];
if (cam == null)
{
cam = Camera.main;
}
float widthScaler = (Screen.width / 1000f);
float keyPixelWidth = 100 * widthScaler;
float keyPixelHeight = 20 * widthScaler;

Screen.width / 2 - keyPixelWidth, Screen.height);
if (!(target == canvas.transform))
{
Vector3 cam2obj = target.position - Camera.main.transform.position;
Vector3 cam2obj = target.position - cam.transform.position;
20f / (Vector3.Dot(cam2obj, Camera.main.transform.forward)));
Vector3 worldPosition = Camera.main.WorldToScreenPoint(
20f / (Vector3.Dot(cam2obj, cam.transform.forward)));
Vector3 worldPosition = cam.WorldToScreenPoint(
target.position + new Vector3(0, verticalOffset, 0));
origin = new Vector3(
worldPosition.x - keyPixelWidth * scale, Screen.height - worldPosition.y);

20
UnitySDK/Assets/ML-Agents/Scripts/RpcCommunicator.cs


/// <param name="communicatorParameters">Communicator parameters.</param>
public RPCCommunicator(CommunicatorParameters communicatorParameters)
{
this.m_communicatorParameters = communicatorParameters;
m_communicatorParameters = communicatorParameters;
}
/// <summary>

var result = m_client.Exchange(WrapMessage(unityOutput, 200));
unityInput = m_client.Exchange(WrapMessage(null, 200)).UnityInput;
#if UNITY_EDITOR
#if UNITY_2017_2_OR_NEWER
#else
EditorApplication.playmodeStateChanged += HandleOnPlayModeChanged;
#endif
#endif
return result.UnityInput;
}

}
#if UNITY_EDITOR
#if UNITY_2017_2_OR_NEWER
/// <summary>
/// When the editor exits, the communicator must be closed
/// </summary>

Close();
}
}
#else
/// <summary>
/// When the editor exits, the communicator must be closed
/// </summary>
private void HandleOnPlayModeChanged()
{
// This method is run whenever the playmode state is changed.
if (!EditorApplication.isPlayingOrWillChangePlaymode)
{
Close();
}
}
#endif
#endif
}

22
UnitySDK/Assets/ML-Agents/Scripts/SocketCommunicator.cs


unityInput = UnityMessage.Parser.ParseFrom(Receive()).UnityInput;
#if UNITY_EDITOR
#if UNITY_2017_2_OR_NEWER
#else
EditorApplication.playmodeStateChanged += HandleOnPlayModeChanged;
#endif
#endif
return initializationInput.UnityInput;

}
#if UNITY_EDITOR
#if UNITY_2017_2_OR_NEWER
void HandleOnPlayModeChanged(PlayModeStateChange state)
private void HandleOnPlayModeChanged(PlayModeStateChange state)
if (state == PlayModeStateChange.ExitingPlayMode)
if (state==PlayModeStateChange.ExitingPlayMode)
#else
/// <summary>
/// When the editor exits, the communicator must be closed
/// </summary>
private void HandleOnPlayModeChanged()
{
// This method is run whenever the playmode state is changed.
if (!EditorApplication.isPlayingOrWillChangePlaymode)
{
Close();
}
}
#endif
#endif
}

2
UnitySDK/Assets/ML-Agents/Scripts/UnityAgentsException.cs


namespace MLAgents
{
[System.Serializable]
[Serializable]
/// Contains exceptions specific to ML-Agents.
public class UnityAgentsException : System.Exception
{

165
UnitySDK/Assets/ML-Agents/Scripts/Agent.cs


using System.Collections.Generic;
using System.Linq;
using UnityEngine;

public string storedTextActions;
/// <summary>
/// For discrete control, specifies the actions that the agent cannot take. Is true if
/// the action is masked.
/// </summary>
public bool[] actionMasks;
/// <summary>
/// Used by the Trainer to store information about the agent. This data
/// structure is not consumed or modified by the agent directly, they are
/// just the owners of their trainier's memory. Currently, however, the

public float[] vectorActions;
public string textActions;
public List<float> memories;
public float value;
}
/// <summary>

/// Unique identifier each agent receives at initialization. It is used
/// to separate between different agents in the environment.
int id;
/// Keeps track of the actions that are masked at each step.
private ActionMasker actionMasker;
/// Array of Texture2D used to render to from render buffer before
/// transforming into float tensor.

}
BrainParameters param = brain.brainParameters;
actionMasker = new ActionMasker(param);
action.vectorActions = new float[param.vectorActionSize];
info.storedVectorActions = new float[param.vectorActionSize];
action.vectorActions = new float[param.vectorActionSize[0]];
info.storedVectorActions = new float[param.vectorActionSize[0]];
action.vectorActions = new float[1];
info.storedVectorActions = new float[1];
action.vectorActions = new float[param.vectorActionSize.Length];
info.storedVectorActions = new float[param.vectorActionSize.Length];
}
if (info.textObservation == null)

action.memories = new List<float>();
if (param.vectorObservationSpaceType == SpaceType.continuous)
{
info.vectorObservation =
new List<float>(param.vectorObservationSize);
info.stackedVectorObservation =
new List<float>(param.vectorObservationSize
* brain.brainParameters.numStackedVectorObservations);
info.stackedVectorObservation.AddRange(
new float[param.vectorObservationSize
* param.numStackedVectorObservations]);
}
else
{
info.vectorObservation = new List<float>(1);
info.stackedVectorObservation =
new List<float>(param.numStackedVectorObservations);
info.stackedVectorObservation.AddRange(
new float[param.numStackedVectorObservations]);
}
info.vectorObservation =
new List<float>(param.vectorObservationSize);
info.stackedVectorObservation =
new List<float>(param.vectorObservationSize
* brain.brainParameters.numStackedVectorObservations);
info.stackedVectorObservation.AddRange(
new float[param.vectorObservationSize
* param.numStackedVectorObservations]);
info.visualObservations = new List<Texture2D>();
}

info.storedVectorActions = action.vectorActions;
info.storedTextActions = action.textActions;
info.vectorObservation.Clear();
actionMasker.ResetMask();
info.actionMasks = actionMasker.GetMask();
if (param.vectorObservationSpaceType == SpaceType.continuous)
if (info.vectorObservation.Count != param.vectorObservationSize)
if (info.vectorObservation.Count != param.vectorObservationSize)
{
throw new UnityAgentsException(string.Format(
"Vector Observation size mismatch between continuous " +
"agent {0} and brain {1}. " +
"Was Expecting {2} but received {3}. ",
gameObject.name, brain.gameObject.name,
brain.brainParameters.vectorObservationSize,
info.vectorObservation.Count));
}
info.stackedVectorObservation.RemoveRange(
0, param.vectorObservationSize);
info.stackedVectorObservation.AddRange(info.vectorObservation);
throw new UnityAgentsException(string.Format(
"Vector Observation size mismatch between continuous " +
"agent {0} and brain {1}. " +
"Was Expecting {2} but received {3}. ",
gameObject.name, brain.gameObject.name,
brain.brainParameters.vectorObservationSize,
info.vectorObservation.Count));
else
{
if (info.vectorObservation.Count != 1)
{
throw new UnityAgentsException(string.Format(
"Vector Observation size mismatch between discrete agent" +
" {0} and brain {1}. Was Expecting {2} but received {3}. ",
gameObject.name, brain.gameObject.name,
1, info.vectorObservation.Count));
}
info.stackedVectorObservation.RemoveRange(0, 1);
info.stackedVectorObservation.AddRange(info.vectorObservation);
}
info.stackedVectorObservation.RemoveRange(
0, param.vectorObservationSize);
info.stackedVectorObservation.AddRange(info.vectorObservation);
info.visualObservations.Clear();
if (param.cameraResolutions.Length > agentParameters.agentCameras.Count)

/// - <see cref="AddVectorObs(float[])"/>
/// - <see cref="AddVectorObs(List{float})"/>
/// - <see cref="AddVectorObs(Quaternion)"/>
/// - <see cref="AddVectorObs(bool)"/>
/// - <see cref="AddVectorObs(int, int)"/>
/// Depending on your environment, any combination of these helpers can
/// be used. They just need to be used in the exact same order each time
/// this method is called and the resulting size of the vector observation

}
/// <summary>
/// Sets an action mask for discrete control agents. When used, the agent will not be
/// able to perform the action passed as argument at the next decision. If no branch is
/// specified, the default branch will be 0. The actionIndex or actionIndices correspond
/// to the action the agent will be unable to perform.
/// </summary>
/// <param name="actionIndices">The indices of the masked actions on branch 0</param>
protected void SetActionMask(IEnumerable<int> actionIndices)
{
actionMasker.SetActionMask(0, actionIndices);
}
/// <summary>
/// Sets an action mask for discrete control agents. When used, the agent will not be
/// able to perform the action passed as argument at the next decision. If no branch is
/// specified, the default branch will be 0. The actionIndex or actionIndices correspond
/// to the action the agent will be unable to perform.
/// </summary>
/// <param name="actionIndex">The index of the masked action on branch 0</param>
protected void SetActionMask(int actionIndex)
{
actionMasker.SetActionMask(0, new int[1]{actionIndex});
}
/// <summary>
/// Sets an action mask for discrete control agents. When used, the agent will not be
/// able to perform the action passed as argument at the next decision. If no branch is
/// specified, the default branch will be 0. The actionIndex or actionIndices correspond
/// to the action the agent will be unable to perform.
/// </summary>
/// <param name="branch">The branch for which the actions will be masked</param>
/// <param name="actionIndex">The index of the masked action</param>
protected void SetActionMask(int branch, int actionIndex)
{
actionMasker.SetActionMask(branch, new int[1]{actionIndex});
}
/// <summary>
/// Modifies an action mask for discrete control agents. When used, the agent will not be
/// able to perform the action passed as argument at the next decision. If no branch is
/// specified, the default branch will be 0. The actionIndex or actionIndices correspond
/// to the action the agent will be unable to perform.
/// </summary>
/// <param name="branch">The branch for which the actions will be masked</param>
/// <param name="actionIndices">The indices of the masked actions</param>
protected void SetActionMask(int branch, IEnumerable<int> actionIndices)
{
actionMasker.SetActionMask(branch, actionIndices);
}
/// <summary>
/// Adds a float observation to the vector observations of the agent.
/// Increases the size of the agents vector observation by 1.
/// </summary>

/// <param name="observation">Observation.</param>
protected void AddVectorObs(int observation)
{
info.vectorObservation.Add((float) observation);
info.vectorObservation.Add(observation);
}
/// <summary>

info.vectorObservation.Add(observation ? 1f : 0f);
}
protected void AddVectorObs(int observation, int range)
{
float[] oneHotVector = new float[range];
oneHotVector[observation] = 1;
info.vectorObservation.AddRange(oneHotVector);
}
/// <summary>
/// Sets the text observation.
/// </summary>

{
action.textActions = textActions;
}
/// <summary>
/// Updates the value of the agent.
/// </summary>
/// <param name="textActions">Text actions.</param>
public void UpdateValueAction(float value)
{
action.value = value;
}
protected float GetValueEstimate()
{
return action.value;
}
/// <summary>
/// Scales continous action from [-1, 1] to arbitrary range.

/// The agent must set maxStepReached.</param>
/// <param name="academyDone">If set to <c>true</c>
/// The agent must set done.</param>
/// <param name="academyStepCounter">Number of current steps in episode</param>
void SetStatus(bool academyMaxStep, bool academyDone, int academyStepCounter)
{
if (academyDone)

2
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/AgentActionProto.cs.meta


fileFormatVersion: 2
guid: 93eec67e32dc3484ca9b8e3ea98909c7
guid: 4482f127d4a874cf8a11da2b2cc27dc9
MonoImporter:
externalObjects: {}
serializedVersion: 2

2
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/AgentInfoProto.cs.meta


fileFormatVersion: 2
guid: 9a2cd47d5b7a84d45b66748c405edf5a
guid: 791522439b8324bff85f84309db90ecc
MonoImporter:
externalObjects: {}
serializedVersion: 2

2
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/BrainParametersProto.cs.meta


fileFormatVersion: 2
guid: 91e3353985a4c4c08a8004648a81de4f
guid: 7b41acc4d406e4a3c94df3399b2a6471
MonoImporter:
externalObjects: {}
serializedVersion: 2

2
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/BrainTypeProto.cs.meta


fileFormatVersion: 2
guid: d2e4f3cea300049b7a4cd65fbee2ee95
guid: 8a44faf5235584f06ae45eb976a247a9
MonoImporter:
externalObjects: {}
serializedVersion: 2

2
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/CommandProto.cs.meta


fileFormatVersion: 2
guid: 19e8be280f78249c188fde36f0855094
guid: 6b2ff9fe2c38b4e79aba78908cc5492c
MonoImporter:
externalObjects: {}
serializedVersion: 2

19
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/EngineConfigurationProto.cs


// <auto-generated>
// Generated by the protocol buffer compiler. DO NOT EDIT!
// source: communicator_objects/engine_configuration_proto.proto
// source: mlagents/envs/communicator_objects/engine_configuration_proto.proto
// </auto-generated>
#pragma warning disable 1591, 0612, 3021
#region Designer generated code

using scg = global::System.Collections.Generic;
namespace MLAgents.CommunicatorObjects {
/// <summary>Holder for reflection information generated from communicator_objects/engine_configuration_proto.proto</summary>
/// <summary>Holder for reflection information generated from mlagents/envs/communicator_objects/engine_configuration_proto.proto</summary>
/// <summary>File descriptor for communicator_objects/engine_configuration_proto.proto</summary>
/// <summary>File descriptor for mlagents/envs/communicator_objects/engine_configuration_proto.proto</summary>
public static pbr::FileDescriptor Descriptor {
get { return descriptor; }
}

byte[] descriptorData = global::System.Convert.FromBase64String(
string.Concat(
"CjVjb21tdW5pY2F0b3Jfb2JqZWN0cy9lbmdpbmVfY29uZmlndXJhdGlvbl9w",
"cm90by5wcm90bxIUY29tbXVuaWNhdG9yX29iamVjdHMilQEKGEVuZ2luZUNv",
"bmZpZ3VyYXRpb25Qcm90bxINCgV3aWR0aBgBIAEoBRIOCgZoZWlnaHQYAiAB",
"KAUSFQoNcXVhbGl0eV9sZXZlbBgDIAEoBRISCgp0aW1lX3NjYWxlGAQgASgC",
"EhkKEXRhcmdldF9mcmFtZV9yYXRlGAUgASgFEhQKDHNob3dfbW9uaXRvchgG",
"IAEoCEIfqgIcTUxBZ2VudHMuQ29tbXVuaWNhdG9yT2JqZWN0c2IGcHJvdG8z"));
"CkNtbGFnZW50cy9lbnZzL2NvbW11bmljYXRvcl9vYmplY3RzL2VuZ2luZV9j",
"b25maWd1cmF0aW9uX3Byb3RvLnByb3RvEhRjb21tdW5pY2F0b3Jfb2JqZWN0",
"cyKVAQoYRW5naW5lQ29uZmlndXJhdGlvblByb3RvEg0KBXdpZHRoGAEgASgF",
"Eg4KBmhlaWdodBgCIAEoBRIVCg1xdWFsaXR5X2xldmVsGAMgASgFEhIKCnRp",
"bWVfc2NhbGUYBCABKAISGQoRdGFyZ2V0X2ZyYW1lX3JhdGUYBSABKAUSFAoM",
"c2hvd19tb25pdG9yGAYgASgIQh+qAhxNTEFnZW50cy5Db21tdW5pY2F0b3JP",
"YmplY3RzYgZwcm90bzM="));
descriptor = pbr::FileDescriptor.FromGeneratedCode(descriptorData,
new pbr::FileDescriptor[] { },
new pbr::GeneratedClrTypeInfo(null, new pbr::GeneratedClrTypeInfo[] {

2
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/EngineConfigurationProto.cs.meta


fileFormatVersion: 2
guid: fac934345fc664df8823b494ea9b1ca8
guid: 2cebeb1263d7846b4b3c7c6e5d5e193f
MonoImporter:
externalObjects: {}
serializedVersion: 2

21
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/EnvironmentParametersProto.cs


// <auto-generated>
// Generated by the protocol buffer compiler. DO NOT EDIT!
// source: communicator_objects/environment_parameters_proto.proto
// source: mlagents/envs/communicator_objects/environment_parameters_proto.proto
// </auto-generated>
#pragma warning disable 1591, 0612, 3021
#region Designer generated code

using scg = global::System.Collections.Generic;
namespace MLAgents.CommunicatorObjects {
/// <summary>Holder for reflection information generated from communicator_objects/environment_parameters_proto.proto</summary>
/// <summary>Holder for reflection information generated from mlagents/envs/communicator_objects/environment_parameters_proto.proto</summary>
/// <summary>File descriptor for communicator_objects/environment_parameters_proto.proto</summary>
/// <summary>File descriptor for mlagents/envs/communicator_objects/environment_parameters_proto.proto</summary>
public static pbr::FileDescriptor Descriptor {
get { return descriptor; }
}

byte[] descriptorData = global::System.Convert.FromBase64String(
string.Concat(
"Cjdjb21tdW5pY2F0b3Jfb2JqZWN0cy9lbnZpcm9ubWVudF9wYXJhbWV0ZXJz",
"X3Byb3RvLnByb3RvEhRjb21tdW5pY2F0b3Jfb2JqZWN0cyK1AQoaRW52aXJv",
"bm1lbnRQYXJhbWV0ZXJzUHJvdG8SXwoQZmxvYXRfcGFyYW1ldGVycxgBIAMo",
"CzJFLmNvbW11bmljYXRvcl9vYmplY3RzLkVudmlyb25tZW50UGFyYW1ldGVy",
"c1Byb3RvLkZsb2F0UGFyYW1ldGVyc0VudHJ5GjYKFEZsb2F0UGFyYW1ldGVy",
"c0VudHJ5EgsKA2tleRgBIAEoCRINCgV2YWx1ZRgCIAEoAjoCOAFCH6oCHE1M",
"QWdlbnRzLkNvbW11bmljYXRvck9iamVjdHNiBnByb3RvMw=="));
"CkVtbGFnZW50cy9lbnZzL2NvbW11bmljYXRvcl9vYmplY3RzL2Vudmlyb25t",
"ZW50X3BhcmFtZXRlcnNfcHJvdG8ucHJvdG8SFGNvbW11bmljYXRvcl9vYmpl",
"Y3RzIrUBChpFbnZpcm9ubWVudFBhcmFtZXRlcnNQcm90bxJfChBmbG9hdF9w",
"YXJhbWV0ZXJzGAEgAygLMkUuY29tbXVuaWNhdG9yX29iamVjdHMuRW52aXJv",
"bm1lbnRQYXJhbWV0ZXJzUHJvdG8uRmxvYXRQYXJhbWV0ZXJzRW50cnkaNgoU",
"RmxvYXRQYXJhbWV0ZXJzRW50cnkSCwoDa2V5GAEgASgJEg0KBXZhbHVlGAIg",
"ASgCOgI4AUIfqgIcTUxBZ2VudHMuQ29tbXVuaWNhdG9yT2JqZWN0c2IGcHJv",
"dG8z"));
descriptor = pbr::FileDescriptor.FromGeneratedCode(descriptorData,
new pbr::FileDescriptor[] { },
new pbr::GeneratedClrTypeInfo(null, new pbr::GeneratedClrTypeInfo[] {

2
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/EnvironmentParametersProto.cs.meta


fileFormatVersion: 2
guid: 312dc062dfab44416a31b8b273cda29a
guid: be8c5f75bdcff41488a8e85748541100
MonoImporter:
externalObjects: {}
serializedVersion: 2

14
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/Header.cs


// <auto-generated>
// Generated by the protocol buffer compiler. DO NOT EDIT!
// source: communicator_objects/header.proto
// source: mlagents/envs/communicator_objects/header.proto
// </auto-generated>
#pragma warning disable 1591, 0612, 3021
#region Designer generated code

using scg = global::System.Collections.Generic;
namespace MLAgents.CommunicatorObjects {
/// <summary>Holder for reflection information generated from communicator_objects/header.proto</summary>
/// <summary>Holder for reflection information generated from mlagents/envs/communicator_objects/header.proto</summary>
/// <summary>File descriptor for communicator_objects/header.proto</summary>
/// <summary>File descriptor for mlagents/envs/communicator_objects/header.proto</summary>
public static pbr::FileDescriptor Descriptor {
get { return descriptor; }
}

byte[] descriptorData = global::System.Convert.FromBase64String(
string.Concat(
"CiFjb21tdW5pY2F0b3Jfb2JqZWN0cy9oZWFkZXIucHJvdG8SFGNvbW11bmlj",
"YXRvcl9vYmplY3RzIikKBkhlYWRlchIOCgZzdGF0dXMYASABKAUSDwoHbWVz",
"c2FnZRgCIAEoCUIfqgIcTUxBZ2VudHMuQ29tbXVuaWNhdG9yT2JqZWN0c2IG",
"cHJvdG8z"));
"Ci9tbGFnZW50cy9lbnZzL2NvbW11bmljYXRvcl9vYmplY3RzL2hlYWRlci5w",
"cm90bxIUY29tbXVuaWNhdG9yX29iamVjdHMiKQoGSGVhZGVyEg4KBnN0YXR1",
"cxgBIAEoBRIPCgdtZXNzYWdlGAIgASgJQh+qAhxNTEFnZW50cy5Db21tdW5p",
"Y2F0b3JPYmplY3RzYgZwcm90bzM="));
descriptor = pbr::FileDescriptor.FromGeneratedCode(descriptorData,
new pbr::FileDescriptor[] { },
new pbr::GeneratedClrTypeInfo(null, new pbr::GeneratedClrTypeInfo[] {

2
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/Header.cs.meta


fileFormatVersion: 2
guid: e582b089dfedc438d9cbce9d4017b807
guid: 8bb8aabfab48b408381733bccccd5af9
MonoImporter:
externalObjects: {}
serializedVersion: 2

15
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/ResolutionProto.cs


// <auto-generated>
// Generated by the protocol buffer compiler. DO NOT EDIT!
// source: communicator_objects/resolution_proto.proto
// source: mlagents/envs/communicator_objects/resolution_proto.proto
// </auto-generated>
#pragma warning disable 1591, 0612, 3021
#region Designer generated code

using scg = global::System.Collections.Generic;
namespace MLAgents.CommunicatorObjects {
/// <summary>Holder for reflection information generated from communicator_objects/resolution_proto.proto</summary>
/// <summary>Holder for reflection information generated from mlagents/envs/communicator_objects/resolution_proto.proto</summary>
/// <summary>File descriptor for communicator_objects/resolution_proto.proto</summary>
/// <summary>File descriptor for mlagents/envs/communicator_objects/resolution_proto.proto</summary>
public static pbr::FileDescriptor Descriptor {
get { return descriptor; }
}

byte[] descriptorData = global::System.Convert.FromBase64String(
string.Concat(
"Citjb21tdW5pY2F0b3Jfb2JqZWN0cy9yZXNvbHV0aW9uX3Byb3RvLnByb3Rv",
"EhRjb21tdW5pY2F0b3Jfb2JqZWN0cyJECg9SZXNvbHV0aW9uUHJvdG8SDQoF",
"d2lkdGgYASABKAUSDgoGaGVpZ2h0GAIgASgFEhIKCmdyYXlfc2NhbGUYAyAB",
"KAhCH6oCHE1MQWdlbnRzLkNvbW11bmljYXRvck9iamVjdHNiBnByb3RvMw=="));
"CjltbGFnZW50cy9lbnZzL2NvbW11bmljYXRvcl9vYmplY3RzL3Jlc29sdXRp",
"b25fcHJvdG8ucHJvdG8SFGNvbW11bmljYXRvcl9vYmplY3RzIkQKD1Jlc29s",
"dXRpb25Qcm90bxINCgV3aWR0aBgBIAEoBRIOCgZoZWlnaHQYAiABKAUSEgoK",
"Z3JheV9zY2FsZRgDIAEoCEIfqgIcTUxBZ2VudHMuQ29tbXVuaWNhdG9yT2Jq",
"ZWN0c2IGcHJvdG8z"));
descriptor = pbr::FileDescriptor.FromGeneratedCode(descriptorData,
new pbr::FileDescriptor[] { },
new pbr::GeneratedClrTypeInfo(null, new pbr::GeneratedClrTypeInfo[] {

2
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/ResolutionProto.cs.meta


fileFormatVersion: 2
guid: ca2454611610e4136a412b5cd6afee4d
guid: eae234f817240444a9d18b3d7366f260
MonoImporter:
externalObjects: {}
serializedVersion: 2

2
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/SpaceTypeProto.cs.meta


fileFormatVersion: 2
guid: bf7e44e20999448ef846526541819077
guid: 3e61637749b07412284363ff304da763
MonoImporter:
externalObjects: {}
serializedVersion: 2

27
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/UnityInput.cs


// <auto-generated>
// Generated by the protocol buffer compiler. DO NOT EDIT!
// source: communicator_objects/unity_input.proto
// source: mlagents/envs/communicator_objects/unity_input.proto
// </auto-generated>
#pragma warning disable 1591, 0612, 3021
#region Designer generated code

using scg = global::System.Collections.Generic;
namespace MLAgents.CommunicatorObjects {
/// <summary>Holder for reflection information generated from communicator_objects/unity_input.proto</summary>
/// <summary>Holder for reflection information generated from mlagents/envs/communicator_objects/unity_input.proto</summary>
/// <summary>File descriptor for communicator_objects/unity_input.proto</summary>
/// <summary>File descriptor for mlagents/envs/communicator_objects/unity_input.proto</summary>
public static pbr::FileDescriptor Descriptor {
get { return descriptor; }
}

byte[] descriptorData = global::System.Convert.FromBase64String(
string.Concat(
"CiZjb21tdW5pY2F0b3Jfb2JqZWN0cy91bml0eV9pbnB1dC5wcm90bxIUY29t",
"bXVuaWNhdG9yX29iamVjdHMaKWNvbW11bmljYXRvcl9vYmplY3RzL3VuaXR5",
"X3JsX2lucHV0LnByb3RvGjhjb21tdW5pY2F0b3Jfb2JqZWN0cy91bml0eV9y",
"bF9pbml0aWFsaXphdGlvbl9pbnB1dC5wcm90byKVAQoKVW5pdHlJbnB1dBI0",
"CghybF9pbnB1dBgBIAEoCzIiLmNvbW11bmljYXRvcl9vYmplY3RzLlVuaXR5",
"UkxJbnB1dBJRChdybF9pbml0aWFsaXphdGlvbl9pbnB1dBgCIAEoCzIwLmNv",
"bW11bmljYXRvcl9vYmplY3RzLlVuaXR5UkxJbml0aWFsaXphdGlvbklucHV0",
"Qh+qAhxNTEFnZW50cy5Db21tdW5pY2F0b3JPYmplY3RzYgZwcm90bzM="));
"CjRtbGFnZW50cy9lbnZzL2NvbW11bmljYXRvcl9vYmplY3RzL3VuaXR5X2lu",
"cHV0LnByb3RvEhRjb21tdW5pY2F0b3Jfb2JqZWN0cxo3bWxhZ2VudHMvZW52",
"cy9jb21tdW5pY2F0b3Jfb2JqZWN0cy91bml0eV9ybF9pbnB1dC5wcm90bxpG",
"bWxhZ2VudHMvZW52cy9jb21tdW5pY2F0b3Jfb2JqZWN0cy91bml0eV9ybF9p",
"bml0aWFsaXphdGlvbl9pbnB1dC5wcm90byKVAQoKVW5pdHlJbnB1dBI0Cghy",
"bF9pbnB1dBgBIAEoCzIiLmNvbW11bmljYXRvcl9vYmplY3RzLlVuaXR5UkxJ",
"bnB1dBJRChdybF9pbml0aWFsaXphdGlvbl9pbnB1dBgCIAEoCzIwLmNvbW11",
"bmljYXRvcl9vYmplY3RzLlVuaXR5UkxJbml0aWFsaXphdGlvbklucHV0Qh+q",
"AhxNTEFnZW50cy5Db21tdW5pY2F0b3JPYmplY3RzYgZwcm90bzM="));
descriptor = pbr::FileDescriptor.FromGeneratedCode(descriptorData,
new pbr::FileDescriptor[] { global::MLAgents.CommunicatorObjects.UnityRlInputReflection.Descriptor, global::MLAgents.CommunicatorObjects.UnityRlInitializationInputReflection.Descriptor, },
new pbr::GeneratedClrTypeInfo(null, new pbr::GeneratedClrTypeInfo[] {

[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public UnityInput(UnityInput other) : this() {
RlInput = other.rlInput_ != null ? other.RlInput.Clone() : null;
RlInitializationInput = other.rlInitializationInput_ != null ? other.RlInitializationInput.Clone() : null;
rlInput_ = other.rlInput_ != null ? other.rlInput_.Clone() : null;
rlInitializationInput_ = other.rlInitializationInput_ != null ? other.rlInitializationInput_.Clone() : null;
_unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields);
}

2
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/UnityInput.cs.meta


fileFormatVersion: 2
guid: c97e6e2cde58d404cba31008c0489454
guid: 25e46cd9eca204e19a08fa938802ef9d
MonoImporter:
externalObjects: {}
serializedVersion: 2

32
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/UnityMessage.cs


// <auto-generated>
// Generated by the protocol buffer compiler. DO NOT EDIT!
// source: communicator_objects/unity_message.proto
// source: mlagents/envs/communicator_objects/unity_message.proto
// </auto-generated>
#pragma warning disable 1591, 0612, 3021
#region Designer generated code

using scg = global::System.Collections.Generic;
namespace MLAgents.CommunicatorObjects {
/// <summary>Holder for reflection information generated from communicator_objects/unity_message.proto</summary>
/// <summary>Holder for reflection information generated from mlagents/envs/communicator_objects/unity_message.proto</summary>
/// <summary>File descriptor for communicator_objects/unity_message.proto</summary>
/// <summary>File descriptor for mlagents/envs/communicator_objects/unity_message.proto</summary>
public static pbr::FileDescriptor Descriptor {
get { return descriptor; }
}

byte[] descriptorData = global::System.Convert.FromBase64String(
string.Concat(
"Cihjb21tdW5pY2F0b3Jfb2JqZWN0cy91bml0eV9tZXNzYWdlLnByb3RvEhRj",
"b21tdW5pY2F0b3Jfb2JqZWN0cxonY29tbXVuaWNhdG9yX29iamVjdHMvdW5p",
"dHlfb3V0cHV0LnByb3RvGiZjb21tdW5pY2F0b3Jfb2JqZWN0cy91bml0eV9p",
"bnB1dC5wcm90bxohY29tbXVuaWNhdG9yX29iamVjdHMvaGVhZGVyLnByb3Rv",
"IqwBCgxVbml0eU1lc3NhZ2USLAoGaGVhZGVyGAEgASgLMhwuY29tbXVuaWNh",
"dG9yX29iamVjdHMuSGVhZGVyEjcKDHVuaXR5X291dHB1dBgCIAEoCzIhLmNv",
"bW11bmljYXRvcl9vYmplY3RzLlVuaXR5T3V0cHV0EjUKC3VuaXR5X2lucHV0",
"GAMgASgLMiAuY29tbXVuaWNhdG9yX29iamVjdHMuVW5pdHlJbnB1dEIfqgIc",
"TUxBZ2VudHMuQ29tbXVuaWNhdG9yT2JqZWN0c2IGcHJvdG8z"));
"CjZtbGFnZW50cy9lbnZzL2NvbW11bmljYXRvcl9vYmplY3RzL3VuaXR5X21l",
"c3NhZ2UucHJvdG8SFGNvbW11bmljYXRvcl9vYmplY3RzGjVtbGFnZW50cy9l",
"bnZzL2NvbW11bmljYXRvcl9vYmplY3RzL3VuaXR5X291dHB1dC5wcm90bxo0",
"bWxhZ2VudHMvZW52cy9jb21tdW5pY2F0b3Jfb2JqZWN0cy91bml0eV9pbnB1",
"dC5wcm90bxovbWxhZ2VudHMvZW52cy9jb21tdW5pY2F0b3Jfb2JqZWN0cy9o",
"ZWFkZXIucHJvdG8irAEKDFVuaXR5TWVzc2FnZRIsCgZoZWFkZXIYASABKAsy",
"HC5jb21tdW5pY2F0b3Jfb2JqZWN0cy5IZWFkZXISNwoMdW5pdHlfb3V0cHV0",
"GAIgASgLMiEuY29tbXVuaWNhdG9yX29iamVjdHMuVW5pdHlPdXRwdXQSNQoL",
"dW5pdHlfaW5wdXQYAyABKAsyIC5jb21tdW5pY2F0b3Jfb2JqZWN0cy5Vbml0",
"eUlucHV0Qh+qAhxNTEFnZW50cy5Db21tdW5pY2F0b3JPYmplY3RzYgZwcm90",
"bzM="));
descriptor = pbr::FileDescriptor.FromGeneratedCode(descriptorData,
new pbr::FileDescriptor[] { global::MLAgents.CommunicatorObjects.UnityOutputReflection.Descriptor, global::MLAgents.CommunicatorObjects.UnityInputReflection.Descriptor, global::MLAgents.CommunicatorObjects.HeaderReflection.Descriptor, },
new pbr::GeneratedClrTypeInfo(null, new pbr::GeneratedClrTypeInfo[] {

[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public UnityMessage(UnityMessage other) : this() {
Header = other.header_ != null ? other.Header.Clone() : null;
UnityOutput = other.unityOutput_ != null ? other.UnityOutput.Clone() : null;
UnityInput = other.unityInput_ != null ? other.UnityInput.Clone() : null;
header_ = other.header_ != null ? other.header_.Clone() : null;
unityOutput_ = other.unityOutput_ != null ? other.unityOutput_.Clone() : null;
unityInput_ = other.unityInput_ != null ? other.unityInput_.Clone() : null;
_unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields);
}

2
UnitySDK/Assets/ML-Agents/Scripts/CommunicatorObjects/UnityMessage.cs.meta


fileFormatVersion: 2
guid: 10dca984632854b079476d5fb6df329c
guid: d270bf9ce3d564bb48b2095802c15ff9
MonoImporter:
externalObjects: {}
serializedVersion: 2

部分文件因为文件数量过多而无法显示

正在加载...
取消
保存