浏览代码

0.2 Update

* added broadcast to the player and heuristic brain.
Allows the python API to record actions taken along with the states and rewards

* removed the broadcast checkbox
Added a Handshake method for the communicator
The academy will try to handshake regardless of the brains present
Player and Heuristic brains will send their information through the communicator but will not receive commands

* bug fix : The environment only requests actions from external brains when unique

* added warning in case no brins are set to external

* fix on the instanciation of coreBrains,
fix on the conversion of actions to arrays in the BrainInfo received from step

* default discrete action is now 0
bug fix for discrete broadcast action (the action size should be one in Agents.cs)
modified Tennis so that the default action is no action
modified the TemplateDecsion.cs to ensure non null values are sent from Decide() and MakeMemory()

* minor fixes

* need to convert the s...
/develop-generalizationTraining-TrainerController
Arthur Juliani 7 年前
当前提交
51f23cd2
共有 199 个文件被更改,包括 10977 次插入1264 次删除
  1. 5
      .gitignore
  2. 85
      docs/Example-Environments.md
  3. 4
      docs/Getting-Started-with-Balance-Ball.md
  4. 40
      docs/Making-a-new-Unity-Environment.md
  5. 25
      docs/Readme.md
  6. 14
      docs/Using-TensorFlow-Sharp-in-Unity-(Experimental).md
  7. 45
      docs/best-practices-ppo.md
  8. 11
      docs/best-practices.md
  9. 56
      python/PPO.ipynb
  10. 89
      python/ppo.py
  11. 134
      python/ppo/models.py
  12. 85
      python/ppo/trainer.py
  13. 2
      python/setup.py
  14. 200
      python/test_unityagents.py
  15. 1
      python/unityagents/__init__.py
  16. 3
      python/unityagents/brain.py
  17. 247
      python/unityagents/environment.py
  18. 31
      python/unityagents/exception.py
  19. 29
      unity-environment/Assets/ML-Agents/Examples/3DBall/Prefabs/Game.prefab
  20. 183
      unity-environment/Assets/ML-Agents/Examples/3DBall/Scene.unity
  21. 12
      unity-environment/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs
  22. 96
      unity-environment/Assets/ML-Agents/Examples/Basic/Scripts/BasicAgent.cs
  23. 21
      unity-environment/Assets/ML-Agents/Examples/Basic/Scripts/BasicDecision.cs
  24. 100
      unity-environment/Assets/ML-Agents/Examples/GridWorld/GridWorld.unity
  25. 1
      unity-environment/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAgent.cs
  26. 13
      unity-environment/Assets/ML-Agents/Examples/Tennis/Materials/ballMat.physicMaterial
  27. 2
      unity-environment/Assets/ML-Agents/Examples/Tennis/Materials/racketMat.physicMaterial
  28. 16
      unity-environment/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAcademy.cs
  29. 62
      unity-environment/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs
  30. 34
      unity-environment/Assets/ML-Agents/Examples/Tennis/Scripts/hitWall.cs
  31. 256
      unity-environment/Assets/ML-Agents/Examples/Tennis/TFModels/Tennis.bytes
  32. 929
      unity-environment/Assets/ML-Agents/Examples/Tennis/Tennis.unity
  33. 49
      unity-environment/Assets/ML-Agents/Scripts/Academy.cs
  34. 31
      unity-environment/Assets/ML-Agents/Scripts/Agent.cs
  35. 36
      unity-environment/Assets/ML-Agents/Scripts/Brain.cs
  36. 13
      unity-environment/Assets/ML-Agents/Scripts/Communicator.cs
  37. 32
      unity-environment/Assets/ML-Agents/Scripts/CoreBrainExternal.cs
  38. 22
      unity-environment/Assets/ML-Agents/Scripts/CoreBrainHeuristic.cs
  39. 56
      unity-environment/Assets/ML-Agents/Scripts/CoreBrainInternal.cs
  40. 28
      unity-environment/Assets/ML-Agents/Scripts/CoreBrainPlayer.cs
  41. 129
      unity-environment/Assets/ML-Agents/Scripts/ExternalCommunicator.cs
  42. 19
      unity-environment/Assets/ML-Agents/Template/Scripts/TemplateDecision.cs
  43. 8
      unity-environment/ProjectSettings/TagManager.asset
  44. 18
      unity-environment/README.md
  45. 12
      docs/broadcast.md
  46. 87
      docs/curriculum.md
  47. 18
      docs/monitor.md
  48. 213
      images/broadcast.png
  49. 1001
      images/crawler.png
  50. 488
      images/curriculum.png
  51. 260
      images/curriculum_progress.png
  52. 173
      images/math.png
  53. 563
      images/monitor.png
  54. 495
      images/push.png
  55. 1001
      images/reacher.png
  56. 695
      images/wall.png
  57. 81
      python/unityagents/curriculum.py
  58. 9
      unity-environment/Assets/ML-Agents/Examples/Area.meta
  59. 9
      unity-environment/Assets/ML-Agents/Examples/Crawler.meta
  60. 9
      unity-environment/Assets/ML-Agents/Examples/Reacher.meta
  61. 10
      unity-environment/Assets/ML-Agents/Examples/Tennis/Prefabs.meta
  62. 40
      unity-environment/Assets/ML-Agents/Examples/Tennis/Scripts/TennisArea.cs
  63. 13
      unity-environment/Assets/ML-Agents/Examples/Tennis/Scripts/TennisArea.cs.meta
  64. 380
      unity-environment/Assets/ML-Agents/Scripts/Monitor.cs
  65. 12
      unity-environment/Assets/ML-Agents/Scripts/Monitor.cs.meta
  66. 12
      python/curricula/push.json
  67. 12
      python/curricula/test.json
  68. 11
      python/curricula/wall.json
  69. 9
      unity-environment/Assets/ML-Agents/Examples/Area/Materials.meta
  70. 76
      unity-environment/Assets/ML-Agents/Examples/Area/Materials/agent.mat
  71. 9
      unity-environment/Assets/ML-Agents/Examples/Area/Materials/agent.mat.meta
  72. 76
      unity-environment/Assets/ML-Agents/Examples/Area/Materials/block.mat
  73. 9
      unity-environment/Assets/ML-Agents/Examples/Area/Materials/block.mat.meta
  74. 76
      unity-environment/Assets/ML-Agents/Examples/Area/Materials/goal.mat
  75. 9
      unity-environment/Assets/ML-Agents/Examples/Area/Materials/goal.mat.meta
  76. 77
      unity-environment/Assets/ML-Agents/Examples/Area/Materials/wall.mat
  77. 9
      unity-environment/Assets/ML-Agents/Examples/Area/Materials/wall.mat.meta
  78. 9
      unity-environment/Assets/ML-Agents/Examples/Area/Prefabs.meta
  79. 224
      unity-environment/Assets/ML-Agents/Examples/Area/Prefabs/Agent.prefab
  80. 9
      unity-environment/Assets/ML-Agents/Examples/Area/Prefabs/Agent.prefab.meta
  81. 111
      unity-environment/Assets/ML-Agents/Examples/Area/Prefabs/Block.prefab
  82. 9
      unity-environment/Assets/ML-Agents/Examples/Area/Prefabs/Block.prefab.meta
  83. 190
      unity-environment/Assets/ML-Agents/Examples/Area/Prefabs/GoalHolder.prefab
  84. 9
      unity-environment/Assets/ML-Agents/Examples/Area/Prefabs/GoalHolder.prefab.meta
  85. 641
      unity-environment/Assets/ML-Agents/Examples/Area/Prefabs/PushArea.prefab
  86. 9
      unity-environment/Assets/ML-Agents/Examples/Area/Prefabs/PushArea.prefab.meta
  87. 757
      unity-environment/Assets/ML-Agents/Examples/Area/Prefabs/WallArea.prefab
  88. 9
      unity-environment/Assets/ML-Agents/Examples/Area/Prefabs/WallArea.prefab.meta
  89. 1001
      unity-environment/Assets/ML-Agents/Examples/Area/Push.unity
  90. 8
      unity-environment/Assets/ML-Agents/Examples/Area/Push.unity.meta
  91. 9
      unity-environment/Assets/ML-Agents/Examples/Area/Scripts.meta
  92. 20
      unity-environment/Assets/ML-Agents/Examples/Area/Scripts/Area.cs

5
.gitignore


/unity-environment/[Oo]bj/
/unity-environment/[Bb]uild/
/unity-environment/[Bb]uilds/
/unity-environment/[Pp]ackages/
/unity-environment/[Uu]nity[Pp]ackage[Mm]anager/
# Environemnt logfile
*unity-environment.log
# Visual Studio 2015 cache directory
/unity-environment/.vs/

85
docs/Example-Environments.md


# Example Learning Environments
### About Example Environments
Unity ML Agents currently contains three example environments which demonstrate various features of the platform. In the coming months more will be added. We are also actively open to adding community contributed environments as examples, as long as they are small, simple, demonstrate a unique feature of the platform, and provide a unique non-trivial challenge to modern RL algorithms. Feel free to submit these environments with a Pull-Request explaining the nature of the environment and task.
Unity ML Agents contains a set of example environments which demonstrate various features of the platform. In the coming months more will be added. We are also actively open to adding community contributed environments as examples, as long as they are small, simple, demonstrate a unique feature of the platform, and provide a unique non-trivial challenge to modern RL algorithms. Feel free to submit these environments with a Pull-Request explaining the nature of the environment and task.
## Basic
* Set-up: A linear movement task where the agent must move left or right to rewarding states.
* Goal: Move to the most reward state.
* Agents: The environment contains one agent linked to a single brain.
* Agent Reward Function:
* +0.1 for arriving at suboptimal state.
* +1.0 for arriving at optimal state.
* Brains: One brain with the following state/action space.
* State space: (Discrete) One variable corresponding to current state.
* Action space: (Discrete) Two possible actions (Move left, move right).
* Observations: 0
* Reset Parameters: None
## 3DBall

* Observations: None
* Reset Parameters: One, corresponding to size of ball.
## Area
### Push Area
![Push](../images/push.png)
* Set-up: A platforming environment where the agent can push a block around.
* Goal: The agent must push the block to the goal.
* Agents: The environment contains one agent linked to a single brain.
* Agent Reward Function:
* -0.01 for every step.
* +1.0 if the block touches the goal.
* -1.0 if the agent falls off the platform.
* Brains: One brain with the following state/action space.
* State space: (Continuous) 15 variables corresponding to position and velocities of agent, block, and goal.
* Action space: (Discrete) Size of 6, corresponding to movement in cardinal directions, jumping, and no movement.
* Observations: None.
* Reset Parameters: One, corresponding to number of steps in training. Used to adjust size of elements for Curriculum Learning.
### Wall Area
![Wall](../images/wall.png)
* Set-up: A platforming environment where the agent can jump over a wall.
* Goal: The agent must use the block to scale the wall and reach the goal.
* Agents: The environment contains one agent linked to a single brain.
* Agent Reward Function:
* -0.01 for every step.
* +1.0 if the agent touches the goal.
* -1.0 if the agent falls off the platform.
* Brains: One brain with the following state/action space.
* State space: (Continuous) 16 variables corresponding to position and velocities of agent, block, and goal, plus the height of the wall.
* Action space: (Discrete) Size of 6, corresponding to movement in cardinal directions, jumping, and no movement.
* Observations: None.
* Reset Parameters: One, corresponding to number of steps in training. Used to adjust size of the wall for Curriculum Learning.
## Reacher
![Tennis](../images/reacher.png)
* Set-up: Double-jointed arm which can move to target locations.
* Goal: The agents must move it's hand to the goal location, and keep it there.
* Agents: The environment contains 32 agent linked to a single brain.
* Agent Reward Function (independent):
* +0.1 Each step agent's hand is in goal location.
* Brains: One brain with the following state/action space.
* State space: (Continuous) 26 variables corresponding to position, rotation, velocity, and angular velocities of the two arm rigidbodies.
* Action space: (Continuous) Size of 4, corresponding to torque applicable to two joints.
* Observations: None
* Reset Parameters: Two, corresponding to goal size, and goal movement speed.
## Crawler
![Crawler](../images/crawler.png)
* Set-up: A creature with 4 arms and 4 forearms.
* Goal: The agents must move its body along the x axis without falling.
* Agents: The environment contains 3 agent linked to a single brain.
* Agent Reward Function (independent):
* +1 times velocity in the x direction
* -1 for falling.
* -0.01 times the action squared
* -0.05 times y position change
* -0.05 times velocity in the z direction
* Brains: One brain with the following state/action space.
* State space: (Continuous) 117 variables corresponding to position, rotation, velocity, and angular velocities of each limb plus the acceleration and angular acceleration of the body.
* Action space: (Continuous) Size of 12, corresponding to torque applicable to 12 joints.
* Observations: None
* Reset Parameters: None

4
docs/Getting-Started-with-Balance-Ball.md


Because TensorFlowSharp support is still experimental, it is disabled by default. In order to enable it, you must follow these steps. Please note that the `Internal` Brain mode will only be available once completing these steps.
1. Make sure you are using Unity 2017.1 or newer.
2. Make sure the TensorFlowSharp plugin is in your `Assets` folder. A Plugins folder which includes TF# can be downloaded [here](https://s3.amazonaws.com/unity-agents/TFSharpPlugin.unitypackage). Double click and import it once downloaded.
2. Make sure the TensorFlowSharp plugin is in your `Assets` folder. A Plugins folder which includes TF# can be downloaded [here](https://s3.amazonaws.com/unity-agents/0.2/TFSharpPlugin.unitypackage). Double click and import it once downloaded.
4. For each of the platforms you target (**`PC, Mac and Linux Standalone`**, **`iOS`** or **`Android`**):
4. For each of the platforms you target (**`PC, Mac and Linux Standalone`**, **`iOS`** or **`Android`**):
1. Go into `Other Settings`.
2. Select `Scripting Runtime Version` to `Experimental (.NET 4.6 Equivalent)`
3. In `Scripting Defined Symbols`, add the flag `ENABLE_TENSORFLOW`

40
docs/Making-a-new-Unity-Environment.md


## Setting up the Unity Project
1. Open an existing Unity project, or create a new one and import the RL interface package:
* [ML-Agents package without TensorflowSharp](https://s3.amazonaws.com/unity-agents/ML-AgentsNoPlugin.unitypackage)
* [ML-Agents package with TensorflowSharp](https://s3.amazonaws.com/unity-agents/ML-AgentsWithPlugin.unitypackage)
1. Open an existing Unity project, or create a new one and import the RL interface package:
* [ML-Agents package without TensorflowSharp](https://s3.amazonaws.com/unity-agents/0.2/ML-AgentsNoPlugin.unitypackage)
* [ML-Agents package with TensorflowSharp](https://s3.amazonaws.com/unity-agents/0.2/ML-AgentsWithPlugin.unitypackage)
2. Rename `TemplateAcademy.cs` (and the contained class name) to the desired name of your new academy class. All Template files are in the folder `Assets -> Template -> Scripts`. Typical naming convention is `YourNameAcademy`.

6. If you will be using Tensorflow Sharp in Unity, you must:
1. Make sure you are using Unity 2017.1 or newer.
2. Make sure the TensorflowSharp plugin is in your Asset folder. It can be downloaded [here](https://s3.amazonaws.com/unity-agents/TFSharpPlugin.unitypackage).
2. Make sure the TensorflowSharp [plugin](https://s3.amazonaws.com/unity-agents/0.2/TFSharpPlugin.unitypackage) is in your Asset folder.
4. For each of the platforms you target (**`PC, Mac and Linux Standalone`**, **`iOS`** or **`Android`**):
4. For each of the platforms you target (**`PC, Mac and Linux Standalone`**, **`iOS`** or **`Android`**):
2. Select `Scripting Runtime Version` to `Experimental (.NET 4.6 Equivalent)`
2. Select `Scripting Runtime Version` to `Experimental (.NET 4.6 Equivalent)`
3. In `Scripting Defined Symbols`, add the flag `ENABLE_TENSORFLOW`
5. Note that some of these changes will require a Unity Restart

* `Target Frame Rate` Frequency of frame rendering. If environment utilizes observations, increase this during training, and set to `60` during inference. If no observations are used, this can be set to `1` during training.
* **`Default Reset Parameters`** You can set the default configuration to be passed at reset. This will be a mapping from strings to float values that you can call in the academy with `resetParameters["YourDefaultParameter"]`
3. Within **`InitializeAcademy()`**, you can define the initialization of the Academy. Note that this command is ran only once at the beginning of the training session.
3. Within **`InitializeAcademy()`**, you can define the initialization of the Academy. Note that this command is ran only once at the beginning of the training session. Do **not** use `Awake()`, `Start()` or `OnEnable()`
3. Within **`AcademyStep()`**, you can define the environment logic each step. Use this function to modify the environment for the agents that will live in it.

For each Brain game object in your academy :
For each Brain game object in your academy :
2. In the inspector tab, you can modify the characteristics of the brain in **`Brain Parameters`**
2. In the inspector tab, you can modify the characteristics of the brain in **`Brain Parameters`**
* `State Size` Number of variables within the state provided to the agent(s).
* `Action Size` The number of possible actions for each individual agent to take.
* `Memory Size` The number of floats the agents will remember each step.

* `Heuristic` : You can have your brain automatically react to the observations and states in a customizable way. You will need to drag a `Decision` script into `YourNameBrain`. To create a custom reaction, you must :
* Rename `TemplateDecision.cs` (and the contained class name) to the desired name of your new reaction. Typical naming convention is `YourNameDecision`.
* Implement `Decide`: Given the state, observation and memory of an agent, this function must return an array of floats corresponding to the actions taken by the agent. If the action space type is discrete, the array must be of size 1.
* Optionally, implement `MakeMemory`: Given the state, observation and memory of an agent, this function must return an array of floats corresponding to the new memories of the agent.
* Optionally, implement `MakeMemory`: Given the state, observation and memory of an agent, this function must return an array of floats corresponding to the new memories of the agent.
* `Internal` : Note that you must have Tensorflow Sharp setup (see top of this page). Here are the fields that must be completed:
* `Graph Model` : This must be the `bytes` file corresponding to the pretrained Tensorflow graph. (You must first drag this file into your Resources folder and then from the Resources folder into the inspector)
* `Graph Scope` : If you set a scope while training your tensorflow model, all your placeholder name will have a prefix. You must specify that prefix here.

* `Name` : Corresponds to the name of the placeholder.
* `Value Type` : Either Integer or Floating Point.
* `Min Value` and 'Max Value' : Specify the minimum and maximum values (included) the placeholder can take. The value will be sampled from the uniform distribution at each step. If you want this value to be fixed, set both `Min Value` and `Max Value` to the same number.
## Implementing `YourNameAgent`
1. Rename `TemplateAgent.cs` (and the contained class name) to the desired name of your new agent. Typical naming convention is `YourNameAgent`.

5. If `Reset On Done` is checked, `Reset()` will be called when the agent is done. Else, `AgentOnDone()` will be called. Note that if `Reset On Done` is unchecked, the agent will remain "done" until the Academy resets. This means that it will not take actions in the environment.
6. Implement the following functions in `YourNameAgent.cs` :
* `InitializeAgent()` : Use this method to initialize your agent. This method is called then the agent is created.
* `InitializeAgent()` : Use this method to initialize your agent. This method is called when the agent is created. Do **not** use `Awake()`, `Start()` or `OnEnable()`.
* `AgentStep()` : This function will be called every frame, you must define what your agent will do given the input actions. You must also specify the rewards and whether or not the agent is done. To do so, modify the public fields of the agent `reward` and `done`.
* `AgentStep()` : This function will be called every frame, you must define what your agent will do given the input actions. You must also specify the rewards and whether or not the agent is done. To do so, modify the public fields of the agent `reward` and `done`.
* `AgentReset()` : This function is called at start, when the Academy resets and when the agent is done (if `Reset On Done` is checked).
* `AgentOnDone()` : If `Reset On Done` is not checked, this function will be called when the agent is done. `Reset()` will only be called when the Academy resets.

Small negative rewards are also typically used each step in scenarios where the optimal agent behavior is to complete an episode as quickly as possible.
Note that the reward is reset to 0 at every step, you must add to the reward (`reward += rewardIncrement`). If you use `skipFrame` in the Academy and set your rewards instead of incrementing them, you might lose information since the reward is sent at every step, not at every frame.
## Agent Monitor
* You can add the script `AgentMonitor.cs` to any gameObject with a component `YourNameAgent.cs`. In the inspector of this component, you will see:
* `Fixed Position` : If this box is checked, the monitor will be on the left corner of the screen and will remain here. Note that you can only have one agent with a fixed monitor or multiple monitors will overlap.
* `Vertical Offset`: If `Fixed Position` is unchecked, the monitor will follow the Agent on the screen. Use `Vertical Offset` to decide how far above the agent the monitor should be.
* `Display Brain Name` : If this box is checked, the name of the brain will appear in the monitor. (Can be useful if you have similar agents using different brains).
* `Display Brain Type` : If this box is checked, the type of the brain of the agent will be displayed.
* `Display FrameCount` : If this box is checked, the number of frames that elapsed since the agent was reset will be displayed.
* `Display Current Reward`: If this box is checked, the current reward of the agent will be displayed.
* `Display Max Reward` : If this box is checked, the maximum reward obtained during this training session will be displayed.
* `Display State` : If this box is checked, the current state of the agent will be displayed.
* `Display Action` : If this box is checked, the current action the agent performs will be displayed.
If you passed a `value` from an external brain, the value will be displayed as a bar (green if value is positive / red if value is negative) above the monitor. The bar's maximum value is set to 1 by default but if the value of the agent is above this number, it becomes the new maximum.

25
docs/Readme.md


# Unity ML Agents Documentation
## Basic
## About
* [Example Environments](Example-Environments.md)
## Tutorials
* [Example Environments](Example-Environments.md)
* [Making a new Unity Environment](Making-a-new-Unity-Environment.md)
* [How to use the Python API](Unity-Agents---Python-API.md)
## Advanced
* [How to make a new Unity Environment](Making-a-new-Unity-Environment.md)
* [Best practices when designing an Environment](best-practices.md)
* [Best practices when training using PPO](best-practices-ppo.md)
* [How to organize the Scene](Organizing-the-Scene.md)
* [How to use the Python API](Unity-Agents---Python-API.md)
* [How to use TensorflowSharp inside Unity [Experimental]](Using-TensorFlow-Sharp-in-Unity-(Experimental).md)
## Features
* [Scene Organization](Organizing-the-Scene.md)
* [Curriculum Learning](curriculum.md)
* [Broadcast](broadcast.md)
* [Monitor](monitor.md)
* [TensorflowSharp in Unity [Experimental]](Using-TensorFlow-Sharp-in-Unity-(Experimental).md)
## Best Practices
* [Best practices when creating an Environment](best-practices.md)
* [Best practices when training using PPO](best-practices-ppo.md)
## Help
* [Limitations & Common Issues](Limitations-&-Common-Issues.md)

14
docs/Using-TensorFlow-Sharp-in-Unity-(Experimental).md


## Requirements
* Unity 2017.1 or above
* Unity Tensorflow Plugin ([Download here](https://s3.amazonaws.com/unity-agents/TFSharpPlugin.unitypackage))
* Unity Tensorflow Plugin ([Download here](https://s3.amazonaws.com/unity-agents/0.2/TFSharpPlugin.unitypackage))
In order to bring a fully trained agent back into Unity, you will need to make sure the nodes of your graph have appropriate names. You can give names to nodes in Tensorflow :
In order to bring a fully trained agent back into Unity, you will need to make sure the nodes of your graph have appropriate names. You can give names to nodes in Tensorflow :
```python
variable= tf.identity(variable, name="variable_name")
```

Go to `Edit` -> `Player Settings` and add `ENABLE_TENSORFLOW` to the `Scripting Define Symbols` for each type of device you want to use (**`PC, Mac and Linux Standalone`**, **`iOS`** or **`Android`**).
Set the Brain you used for training to `Internal`. Drag `your_name_graph.bytes` into Unity and then drag it into The `Graph Model` field in the Brain. If you used a scope when training you graph, specify it in the `Graph Scope` field. Specify the names of the nodes you used in your graph. If you followed these instructions well, the agents in your environment that use this brain will use you fully trained network to make decisions.
Set the Brain you used for training to `Internal`. Drag `your_name_graph.bytes` into Unity and then drag it into The `Graph Model` field in the Brain. If you used a scope when training you graph, specify it in the `Graph Scope` field. Specify the names of the nodes you used in your graph. If you followed these instructions well, the agents in your environment that use this brain will use you fully trained network to make decisions.
* Once you build for iOS in the editor, Xcode will launch.
* Once you build for iOS in the editor, Xcode will launch.
* In `General` -> `Linked Frameworks and Libraries`:
* Add a framework called `Framework.accelerate`
* Remove the library `libtensorflow-core.a`

* Drag the library `libtensorflow-core.a` from the `Project Navigator` on the left under `Libraries/ML-Agents/Plugins/iOS` into the flag list.
# Using TensorflowSharp without ML-Agents
Beyond controlling an in-game agent, you may desire to use TensorFlowSharp for more general computation. The below instructions describe how to generally embed Tensorflow models without using the ML-Agents framework.

Put the file `your_name_graph.bytes` into Resources.
In your C# script :
At the top, add the line
At the top, add the line
```csharp
using Tensorflow;
```

TensorFlowSharp.Android.NativeBinding.Init();
#endif
```
Put your graph as a text asset in the variable `graphModel`. You can do so in the inspector by making `graphModel` a public variable and dragging you asset in the inspector or load it from the Resources folder :
Put your graph as a text asset in the variable `graphModel`. You can do so in the inspector by making `graphModel` a public variable and dragging you asset in the inspector or load it from the Resources folder :
```csharp
TextAsset graphModel = Resources.Load (your_name_graph) as TextAsset;
```

45
docs/best-practices-ppo.md


### Batch Size
`batch_size` corresponds to how many experiences are used for each gradient descent update. This should always be a fraction
of the `buffer_size`. If you are using a continuous action space, this value should be large. If you are using a discrete action space, this value should be smaller.
of the `buffer_size`. If you are using a continuous action space, this value should be large (in 1000s). If you are using a discrete action space, this value should be smaller (in 10s).
Typical Range (Continuous): `512` - `5120`

### Beta
### Beta (Used only in Discrete Control)
`beta` corresponds to the strength of the entropy regularization. This ensures that discrete action space agents properly
explore during training. Increasing this will ensure more random actions are taken. This should be adjusted such that
the entropy (measurable from TensorBoard) slowly decreases alongside increases in reward. If entropy drops too quickly,
increase `beta`. If entropy drops too slowly, decrease `beta`.
`beta` corresponds to the strength of the entropy regularization, which makes the policy "more random." This ensures that discrete action space agents properly explore during training. Increasing this will ensure more random actions are taken. This should be adjusted such that the entropy (measurable from TensorBoard) slowly decreases alongside increases in reward. If entropy drops too quickly, increase `beta`. If entropy drops too slowly, decrease `beta`.
Typical Range: `1e-4` - `1e-2`

This should be a multiple of `batch_size`.
This should be a multiple of `batch_size`. Typically larger buffer sizes correspond to more stable training updates.
`epsilon` corresponds to the acceptable threshold between the old and new policies during gradient descent updating.
`epsilon` corresponds to the acceptable threshold of divergence between the old and new policies during gradient descent updating. Setting this value small will result in more stable updates, but will also slow the training process.
Typical Range: `0.1` - `0.3`

### Number of Epochs
`num_epoch` is the number of passes through the experience buffer during gradient descent. The larger the batch size, the
larger it is acceptable to make this.
larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning.
Typical Range: `3` - `10`

In cases where there are frequent rewards within an episode, or episodes are prohibitively large, this can be a smaller number.
For most stable training however, this number should be large enough to capture all the important behavior within a sequence of
an agent's actions.
In cases where there are frequent rewards within an episode, or episodes are prohibitively large, this can be a smaller number. For most stable training however, this number should be large enough to capture all the important behavior within a sequence of an agent's actions.
### Max Steps
`max_steps` corresponds to how many steps of the simulation (multiplied by frame-skip) are run durring the training process. This value should be increased for more complex problems.
Typical Range: `5e5 - 1e7`
### Normalize
`normalize` corresponds to whether normalization is applied to the state inputs. This normalization is based on the running average and variance of the states.
Normalization can be helpful in cases with complex continuous control problems, but may be harmful with simpler discrete control problems.
### Number of Layers
`num_layers` corresponds to how many hidden layers are present after the state input, or after the CNN encoding of the observation. For simple problems,
fewer layers are likely to train faster and more efficiently. More layers may be necessary for more complex control problems.
Typical range: `1` - `3`
## Training Statistics
To view training statistics, use Tensorboard. For information on launching and using Tensorboard, see [here](./Getting-Started-with-Balance-Ball.md#observing-training-progress).

The general trend in reward should consistently increase over time. Small ups and downs are to be expected.
The general trend in reward should consistently increase over time. Small ups and downs are to be expected. Depending on the complexity of the task, a significant increase in reward may not present itself until millions of steps into the training process.
This corresponds to how random the decisions of a brain are. This should consistently decrease during training. If it decreases
too soon or not at all, `beta` should be adjusted (when using discrete action space).
This corresponds to how random the decisions of a brain are. This should consistently decrease during training. If it decreases too soon or not at all, `beta` should be adjusted (when using discrete action space).
### Learning Rate

### Value Estimate
These values should increase with the reward. They corresponds to how much future reward the agent predicts itself receiving at
any given point.
These values should increase with the reward. They corresponds to how much future reward the agent predicts itself receiving at any given point.
### Value Loss

11
docs/best-practices.md


## General
* It is often helpful to being with the simplest version of the problem, to ensure the agent can learn it. From there increase
complexity over time.
complexity over time. This can either be done manually, or via Curriculum Learning, where a set of lessons which progressively increase in difficulty are presented to the agent ([learn more here](../docs/curriculum.md)).
* For locomotion tasks, a small positive reward (+0.1) for forward progress is typically used.
* If you want the agent the finish a task quickly, it is often helpful to provide a small penalty every step (-0.1).
* For locomotion tasks, a small positive reward (+0.1) for forward velocity is typically used.
* If you want the agent the finish a task quickly, it is often helpful to provide a small penalty every step (-0.05) that the agent does not complete the task. In this case completion of the task should also coincide with the end of the episode.
* Overly-large negative rewards can cause undesirable behavior where an agent learns to avoid any behavior which might produce the negative reward, even if it is also behavior which can eventually lead to a positive reward.
* The magnitude of each state variable should be normalized to around 1.0.
* Rotation information on GameObjects should be recorded as `state.Add(transform.rotation.eulerAngles.y/180.0f-1.0f);` rather than `state.Add(transform.rotation.y);`.
* Positional information of relevant GameObjects should be encoded in relative coordinates wherever possible. This is often relative to the agent position.
* Be sure to set the action-space-size to the number of used actions, and not greater, as doing the latter can interfere with the efficency of the training process.

56
python/PPO.ipynb


"summary_freq = 10000 # Frequency at which to save training statistics.\n",
"save_freq = 50000 # Frequency at which to save model.\n",
"env_name = \"environment\" # Name of the training environment file.\n",
"curriculum_file = None\n",
"\n",
"### Algorithm-specific parameters for tuning\n",
"gamma = 0.99 # Reward discount rate.\n",

"num_epoch = 5 # Number of gradient descent steps per batch of experiences.\n",
"num_layers = 2 # Number of hidden layers between state/observation encoding and value/policy layers.\n",
"batch_size = 64 # How many experiences per gradient descent update step."
"batch_size = 64 # How many experiences per gradient descent update step.\n",
"normalize = False\n",
"\n",
"### Logging dictionary for hyperparameters\n",
"hyperparameter_dict = {'max_steps':max_steps, 'run_path':run_path, 'env_name':env_name,\n",
" 'curriculum_file':curriculum_file, 'gamma':gamma, 'lambd':lambd, 'time_horizon':time_horizon,\n",
" 'beta':beta, 'num_epoch':num_epoch, 'epsilon':epsilon, 'buffe_size':buffer_size,\n",
" 'leaning_rate':learning_rate, 'hidden_units':hidden_units, 'batch_size':batch_size}"
]
},
{

{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"env = UnityEnvironment(file_name=env_name)\n",
"env = UnityEnvironment(file_name=env_name, curriculum=curriculum_file)\n",
"brain_name = env.brain_names[0]"
"brain_name = env.external_brain_names[0]"
]
},
{

"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": true
},
"outputs": [],

"if curriculum_file == \"None\":\n",
" curriculum_file = None\n",
"\n",
"\n",
"def get_progress():\n",
" if curriculum_file is not None:\n",
" if env._curriculum.measure_type == \"progress\":\n",
" return steps / max_steps\n",
" elif env._curriculum.measure_type == \"reward\":\n",
" return last_reward\n",
" else:\n",
" return None\n",
" else:\n",
" return None\n",
"\n",
" beta=beta, max_step=max_steps)\n",
" beta=beta, max_step=max_steps, \n",
" normalize=normalize, num_layers=num_layers)\n",
"\n",
"is_continuous = (env.brains[brain_name].action_space_type == \"continuous\")\n",
"use_observations = (env.brains[brain_name].number_observations > 0)\n",

" saver.restore(sess, ckpt.model_checkpoint_path)\n",
" else:\n",
" sess.run(init)\n",
" steps = sess.run(ppo_model.global_step)\n",
" steps, last_reward = sess.run([ppo_model.global_step, ppo_model.last_reward]) \n",
" info = env.reset(train_mode=train_model)[brain_name]\n",
" trainer = Trainer(ppo_model, sess, info, is_continuous, use_observations, use_states)\n",
" info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]\n",
" trainer = Trainer(ppo_model, sess, info, is_continuous, use_observations, use_states, train_model)\n",
" if train_model:\n",
" trainer.write_text(summary_writer, 'Hyperparameters', hyperparameter_dict, steps)\n",
" info = env.reset(train_mode=train_model)[brain_name]\n",
" info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]\n",
" new_info = trainer.take_action(info, env, brain_name)\n",
" new_info = trainer.take_action(info, env, brain_name, steps, normalize)\n",
" info = new_info\n",
" trainer.process_experiences(info, time_horizon, gamma, lambd)\n",
" if len(trainer.training_buffer['actions']) > buffer_size and train_model:\n",

" # Write training statistics to tensorboard.\n",
" trainer.write_summary(summary_writer, steps)\n",
" trainer.write_summary(summary_writer, steps, env._curriculum.lesson_number)\n",
" if len(trainer.stats['cumulative_reward']) > 0:\n",
" mean_reward = np.mean(trainer.stats['cumulative_reward'])\n",
" sess.run(ppo_model.update_reward, feed_dict={ppo_model.new_reward: mean_reward})\n",
" last_reward = sess.run(ppo_model.last_reward)\n",
" # Final save Tensorflow model\n",
" if steps != 0 and train_model:\n",
" save_model(sess, model_path=model_path, steps=steps, saver=saver)\n",

89
python/ppo.py


Options:
--help Show this message.
--max-steps=<n> Maximum number of steps to run environment [default: 1e6].
--batch-size=<n> How many experiences per gradient descent update step [default: 64].
--beta=<n> Strength of entropy regularization [default: 2.5e-3].
--buffer-size=<n> How large the experience buffer should be before gradient descent [default: 2048].
--curriculum=<file> Curriculum json file for environment [default: None].
--epsilon=<n> Acceptable threshold around ratio of old and new policy probabilities [default: 0.2].
--gamma=<n> Reward discount rate [default: 0.99].
--hidden-units=<n> Number of units in hidden layer [default: 64].
--keep-checkpoints=<n> How many model checkpoints to keep [default: 5].
--lambd=<n> Lambda parameter for GAE [default: 0.95].
--learning-rate=<rate> Model learning rate [default: 3e-4].
--load Whether to load the model or randomly initialize [default: False].
--max-steps=<n> Maximum number of steps to run environment [default: 1e6].
--normalize Whether to normalize the state input using running statistics [default: False].
--num-epoch=<n> Number of gradient descent steps per batch of experiences [default: 5].
--num-layers=<n> Number of hidden layers between state/observation and outputs [default: 2].
--load Whether to load the model or randomly initialize [default: False].
--train Whether to train model, or only run inference [default: True].
--save-freq=<n> Frequency at which to save model [default: 50000].
--save-freq=<n> Frequency at which to save model [default: 50000].
--gamma=<n> Reward discount rate [default: 0.99].
--lambd=<n> Lambda parameter for GAE [default: 0.95].
--beta=<n> Strength of entropy regularization [default: 1e-3].
--num-epoch=<n> Number of gradient descent steps per batch of experiences [default: 5].
--epsilon=<n> Acceptable threshold around ratio of old and new policy probabilities [default: 0.2].
--buffer-size=<n> How large the experience buffer should be before gradient descent [default: 2048].
--learning-rate=<rate> Model learning rate [default: 3e-4].
--hidden-units=<n> Number of units in hidden layer [default: 64].
--batch-size=<n> How many experiences per gradient descent update step [default: 64].
--keep-checkpoints=<n> How many model checkpoints to keep [default: 5].
--worker-id=<n> Number to add to communication port (5005). Used for asynchronous agent scenarios [default: 0].
--train Whether to train model, or only run inference [default: False].
--worker-id=<n> Number to add to communication port (5005). Used for multi-environment [default: 0].
'''
options = docopt(_USAGE)

env_name = options['<env>']
keep_checkpoints = int(options['--keep-checkpoints'])
worker_id = int(options['--worker-id'])
curriculum_file = str(options['--curriculum'])
if curriculum_file == "None":
curriculum_file = None
# Algorithm-specific parameters for tuning
gamma = float(options['--gamma'])

num_epoch = int(options['--num-epoch'])
num_layers = int(options['--num-layers'])
normalize = options['--normalize']
env = UnityEnvironment(file_name=env_name, worker_id=worker_id)
env = UnityEnvironment(file_name=env_name, worker_id=worker_id, curriculum=curriculum_file)
brain_name = env.brain_names[0]
brain_name = env.external_brain_names[0]
tf.reset_default_graph()

beta=beta, max_step=max_steps)
beta=beta, max_step=max_steps,
normalize=normalize, num_layers=num_layers)
is_continuous = (env.brains[brain_name].action_space_type == "continuous")
use_observations = (env.brains[brain_name].number_observations > 0)

init = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=keep_checkpoints)
def get_progress():
if curriculum_file is not None:
if env._curriculum.measure_type == "progress":
return steps / max_steps
elif env._curriculum.measure_type == "reward":
return last_reward
else:
return None
else:
return None
if ckpt == None:
print('The model {0} could not be found. Make sure you specified the right '
'--run-path'.format(model_path))
steps = sess.run(ppo_model.global_step)
steps, last_reward = sess.run([ppo_model.global_step, ppo_model.last_reward])
info = env.reset(train_mode=train_model)[brain_name]
trainer = Trainer(ppo_model, sess, info, is_continuous, use_observations, use_states)
info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
trainer = Trainer(ppo_model, sess, info, is_continuous, use_observations, use_states, train_model)
if train_model:
trainer.write_text(summary_writer, 'Hyperparameters', options, steps)
info = env.reset(train_mode=train_model)[brain_name]
info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
trainer.reset_buffers(info, total=True)
new_info = trainer.take_action(info, env, brain_name)
new_info = trainer.take_action(info, env, brain_name, steps, normalize)
info = new_info
trainer.process_experiences(info, time_horizon, gamma, lambd)
if len(trainer.training_buffer['actions']) > buffer_size and train_model:

# Write training statistics to tensorboard.
trainer.write_summary(summary_writer, steps)
trainer.write_summary(summary_writer, steps, env._curriculum.lesson_number)
steps += 1
sess.run(ppo_model.increment_step)
if train_model:
steps += 1
sess.run(ppo_model.increment_step)
if len(trainer.stats['cumulative_reward']) > 0:
mean_reward = np.mean(trainer.stats['cumulative_reward'])
sess.run(ppo_model.update_reward, feed_dict={ppo_model.new_reward: mean_reward})
last_reward = sess.run(ppo_model.last_reward)
export_graph(model_path, env_name)
graph_name = (env_name.strip()
.replace('.app', '').replace('.exe', '').replace('.x86_64', '').replace('.x86', ''))
graph_name = os.path.basename(os.path.normpath(graph_name))
export_graph(model_path, graph_name)

134
python/ppo/models.py


from unityagents import UnityEnvironmentException
def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6):
def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6, normalize=False, num_layers=2):
"""
Takes a Unity environment and model-specific hyper-parameters and returns the
appropriate PPO agent model for the environment.

:return: a sub-class of PPOAgent tailored to the environment.
:param max_step: Total number of training steps.
"""
if num_layers < 1: num_layers = 1
return ContinuousControlModel(lr, brain, h_size, epsilon, max_step)
return ContinuousControlModel(lr, brain, h_size, epsilon, max_step, normalize, num_layers)
return DiscreteControlModel(lr, brain, h_size, epsilon, beta, max_step)
return DiscreteControlModel(lr, brain, h_size, epsilon, beta, max_step, normalize, num_layers)
def save_model(sess, saver, model_path="./", steps=0):

print("Saved Model")
def export_graph(model_path, env_name="env", target_nodes="action"):
def export_graph(model_path, env_name="env", target_nodes="action,value_estimate,action_probs"):
"""
Exports latest saved model to .bytes format for Unity embedding.
:param model_path: path of model checkpoints.

class PPOModel(object):
def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, activation):
def __init__(self):
self.normalize = False
def create_global_steps(self):
"""Creates TF ops to track and increment global training step."""
self.global_step = tf.Variable(0, name="global_step", trainable=False, dtype=tf.int32)
self.increment_step = tf.assign(self.global_step, self.global_step + 1)
def create_reward_encoder(self):
"""Creates TF ops to track and increment recent average cumulative reward."""
self.last_reward = tf.Variable(0, name="last_reward", trainable=False, dtype=tf.float32)
self.new_reward = tf.placeholder(shape=[], dtype=tf.float32, name='new_reward')
self.update_reward = tf.assign(self.last_reward, self.new_reward)
def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, activation, num_layers):
"""
Builds a set of visual (CNN) encoders.
:param o_size_h: Height observation size.

name='observation_0')
streams = []
for i in range(num_streams):
self.conv1 = tf.layers.conv2d(self.observation_in, 32, kernel_size=[3, 3], strides=[2, 2],
self.conv1 = tf.layers.conv2d(self.observation_in, 16, kernel_size=[8, 8], strides=[4, 4],
self.conv2 = tf.layers.conv2d(self.conv1, 64, kernel_size=[3, 3], strides=[2, 2],
self.conv2 = tf.layers.conv2d(self.conv1, 32, kernel_size=[4, 4], strides=[2, 2],
hidden = tf.layers.dense(c_layers.flatten(self.conv2), h_size, use_bias=False, activation=activation)
hidden = c_layers.flatten(self.conv2)
for j in range(num_layers):
hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation)
def create_continuous_state_encoder(self, s_size, h_size, num_streams, activation):
def create_continuous_state_encoder(self, s_size, h_size, num_streams, activation, num_layers):
"""
Builds a set of hidden state encoders.
:param s_size: state input size.

:return: List of hidden layer tensors.
"""
self.state_in = tf.placeholder(shape=[None, s_size], dtype=tf.float32, name='state')
if self.normalize:
self.running_mean = tf.get_variable("running_mean", [s_size], trainable=False, dtype=tf.float32,
initializer=tf.zeros_initializer())
self.running_variance = tf.get_variable("running_variance", [s_size], trainable=False, dtype=tf.float32,
initializer=tf.ones_initializer())
self.normalized_state = tf.clip_by_value((self.state_in - self.running_mean) / tf.sqrt(
self.running_variance / (tf.cast(self.global_step, tf.float32) + 1)), -5, 5, name="normalized_state")
self.new_mean = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_mean')
self.new_variance = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_variance')
self.update_mean = tf.assign(self.running_mean, self.new_mean)
self.update_variance = tf.assign(self.running_variance, self.new_variance)
else:
self.normalized_state = self.state_in
hidden_1 = tf.layers.dense(self.state_in, h_size, use_bias=False, activation=activation)
hidden_2 = tf.layers.dense(hidden_1, h_size, use_bias=False, activation=activation)
streams.append(hidden_2)
hidden = self.normalized_state
for j in range(num_layers):
hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation)
streams.append(hidden)
def create_discrete_state_encoder(self, s_size, h_size, num_streams, activation):
def create_discrete_state_encoder(self, s_size, h_size, num_streams, activation, num_layers):
"""
Builds a set of hidden state encoders from discrete state input.
:param s_size: state input size (discrete).

state_in = tf.reshape(self.state_in, [-1])
state_onehot = c_layers.one_hot_encoding(state_in, s_size)
streams = []
hidden = state_onehot
hidden = tf.layers.dense(state_onehot, h_size, use_bias=False, activation=activation)
for j in range(num_layers):
hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation)
streams.append(hidden)
return streams

:param lr: Learning rate
:param max_step: Total number of training steps.
"""
r_theta = probs / old_probs
decay_epsilon = tf.train.polynomial_decay(epsilon, self.global_step,
max_step, 1e-2,
power=1.0)
r_theta = probs / (old_probs + 1e-10)
p_opt_b = tf.clip_by_value(r_theta, 1 - epsilon, 1 + epsilon) * self.advantage
p_opt_b = tf.clip_by_value(r_theta, 1 - decay_epsilon, 1 + decay_epsilon) * self.advantage
self.loss = self.policy_loss + self.value_loss - beta * tf.reduce_mean(entropy)
decay_beta = tf.train.polynomial_decay(beta, self.global_step,
max_step, 1e-5,
power=1.0)
self.loss = self.policy_loss + self.value_loss - decay_beta * tf.reduce_mean(entropy)
self.global_step = tf.Variable(0, trainable=False, name='global_step', dtype=tf.int32)
self.learning_rate = tf.train.polynomial_decay(lr, self.global_step,
max_step, 1e-10,
power=1.0)

self.increment_step = tf.assign(self.global_step, self.global_step + 1)
def __init__(self, lr, brain, h_size, epsilon, max_step):
def __init__(self, lr, brain, h_size, epsilon, max_step, normalize, num_layers):
super(ContinuousControlModel, self).__init__()
self.normalize = normalize
self.create_global_steps()
self.create_reward_encoder()
h_size, w_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
hidden_visual = self.create_visual_encoder(h_size, w_size, bw, h_size, 2, tf.nn.tanh)
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers)
hidden_state = self.create_continuous_state_encoder(s_size, h_size, 2, tf.nn.tanh)
hidden_state = self.create_continuous_state_encoder(s_size, h_size, 2, tf.nn.tanh, num_layers)
hidden_state = self.create_discrete_state_encoder(s_size, h_size, 2, tf.nn.tanh)
hidden_state = self.create_discrete_state_encoder(s_size, h_size, 2, tf.nn.tanh, num_layers)
if hidden_visual is None and hidden_state is None:
raise Exception("No valid network configuration possible. "

self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name='batch_size')
self.mu = tf.layers.dense(hidden_policy, a_size, activation=None, use_bias=False,
kernel_initializer=c_layers.variance_scaling_initializer(factor=0.1))
self.log_sigma_sq = tf.Variable(tf.zeros([a_size]))
kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01))
self.log_sigma_sq = tf.get_variable("log_sigma_squared", [a_size], dtype=tf.float32,
initializer=tf.zeros_initializer())
self.sigma_sq = tf.exp(self.log_sigma_sq)
self.epsilon = tf.placeholder(shape=[None, a_size], dtype=tf.float32, name='epsilon')

a = tf.exp(-1 * tf.pow(tf.stop_gradient(self.output) - self.mu, 2) / (2 * self.sigma_sq))
b = 1 / tf.sqrt(2 * self.sigma_sq * np.pi)
self.probs = a * b
self.probs = tf.multiply(a, b, name="action_probs")
self.value = tf.identity(self.value, name="value_estimate")
self.old_probs = tf.placeholder(shape=[None, a_size], dtype=tf.float32, name='old_probabilities')

class DiscreteControlModel(PPOModel):
def __init__(self, lr, brain, h_size, epsilon, beta, max_step):
def __init__(self, lr, brain, h_size, epsilon, beta, max_step, normalize, num_layers):
super(DiscreteControlModel, self).__init__()
self.create_global_steps()
self.create_reward_encoder()
self.normalize = normalize
h_size, w_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
hidden_visual = self.create_visual_encoder(h_size, w_size, bw, h_size, 1, tf.nn.elu)[0]
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0]
hidden_state = self.create_continuous_state_encoder(s_size, h_size, 1, tf.nn.elu)[0]
hidden_state = self.create_continuous_state_encoder(s_size, h_size, 1, tf.nn.elu, num_layers)[0]
hidden_state = self.create_discrete_state_encoder(s_size, h_size, 1, tf.nn.elu)[0]
hidden_state = self.create_discrete_state_encoder(s_size, h_size, 1, tf.nn.elu, num_layers)[0]
if hidden_visual is None and hidden_state is None:
raise Exception("No valid network configuration possible. "

self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name='batch_size')
self.policy = tf.layers.dense(hidden, a_size, activation=None, use_bias=False,
kernel_initializer=c_layers.variance_scaling_initializer(factor=0.1))
self.probs = tf.nn.softmax(self.policy)
self.action = tf.multinomial(self.policy, 1)
self.output = tf.identity(self.action, name='action')
self.value = tf.layers.dense(hidden, 1, activation=None, use_bias=False)
kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01))
self.probs = tf.nn.softmax(self.policy, name="action_probs")
self.output = tf.multinomial(self.policy, 1)
self.output = tf.identity(self.output, name="action")
self.value = tf.layers.dense(hidden, 1, activation=None, use_bias=False,
kernel_initializer=c_layers.variance_scaling_initializer(factor=1.0))
self.value = tf.identity(self.value, name="value_estimate")
self.entropy = -tf.reduce_sum(self.probs * tf.log(self.probs + 1e-10), axis=1)

self.old_responsible_probs = tf.reduce_sum(self.old_probs * self.selected_actions, axis=1)
self.create_ppo_optimizer(self.responsible_probs, self.old_responsible_probs,
self.value, self.entropy, beta, epsilon, lr, max_step)
self.value, self.entropy, beta, epsilon, lr, max_step)

85
python/ppo/trainer.py


class Trainer(object):
def __init__(self, ppo_model, sess, info, is_continuous, use_observations, use_states):
def __init__(self, ppo_model, sess, info, is_continuous, use_observations, use_states, training):
Responsible for collecting experinces and training PPO model.
Responsible for collecting experiences and training PPO model.
:param ppo_model: Tensorflow graph defining model.
:param sess: Tensorflow session.
:param info: Environment BrainInfo object.

stats = {'cumulative_reward': [], 'episode_length': [], 'value_estimate': [],
'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': []}
self.stats = stats
self.is_training = training
self.reset_buffers(info, total=True)
self.history_dict = empty_all_history(info)
def take_action(self, info, env, brain_name):
def running_average(self, data, steps, running_mean, running_variance):
"""
Computes new running mean and variances.
:param data: New piece of data.
:param steps: Total number of data so far.
:param running_mean: TF op corresponding to stored running mean.
:param running_variance: TF op corresponding to stored running variance.
:return: New mean and variance values.
"""
mean, var = self.sess.run([running_mean, running_variance])
current_x = np.mean(data, axis=0)
new_mean = mean + (current_x - mean) / (steps + 1)
new_variance = var + (current_x - new_mean) * (current_x - mean)
return new_mean, new_variance
def take_action(self, info, env, brain_name, steps, normalize):
"""
Decides actions given state/observation information, and takes them in environment.
:param info: Current BrainInfo from environment.

"""
epsi = None
feed_dict = {self.model.batch_size: len(info.states)}
run_list = [self.model.output, self.model.probs, self.model.value, self.model.entropy,
self.model.learning_rate]
if self.is_continuous:
epsi = np.random.randn(len(info.states), env.brains[brain_name].action_space_size)
feed_dict[self.model.epsilon] = epsi

feed_dict[self.model.state_in] = info.states
actions, a_dist, value, ent, learn_rate = self.sess.run([self.model.output, self.model.probs,
self.model.value, self.model.entropy,
self.model.learning_rate],
feed_dict=feed_dict)
if self.is_training and env.brains[brain_name].state_space_type == "continuous" and self.use_states and normalize:
new_mean, new_variance = self.running_average(info.states, steps, self.model.running_mean,
self.model.running_variance)
feed_dict[self.model.new_mean] = new_mean
feed_dict[self.model.new_variance] = new_variance
run_list = run_list + [self.model.update_mean, self.model.update_variance]
actions, a_dist, value, ent, learn_rate, _, _ = self.sess.run(run_list, feed_dict=feed_dict)
else:
actions, a_dist, value, ent, learn_rate = self.sess.run(run_list, feed_dict=feed_dict)
self.stats['value_estimate'].append(value)
self.stats['entropy'].append(ent)
self.stats['learning_rate'].append(learn_rate)

history['cumulative_reward'] = 0
history['episode_steps'] = 0
def reset_buffers(self, brain_info=None, total=False):
"""
Resets either all training buffers or local training buffers
:param brain_info: The BrainInfo object containing agent ids.
:param total: Whether to completely clear buffer.
"""
if not total:
for key in self.history_dict:
self.history_dict[key] = empty_local_history(self.history_dict[key])
else:
self.history_dict = empty_all_history(agent_info=brain_info)
def update_model(self, batch_size, num_epoch):
"""
Uses training_buffer to update model.

self.stats['value_loss'].append(total_v)
self.stats['policy_loss'].append(total_p)
self.training_buffer = vectorize_history(empty_local_history({}))
for key in self.history_dict:
self.history_dict[key] = empty_local_history(self.history_dict[key])
def write_summary(self, summary_writer, steps):
def write_summary(self, summary_writer, steps, lesson_number):
print("Mean Reward: {0}".format(np.mean(self.stats['cumulative_reward'])))
if len(self.stats['cumulative_reward']) > 0:
mean_reward = np.mean(self.stats['cumulative_reward'])
print("Step: {0}. Mean Reward: {1}. Std of Reward: {2}."
.format(steps, mean_reward, np.std(self.stats['cumulative_reward'])))
summary = tf.Summary()
for key in self.stats:
if len(self.stats[key]) > 0:

summary.value.add(tag='Info/Lesson', simple_value=lesson_number)
def write_text(self, summary_writer, key, input_dict, steps):
"""
Saves text to Tensorboard.
Note: Only works on tensorflow r1.2 or above.
:param summary_writer: writer associated with Tensorflow session.
:param key: The name of the text.
:param input_dict: A dictionary that will be displayed in a table on Tensorboard.
:param steps: Number of environment steps in training process.
"""
try:
s_op = tf.summary.text(key,
tf.convert_to_tensor(([[str(x), str(input_dict[x])] for x in input_dict]))
)
s = self.sess.run(s_op)
summary_writer.add_summary(s, steps)
except:
print("Cannot write text summary for Tensorboard. Tensorflow version must be r1.2 or above.")
pass

2
python/setup.py


required = f.read().splitlines()
setup(name='unityagents',
version='0.1.1',
version='0.2.0',
description='Unity Machine Learning Agents',
license='Apache License 2.0',
author='Unity Technologies',

200
python/test_unityagents.py


import pytest
import socket
import mock
import struct
import json
from unityagents import UnityEnvironment, UnityEnvironmentException, UnityActionException, BrainInfo, BrainParameters
from unityagents import UnityEnvironment, UnityEnvironmentException, UnityActionException, BrainInfo, BrainParameters, Curriculum
def append_length(input):
return struct.pack("I", len(input.encode())) + input.encode()
"externalBrainNames": ["RealFakeBrain"],
"logPath":"RealFakePath",
"apiNumber":"API-2",
"brainParameters": [{
"stateSize": 3,
"actionSize": 2,

dummy_reset = [
'CONFIG_REQUEST'.encode(),
append_length(
'''
{
"brain_name": "RealFakeBrain",

"actions": null,
"actions": [1,2,3,4],
}'''.encode(),
}'''),
'''
append_length('''
"actions": null,
"actions": [1,2,3,4,5,6],
}'''.encode(),
}'''),
'''
append_length('''
"actions": null,
"actions": [1,2,3,4,5,6],
}'''.encode(),
}'''),
mock_socket.return_value.accept.return_value = (mock_socket, 0)
mock_socket.recv.return_value.decode.return_value = dummy_start
env = UnityEnvironment(' ')
with pytest.raises(UnityActionException):
env.step([0])
assert env.brain_names[0] == 'RealFakeBrain'
env.close()
with mock.patch('glob.glob') as mock_glob:
mock_glob.return_value = ['FakeLaunchPath']
mock_socket.return_value.accept.return_value = (mock_socket, 0)
mock_socket.recv.return_value.decode.return_value = dummy_start
env = UnityEnvironment(' ')
with pytest.raises(UnityActionException):
env.step([0])
assert env.brain_names[0] == 'RealFakeBrain'
env.close()
mock_socket.return_value.accept.return_value = (mock_socket, 0)
mock_socket.recv.return_value.decode.return_value = dummy_start
env = UnityEnvironment(' ')
brain = env.brains['RealFakeBrain']
mock_socket.recv.side_effect = dummy_reset
brain_info = env.reset()
env.close()
assert not env.global_done
assert isinstance(brain_info, dict)
assert isinstance(brain_info['RealFakeBrain'], BrainInfo)
assert isinstance(brain_info['RealFakeBrain'].observations, list)
assert isinstance(brain_info['RealFakeBrain'].states, np.ndarray)
assert len(brain_info['RealFakeBrain'].observations) == brain.number_observations
assert brain_info['RealFakeBrain'].states.shape[0] == len(brain_info['RealFakeBrain'].agents)
assert brain_info['RealFakeBrain'].states.shape[1] == brain.state_space_size
with mock.patch('glob.glob') as mock_glob:
mock_glob.return_value = ['FakeLaunchPath']
mock_socket.return_value.accept.return_value = (mock_socket, 0)
mock_socket.recv.return_value.decode.return_value = dummy_start
env = UnityEnvironment(' ')
brain = env.brains['RealFakeBrain']
mock_socket.recv.side_effect = dummy_reset
brain_info = env.reset()
env.close()
assert not env.global_done
assert isinstance(brain_info, dict)
assert isinstance(brain_info['RealFakeBrain'], BrainInfo)
assert isinstance(brain_info['RealFakeBrain'].observations, list)
assert isinstance(brain_info['RealFakeBrain'].states, np.ndarray)
assert len(brain_info['RealFakeBrain'].observations) == brain.number_observations
assert brain_info['RealFakeBrain'].states.shape[0] == len(brain_info['RealFakeBrain'].agents)
assert brain_info['RealFakeBrain'].states.shape[1] == brain.state_space_size
mock_socket.return_value.accept.return_value = (mock_socket, 0)
mock_socket.recv.return_value.decode.return_value = dummy_start
env = UnityEnvironment(' ')
brain = env.brains['RealFakeBrain']
mock_socket.recv.side_effect = dummy_reset
brain_info = env.reset()
mock_socket.recv.side_effect = dummy_step
brain_info = env.step([0] * brain.action_space_size * len(brain_info['RealFakeBrain'].agents))
with pytest.raises(UnityActionException):
env.step([0])
brain_info = env.step([0] * brain.action_space_size * len(brain_info['RealFakeBrain'].agents))
with pytest.raises(UnityActionException):
env.step([0] * brain.action_space_size * len(brain_info['RealFakeBrain'].agents))
env.close()
assert env.global_done
assert isinstance(brain_info, dict)
assert isinstance(brain_info['RealFakeBrain'], BrainInfo)
assert isinstance(brain_info['RealFakeBrain'].observations, list)
assert isinstance(brain_info['RealFakeBrain'].states, np.ndarray)
assert len(brain_info['RealFakeBrain'].observations) == brain.number_observations
assert brain_info['RealFakeBrain'].states.shape[0] == len(brain_info['RealFakeBrain'].agents)
assert brain_info['RealFakeBrain'].states.shape[1] == brain.state_space_size
assert not brain_info['RealFakeBrain'].local_done[0]
assert brain_info['RealFakeBrain'].local_done[2]
with mock.patch('glob.glob') as mock_glob:
mock_glob.return_value = ['FakeLaunchPath']
mock_socket.return_value.accept.return_value = (mock_socket, 0)
mock_socket.recv.return_value.decode.return_value = dummy_start
env = UnityEnvironment(' ')
brain = env.brains['RealFakeBrain']
mock_socket.recv.side_effect = dummy_reset
brain_info = env.reset()
mock_socket.recv.side_effect = dummy_step
brain_info = env.step([0] * brain.action_space_size * len(brain_info['RealFakeBrain'].agents))
with pytest.raises(UnityActionException):
env.step([0])
brain_info = env.step([0] * brain.action_space_size * len(brain_info['RealFakeBrain'].agents))
with pytest.raises(UnityActionException):
env.step([0] * brain.action_space_size * len(brain_info['RealFakeBrain'].agents))
env.close()
assert env.global_done
assert isinstance(brain_info, dict)
assert isinstance(brain_info['RealFakeBrain'], BrainInfo)
assert isinstance(brain_info['RealFakeBrain'].observations, list)
assert isinstance(brain_info['RealFakeBrain'].states, np.ndarray)
assert len(brain_info['RealFakeBrain'].observations) == brain.number_observations
assert brain_info['RealFakeBrain'].states.shape[0] == len(brain_info['RealFakeBrain'].agents)
assert brain_info['RealFakeBrain'].states.shape[1] == brain.state_space_size
assert not brain_info['RealFakeBrain'].local_done[0]
assert brain_info['RealFakeBrain'].local_done[2]

with mock.patch('socket.socket') as mock_socket:
mock_socket.return_value.accept.return_value = (mock_socket, 0)
mock_socket.recv.return_value.decode.return_value = dummy_start
env = UnityEnvironment(' ')
assert env._loaded
env.close()
assert not env._loaded
mock_socket.close.assert_called_once()
with mock.patch('glob.glob') as mock_glob:
mock_glob.return_value = ['FakeLaunchPath']
mock_socket.return_value.accept.return_value = (mock_socket, 0)
mock_socket.recv.return_value.decode.return_value = dummy_start
env = UnityEnvironment(' ')
assert env._loaded
env.close()
assert not env._loaded
mock_socket.close.assert_called_once()
dummy_curriculum= json.loads('''{
"measure" : "reward",
"thresholds" : [10, 20, 50],
"min_lesson_length" : 3,
"signal_smoothing" : true,
"parameters" :
{
"param1" : [0.7, 0.5, 0.3, 0.1],
"param2" : [100, 50, 20, 15],
"param3" : [0.2, 0.3, 0.7, 0.9]
}
}''')
bad_curriculum= json.loads('''{
"measure" : "reward",
"thresholds" : [10, 20, 50],
"min_lesson_length" : 3,
"signal_smoothing" : false,
"parameters" :
{
"param1" : [0.7, 0.5, 0.3, 0.1],
"param2" : [100, 50, 20],
"param3" : [0.2, 0.3, 0.7, 0.9]
}
}''')
def test_curriculum():
open_name = '%s.open' % __name__
with mock.patch('json.load') as mock_load:
with mock.patch(open_name, create=True) as mock_open:
mock_open.return_value = 0
mock_load.return_value = bad_curriculum
with pytest.raises(UnityEnvironmentException):
curriculum = Curriculum('test_unityagents.py', {"param1":1,"param2":1,"param3":1})
mock_load.return_value = dummy_curriculum
with pytest.raises(UnityEnvironmentException):
curriculum = Curriculum('test_unityagents.py', {"param1":1,"param2":1})
curriculum = Curriculum('test_unityagents.py', {"param1":1,"param2":1,"param3":1})
assert curriculum.get_lesson_number() == 0
curriculum.set_lesson_number(1)
assert curriculum.get_lesson_number() == 1
curriculum.get_lesson(10)
assert curriculum.get_lesson_number() == 1
curriculum.get_lesson(30)
curriculum.get_lesson(30)
assert curriculum.get_lesson_number() == 1
assert curriculum.lesson_length == 3
assert curriculum.get_lesson(30) == {'param1': 0.3, 'param2': 20, 'param3': 0.7}
assert curriculum.lesson_length == 0
assert curriculum.get_lesson_number() == 2

1
python/unityagents/__init__.py


from .environment import *
from .brain import *
from .exception import *
from .curriculum import *

3
python/unityagents/brain.py


class BrainInfo:
def __init__(self, observation, state, memory=None, reward=None, agents=None, local_done=None):
def __init__(self, observation, state, memory=None, reward=None, agents=None, local_done=None, action =None):
"""
Describes experience at current step of all agents linked to a brain.
"""

self.rewards = reward
self.local_done = local_done
self.agents = agents
self.previous_actions = action
class BrainParameters:

247
python/unityagents/environment.py


import os
import socket
import subprocess
import struct
from .exception import UnityEnvironmentException, UnityActionException
from .exception import UnityEnvironmentException, UnityActionException, UnityTimeOutException
from .curriculum import Curriculum
logger = logging.getLogger(__name__)
logger = logging.getLogger("unityagents")
base_port=5005):
base_port=5005, curriculum=None):
"""
Starts a new unity environment and establishes a connection with the environment.
Notice: Currently communication between Unity and Python takes place over an open socket without authentication.

atexit.register(self.close)
self.port = base_port + worker_id
self._buffer_size = 120000
self._buffer_size = 12000
self._python_api = "API-2"
self._loaded = False
self._open_socket = False

"or use a different worker number.".format(str(worker_id)))
cwd = os.getcwd()
try:
true_filename = os.path.basename(os.path.normpath(file_name))
launch_string = ""
if platform == "linux" or platform == "linux2":
candidates = glob.glob(os.path.join(cwd, file_name) + '.x86_64')
if len(candidates) == 0:
candidates = glob.glob(os.path.join(cwd, file_name) + '.x86')
if len(candidates) > 0:
launch_string = candidates[0]
else:
raise UnityEnvironmentException("Couldn't launch new environment. Provided filename "
"does not match any environments in {}. ".format(cwd))
elif platform == 'darwin':
launch_string = os.path.join(cwd, file_name + '.app', 'Contents', 'MacOS', true_filename)
elif platform == 'win32':
launch_string = os.path.join(cwd, file_name + '.exe')
file_name = (file_name.strip()
.replace('.app', '').replace('.exe', '').replace('.x86_64', '').replace('.x86', ''))
true_filename = os.path.basename(os.path.normpath(file_name))
launch_string = None
if platform == "linux" or platform == "linux2":
candidates = glob.glob(os.path.join(cwd, file_name) + '.x86_64')
if len(candidates) == 0:
candidates = glob.glob(os.path.join(cwd, file_name) + '.x86')
if len(candidates) == 0:
candidates = glob.glob(file_name + '.x86_64')
if len(candidates) == 0:
candidates = glob.glob(file_name + '.x86')
if len(candidates) > 0:
launch_string = candidates[0]
elif platform == 'darwin':
candidates = glob.glob(os.path.join(cwd, file_name + '.app', 'Contents', 'MacOS', true_filename))
if len(candidates) == 0:
candidates = glob.glob(os.path.join(file_name + '.app', 'Contents', 'MacOS', true_filename))
if len(candidates) > 0:
launch_string = candidates[0]
elif platform == 'win32':
candidates = glob.glob(os.path.join(cwd, file_name + '.exe'))
if len(candidates) == 0:
candidates = glob.glob(file_name + '.exe')
if len(candidates) > 0:
launch_string = candidates[0]
if launch_string is None: