
Merge remote-tracking branch 'origin/develop' into enable-flake8

Chris Elion 5 年前
共有 161 个文件被更改,包括 22523 次插入476 次删除
  1. 5
  2. 1
  3. 10
  4. 10
  5. 2
  6. 19
  7. 19
  8. 6
  9. 17
  10. 2
  11. 25
  12. 8
  13. 18
  14. 2
  15. 1
  16. 40
  17. 2
  18. 19
  19. 38
  20. 125
  21. 70
  22. 99
  23. 23
  24. 6
  25. 4
  26. 13
  27. 12
  28. 2
  29. 3
  30. 4
  31. 14
  32. 32
  33. 17
  34. 22
  35. 6
  36. 39
  37. 7
  38. 37
  39. 39
  40. 30
  41. 49
  42. 13
  43. 6
  44. 8
  45. 76
  46. 154
  47. 134
  48. 27
  49. 187
  50. 14
  51. 6
  52. 92
  53. 80
  54. 38
  55. 10
  56. 187
  57. 110
  58. 96
  59. 181
  60. 158
  61. 1001
  62. 442
  63. 1001
  64. 1001
  65. 171
  66. 198
  67. 1001
  68. 1001
  69. 1001
  70. 1001
  71. 1001
  72. 1001
  73. 1001
  74. 1001
  75. 1001
  76. 1001
  77. 0
  78. 0
  79. 115
  80. 70
  81. 664
  82. 635
  83. 1001
  84. 1001
  85. 611
  86. 69
  87. 51
  88. 955
  89. 121
  90. 139
  91. 268
  92. 1001


# pytest cache
# pytest cache
# Ignore compiled protobuf files.

# Python virtual environment
# Code coverage report


we welcome any enhancements and improvements from the community.
* [Chinese](docs/localized/zh-CN/)
* [Korean](docs/localized/KR/)
## License


m_ReflectionIntensity: 1
m_CustomReflection: {fileID: 0}
m_Sun: {fileID: 0}
m_IndirectSpecularColor: {r: 0.44824862, g: 0.49827534, b: 0.57558274, a: 1}
m_IndirectSpecularColor: {r: 0.44824898, g: 0.49827564, b: 0.5755826, a: 1}
--- !u!157 &3
m_ObjectHideFlags: 0

timeScale: 1
targetFrameRate: -1
resetParameters: []
- key: mass
value: 1
- key: gravity
value: 9.81
- key: scale
value: 1
--- !u!1 &1746325439
m_ObjectHideFlags: 0


m_ReflectionIntensity: 1
m_CustomReflection: {fileID: 0}
m_Sun: {fileID: 0}
m_IndirectSpecularColor: {r: 0.45096254, g: 0.5008292, b: 0.5744089, a: 1}
m_IndirectSpecularColor: {r: 0.45096314, g: 0.50082976, b: 0.57440954, a: 1}
--- !u!157 &3
m_ObjectHideFlags: 0

timeScale: 1
targetFrameRate: -1
resetParameters: []
- key: mass
value: 1
- key: gravity
value: 9.81
- key: scale
value: 1
--- !u!1001 &1591880668
m_ObjectHideFlags: 0


public override void AcademyReset()
Physics.gravity = new Vector3(0, -resetParameters["gravity"], 0);
public override void AcademyStep()


[Header("Specific to Ball3D")]
public GameObject ball;
private Rigidbody ballRb;
private ResetParameters resetParams;
var academy = Object.FindObjectOfType<Academy>() as Academy;
resetParams = academy.resetParameters;
public override void CollectObservations()

public override void AgentAction(float[] vectorAction, string textAction)
if (brain.brainParameters.vectorActionSpaceType == SpaceType.continuous)
var actionZ = 2f * Mathf.Clamp(vectorAction[0], -1f, 1f);

ballRb.velocity = new Vector3(0f, 0f, 0f);
ball.transform.position = new Vector3(Random.Range(-1.5f, 1.5f), 4f, Random.Range(-1.5f, 1.5f))
+ gameObject.transform.position;
//Reset the parameters when the Agent is reset.
public void SetBall()
//Set the attributes of the ball by fetching the information from the academy
ballRb.mass = resetParams["mass"];
var scale = resetParams["scale"];
ball.transform.localScale = new Vector3(scale, scale, scale);
public void SetResetParameters()


[Header("Specific to Ball3DHard")]
public GameObject ball;
private Rigidbody ballRb;
private ResetParameters resetParams;
var academy = Object.FindObjectOfType<Academy>() as Academy;
resetParams = academy.resetParameters;
public override void CollectObservations()

public override void AgentAction(float[] vectorAction, string textAction)
if (brain.brainParameters.vectorActionSpaceType == SpaceType.continuous)
var actionZ = 2f * Mathf.Clamp(vectorAction[0], -1f, 1f);

public void SetBall()
//Set the attributes of the ball by fetching the information from the academy
ballRb.mass = resetParams["mass"];
var scale = resetParams["scale"];
ball.transform.localScale = new Vector3(scale, scale, scale);
public void SetResetParameters()


timeScale: 2
targetFrameRate: 60
resetParameters: []
- key: ball_scale
value: 7.5
- key: gravity
value: 9.81
brainStriker: {fileID: 11400000, guid: 29ed78b3e8fef4340b3a1f6954b88f18, type: 2}
brainGoalie: {fileID: 11400000, guid: 090fa5a8588f5433bb7f878e6f5ac954, type: 2}
redMaterial: {fileID: 2100000, guid: 776dd8b57653342839c3fb5f46ce664e, type: 2}


public enum Team
public Rigidbody agentRb;
SoccerAcademy academy;

var playerState = new PlayerState
agentRB = agentRb,
startingPos = transform.position,
agentRB = agentRb,
startingPos = transform.position,
agentScript = this,

transform.position = area.GetRandomSpawnPos(agentRole, team);
agentRb.velocity = Vector3.zero;
agentRb.angularVelocity = Vector3.zero;
public void SetResetParameters()


public override void AcademyReset()
Physics.gravity = new Vector3(0, -resetParameters["gravity"], 0);
public override void AcademyStep()


public class PlayerState
public int playerIndex;
public Rigidbody agentRB;
public Vector3 startingPos;
public AgentSoccer agentScript;
public int playerIndex;
public Rigidbody agentRB;
public Vector3 startingPos;
public AgentSoccer agentScript;
public float ballPosReward;

public GameObject ball;
public Rigidbody ballRB;
public GameObject ground;
public GameObject ground;
public GameObject centerPitch;
SoccerBallController ballController;
public List<PlayerState> playerStates = new List<PlayerState>();

public IEnumerator GoalScoredSwapGroundMaterial(Material mat, float time)
groundRenderer.material = mat;
yield return new WaitForSeconds(time);
yield return new WaitForSeconds(time);
groundRenderer.material = groundMaterial;

academy = FindObjectOfType<SoccerAcademy>();
groundRenderer = centerPitch.GetComponent<Renderer>();
groundRenderer = centerPitch.GetComponent<Renderer>();
groundMaterial = groundRenderer.material;
canResetBall = true;
if (goalTextUI) { goalTextUI.SetActive(false); }

xOffset = xOffset * -1f;
var randomSpawnPos = ground.transform.position +
new Vector3(xOffset, 0f, 0f)
var randomSpawnPos = ground.transform.position +
new Vector3(xOffset, 0f, 0f)
+ (Random.insideUnitSphere * 2);
randomSpawnPos.y = ground.transform.position.y + 2;
return randomSpawnPos;

var randomSpawnPos = ground.transform.position +
new Vector3(0f, 0f, 0f)
var randomSpawnPos = ground.transform.position +
new Vector3(0f, 0f, 0f)
+ (Random.insideUnitSphere * 2);
randomSpawnPos.y = ground.transform.position.y + 2;
return randomSpawnPos;

ball.transform.position = GetBallSpawnPosition();
ballRB.velocity = Vector3.zero;
ballRB.angularVelocity = Vector3.zero;
var ballScale = academy.resetParameters["ball_scale"];
ballRB.transform.localScale = new Vector3(ballScale, ballScale, ballScale);


timeScale: 1
targetFrameRate: 60
resetParameters: []
- key: gravity
value: 9.81
- key: angle
value: 55
- key: scale
value: 1
--- !u!1001 &1065879750
m_ObjectHideFlags: 0


brain: {fileID: 11400000, guid: 6bf6a586a645b471bb9bd1194ae0e229, type: 2}
agentCameras: []
agentRenderTextures: []
maxStep: 5000
resetOnDone: 1
onDemandDecision: 0

score: 0
scoreText: {fileID: 2073469450}
opponent: {fileID: 1894084401}
angle: 0
scale: 0
--- !u!65 &348265184
m_ObjectHideFlags: 0

timeScale: 1
targetFrameRate: 60
resetParameters: []
- key: angle
value: 55
- key: scale
value: 1
- key: gravity
value: 9.81
--- !u!1 &1114726487
m_ObjectHideFlags: 0

brain: {fileID: 11400000, guid: 1674996276be448c2ad51fb139e21e05, type: 2}
agentCameras: []
agentRenderTextures: []
maxStep: 5000
resetOnDone: 1
onDemandDecision: 0

score: 0
scoreText: {fileID: 1871669621}
opponent: {fileID: 348265181}
angle: 0
scale: 0
--- !u!65 &1894084404
m_ObjectHideFlags: 0


lastAgentHit = collision.gameObject.name == "AgentA" ? 0 : 1;


public override void AcademyReset()
Physics.gravity = new Vector3(0, -resetParameters["gravity"], 0);
public override void AcademyStep()


public bool invertX;
public int score;
public GameObject myArea;
public float angle;
public float scale;
private ResetParameters resetParams;
// Looks for the scoreboard based on the name of the gameObjects.
// Do not modify the names of the Score GameObjects

ballRb = ball.GetComponent<Rigidbody>();
var canvas = GameObject.Find(CanvasName);
GameObject scoreBoard;
var academy = Object.FindObjectOfType<Academy>() as Academy;
resetParams = academy.resetParameters;
if (invertX)
scoreBoard = canvas.transform.Find(ScoreBoardBName).gameObject;

scoreBoard = canvas.transform.Find(ScoreBoardAName).gameObject;
textComponent = scoreBoard.GetComponent<Text>();
public override void CollectObservations()

var moveX = Mathf.Clamp(vectorAction[0], -1f, 1f) * invertMult;
var moveY = Mathf.Clamp(vectorAction[1], -1f, 1f);
if (moveY > 0.5 && transform.position.y - transform.parent.transform.position.y < -1.5f)
agentRb.velocity = new Vector3(agentRb.velocity.x, 7f, 0f);

if (invertX && transform.position.x - transform.parent.transform.position.x < -invertMult ||
if (invertX && transform.position.x - transform.parent.transform.position.x < -invertMult ||
transform.position = new Vector3(-invertMult + transform.parent.transform.position.x,
transform.position = new Vector3(-invertMult + transform.parent.transform.position.x,
textComponent.text = score.ToString();

transform.position = new Vector3(-invertMult * Random.Range(6f, 8f), -1.5f, 0f) + transform.parent.transform.position;
agentRb.velocity = new Vector3(0f, 0f, 0f);
public void SetRacket()
angle = resetParams["angle"];
gameObject.transform.eulerAngles = new Vector3(
invertMult * angle
public void SetBall()
scale = resetParams["scale"];
ball.transform.localScale = new Vector3(scale, scale, scale);
public void SetResetParameters()


[Learning a policy](https://blogs.unity3d.com/2017/08/22/unity-ai-reinforcement-learning-with-q-learning/)
usually requires many trials and iterative policy updates. More specifically,
the robot is placed in several fire situations and over time learns an optimal
policy which allows it to put our fires more effectively. Obviously, we cannot
policy which allows it to put out fires more effectively. Obviously, we cannot
expect to train a robot repeatedly in the real world, particularly when fires
are involved. This is precisely why the use of
[Unity as a simulator](https://blogs.unity3d.com/2018/01/23/designing-safer-cities-through-simulations/)


### Install Python and mlagents Package
In order to use ML-Agents toolkit, you need Python 3.6 along with the
dependencies listed in the [setup.py file](../ml-agents/setup.py).
Some of the primary dependencies include:
- [TensorFlow](Background-TensorFlow.md) (Requires a CPU w/ AVX support)
- [Jupyter](Background-Jupyter.md)
[Download](https://www.python.org/downloads/) and install Python 3.6 if you do not
already have it.
In order to use ML-Agents toolkit, you need Python 3.6.
[Download](https://www.python.org/downloads/) and install Python 3.6 if you do not already have it.
To install the dependencies and `mlagents` Python package, run from the command line:
To install the `mlagents` Python package, run from the command line:
pip3 install mlagents

If you installed this correctly, you should be able to run
`mlagents-learn --help`, after which you will see the Unity logo and the command line
parameters you can use with `mlagents-learn`.
By installing the `mlagents` package, its dependencies listed in the [setup.py file](../ml-agents/setup.py) are also installed.
Some of the primary dependencies include:
- [TensorFlow](Background-TensorFlow.md) (Requires a CPU w/ AVX support)
- [Jupyter](Background-Jupyter.md)


* Vector Action space: (Continuous) Size of 2, with one value corresponding to
X-rotation, and the other to Z-rotation.
* Visual Observations: None.
* Reset Parameters: None
* Reset Parameters: Three, corresponding to the following:
* scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions)
* Default: 1
* Recommended Minimum: 0.2
* Recommended Maximum: 5
* gravity: Magnitude of gravity
* Default: 9.81
* Recommended Minimum: 4
* Recommended Maximum: 105
* mass: Specifies mass of the ball
* Default: 1
* Recommended Minimum: 0.1
* Recommended Maximum: 20
* Benchmark Mean Reward: 100
## [GridWorld](https://youtu.be/gu8HE9WKEVI)

* Vector Action space: (Continuous) Size of 2, corresponding to movement
toward net or away from net, and jumping.
* Visual Observations: None.
* Reset Parameters: One, corresponding to size of ball.
* Reset Parameters: Three, corresponding to the following:
* angle: Angle of the racket from the vertical (Y) axis.
* Default: 55
* Recommended Minimum: 35
* Recommended Maximum: 65
* gravity: Magnitude of gravity
* Default: 9.81
* Recommended Minimum: 6
* Recommended Maximum: 20
* scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions)
* Default: 1
* Recommended Minimum: 0.2
* Recommended Maximum: 5
* Benchmark Mean Reward: 2.5
* Optional Imitation Learning scene: `TennisIL`.

as well as rotation.
* Goalie: 4 actions corresponding to forward, backward, sideways movement.
* Visual Observations: None.
* Reset Parameters: None
* Reset Parameters: Two, corresponding to the following:
* ball_scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions)
* Default: 7.5
* Recommended minimum: 4
* Recommended maximum: 10
* gravity: Magnitude of the gravity
* Default: 9.81
* Recommended minimum: 6
* Recommended maximum: 20
* Benchmark Mean Reward (Striker & Goalie Brain): 0 (the means will be inverse
of each other and criss crosses during training) __Note that our trainer is currently unable to consistently train this environment__


Imitation Learning uses pairs of observations and actions from
from a demonstration to learn a policy. [Video Link](https://youtu.be/kpb8ZkMBFYs).
Imitation learning can also be used to help reinforcement learning. Especially in
environments with sparse (i.e., infrequent or rare) rewards, the agent may never see
the reward and thus not learn from it. Curiosity helps the agent explore, but in some cases
it is easier to just show the agent how to achieve the reward. In these cases,
imitation learning can dramatically reduce the time it takes to solve the environment.
For instance, on the [Pyramids environment](Learning-Environment-Examples.md#pyramids),
just 6 episodes of demonstrations can reduce training steps by more than 4 times.
<p align="center">
<img src="images/mlagents-ImitationAndRL.png"
alt="Using Demonstrations with Reinforcement Learning"
width="350" border="0" />
ML-Agents provides several ways to learn from demonstrations. For most situations,
[GAIL](Training-RewardSignals.md#the-gail-reward-signal) is the preferred approach.
* To train using GAIL (Generative Adversarial Imitaiton Learning) you can add the
[GAIL reward signal](Training-RewardSignals.md#the-gail-reward-signal). GAIL can be
used with or without environment rewards, and works well when there are a limited
number of demonstrations.
* To help bootstrap reinforcement learning, you can enable
on the PPO trainer, in addition to using a small GAIL reward signal.
* To train an agent to exactly mimic demonstrations, you can use the
[Behavioral Cloning](Training-BehavioralCloning.md) trainer. Behavioral Cloning can be
used offline and online (in-editor), and learns very quickly. However, it usually is ineffective
on more complex environments without a large number of demonstrations.
## Recording Demonstrations
It is possible to record demonstrations of agent behavior from the Unity Editor,

alt="BC Teacher Helper"
width="375" border="10" />
## Training with Behavioral Cloning
There are a variety of possible imitation learning algorithms which can
be used, the simplest one of them is Behavioral Cloning. It works by collecting
demonstrations from a teacher, and then simply uses them to directly learn a
policy, in the same way the supervised learning for image classification
or other traditional Machine Learning tasks work.
### Offline Training
With offline behavioral cloning, we can use demonstrations (`.demo` files)
generated using the `Demonstration Recorder` as the dataset used to train a behavior.
1. Choose an agent you would like to learn to imitate some set of demonstrations.
2. Record a set of demonstration using the `Demonstration Recorder` (see above).
For illustrative purposes we will refer to this file as `AgentRecording.demo`.
3. Build the scene, assigning the agent a Learning Brain, and set the Brain to
Control in the Broadcast Hub. For more information on Brains, see
4. Open the `config/offline_bc_config.yaml` file.
5. Modify the `demo_path` parameter in the file to reference the path to the
demonstration file recorded in step 2. In our case this is:
6. Launch `mlagent-learn`, providing `./config/offline_bc_config.yaml`
as the config parameter, and include the `--run-id` and `--train` as usual.
Provide your environment as the `--env` parameter if it has been compiled
as standalone, or omit to train in the editor.
7. (Optional) Observe training performance using TensorBoard.
This will use the demonstration file to train a neural network driven agent
to directly imitate the actions provided in the demonstration. The environment
will launch and be used for evaluating the agent's performance during training.
### Online Training
It is also possible to provide demonstrations in realtime during training,
without pre-recording a demonstration file. The steps to do this are as follows:
1. First create two Brains, one which will be the "Teacher," and the other which
will be the "Student." We will assume that the names of the Brain
Assets are "Teacher" and "Student" respectively.
2. The "Teacher" Brain must be a **Player Brain**. You must properly
configure the inputs to map to the corresponding actions.
3. The "Student" Brain must be a **Learning Brain**.
4. The Brain Parameters of both the "Teacher" and "Student" Brains must be
compatible with the agent.
5. Drag both the "Teacher" and "Student" Brain into the Academy's `Broadcast Hub`
and check the `Control` checkbox on the "Student" Brain.
6. Link the Brains to the desired Agents (one Agent as the teacher and at least
one Agent as a student).
7. In `config/online_bc_config.yaml`, add an entry for the "Student" Brain. Set
the `trainer` parameter of this entry to `online_bc`, and the
`brain_to_imitate` parameter to the name of the teacher Brain: "Teacher".
Additionally, set `batches_per_epoch`, which controls how much training to do
each moment. Increase the `max_steps` option if you'd like to keep training
the Agents for a longer period of time.
8. Launch the training process with `mlagents-learn config/online_bc_config.yaml
--train --slow`, and press the :arrow_forward: button in Unity when the
message _"Start training by pressing the Play button in the Unity Editor"_ is
displayed on the screen
9. From the Unity window, control the Agent with the Teacher Brain by providing
"teacher demonstrations" of the behavior you would like to see.
10. Watch as the Agent(s) with the student Brain attached begin to behave
similarly to the demonstrations.
11. Once the Student Agents are exhibiting the desired behavior, end the training
process with `CTL+C` from the command line.
12. Move the resulting `*.nn` file into the `TFModels` subdirectory of the
Assets folder (or a subdirectory within Assets of your choosing) , and use
with `Learning` Brain.
**BC Teacher Helper**
We provide a convenience utility, `BC Teacher Helper` component that you can add
to the Teacher Agent.
<p align="center">
<img src="images/bc_teacher_helper.png"
alt="BC Teacher Helper"
width="375" border="10" />
This utility enables you to use keyboard shortcuts to do the following:
1. To start and stop recording experiences. This is useful in case you'd like to
interact with the game _but not have the agents learn from these
interactions_. The default command to toggle this is to press `R` on the
2. Reset the training buffer. This enables you to instruct the agents to forget
their buffer of recent experiences. This is useful if you'd like to get them
to quickly learn a new behavior. The default command to reset the buffer is
to press `C` on the keyboard.


presented to an agent, see [Training with Curriculum
For information about imitation learning, which uses a different training
algorithm, see
For information about imitation learning from demonstrations, see
[Training with Imitation Learning](Training-Imitation-Learning.md).
## Best Practices when training with PPO

the agent will need to remember in order to successfully complete the task.
Typical Range: `64` - `512`
## (Optional) Pretraining Using Demonstrations
In some cases, you might want to bootstrap the agent's policy using behavior recorded
from a player. This can help guide the agent towards the reward. Pretraining adds
training operations that mimic a demonstration rather than attempting to maximize reward.
It is essentially equivalent to running [behavioral cloning](./Training-BehavioralCloning.md)
in-line with PPO.
To use pretraining, add a `pretraining` section to the trainer_config. For instance:
demo_path: ./demos/ExpertPyramid.demo
strength: 0.5
steps: 10000
Below are the avaliable hyperparameters for pretraining.
### Strength
`strength` corresponds to the learning rate of the imitation relative to the learning
rate of PPO, and roughly corresponds to how strongly we allow the behavioral cloning
to influence the policy.
Typical Range: `0.1` - `0.5`
### Demo Path
`demo_path` is the path to your `.demo` file or directory of `.demo` files.
See the [imitation learning guide](Training-ImitationLearning.md) for more on `.demo` files.
### Steps
During pretraining, it is often desirable to stop using demonstrations after the agent has
"seen" rewards, and allow it to optimize past the available demonstrations and/or generalize
outside of the provided demonstrations. `steps` corresponds to the training steps over which
pretraining is active. The learning rate of the pretrainer will anneal over the steps. Set
the steps to 0 for constant imitation over the entire training run.
### (Optional) Batch Size
`batch_size` is the number of demonstration experiences used for one iteration of a gradient
descent update. If not specified, it will default to the `batch_size` defined for PPO.
Typical Range (Continuous): `512` - `5120`
Typical Range (Discrete): `32` - `512`
### (Optional) Number of Epochs
`num_epoch` is the number of passes through the experience buffer during
gradient descent. If not specified, it will default to the number of epochs set for PPO.
Typical Range: `3` - `10`
### (Optional) Samples Per Update
`samples_per_update` is the maximum number of samples
to use during each imitation update. You may want to lower this if your demonstration
dataset is very large to avoid overfitting the policy on demonstrations. Set to 0
to train over all of the demonstrations at each update step.
Default Value: `0` (all)
Typical Range: Approximately equal to PPO's `buffer_size`
## Training Statistics


observation, but also not too small to prevent it from learning to differentiate between
demonstrated and actual behavior.
Default Value: 64
Default Value: `64`
Typical Range: `64` - `256`
#### Learning Rate

Default Value: `3e-4`
### The GAIL Reward Signal
GAIL, or [Generative Adversarial Imitation Learning](https://arxiv.org/abs/1606.03476), is an
imitation learning algorithm that uses an adversarial approach, in a similar vein to GANs
(Generative Adversarial Networks). In this framework, a second neural network, the
discriminator, is taught to distinguish whether an observation/action is from a demonstration, or
produced by the agent. This discriminator can the examine a new observation/action and provide it a
reward based on how close it believes this new observation/action is to the provided demonstrations.
At each training step, the agent tries to learn how to maximize this reward. Then, the
discriminator is trained to better distinguish between demonstrations and agent state/actions.
In this way, while the agent gets better and better at mimicing the demonstrations, the
discriminator keeps getting stricter and stricter and the agent must try harder to "fool" it.
This approach, when compared to [Behavioral Cloning](Training-BehavioralCloning.md), requires
far fewer demonstrations to be provided. After all, we are still learning a policy that happens
to be similar to the demonstration, not directly copying the behavior of the demonstrations. It
is also especially effective when combined with an Extrinsic signal, but can also be used
independently to purely learn from demonstration.
Using GAIL requires recorded demonstrations from your Unity environment. See the
[imitation learning guide](Training-Imitation-Learning.md) to learn more about recording demonstrations.
#### Strength
`strength` is the factor by which to multiply the raw reward. Note that when using GAIL
with an Extrinsic Signal, this value should be set lower if your demonstrations are
suboptimal (e.g. from a human), so that a trained agent will focus on receiving extrinsic
rewards instead of exactly copying the demonstrations. Keep the strength below about 0.1 in those cases.
Typical Range: `0.01` - `1.0`
#### Gamma
`gamma` corresponds to the discount factor for future rewards.
Typical Range: `0.8` - `0.9`
#### Demo Path
`demo_path` is the path to your `.demo` file or directory of `.demo` files. See the [imitation learning guide]
#### Encoding Size
`encoding_size` corresponds to the size of the hidden layer used by the discriminator.
This value should be small enough to encourage the discriminator to compress the original
observation, but also not too small to prevent it from learning to differentiate between
demonstrated and actual behavior. Dramatically increasing this size will also negatively affect
training times.
Default Value: `64`
Typical Range: `64` - `256`
#### Learning Rate
`learning_rate` is the learning rate used to update the discriminator.
This should typically be decreased if training is unstable, and the GAIL loss is unstable.
Default Value: `3e-4`
Typical Range: `1e-5` - `1e-3`
#### Use Actions
`use_actions` determines whether the discriminator should discriminate based on both
observations and actions, or just observations. Set to `True` if you want the agent to
mimic the actions from the demonstrations, and `False` if you'd rather have the agent
visit the same states as in the demonstrations but with possibly different actions.
Setting to `False` is more likely to be stable, especially with imperfect demonstrations,
but may learn slower.
Default Value: `false`
#### (Optional) Samples Per Update
`samples_per_update` is the maximum number of samples to use during each discriminator update. You may
want to lower this if your buffer size is very large to avoid overfitting the discriminator on current data.
If set to 0, we will use the minimum of buffer size and the number of demonstration samples.
Default Value: `0`
Typical Range: Approximately equal to [`buffer_size`](Training-PPO.md)
#### (Optional) Variational Discriminator Bottleneck
`use_vail` enables a [variational bottleneck](https://arxiv.org/abs/1810.00821) within the
GAIL discriminator. This forces the discriminator to learn a more general representation
and reduces its tendency to be "too good" at discriminating, making learning more stable.
However, it does increase training time. Enable this if you notice your imitation learning is
unstable, or unable to learn the task at hand.
Default Value: `false`


the repository:
docker run --name <container-name> \
docker run -it --name <container-name> \
-p 6006:6006 \
<image-name>:latest \
--docker-target-name=unity-volume \
<trainer-config-file> \

To train with a `3DBall` environment executable, the command would be:
docker run --name 3DBallContainer.first.trial \
docker run -it --name 3DBallContainer.first.trial \
-p 6006:6006 \
--env=3DBall \
--train \

**NOTE** If you are training using docker for environments that use visual observations, you may need to increase the default memory that Docker allocates for the container. For example, see [here](https://docs.docker.com/docker-for-mac/#advanced) for instructions for Docker for Mac.
### Running Tensorboard
You can run Tensorboard to monitor your training instance on http://localhost:6006:
docker exec -it <container-name> tensorboard --logdir=/unity-volume/summaries --host=
With our previous 3DBall example, this command would look like this:
docker exec -it 3DBallContainer.first.trial tensorboard --logdir=/unity-volume/summaries --host=
For more details on Tensorboard, check out the documentation about [Using Tensorboard](Using-Tensorboard.md).
### Stopping Container and Saving State


def _single_step(self, info):
if self.use_visual:
visual_obs = info.visual_observations
if isinstance(visual_obs, list):
visual_obs = np.array(visual_obs)
visual_obs_list.append(self._preprocess_single(obs[0, :, :, :]))
self.visual_obs = self._preprocess_single(visual_obs[0][0, :, :, :])
self.visual_obs = self._preprocess_single(visual_obs[0][0])
default_observation = self.visual_obs


from .brain import *
from .brain import AllBrainInfo, BrainInfo, BrainParameters
from .action_info import ActionInfo, ActionInfoOutputs
from .policy import Policy
from .environment import *
from .exception import *


return np.append(m1, m2, axis=0)
def process_pixels(image_bytes, gray_scale):
def process_pixels(image_bytes: bytes, gray_scale: bool) -> np.ndarray:
Converts byte array observation image into numpy array, re-sizes it,
and optionally converts it to grey scale

s = bytearray(image_bytes)
image = Image.open(io.BytesIO(s))
image_bytearray = bytearray(image_bytes)
image = Image.open(io.BytesIO(image_bytearray))
s = np.array(image) / 255.0
if gray_scale:
s = np.mean(s, axis=2)

def from_agent_proto(agent_info_list, brain_params):
def from_agent_proto(worker_id: int, agent_info_list, brain_params):
vis_obs = []
vis_obs: List[np.ndarray] = []
for i in range(brain_params.number_visual_observations):
obs = [

vector_obs = np.nan_to_num(
np.array([x.stacked_vector_observation for x in agent_info_list])
agents = [f"${worker_id}-{x.id}" for x in agent_info_list]
brain_info = BrainInfo(

agents=[x.id for x in agent_info_list],
local_done=[x.done for x in agent_info_list],
vector_action=np.array([x.stored_vector_actions for x in agent_info_list]),
text_action=[list(x.stored_text_actions) for x in agent_info_list],


docker_training: bool = False,
no_graphics: bool = False,
timeout_wait: int = 30,
args: list = [],
Starts a new unity environment and establishes a connection with the environment.

:bool no_graphics: Whether to run the Unity simulator in no-graphics mode
:int timeout_wait: Time (in seconds) to wait for connection from environment.
:bool train_mode: Whether to run in training mode, speeding up the simulation, by default.
:list args: Addition Unity command line arguments

) # The process that is started. If None, no process was started
self.communicator = self.get_communicator(worker_id, base_port, timeout_wait)
self.worker_id = worker_id
# If the environment name is None, a new environment will not be launched
# and the communicator will directly try to connect to an existing unity environment.

"the worker-id must be 0 in order to connect with the Editor."
if file_name is not None:
self.executable_launcher(file_name, docker_training, no_graphics)
self.executable_launcher(file_name, docker_training, no_graphics, args)
"Start training by pressing the Play button in the Unity Editor."

def reset_parameters(self):
return self._resetParameters
def executable_launcher(self, file_name, docker_training, no_graphics):
def executable_launcher(self, file_name, docker_training, no_graphics, args):
cwd = os.getcwd()
file_name = (

+ args
[launch_string, "--port", str(self.port)]
[launch_string, "--port", str(self.port)] + args

for brain_name in output.agentInfos:
agent_info_list = output.agentInfos[brain_name].value
_data[brain_name] = BrainInfo.from_agent_proto(
agent_info_list, self.brains[brain_name]
self.worker_id, agent_info_list, self.brains[brain_name]
return _data, global_done




from .action_info import *
from .buffer import *
from .curriculum import *
from .meta_curriculum import *

from .policy import *
from .tf_policy import *
from .trainer_controller import *
from .bc.models import *
from .bc.offline_trainer import *


import numpy as np
from mlagents.trainers.bc.models import BehavioralCloningModel
from mlagents.trainers.policy import Policy
from mlagents.trainers.tf_policy import TFPolicy
class BCPolicy(Policy):
class BCPolicy(TFPolicy):
def __init__(self, seed, brain, trainer_parameters, load):
:param seed: Random seed.


from typing import List, Tuple
import tensorflow as tf
from mlagents.trainers.models import LearningModel

self.encoding_size = encoding_size
self.policy_model = policy_model
self.next_visual_in: List[tf.Tensor] = []
def create_curiosity_encoders(self):
def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
Creates state encoders for current and future observations.
Used for implementation of Curiosity-driven Exploration by Self-supervised Prediction

encoded_next_state = tf.concat(encoded_next_state_list, axis=1)
return encoded_state, encoded_next_state
def create_inverse_model(self, encoded_state, encoded_next_state):
def create_inverse_model(
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
) -> None:
Creates inverse model TensorFlow ops for Curiosity module.
Predicts action taken given current and future encoded states.

tf.dynamic_partition(cross_entropy, self.policy_model.mask, 2)[1]
def create_forward_model(self, encoded_state, encoded_next_state):
def create_forward_model(
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
) -> None:
Creates forward model TensorFlow ops for Curiosity module.
Predicts encoded future state based on encoded current state and given action.

tf.dynamic_partition(squared_difference, self.policy_model.mask, 2)[1]
def create_loss(self, learning_rate):
def create_loss(self, learning_rate: float) -> None:
Creates the loss node of the model as well as the update_batch optimizer to update the model.
:param learning_rate: The learning rate for the optimizer.


from typing import Any, Dict, List
from mlagents.envs.brain import BrainInfo
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.policy import Policy
from mlagents.trainers.tf_policy import TFPolicy
policy: Policy,
policy: TFPolicy,
strength: float,
gamma: float,
encoding_size: int = 128,

Creates the Curiosity reward generator
:param policy: The Learning Policy
:param encoding_size: The size of the Curiosity encoding
:param signal_strength: The scaling parameter for the reward. The scaled reward will be the unscaled
:param strength: The scaling parameter for the reward. The scaled reward will be the unscaled
:param gamma: The time discounting factor used for this reward.
:param encoding_size: The size of the hidden encoding layer for the ICM
:param learning_rate: The learning rate for the ICM.
:param num_epoch: The number of epochs to train over the training buffer for the ICM.
super().__init__(policy, strength, gamma)
self.model = CuriosityModel(

self.has_updated = False
def evaluate(self, current_info, next_info):
def evaluate(
self, current_info: BrainInfo, next_info: BrainInfo
) -> RewardSignalResult:
Evaluates the reward for the agents present in current_info given the next_info
:param current_info: The current BrainInfo.

return RewardSignalResult(scaled_reward, unscaled_reward)
def check_config(cls, config_dict):
def check_config(
cls, config_dict: Dict[str, Any], param_keys: List[str] = None
) -> None:
Checks the config and throw an exception if a hyperparameter is missing. Curiosity requires strength,
gamma, and encoding size at minimum.

def update(self, update_buffer, num_sequences):
def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]:
Updates Curiosity model using training buffer. Divides training buffer into mini batches and performs
gradient descent.

forward_total, inverse_total = [], []
forward_total: List[float] = []
inverse_total: List[float] = []
for _ in range(self.num_epoch):
buffer = update_buffer

return update_stats
def _update_batch(self, mini_batch, num_sequences):
def _update_batch(
self, mini_batch: Dict[str, np.ndarray], num_sequences: int
) -> Dict[str, float]:
Updates model using buffer.
:param num_sequences: Number of trajectories in batch.


from typing import Any, Dict, List
from mlagents.envs.brain import BrainInfo
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.policy import Policy
from mlagents.trainers.tf_policy import TFPolicy
def __init__(self, policy: Policy, strength: float, gamma: float):
def __init__(self, policy: TFPolicy, strength: float, gamma: float):
The extrinsic reward generator. Returns the reward received by the environment
:param policy: The Policy object (e.g. PPOPolicy) that this Reward Signal will apply to.

super().__init__(policy, strength, gamma)
def check_config(cls, config_dict):
def check_config(
cls, config_dict: Dict[str, Any], param_keys: List[str] = None
) -> None:
Checks the config and throw an exception if a hyperparameter is missing. Extrinsic requires strength and gamma
at minimum.

def evaluate(self, current_info, next_info):
def evaluate(
self, current_info: BrainInfo, next_info: BrainInfo
) -> RewardSignalResult:
Evaluates the reward for the agents present in current_info given the next_info
:param current_info: The current BrainInfo.

scaled_reward = self.strength * unscaled_reward
return RewardSignalResult(scaled_reward, unscaled_reward)
def update(self, update_buffer, num_sequences):
def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]:
This method does nothing, as there is nothing to update.


import logging
from mlagents.trainers.trainer import UnityTrainerException
from mlagents.trainers.policy import Policy
from typing import Any, Dict, List
from collections import namedtuple
import numpy as np
import abc

from mlagents.envs.brain import BrainInfo
from mlagents.trainers.trainer import UnityTrainerException
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.buffer import Buffer
logger = logging.getLogger("mlagents.trainers")
RewardSignalResult = namedtuple(

class RewardSignal(abc.ABC):
def __init__(self, policy: Policy, strength: float, gamma: float):
def __init__(self, policy: TFPolicy, strength: float, gamma: float):
Initializes a reward signal. At minimum, you must pass in the policy it is being applied to,
the reward strength, and the gamma (discount factor.)

self.policy = policy
self.strength = strength
def evaluate(self, current_info, next_info):
def evaluate(
self, current_info: BrainInfo, next_info: BrainInfo
) -> RewardSignalResult:
Evaluates the reward for the agents present in current_info given the next_info
:param current_info: The current BrainInfo.

return (
return RewardSignalResult(
def update(self, update_buffer, n_sequences):
def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]:
If the reward signal has an internal model (e.g. GAIL or Curiosity), update that model.
:param update_buffer: An AgentBuffer that contains the live data from which to update.

return {}
def check_config(cls, config_dict, param_keys=None):
def check_config(
cls, config_dict: Dict[str, Any], param_keys: List[str] = None
) -> None:
Check the config dict, and throw an error if there are missing hyperparameters.


from mlagents.trainers.components.reward_signals.extrinsic.signal import (
from mlagents.trainers.components.reward_signals.gail.signal import GAILRewardSignal
from mlagents.trainers.policy import Policy
from mlagents.trainers.tf_policy import TFPolicy
logger = logging.getLogger("mlagents.trainers")

"curiosity": CuriosityRewardSignal,
"gail": GAILRewardSignal,
policy: Policy, name: str, config_entry: Dict[str, Any]
policy: TFPolicy, name: str, config_entry: Dict[str, Any]
) -> RewardSignal:
Creates a reward signal class based on the name and config entry provided as a dict.


import pathlib
import logging
import os
from typing import List, Tuple
from mlagents.envs.communicator_objects import *
from mlagents.envs.communicator_objects import (