Merge branch 'master' of github.com:Unity-Technologies/ml-agents into develop-sac-apex

5 年前 · 0bc4cc1e
--- a/Project/Assets/ML-Agents/Examples/3DBall/Demos/Expert3DBall.demo.meta
+++ b/Project/Assets/ML-Agents/Examples/3DBall/Demos/Expert3DBall.demo.meta
  fileIDToRecycleName:
    11400000: Assets/ML-Agents/Examples/3DBall/Demos/Expert3DBall.demo
  externalObjects: {}
-  userData: ' (MLAgents.Demonstrations.Demonstration)'
+  userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
  assetBundleVariant: 
  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Project/Assets/ML-Agents/Examples/3DBall/Demos/Expert3DBallHard.demo.meta
+++ b/Project/Assets/ML-Agents/Examples/3DBall/Demos/Expert3DBallHard.demo.meta
  fileIDToRecycleName:
    11400000: Assets/ML-Agents/Examples/3DBall/Demos/Expert3DBallHard.demo
  externalObjects: {}
-  userData: ' (MLAgents.Demonstrations.Demonstration)'
+  userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
  assetBundleVariant: 
  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Project/Assets/ML-Agents/Examples/Basic/Demos/ExpertBasic.demo.meta
+++ b/Project/Assets/ML-Agents/Examples/Basic/Demos/ExpertBasic.demo.meta
  fileIDToRecycleName:
    11400000: Assets/ML-Agents/Examples/Basic/Demos/ExpertBasic.demo
  externalObjects: {}
-  userData: ' (MLAgents.Demonstrations.Demonstration)'
+  userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
  assetBundleVariant: 
  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Project/Assets/ML-Agents/Examples/Bouncer/Demos/ExpertBouncer.demo.meta
+++ b/Project/Assets/ML-Agents/Examples/Bouncer/Demos/ExpertBouncer.demo.meta
  fileIDToRecycleName:
    11400000: Assets/ML-Agents/Examples/Bouncer/Demos/ExpertBouncer.demo
  externalObjects: {}
-  userData: ' (MLAgents.Demonstrations.Demonstration)'
+  userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
  assetBundleVariant: 
  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Project/Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerDyn.demo.meta
+++ b/Project/Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerDyn.demo.meta
  fileIDToRecycleName:
    11400000: Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerDyn.demo
  externalObjects: {}
-  userData: ' (MLAgents.Demonstrations.Demonstration)'
+  userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
  assetBundleVariant: 
  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Project/Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerSta.demo.meta
+++ b/Project/Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerSta.demo.meta
  fileIDToRecycleName:
    11400000: Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerSta.demo
  externalObjects: {}
-  userData: ' (MLAgents.Demonstrations.Demonstration)'
+  userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
  assetBundleVariant: 
  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Project/Assets/ML-Agents/Examples/FoodCollector/Demos/ExpertFood.demo.meta
+++ b/Project/Assets/ML-Agents/Examples/FoodCollector/Demos/ExpertFood.demo.meta
  fileIDToRecycleName:
    11400000: Assets/ML-Agents/Examples/FoodCollector/Demos/ExpertFood.demo
  externalObjects: {}
-  userData: ' (MLAgents.Demonstrations.Demonstration)'
+  userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
  assetBundleVariant: 
  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Project/Assets/ML-Agents/Examples/GridWorld/Demos/ExpertGrid.demo.meta
+++ b/Project/Assets/ML-Agents/Examples/GridWorld/Demos/ExpertGrid.demo.meta
  fileIDToRecycleName:
    11400000: Assets/ML-Agents/Examples/GridWorld/Demos/ExpertGrid.demo
  externalObjects: {}
-  userData: ' (MLAgents.Demonstrations.Demonstration)'
+  userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
  assetBundleVariant: 
  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Project/Assets/ML-Agents/Examples/Hallway/Demos/ExpertHallway.demo.meta
+++ b/Project/Assets/ML-Agents/Examples/Hallway/Demos/ExpertHallway.demo.meta
  fileIDToRecycleName:
    11400000: Assets/ML-Agents/Examples/Hallway/Demos/ExpertHallway.demo
  externalObjects: {}
-  userData: ' (MLAgents.Demonstrations.Demonstration)'
+  userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
  assetBundleVariant: 
  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Project/Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo.meta
+++ b/Project/Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo.meta
  fileIDToRecycleName:
    11400000: Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo
  externalObjects: {}
-  userData: ' (MLAgents.Demonstrations.Demonstration)'
+  userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
  assetBundleVariant: 
  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo.meta
+++ b/Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo.meta
  fileIDToRecycleName:
    11400000: Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
  externalObjects: {}
-  userData: ' (MLAgents.Demonstrations.Demonstration)'
+  userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
  assetBundleVariant: 
  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Project/Assets/ML-Agents/Examples/Reacher/Demos/ExpertReacher.demo.meta
+++ b/Project/Assets/ML-Agents/Examples/Reacher/Demos/ExpertReacher.demo.meta
  fileIDToRecycleName:
    11400000: Assets/ML-Agents/Examples/Reacher/Demos/ExpertReacher.demo
  externalObjects: {}
-  userData: ' (MLAgents.Demonstrations.Demonstration)'
+  userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
  assetBundleVariant: 
  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Project/Assets/ML-Agents/Examples/Tennis/Demos/ExpertTennis.demo.meta
+++ b/Project/Assets/ML-Agents/Examples/Tennis/Demos/ExpertTennis.demo.meta
  fileIDToRecycleName:
    11400000: Assets/ML-Agents/Examples/Tennis/Demos/ExpertTennis.demo
  externalObjects: {}
-  userData: ' (MLAgents.Demonstrations.Demonstration)'
+  userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
  assetBundleVariant: 
  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Project/Assets/ML-Agents/Examples/Walker/Demos/ExpertWalker.demo.meta
+++ b/Project/Assets/ML-Agents/Examples/Walker/Demos/ExpertWalker.demo.meta
  fileIDToRecycleName:
    11400000: Assets/ML-Agents/Examples/Walker/Demos/ExpertWalker.demo
  externalObjects: {}
-  userData: ' (MLAgents.Demonstrations.Demonstration)'
+  userData: ' (MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
  assetBundleVariant: 
  script: {fileID: 11500000, guid: 7bd65ce151aaa4a41a45312543c56be1, type: 3}
--- a/Project/ProjectSettings/ProjectVersion.txt
+++ b/Project/ProjectSettings/ProjectVersion.txt
-m_EditorVersion: 2018.4.18f1
+m_EditorVersion: 2018.4.17f1
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
 # Changelog
+
-and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
-
+and this project adheres to
+[Semantic Versioning](http://semver.org/spec/v2.0.0.html).
+
- - The `--load` and `--train` command-line flags have been deprecated. Training now happens by default, and
- use `--resume` to resume training instead. (#3705)
- - The Jupyter notebooks have been removed from the repository.
- - Introduced the `SideChannelUtils` to register, unregister and access side channels.
- - `Academy.FloatProperties` was removed, please use `SideChannelUtils.GetSideChannel<FloatPropertiesChannel>()` instead.
- - Removed the multi-agent gym option from the gym wrapper. For multi-agent scenarios, use the [Low Level Python API](Python-API.md).
- - The low level Python API has changed. You can look at the document [Low Level Python API documentation](Python-API.md) for more information. If you use `mlagents-learn` for training, this should be a transparent change.
- - Added ability to start training (initialize model weights) from a previous run ID. (#3710)
- - The internal event `Academy.AgentSetStatus` was renamed to `Academy.AgentPreStep` and made public.
- - The offset logic was removed from DecisionRequester.
- - The signature of `Agent.Heuristic()` was changed to take a `float[]` as a parameter, instead of returning the array. This was done to prevent a common source of error where users would return arrays of the wrong size.
- - The communication API version has been bumped up to 1.0.0 and will use [Semantic Versioning](https://semver.org/) to do compatibility checks for communication between Unity and the Python process.
- - The obsolete `Agent` methods `GiveModel`, `Done`, `InitializeAgent`, `AgentAction` and `AgentReset` have been removed.
+
+- The `--load` and `--train` command-line flags have been deprecated. Training
+  now happens by default, and use `--resume` to resume training instead. (#3705)
+- The Jupyter notebooks have been removed from the repository.
+- Introduced the `SideChannelUtils` to register, unregister and access side
+  channels.
+- `Academy.FloatProperties` was removed, please use
+  `SideChannelUtils.GetSideChannel<FloatPropertiesChannel>()` instead.
+- Removed the multi-agent gym option from the gym wrapper. For multi-agent
+  scenarios, use the [Low Level Python API](../docs/Python-API.md).
+- The low level Python API has changed. You can look at the document
+  [Low Level Python API documentation](../docs/Python-API.md) for more
+  information. If you use `mlagents-learn` for training, this should be a
+  transparent change.
+- Added ability to start training (initialize model weights) from a previous run
+  ID. (#3710)
+- The internal event `Academy.AgentSetStatus` was renamed to
+  `Academy.AgentPreStep` and made public.
+- The offset logic was removed from DecisionRequester.
+- The signature of `Agent.Heuristic()` was changed to take a `float[]` as a
+  parameter, instead of returning the array. This was done to prevent a common
+  source of error where users would return arrays of the wrong size.
+- The communication API version has been bumped up to 1.0.0 and will use
+  [Semantic Versioning](https://semver.org/) to do compatibility checks for
+  communication between Unity and the Python process.
+- The obsolete `Agent` methods `GiveModel`, `Done`, `InitializeAgent`,
+  `AgentAction` and `AgentReset` have been removed.
- - Format of console output has changed slightly and now matches the name of the model/summary directory. (#3630, #3616)
- - Added a feature to allow sending stats from C# environments to TensorBoard (and other python StatsWriters). To do this from your code, use `SideChannelUtils.GetSideChannel<StatsSideChannel>().AddStat(key, value)` (#3660)
- - Renamed 'Generalization' feature to 'Environment Parameter Randomization'.
- - Timer files now contain a dictionary of metadata, including things like the package version numbers.
- - SideChannel IncomingMessages methods now take an optional default argument, which is used when trying to read more data than the message contains.
- - The way that UnityEnvironment decides the port was changed. If no port is specified, the behavior will depend on the `file_name` parameter. If it is `None`, 5004 (the editor port) will be used; otherwise 5005 (the base environment port) will be used.
- - Fixed an issue where exceptions from environments provided a returncode of 0. (#3680)
- - Running `mlagents-learn` with the same `--run-id` twice will no longer overwrite the existing files. (#3705)
- - `StackingSensor` was changed from `internal` visibility to `public`
- - Updated Barracuda to 0.6.3-preview.
- - Model updates now happen asynchronously with environment steps. (#3690)
+
+- Format of console output has changed slightly and now matches the name of the
+  model/summary directory. (#3630, #3616)
+- Added a feature to allow sending stats from C# environments to TensorBoard
+  (and other python StatsWriters). To do this from your code, use
+  `SideChannelUtils.GetSideChannel<StatsSideChannel>().AddStat(key, value)`
+  (#3660)
+- Renamed 'Generalization' feature to 'Environment Parameter Randomization'.
+- Timer files now contain a dictionary of metadata, including things like the
+  package version numbers.
+- SideChannel IncomingMessages methods now take an optional default argument,
+  which is used when trying to read more data than the message contains.
+- The way that UnityEnvironment decides the port was changed. If no port is
+  specified, the behavior will depend on the `file_name` parameter. If it is
+  `None`, 5004 (the editor port) will be used; otherwise 5005 (the base
+  environment port) will be used.
+- Fixed an issue where exceptions from environments provided a returncode of 0.
+  (#3680)
+- Running `mlagents-learn` with the same `--run-id` twice will no longer
+  overwrite the existing files. (#3705)
+- `StackingSensor` was changed from `internal` visibility to `public`
+- Updated Barracuda to 0.6.3-preview.
+ - Model updates can now happen asynchronously with environment steps for better performance. (#3690)
+### Bug Fixes
+
+ - Fixed a display bug when viewing Demonstration files in the inspector. The
+   shapes of the observations in the file now display correctly. (#3771)
+
+
- - Raise the wall in CrawlerStatic scene to prevent Agent from falling off. (#3650)
- - Fixed an issue where specifying `vis_encode_type` was required only for SAC. (#3677)
- - Fixed the reported entropy values for continuous actions (#3684)
- - Fixed an issue where switching models using `SetModel()` during training would use an excessive amount of memory. (#3664)
- - Environment subprocesses now close immediately on timeout or wrong API version. (#3679)
- - Fixed an issue in the gym wrapper that would raise an exception if an Agent called EndEpisode multiple times in the same step. (#3700)
- - Fixed an issue where logging output was not visible; logging levels are now set consistently. (#3703)
+- Raise the wall in CrawlerStatic scene to prevent Agent from falling off.
+  (#3650)
+- Fixed an issue where specifying `vis_encode_type` was required only for SAC.
+  (#3677)
+- Fixed the reported entropy values for continuous actions (#3684)
+- Fixed an issue where switching models using `SetModel()` during training would
+  use an excessive amount of memory. (#3664)
+- Environment subprocesses now close immediately on timeout or wrong API
+  version. (#3679)
+- Fixed an issue in the gym wrapper that would raise an exception if an Agent
+  called EndEpisode multiple times in the same step. (#3700)
+- Fixed an issue where logging output was not visible; logging levels are now
+  set consistently. (#3703)
+
- - `Agent.CollectObservations` now takes a VectorSensor argument. (#3352, #3389)
- - Added `Agent.CollectDiscreteActionMasks` virtual method with a `DiscreteActionMasker` argument to specify which discrete actions are unavailable to the Agent. (#3525)
- - Beta support for ONNX export was added. If the `tf2onnx` python package is installed, models will be saved to `.onnx` as well as `.nn` format.
- Note that Barracuda 0.6.0 or later is required to import the `.onnx` files properly
- - Multi-GPU training and the `--multi-gpu` option has been removed temporarily. (#3345)
- - All Sensor related code has been moved to the namespace `MLAgents.Sensors`.
- - All SideChannel related code has been moved to the namespace `MLAgents.SideChannels`.
- - `BrainParameters` and `SpaceType` have been removed from the public API
- - `BehaviorParameters` have been removed from the public API.
- - The following methods in the `Agent` class have been deprecated and will be removed in a later release:
-   - `InitializeAgent()` was renamed to `Initialize()`
-   - `AgentAction()` was renamed to `OnActionReceived()`
-   - `AgentReset()` was renamed to `OnEpisodeBegin()`
-   - `Done()` was renamed to `EndEpisode()`
-   - `GiveModel()` was renamed to `SetModel()`
+
+- `Agent.CollectObservations` now takes a VectorSensor argument. (#3352, #3389)
+- Added `Agent.CollectDiscreteActionMasks` virtual method with a
+  `DiscreteActionMasker` argument to specify which discrete actions are
+  unavailable to the Agent. (#3525)
+- Beta support for ONNX export was added. If the `tf2onnx` python package is
+  installed, models will be saved to `.onnx` as well as `.nn` format. Note that
+  Barracuda 0.6.0 or later is required to import the `.onnx` files properly
+- Multi-GPU training and the `--multi-gpu` option has been removed temporarily.
+  (#3345)
+- All Sensor related code has been moved to the namespace `MLAgents.Sensors`.
+- All SideChannel related code has been moved to the namespace
+  `MLAgents.SideChannels`.
+- `BrainParameters` and `SpaceType` have been removed from the public API
+- `BehaviorParameters` have been removed from the public API.
+- The following methods in the `Agent` class have been deprecated and will be
+  removed in a later release:
+  - `InitializeAgent()` was renamed to `Initialize()`
+  - `AgentAction()` was renamed to `OnActionReceived()`
+  - `AgentReset()` was renamed to `OnEpisodeBegin()`
+  - `Done()` was renamed to `EndEpisode()`
+  - `GiveModel()` was renamed to `SetModel()`
- - Monitor.cs was moved to Examples. (#3372)
- - Automatic stepping for Academy is now controlled from the AutomaticSteppingEnabled property. (#3376)
- - The GetEpisodeCount, GetStepCount, GetTotalStepCount and methods of Academy were changed to EpisodeCount, StepCount, TotalStepCount properties respectively. (#3376)
- - Several classes were changed from public to internal visibility. (#3390)
- - Academy.RegisterSideChannel and UnregisterSideChannel methods were added. (#3391)
- - A tutorial on adding custom SideChannels was added (#3391)
- - The stepping logic for the Agent and the Academy has been simplified (#3448)
- - Update Barracuda to 0.6.1-preview
- * The interface for `RayPerceptionSensor.PerceiveStatic()` was changed to take an input class and write to an output class, and the method was renamed to `Perceive()`.
- - The checkpoint file suffix was changed from `.cptk` to `.ckpt` (#3470)
- - The command-line argument used to determine the port that an environment will listen on was changed from `--port` to `--mlagents-port`.
- - `DemonstrationRecorder` can now record observations outside of the editor.
- - `DemonstrationRecorder` now has an optional path for the demonstrations. This will default to `Application.dataPath` if not set.
- - `DemonstrationStore` was changed to accept a `Stream` for its constructor, and was renamed to `DemonstrationWriter`
- - The method `GetStepCount()` on the Agent class has been replaced with the property getter `StepCount`
- - `RayPerceptionSensorComponent` and related classes now display the debug gizmos whenever the Agent is selected (not just Play mode).
- - Most fields on `RayPerceptionSensorComponent` can now be changed while the editor is in Play mode. The exceptions to this are fields that affect the number of observations.
- - Most fields on `CameraSensorComponent` and `RenderTextureSensorComponent` were changed to private and replaced by properties with the same name.
- - Unused static methods from the `Utilities` class (ShiftLeft, ReplaceRange, AddRangeNoAlloc, and GetSensorFloatObservationSize) were removed.
- - The `Agent` class is no longer abstract.
- - SensorBase was moved out of the package and into the Examples directory.
- - `AgentInfo.actionMasks` has been renamed to `AgentInfo.discreteActionMasks`.
- - `DecisionRequester` has been made internal (you can still use the DecisionRequesterComponent from the inspector). `RepeatAction` was renamed `TakeActionsBetweenDecisions` for clarity. (#3555)
- - The `IFloatProperties` interface has been removed.
- - Fix #3579.
- - Improved inference performance for models with multiple action branches. (#3598)
- - Fixed an issue when using GAIL with less than `batch_size` number of demonstrations. (#3591)
- - The interfaces to the `SideChannel` classes (on C# and python) have changed to use new  `IncomingMessage` and `OutgoingMessage` classes. These should make reading and writing data to the channel easier. (#3596)
- - Updated the ExpertPyramid.demo example demonstration file (#3613)
- - Updated project version for example environments to 2018.4.18f1. (#3618)
- - Changed the Product Name in the example environments to remove spaces, so that the default build executable file doesn't contain spaces. (#3612)
+
+- Monitor.cs was moved to Examples. (#3372)
+- Automatic stepping for Academy is now controlled from the
+  AutomaticSteppingEnabled property. (#3376)
+- The GetEpisodeCount, GetStepCount, GetTotalStepCount and methods of Academy
+  were changed to EpisodeCount, StepCount, TotalStepCount properties
+  respectively. (#3376)
+- Several classes were changed from public to internal visibility. (#3390)
+- Academy.RegisterSideChannel and UnregisterSideChannel methods were added.
+  (#3391)
+- A tutorial on adding custom SideChannels was added (#3391)
+- The stepping logic for the Agent and the Academy has been simplified (#3448)
+- Update Barracuda to 0.6.1-preview
+
+* The interface for `RayPerceptionSensor.PerceiveStatic()` was changed to take
+  an input class and write to an output class, and the method was renamed to
+  `Perceive()`.
+
+- The checkpoint file suffix was changed from `.cptk` to `.ckpt` (#3470)
+- The command-line argument used to determine the port that an environment will
+  listen on was changed from `--port` to `--mlagents-port`.
+- `DemonstrationRecorder` can now record observations outside of the editor.
+- `DemonstrationRecorder` now has an optional path for the demonstrations. This
+  will default to `Application.dataPath` if not set.
+- `DemonstrationStore` was changed to accept a `Stream` for its constructor, and
+  was renamed to `DemonstrationWriter`
+- The method `GetStepCount()` on the Agent class has been replaced with the
+  property getter `StepCount`
+- `RayPerceptionSensorComponent` and related classes now display the debug
+  gizmos whenever the Agent is selected (not just Play mode).
+- Most fields on `RayPerceptionSensorComponent` can now be changed while the
+  editor is in Play mode. The exceptions to this are fields that affect the
+  number of observations.
+- Most fields on `CameraSensorComponent` and `RenderTextureSensorComponent` were
+  changed to private and replaced by properties with the same name.
+- Unused static methods from the `Utilities` class (ShiftLeft, ReplaceRange,
+  AddRangeNoAlloc, and GetSensorFloatObservationSize) were removed.
+- The `Agent` class is no longer abstract.
+- SensorBase was moved out of the package and into the Examples directory.
+- `AgentInfo.actionMasks` has been renamed to `AgentInfo.discreteActionMasks`.
+- `DecisionRequester` has been made internal (you can still use the
+  DecisionRequesterComponent from the inspector). `RepeatAction` was renamed
+  `TakeActionsBetweenDecisions` for clarity. (#3555)
+- The `IFloatProperties` interface has been removed.
+- Fix #3579.
+- Improved inference performance for models with multiple action branches.
+  (#3598)
+- Fixed an issue when using GAIL with less than `batch_size` number of
+  demonstrations. (#3591)
+- The interfaces to the `SideChannel` classes (on C# and python) have changed to
+  use new `IncomingMessage` and `OutgoingMessage` classes. These should make
+  reading and writing data to the channel easier. (#3596)
+- Updated the ExpertPyramid.demo example demonstration file (#3613)
+- Updated project version for example environments to 2018.4.18f1. (#3618)
+- Changed the Product Name in the example environments to remove spaces, so that
+  the default build executable file doesn't contain spaces. (#3612)
- Fixed an issue which caused self-play training sessions to consume a lot of memory. (#3451)
- Fixed an IndexError when using GAIL or behavioral cloning with demonstrations recorded with 0.14.0 or later (#3464)
+
+- Fixed an issue which caused self-play training sessions to consume a lot of
+  memory. (#3451)
+- Fixed an IndexError when using GAIL or behavioral cloning with demonstrations
+  recorded with 0.14.0 or later (#3464)
- Fixed a bug with the rewards of multiple Agents in the gym interface (#3471, #3496)
-
+- Fixed a bug with the rewards of multiple Agents in the gym interface (#3471,
+  #3496)
- A new self-play mechanism for training agents in adversarial scenarios was added (#3194)
- Tennis and Soccer environments were refactored to enable training with self-play (#3194, #3331)
- UnitySDK folder was split into a Unity Package (com.unity.ml-agents) and our examples were moved to the Project folder (#3267)
+
+- A new self-play mechanism for training agents in adversarial scenarios was
+  added (#3194)
+- Tennis and Soccer environments were refactored to enable training with
+  self-play (#3194, #3331)
+- UnitySDK folder was split into a Unity Package (com.unity.ml-agents) and our
+  examples were moved to the Project folder (#3267)
- In order to reduce the size of the API, several classes and methods were marked as internal or private. Some public fields on the Agent were trimmed (#3342, #3353, #3269)
- Decision Period and on-demand decision checkboxes were removed from the Agent. on-demand decision is now the default (#3243)
- Calling Done() on the Agent will reset it immediately and call the AgentReset virtual method (#3291, #3242)
- The "Reset on Done" setting in AgentParameters was removed; this is now always true. AgentOnDone virtual method on the Agent was removed (#3311, #3222)
- Trainer steps are now counted per-Agent, not per-environment as in previous versions. For instance, if you have 10 Agents in the scene, 20 environment steps now correspond to 200 steps as printed in the terminal and in Tensorboard (#3113)
+- In order to reduce the size of the API, several classes and methods were
+  marked as internal or private. Some public fields on the Agent were trimmed
+  (#3342, #3353, #3269)
+- Decision Period and on-demand decision checkboxes were removed from the Agent.
+  on-demand decision is now the default (#3243)
+- Calling Done() on the Agent will reset it immediately and call the AgentReset
+  virtual method (#3291, #3242)
+- The "Reset on Done" setting in AgentParameters was removed; this is now always
+  true. AgentOnDone virtual method on the Agent was removed (#3311, #3222)
+- Trainer steps are now counted per-Agent, not per-environment as in previous
+  versions. For instance, if you have 10 Agents in the scene, 20 environment
+  steps now correspond to 200 steps as printed in the terminal and in
+  Tensorboard (#3113)
+
- Curriculum config files are now YAML formatted and all curricula for a training run are combined into a single file (#3186)
- ML-Agents components, such as BehaviorParameters and various Sensor implementations, now appear in the Components menu (#3231)
- Exceptions are now raised in Unity (in debug mode only) if NaN observations or rewards are passed (#3221)
- RayPerception MonoBehavior, which was previously deprecated, was removed (#3304)
- Uncompressed visual (i.e. 3d float arrays) observations are now supported. CameraSensorComponent and RenderTextureSensor now have an option to write uncompressed observations (#3148)
- Agent’s handling of observations during training was improved so that an extra copy of the observations is no longer maintained (#3229)
- Error message for missing trainer config files was improved to include the absolute path (#3230)
+- Curriculum config files are now YAML formatted and all curricula for a
+  training run are combined into a single file (#3186)
+- ML-Agents components, such as BehaviorParameters and various Sensor
+  implementations, now appear in the Components menu (#3231)
+- Exceptions are now raised in Unity (in debug mode only) if NaN observations or
+  rewards are passed (#3221)
+- RayPerception MonoBehavior, which was previously deprecated, was removed
+  (#3304)
+- Uncompressed visual (i.e. 3d float arrays) observations are now supported.
+  CameraSensorComponent and RenderTextureSensor now have an option to write
+  uncompressed observations (#3148)
+- Agent’s handling of observations during training was improved so that an extra
+  copy of the observations is no longer maintained (#3229)
+- Error message for missing trainer config files was improved to include the
+  absolute path (#3230)
+
- A bug that caused RayPerceptionSensor to behave inconsistently with transforms that have non-1 scale was fixed (#3321)
- Some small bugfixes to tensorflow_to_barracuda.py were backported from the barracuda release (#3341)
- Base port in the jupyter notebook example was updated to use the same port that the editor uses (#3283)
-
+- A bug that caused RayPerceptionSensor to behave inconsistently with transforms
+  that have non-1 scale was fixed (#3321)
+- Some small bugfixes to tensorflow_to_barracuda.py were backported from the
+  barracuda release (#3341)
+- Base port in the jupyter notebook example was updated to use the same port
+  that the editor uses (#3283)
-### This is the first release of *Unity Package ML-Agents*.
+### This is the first release of _Unity Package ML-Agents_.
-*Short description of this release*
+_Short description of this release_
--- a/com.unity.ml-agents/Editor/DemonstrationDrawer.cs
+++ b/com.unity.ml-agents/Editor/DemonstrationDrawer.cs
+using System.Collections.Generic;
 using System.Text;
 using UnityEditor;
 using MLAgents.Demonstrations;
 namespace MLAgents.Editor
 {
    /// <summary>
-    /// Renders a custom UI for Demonstration Scriptable Object.
+    /// Renders a custom UI for DemonstrationSummary ScriptableObject.
-    [CustomEditor(typeof(Demonstration))]
+    [CustomEditor(typeof(DemonstrationSummary))]
+        SerializedProperty m_ObservationShapes;
+            m_ObservationShapes = serializedObject.FindProperty("observationSummaries");
        }

        /// <summary>
        {
            var nameProp = property.FindPropertyRelative("demonstrationName");
-            var expProp = property.FindPropertyRelative("numberExperiences");
-            var epiProp = property.FindPropertyRelative("numberEpisodes");
-            var rewProp = property.FindPropertyRelative("meanReward");
+            var experiencesProp = property.FindPropertyRelative("numberSteps");
+            var episodesProp = property.FindPropertyRelative("numberEpisodes");
+            var rewardsProp = property.FindPropertyRelative("meanReward");
-            var expLabel = expProp.displayName + ": " + expProp.intValue;
-            var epiLabel = epiProp.displayName + ": " + epiProp.intValue;
-            var rewLabel = rewProp.displayName + ": " + rewProp.floatValue;
+            var experiencesLabel = experiencesProp.displayName + ": " + experiencesProp.intValue;
+            var episodesLabel = episodesProp.displayName + ": " + episodesProp.intValue;
+            var rewardsLabel = rewardsProp.displayName + ": " + rewardsProp.floatValue;
-            EditorGUILayout.LabelField(expLabel);
-            EditorGUILayout.LabelField(epiLabel);
-            EditorGUILayout.LabelField(rewLabel);
+            EditorGUILayout.LabelField(experiencesLabel);
+            EditorGUILayout.LabelField(episodesLabel);
+            EditorGUILayout.LabelField(rewardsLabel);
-        /// Constructs label for action size array.
+        /// Constructs label for a serialized integer array.
-        static string BuildActionArrayLabel(SerializedProperty actionSizeProperty)
+        static string BuildIntArrayLabel(SerializedProperty actionSizeProperty)
        {
            var actionSize = actionSizeProperty.arraySize;
            var actionLabel = new StringBuilder("[ ");
        }

        /// <summary>
-        /// Renders Inspector UI for Brain Parameters of Demonstration.
+        /// Renders Inspector UI for BrainParameters of a DemonstrationSummary.
+        /// Only the Action size and type are used from the BrainParameters.
-        void MakeBrainParametersProperty(SerializedProperty property)
+        void MakeActionsProperty(SerializedProperty property)
-            var vecObsSizeProp = property.FindPropertyRelative("vectorObservationSize");
-            var numStackedProp = property.FindPropertyRelative("numStackedVectorObservations");
-            var vecObsSizeLabel = vecObsSizeProp.displayName + ": " + vecObsSizeProp.intValue;
-            var numStackedLabel = numStackedProp.displayName + ": " + numStackedProp.intValue;
-                actSizeProperty.displayName + ": " + BuildActionArrayLabel(actSizeProperty);
+                actSizeProperty.displayName + ": " + BuildIntArrayLabel(actSizeProperty);
-            EditorGUILayout.LabelField(vecObsSizeLabel);
-            EditorGUILayout.LabelField(numStackedLabel);
+        /// <summary>
+        /// Render the observation shapes of a DemonstrationSummary.
+        /// </summary>
+        /// <param name="obsSummariesProperty"></param>
+        void MakeObservationsProperty(SerializedProperty obsSummariesProperty)
+        {
+            var shapesLabels = new List<string>();
+            var numObservations = obsSummariesProperty.arraySize;
+            for (var i = 0; i < numObservations; i++)
+            {
+                var summary = obsSummariesProperty.GetArrayElementAtIndex(i);
+                var shapeProperty = summary.FindPropertyRelative("shape");
+                shapesLabels.Add(BuildIntArrayLabel(shapeProperty));
+            }
+
+            var shapeLabel = $"Shapes: {string.Join(",  ", shapesLabels)}";
+            EditorGUILayout.LabelField(shapeLabel);
+
+        }
+
+
+            EditorGUI.indentLevel++;
-            EditorGUILayout.LabelField("Brain Parameters", EditorStyles.boldLabel);
-            MakeBrainParametersProperty(m_BrainParameters);
+            EditorGUI.indentLevel--;
+
+            EditorGUILayout.LabelField("Observations", EditorStyles.boldLabel);
+            EditorGUI.indentLevel++;
+            MakeObservationsProperty(m_ObservationShapes);
+            EditorGUI.indentLevel--;
+
+            EditorGUILayout.LabelField("Actions", EditorStyles.boldLabel);
+            EditorGUI.indentLevel++;
+            MakeActionsProperty(m_BrainParameters);
+            EditorGUI.indentLevel--;
+
            serializedObject.ApplyModifiedProperties();
        }
    }
--- a/com.unity.ml-agents/Editor/DemonstrationImporter.cs
+++ b/com.unity.ml-agents/Editor/DemonstrationImporter.cs
 using System;
+using System.Collections.Generic;
 using System.IO;
 using MLAgents.CommunicatorObjects;
 using UnityEditor;

            try
            {
-                // Read first two proto objects containing metadata and brain parameters.
+                // Read first three proto objects containing metadata, brain parameters, and observations.
                Stream reader = File.OpenRead(ctx.assetPath);

                var metaDataProto = DemonstrationMetaProto.Parser.ParseDelimitedFrom(reader);
                var brainParamsProto = BrainParametersProto.Parser.ParseDelimitedFrom(reader);
                var brainParameters = brainParamsProto.ToBrainParameters();

+                // Read the first AgentInfoActionPair so that we can get the observation sizes.
+                List<ObservationSummary> observationSummaries;
+                try
+                {
+                    var agentInfoActionPairProto = AgentInfoActionPairProto.Parser.ParseDelimitedFrom(reader);
+                    observationSummaries = agentInfoActionPairProto.GetObservationSummaries();
+                }
+                catch
+                {
+                    // Just in case there weren't any AgentInfoActionPair or they couldn't be read.
+                    observationSummaries = new List<ObservationSummary>();
+                }
+
-                var demonstration = ScriptableObject.CreateInstance<Demonstration>();
-                demonstration.Initialize(brainParameters, metaData);
-                userData = demonstration.ToString();
+                var demonstrationSummary = ScriptableObject.CreateInstance<DemonstrationSummary>();
+                demonstrationSummary.Initialize(brainParameters, metaData, observationSummaries);
+                userData = demonstrationSummary.ToString();
-                ctx.AddObjectToAsset(ctx.assetPath, demonstration, texture);
-                ctx.SetMainObject(demonstration);
+                ctx.AddObjectToAsset(ctx.assetPath, demonstrationSummary, texture);
+                ctx.SetMainObject(demonstrationSummary);
            }
            catch
            {
--- a/com.unity.ml-agents/Runtime/Communicator/GrpcExtensions.cs
+++ b/com.unity.ml-agents/Runtime/Communicator/GrpcExtensions.cs
 {
    internal static class GrpcExtensions
    {
+        #region AgentInfo
        /// <summary>
        /// Converts a AgentInfo to a protobuf generated AgentInfoActionPairProto
        /// </summary>
        }

        /// <summary>
+        /// Get summaries for the observations in the AgentInfo part of the AgentInfoActionPairProto.
+        /// </summary>
+        /// <param name="infoActionPair"></param>
+        /// <returns></returns>
+        public static List<ObservationSummary> GetObservationSummaries(this AgentInfoActionPairProto infoActionPair)
+        {
+            List<ObservationSummary> summariesOut = new List<ObservationSummary>();
+            var agentInfo = infoActionPair.AgentInfo;
+            foreach (var obs in agentInfo.Observations)
+            {
+                var summary = new ObservationSummary();
+                summary.shape = obs.Shape.ToArray();
+                summariesOut.Add(summary);
+            }
+
+            return summariesOut;
+        }
+
+
+        #endregion
+
+        #region BrainParameters
+        /// <summary>
        /// Converts a Brain into to a Protobuf BrainInfoProto so it can be sent
        /// </summary>
        /// <returns>The BrainInfoProto generated.</returns>
        }

        /// <summary>
+        /// Convert a BrainParametersProto to a BrainParameters struct.
+        /// </summary>
+        /// <param name="bpp">An instance of a brain parameters protobuf object.</param>
+        /// <returns>A BrainParameters struct.</returns>
+        public static BrainParameters ToBrainParameters(this BrainParametersProto bpp)
+        {
+            var bp = new BrainParameters
+            {
+                vectorActionSize = bpp.VectorActionSize.ToArray(),
+                vectorActionDescriptions = bpp.VectorActionDescriptions.ToArray(),
+                vectorActionSpaceType = (SpaceType)bpp.VectorActionSpaceType
+            };
+            return bp;
+        }
+
+        #endregion
+
+        #region DemonstrationMetaData
+        /// <summary>
        /// Convert metadata object to proto object.
        /// </summary>
        public static DemonstrationMetaProto ToProto(this DemonstrationMetaData dm)
                ApiVersion = DemonstrationMetaData.ApiVersion,
                MeanReward = dm.meanReward,
-                NumberSteps = dm.numberExperiences,
+                NumberSteps = dm.numberSteps,
                NumberEpisodes = dm.numberEpisodes,
                DemonstrationName = dm.demonstrationName
            };
            var dm = new DemonstrationMetaData
            {
                numberEpisodes = demoProto.NumberEpisodes,
-                numberExperiences = demoProto.NumberSteps,
+                numberSteps = demoProto.NumberSteps,
                meanReward = demoProto.MeanReward,
                demonstrationName = demoProto.DemonstrationName
            };
            }
            return dm;
        }
-
-        /// <summary>
-        /// Convert a BrainParametersProto to a BrainParameters struct.
-        /// </summary>
-        /// <param name="bpp">An instance of a brain parameters protobuf object.</param>
-        /// <returns>A BrainParameters struct.</returns>
-        public static BrainParameters ToBrainParameters(this BrainParametersProto bpp)
-        {
-            var bp = new BrainParameters
-            {
-                vectorActionSize = bpp.VectorActionSize.ToArray(),
-                vectorActionDescriptions = bpp.VectorActionDescriptions.ToArray(),
-                vectorActionSpaceType = (SpaceType)bpp.VectorActionSpaceType
-            };
-            return bp;
-        }
+        #endregion

        public static UnityRLInitParameters ToUnityRLInitParameters(this UnityRLInitializationInputProto inputProto)
        {
            };
        }

+        #region AgentAction
        public static AgentAction ToAgentAction(this AgentActionProto aap)
        {
            return new AgentAction
            }
            return agentActions;
        }
+        #endregion
+        #region Observations
        public static ObservationProto ToProto(this Observation obs)
        {
            ObservationProto obsProto = null;
            observationProto.Shape.AddRange(shape);
            return observationProto;
        }
+        #endregion
    }
 }
--- a/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationWriter.cs
+++ b/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationWriter.cs
            }

            // Increment meta-data counters.
-            m_MetaData.numberExperiences++;
+            m_MetaData.numberSteps++;
            m_CumulativeReward += info.reward;
            if (info.done)
            {
--- a/com.unity.ml-agents/Runtime/Timer.cs
+++ b/com.unity.ml-agents/Runtime/Timer.cs
    }

    /// <summary>
-    /// Tracks the most recent value of a metric. This is analogous to gauges in statsd.
+    /// Tracks the most recent value of a metric. This is analogous to gauges in statsd and Prometheus.
    /// </summary>
    [DataContract]
    internal class GaugeNode
+        /// <summary>
+        /// The most recent value that the gauge was set to.
+        /// </summary>
+
+        /// <summary>
+        /// The smallest value that has been seen for the gauge since it was created.
+        /// </summary>
+
+        /// <summary>
+        /// The largest value that has been seen for the gauge since it was created.
+        /// </summary>
+
+        /// <summary>
+        /// The exponential moving average of the gauge value. This will take all values into account,
+        /// but weights older values less as more values are added.
+        /// </summary>
+
+        /// <summary>
+        /// The running average of all gauge values.
+        /// </summary>
+        [DataMember]
+        public float runningAverage;
+
+        /// <summary>
+        /// The number of times the gauge has been updated.
+        /// </summary>
+
+            runningAverage = value;
            minValue = value;
            maxValue = value;
            count = 1;
        {
+            ++count;
-            ++count;
+
+            // Update running average - see https://www.johndcook.com/blog/standard_deviation/ for formula.
+            runningAverage = runningAverage + (newValue - runningAverage) / count;
        }
    }

--- a/com.unity.ml-agents/Tests/Editor/TimerTest.cs
+++ b/com.unity.ml-agents/Tests/Editor/TimerTest.cs
            myTimer.Reset();
            Assert.AreEqual(myTimer.RootNode.Children, null);
        }
+
+        [Test]
+        public void TestGauges()
+        {
+            TimerStack myTimer = TimerStack.Instance;
+            myTimer.Reset();
+
+            // Simple test - adding 1's should keep that for the weighted and running averages.
+            myTimer.SetGauge("one", 1.0f);
+            var oneNode = myTimer.RootNode.Gauges["one"];
+            Assert.AreEqual(oneNode.weightedAverage, 1.0f);
+            Assert.AreEqual(oneNode.runningAverage, 1.0f);
+
+            for (int i = 0; i < 10; i++)
+            {
+                myTimer.SetGauge("one", 1.0f);
+            }
+
+            Assert.AreEqual(oneNode.weightedAverage, 1.0f);
+            Assert.AreEqual(oneNode.runningAverage, 1.0f);
+
+            // Try some more interesting values
+            myTimer.SetGauge("increasing", 1.0f);
+            myTimer.SetGauge("increasing", 2.0f);
+            myTimer.SetGauge("increasing", 3.0f);
+
+            myTimer.SetGauge("decreasing", 3.0f);
+            myTimer.SetGauge("decreasing", 2.0f);
+            myTimer.SetGauge("decreasing", 1.0f);
+            var increasingNode = myTimer.RootNode.Gauges["increasing"];
+            var decreasingNode = myTimer.RootNode.Gauges["decreasing"];
+
+            // Expect the running average to be (roughly) the same,
+            // but weighted averages will be biased differently.
+            Assert.AreEqual(increasingNode.runningAverage, 2.0f);
+            Assert.AreEqual(decreasingNode.runningAverage, 2.0f);
+
+            // The older values are actually weighted more heavily, so we expect the
+            // increasing series to have a lower moving average.
+            Assert.Less(increasingNode.weightedAverage, decreasingNode.weightedAverage);
+
+
+        }
    }
 }
--- a/docs/FAQ.md
+++ b/docs/FAQ.md
 ## Installation problems

 ### Tensorflow dependency
-ML Agents requires TensorFlow; if you don't already have it installed, `pip` will try to install it when you install
-the ml-agents package.
+
+ML Agents requires TensorFlow; if you don't already have it installed, `pip`
+will try to install it when you install the ml-agents package.
+
-it means that there is no version of TensorFlow for your python environment. Some known potential causes are:
- * You're using 32-bit python instead of 64-bit. See the answer [here](https://stackoverflow.com/a/1405971/224264)
-  for how to tell which you have installed.
- * You're using python 3.8. Tensorflow plans to release packages for this as soon as possible; see
-  [this issue](https://github.com/tensorflow/tensorflow/issues/33374) for more details.
- * You have the `tensorflow-gpu` package installed. This is equivalent to `tensorflow`, however `pip` doesn't recognize
-  this. The best way to resolve this is to update to `tensorflow==1.15.0` which provides GPU support in the same package
-  (see the [release notes](https://github.com/tensorflow/tensorflow/issues/33374) for more details.)
- * You're on another architecture (e.g. ARM) which requires vendor provided packages.
-In all of these cases, the issue is a pip/python environment setup issue.  Please search the tensorflow github issues
-for similar problems and solutions before creating a new issue.
-
-## Scripting Runtime Environment not setup correctly
-
-If you haven't switched your scripting runtime version from .NET 3.5 to .NET 4.6
-or .NET 4.x, you will see such error message:
+it means that there is no version of TensorFlow for your python environment.
+Some known potential causes are:
-```console
-error CS1061: Type `System.Text.StringBuilder' does not contain a definition for `Clear' and no extension method `Clear' of type `System.Text.StringBuilder' could be found. Are you missing an assembly reference?
-```
+- You're using 32-bit python instead of 64-bit. See the answer
+  [here](https://stackoverflow.com/a/1405971/224264) for how to tell which you
+  have installed.
+- You're using python 3.8. Tensorflow plans to release packages for this as soon
+  as possible; see
+  [this issue](https://github.com/tensorflow/tensorflow/issues/33374) for more
+  details.
+- You have the `tensorflow-gpu` package installed. This is equivalent to
+  `tensorflow`, however `pip` doesn't recognize this. The best way to resolve
+  this is to update to `tensorflow==1.15.0` which provides GPU support in the
+  same package (see the
+  [release notes](https://github.com/tensorflow/tensorflow/issues/33374) for
+  more details.)
+- You're on another architecture (e.g. ARM) which requires vendor provided
+  packages.
-This is because .NET 3.5 doesn't support method Clear() for StringBuilder, refer
-to [Setting Up The ML-Agents Toolkit Within
-Unity](Installation.md#setting-up-ml-agent-within-unity) for solution.
+In all of these cases, the issue is a pip/python environment setup issue. Please
+search the tensorflow github issues for similar problems and solutions before
+creating a new issue.
-If you directly import your Unity environment without building it in the
-editor, you might need to give it additional permissions to execute it.
+If you directly import your Unity environment without building it in the editor,
+you might need to give it additional permissions to execute it.

 If you receive such a permission error on macOS, run:

 ```

 On Windows, you can find
-[instructions](https://technet.microsoft.com/en-us/library/cc754344(v=ws.11).aspx).
+[instructions](<https://technet.microsoft.com/en-us/library/cc754344(v=ws.11).aspx>).

 ## Environment Connection Timeout


 There may be a number of possible causes:

-* _Cause_: There may be no agent in the scene
-* _Cause_: On OSX, the firewall may be preventing communication with the
+- _Cause_: There may be no agent in the scene
+- _Cause_: On OSX, the firewall may be preventing communication with the
-* _Cause_: An error happened in the Unity Environment preventing communication.
-  _Solution_: Look into the [log
-  files](https://docs.unity3d.com/Manual/LogFiles.html) generated by the Unity
-  Environment to figure what error happened.
-* _Cause_: You have assigned HTTP_PROXY and HTTPS_PROXY values in your
+- _Cause_: An error happened in the Unity Environment preventing communication.
+  _Solution_: Look into the
+  [log files](https://docs.unity3d.com/Manual/LogFiles.html) generated by the
+  Unity Environment to figure what error happened.
+- _Cause_: You have assigned `HTTP_PROXY` and `HTTPS_PROXY` values in your
-If you receive an exception `"Couldn't launch new environment because
-communication port {} is still in use. "`, you can change the worker number in
-the Python script when calling
+If you receive an exception
+`"Couldn't launch new environment because communication port {} is still in use. "`,
+you can change the worker number in the Python script when calling

 ```python
 UnityEnvironment(file_name=filename, worker_id=X)

 If you receive a message `Mean reward : nan` when attempting to train a model
 using PPO, this is due to the episodes of the Learning Environment not
-terminating. In order to address this, set `Max Steps` for the
-Agents within the Scene Inspector to a value greater than 0. Alternatively, it
-is possible to manually set `done` conditions for episodes from within scripts
-for custom episode-terminating events.
-
-## Problems with training on AWS
-
-Please refer to [Training on Amazon Web Service FAQ](Training-on-Amazon-Web-Service.md#faq)
-
-# Known Issues
-
-## Release 0.10.0
-* ml-agents 0.10.0 and earlier were incompatible with TensorFlow 1.15.0; the graph could contain
- an operator that `tensorflow_to_barracuda` didn't handle. This was fixed in the 0.11.0 release.
+terminating. In order to address this, set `Max Steps` for the Agents within the
+Scene Inspector to a value greater than 0. Alternatively, it is possible to
+manually set `done` conditions for episodes from within scripts for custom
+episode-terminating events.
--- a/docs/Getting-Started.md
+++ b/docs/Getting-Started.md
 # Getting Started Guide

-This guide walks through the end-to-end process of opening an ML-Agents
-toolkit example environment in Unity, building the Unity executable, training an
-Agent in it, and finally embedding the trained model into the Unity environment.
-
-The ML-Agents toolkit includes a number of [example
-environments](Learning-Environment-Examples.md) which you can examine to help
-understand the different ways in which the ML-Agents toolkit can be used. These
-environments can also serve as templates for new environments or as ways to test
-new ML algorithms. After reading this tutorial, you should be able to explore
-train the example environments.
-
-If you are not familiar with the [Unity Engine](https://unity3d.com/unity), we
-highly recommend the [Roll-a-ball
-tutorial](https://unity3d.com/learn/tutorials/s/roll-ball-tutorial) to learn all
-the basic concepts first.
+This guide walks through the end-to-end process of opening one of our
+[example environments](Learning-Environment-Examples.md) in Unity, training an
+Agent in it, and embedding the trained model into the Unity environment. After
+reading this tutorial, you should be able to train any of the example
+environments. If you are not familiar with the
+[Unity Engine](https://unity3d.com/unity), view our
+[Background: Unity](Background-Unity.md) page for helpful pointers.
+Additionally, if you're not familiar with machine learning, view our
+[Background: Machine Learning](Background-Machine-Learning.md) page for a brief
+overview and helpful pointers.
-This guide uses the **3D Balance Ball** environment to teach the basic concepts and
-usage patterns of ML-Agents. 3D Balance Ball
-contains a number of agent cubes and balls (which are all copies of each other).
-Each agent cube tries to keep its ball from falling by rotating either
-horizontally or vertically. In this environment, an agent cube is an **Agent** that
-receives a reward for every step that it balances the ball. An agent is also
-penalized with a negative reward for dropping the ball. The goal of the training
-process is to have the agents learn to balance the ball on their head.
+For this guide, we'll use the **3D Balance Ball** environment which contains a
+number of agent cubes and balls (which are all copies of each other). Each agent
+cube tries to keep its ball from falling by rotating either horizontally or
+vertically. In this environment, an agent cube is an **Agent** that receives a
+reward for every step that it balances the ball. An agent is also penalized with
+a negative reward for dropping the ball. The goal of the training process is to
+have the agents learn to balance the ball on their head.
-In order to install and set up the ML-Agents toolkit, the Python dependencies
-and Unity, see the [installation instructions](Installation.md).
-
-Depending on your version of Unity, it may be necessary to change the **Scripting Runtime Version** of your project. This can be done as follows:
+If you haven't already, follow the [installation instructions](Installation.md).
+Afterwards, open the Unity Project that contains all the example environments:
-2. On the Projects dialog, choose the **Add** option at the top of the window.
-3. Using the file dialog that opens, locate the `Project` folder
-   within the ML-Agents toolkit project and click **Open**.
-4. Go to **Edit** > **Project Settings** > **Player**
-5. For **each** of the platforms you target (**PC, Mac and Linux Standalone**,
-   **iOS** or **Android**):
-    1. Expand the **Other Settings** section.
-    2. Select **Scripting Runtime Version** to **Experimental (.NET 4.6
-       Equivalent or .NET 4.x Equivalent)**
-6. Go to **File** > **Save Project**
-
+1. On the Projects dialog, choose the **Add** option at the top of the window.
+1. Using the file dialog that opens, locate the `Project` folder within the
+   ML-Agents Toolkit and click **Open**.
+1. In the **Project** window, go to the
+   `Assets/ML-Agents/Examples/3DBall/Scenes` folder and open the `3DBall` scene
+   file.
-_environment_. In the context of Unity, an environment is a scene containing
-one or more Agent objects, and, of course, the other
-entities that an agent interacts with.
+_environment_. In the context of Unity, an environment is a scene containing one
+or more Agent objects, and, of course, the other entities that an agent
+interacts with.

 ![Unity Editor](images/mlagents-3DBallHierarchy.png)

 window. The Inspector shows every component on a GameObject.

 The first thing you may notice after opening the 3D Balance Ball scene is that
-it contains not one, but several agent cubes.  Each agent cube in the scene is an
-independent agent, but they all share the same Behavior. 3D Balance Ball does this
-to speed up training since all twelve agents contribute to training in parallel.
+it contains not one, but several agent cubes. Each agent cube in the scene is an
+independent agent, but they all share the same Behavior. 3D Balance Ball does
+this to speed up training since all twelve agents contribute to training in
+parallel.

 ### Agent

 behavior:

-* **Behavior Parameters** — Every Agent must have a Behavior. The Behavior
-  determines how an Agent makes decisions. More on Behavior Parameters in
-  the next section.
-* **Max Step** — Defines how many simulation steps can occur before the Agent's
+- **Behavior Parameters** — Every Agent must have a Behavior. The Behavior
+  determines how an Agent makes decisions.
+- **Max Step** — Defines how many simulation steps can occur before the Agent's
-When you create an Agent, you must extend the base Agent class.
-The Ball3DAgent subclass defines the following methods:
-
-* `Agent.OnEpisodeBegin()` — Called at the beginning of an Agent's episode, including at the beginning
-  of the simulation. The Ball3DAgent class uses this function to reset the
-  agent cube and ball to their starting positions. The function randomizes the reset values so that the
-  training generalizes to more than a specific starting position and agent cube
-  attitude.
-* `Agent.CollectObservations(VectorSensor sensor)` — Called every simulation step. Responsible for
-  collecting the Agent's observations of the environment. Since the Behavior
-  Parameters of the Agent are set with vector observation
-  space with a state size of 8, the `CollectObservations(VectorSensor sensor)` must call
-  `VectorSensor.AddObservation()` such that vector size adds up to 8.
-* `Agent.OnActionReceived()` — Called every time the Agent receives an action to take. Receives the action chosen
-  by the Agent. The vector action spaces result in a
-  small change in the agent cube's rotation at each step. The `OnActionReceived()` method
-  assigns a reward to the Agent; in this example, an Agent receives a small
-  positive reward for each step it keeps the ball on the agent cube's head and a larger,
-  negative reward for dropping the ball. An Agent's episode is also ended when it
-  drops the ball so that it will reset with a new ball for the next simulation
-  step.
-* `Agent.Heuristic()` - When the `Behavior Type` is set to `Heuristic Only` in the Behavior
-  Parameters of the Agent, the Agent will use the `Heuristic()` method to generate
-  the actions of the Agent. As such, the `Heuristic()` method takes an array of
-  floats. In the case of the Ball 3D Agent, the `Heuristic()` method converts the
-  keyboard inputs into actions.
-
-
 #### Behavior Parameters : Vector Observation Space

 Before making a decision, an agent collects its observation about its state in
-The Behavior Parameters of the 3D Balance Ball example uses a **Space Size** of 8.
-This means that the feature
-vector containing the Agent's observations contains eight elements: the `x` and
-`z` components of the agent cube's rotation and the `x`, `y`, and `z` components
-of the ball's relative position and velocity. (The observation values are
-defined in the Agent's `CollectObservations(VectorSensor sensor)` method.)
+The Behavior Parameters of the 3D Balance Ball example uses a `Space Size` of 8.
+This means that the feature vector containing the Agent's observations contains
+eight elements: the `x` and `z` components of the agent cube's rotation and the
+`x`, `y`, and `z` components of the ball's relative position and velocity.
-An Agent is given instructions in the form of a float array of *actions*.
-ML-Agents toolkit classifies actions into two types: the **Continuous** vector
-action space is a vector of numbers that can vary continuously. What each
-element of the vector means is defined by the Agent logic (the training
-process just learns what values are better given particular state observations
-based on the rewards received when it tries different values). For example, an
-element might represent a force or torque applied to a `Rigidbody` in the Agent.
-The **Discrete** action vector space defines its actions as tables. An action
-given to the Agent is an array of indices into tables.
-
-The 3D Balance Ball example is programmed to use continuous action
-space with `Space Size` of 2.
+An Agent is given instructions in the form of a float array of _actions_.
+ML-Agents Toolkit classifies actions into two types: continuous and discrete.
+The 3D Balance Ball example is programmed to use continuous action space which
+is a a vector of numbers that can vary continuously. More specifically, it uses
+a `Space Size` of 2 to control the amount of `x` and `z` rotations to apply to
+itself to keep the ball balanced on its head.
-[Unity Inference Engine](Unity-Inference-Engine.md) to run these models
-inside Unity. In this section, we will use the pre-trained model for the
-3D Ball example.
+[Unity Inference Engine](Unity-Inference-Engine.md) to run these models inside
+Unity. In this section, we will use the pre-trained model for the 3D Ball
+example.
-1. In the **Project** window, go to the `Assets/ML-Agents/Examples/3DBall/Scenes` folder
-   and open the `3DBall` scene file.
-2. In the **Project** window, go to the `Assets/ML-Agents/Examples/3DBall/Prefabs` folder.
-   Expand `3DBall` and click on the `Agent` prefab.  You should see the `Agent` prefab in the **Inspector** window.
+1. In the **Project** window, go to the
+   `Assets/ML-Agents/Examples/3DBall/Prefabs` folder. Expand `3DBall` and click
+   on the `Agent` prefab. You should see the `Agent` prefab in the **Inspector**
+   window.
-   **Note**: The platforms in the `3DBall` scene were created using the `3DBall` prefab.  Instead of updating all 12 platforms individually, you can update the `3DBall` prefab instead.
+   **Note**: The platforms in the `3DBall` scene were created using the `3DBall`
+   prefab. Instead of updating all 12 platforms individually, you can update the
+   `3DBall` prefab instead.
-3. In the **Project** window, drag the **3DBall** Model located in
-   `Assets/ML-Agents/Examples/3DBall/TFModels` into the `Model` property under `Behavior Parameters (Script)` component in the Agent GameObject **Inspector** window.
+1. In the **Project** window, drag the **3DBall** Model located in
+   `Assets/ML-Agents/Examples/3DBall/TFModels` into the `Model` property under
+   `Behavior Parameters (Script)` component in the Agent GameObject
+   **Inspector** window.
-4. You should notice that each `Agent` under each `3DBall` in the **Hierarchy** windows now contains **3DBall** as `Model` on the `Behavior Parameters`. __Note__ : You can modify multiple game objects in a scene by selecting them all at
-   once using the search bar in the Scene Hierarchy.
-8. Select the **InferenceDevice** to use for this model (CPU or GPU) on the Agent.
-   _Note: CPU is faster for the majority of ML-Agents toolkit generated models_
-9. Click the **Play** button and you will see the platforms balance the balls
-   using the pre-trained model.
+1. You should notice that each `Agent` under each `3DBall` in the **Hierarchy**
+   windows now contains **3DBall** as `Model` on the `Behavior Parameters`.
+   **Note** : You can modify multiple game objects in a scene by selecting them
+   all at once using the search bar in the Scene Hierarchy.
+1. Set the **Inference Device** to use for this model as `CPU`.
+1. Click the :arrow_forward: button in the Unity Editor and you will see the
+   platforms balance the balls using the pre-trained model.
-While we provide pre-trained `.nn` files for the agents in this environment, any environment you make yourself will require training agents from scratch to generate a new model file. We can do this using reinforcement learning.
-
-In order to train an agent to correctly balance the ball, we provide two
-deep reinforcement learning algorithms.
-
-The default algorithm is Proximal Policy Optimization (PPO). This
-is a method that has been shown to be more general purpose and stable
-than many other RL algorithms. For more information on PPO, OpenAI
-has a [blog post](https://blog.openai.com/openai-baselines-ppo/)
-explaining it, and [our page](Training-PPO.md) for how to use it in training.
-
-We also provide Soft-Actor Critic, an off-policy algorithm that
-has been shown to be both stable and sample-efficient.
-For more information on SAC, see UC Berkeley's
-[blog post](https://bair.berkeley.edu/blog/2018/12/14/sac/) and
-[our page](Training-SAC.md) for more guidance on when to use SAC vs. PPO. To
-use SAC to train Balance Ball, replace all references to `config/trainer_config.yaml`
-with `config/sac_trainer_config.yaml` below.
-
-To train the agents within the Balance Ball environment, we will be using the
-ML-Agents Python package. We have provided a convenient command called `mlagents-learn`
-which accepts arguments used to configure both training and inference phases.
+While we provide pre-trained `.nn` files for the agents in this environment, any
+environment you make yourself will require training agents from scratch to
+generate a new model file. In this section we will demonstrate how to use the
+reinforcement learning algorithms that are part of the ML-Agents Python package
+to accomplish this. We have provided a convenient command `mlagents-learn` which
+accepts arguments used to configure both training and inference phases.
-2. Navigate to the folder where you cloned the ML-Agents toolkit repository.
-   **Note**: If you followed the default [installation](Installation.md), then
-   you should be able to run `mlagents-learn` from any directory.
-3. Run `mlagents-learn <trainer-config-path> --run-id=<run-identifier>`
-   where:
-    - `<trainer-config-path>` is the relative or absolute filepath of the
-      trainer configuration. The defaults used by example environments included
-      in `MLAgentsSDK` can be found in `config/trainer_config.yaml`.
-    - `<run-identifier>` is a string used to separate the results of different
-      training runs. Make sure to use one that hasn't been used already!
-4. If you cloned the ML-Agents repo, then you can simply run
-
-      ```sh
-      mlagents-learn config/trainer_config.yaml --run-id=firstRun
-      ```
-
-5. When the message _"Start training by pressing the Play button in the Unity
+1. Navigate to the folder where you cloned the `ml-agents` repository. **Note**:
+   If you followed the default [installation](Installation.md), then you should
+   be able to run `mlagents-learn` from any directory.
+1. Run `mlagents-learn config/trainer_config.yaml --run-id=first3DBallRun`.
+   - `config/trainer_config.yaml` is the path to a default training
+     configuration file that we provide. In includes training configurations for
+     all our example environments, including 3DBall.
+   - `run-id` is a unique name for this training session.
+1. When the message _"Start training by pressing the Play button in the Unity
-**Note**: If you're using Anaconda, don't forget to activate the ml-agents
-environment first.
-
-The `--time-scale=100` sets the `Time.TimeScale` value in Unity.
-
-**Note**: You can train using an executable rather than the Editor. To do so,
-follow the instructions in
-[Using an Executable](Learning-Environment-Executable.md).
-
-**Note**: Re-running this command will start training from scratch again. To resume
-a previous training run, append the `--load` flag and give the same `--run-id` as the
-run you want to resume.
-
 If `mlagents-learn` runs correctly and starts training, you should see something
 like this:

        sequence_length:     64
        summary_freq:        1000
        use_recurrent:       False
-        summary_path:        ./summaries/first-run-0
+        summary_path:        ./summaries/first3DBallRun
-        model_path:	./models/first-run-0/3DBallLearning
-INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 1000. Mean Reward: 1.242. Std of Reward: 0.746. Training.
-INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 2000. Mean Reward: 1.319. Std of Reward: 0.693. Training.
-INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 3000. Mean Reward: 1.804. Std of Reward: 1.056. Training.
-INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 4000. Mean Reward: 2.151. Std of Reward: 1.432. Training.
-INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 5000. Mean Reward: 3.175. Std of Reward: 2.250. Training.
-INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 6000. Mean Reward: 4.898. Std of Reward: 4.019. Training.
-INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 7000. Mean Reward: 6.716. Std of Reward: 5.125. Training.
-INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 8000. Mean Reward: 12.124. Std of Reward: 11.929. Training.
-INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 9000. Mean Reward: 18.151. Std of Reward: 16.871. Training.
-INFO:mlagents.trainers: first-run-0: 3DBallLearning: Step: 10000. Mean Reward: 27.284. Std of Reward: 28.667. Training.
+        model_path: ./models/first3DBallRun/3DBallLearning
+INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 1000. Mean Reward: 1.242. Std of Reward: 0.746. Training.
+INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 2000. Mean Reward: 1.319. Std of Reward: 0.693. Training.
+INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 3000. Mean Reward: 1.804. Std of Reward: 1.056. Training.
+INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 4000. Mean Reward: 2.151. Std of Reward: 1.432. Training.
+INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 5000. Mean Reward: 3.175. Std of Reward: 2.250. Training.
+INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 6000. Mean Reward: 4.898. Std of Reward: 4.019. Training.
+INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 7000. Mean Reward: 6.716. Std of Reward: 5.125. Training.
+INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 8000. Mean Reward: 12.124. Std of Reward: 11.929. Training.
+INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 9000. Mean Reward: 18.151. Std of Reward: 16.871. Training.
+INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 10000. Mean Reward: 27.284. Std of Reward: 28.667. Training.
+Note how the `Mean Reward` value printed to the screen increases as training
+progresses. This is a positive sign that training is succeeding.

 ### Observing Training Progress

 tensorboard --logdir=summaries
 ```

-Then navigate to `localhost:6006` in your browser.
-
-From TensorBoard, you will see the summary statistics:
-
-* **Lesson** - only interesting when performing [curriculum
-  training](Training-Curriculum-Learning.md). This is not used in the 3D Balance
-  Ball environment.
-* **Cumulative Reward** - The mean cumulative episode reward over all agents. Should
-  increase during a successful training session.
-* **Entropy** - How random the decisions of the model are. Should slowly decrease
-  during a successful training process. If it decreases too quickly, the `beta`
-  hyperparameter should be increased.
-* **Episode Length** - The mean length of each episode in the environment for all
-  agents.
-* **Learning Rate** - How large a step the training algorithm takes as it searches
-  for the optimal policy. Should decrease over time.
-* **Policy Loss** - The mean loss of the policy function update. Correlates to how
-  much the policy (process for deciding actions) is changing. The magnitude of
-  this should decrease during a successful training session.
-* **Value Estimate** - The mean value estimate for all states visited by the agent.
-  Should increase during a successful training session.
-* **Value Loss** - The mean loss of the value function update. Correlates to how
-  well the model is able to predict the value of each state. This should
-  decrease during a successful training session.
+Then navigate to `localhost:6006` in your browser to view the TensorBoard
+summary statistics as shown below. For the purposes of this section, the most
+important statistic is `Environment/Cumulative Reward` which should increase
+throughout training, eventually converging close to `100` which is the maximum
+reward the agent can accumulate.

 ![Example TensorBoard Run](images/mlagents-TensorBoard.png)

 (denoted by the `Saved Model` message) you can add it to the Unity project and
-use it with compatible Agents (the Agents that generated the model).
-__Note:__ Do not just close the Unity Window once the `Saved Model` message appears.
+use it with compatible Agents (the Agents that generated the model). **Note:**
+Do not just close the Unity Window once the `Saved Model` message appears.
-command-line prompt. If you close the window manually, the `.nn` file
-containing the trained model is not exported into the ml-agents folder.
+command-line prompt. If you close the window manually, the `.nn` file containing
+the trained model is not exported into the ml-agents folder.
-If you've quit the training early using Ctrl+C and want to resume training, run the
-same command again, appending the `--resume` flag:
+If you've quit the training early using Ctrl+C and want to resume training, run
+the same command again, appending the `--resume` flag:
-mlagents-learn config/trainer_config.yaml --run-id=firstRun --resume
+mlagents-learn config/trainer_config.yaml --run-id=first3DBallRun --resume
-`<behavior_name>` is the name of the `Behavior Name` of the agents corresponding to the model.
-(**Note:** There is a known bug on Windows that causes the saving of the model to
-fail when you early terminate the training, it's recommended to wait until Step
-has reached the max_steps parameter you set in trainer_config.yaml.) This file
-corresponds to your model's latest checkpoint. You can now embed this trained
-model into your Agents by following the steps below, which is similar to
-the steps described
-[above](#running-a-pre-trained-model).
+`<behavior_name>` is the name of the `Behavior Name` of the agents corresponding
+to the model. This file corresponds to your model's latest checkpoint. You can
+now embed this trained model into your Agents by following the steps below,
+which is similar to the steps described [above](#running-a-pre-trained-model).
-2. Open the Unity Editor, and select the **3DBall** scene as described above.
-3. Select the  **3DBall** prefab Agent object.
-4. Drag the `<behavior_name>.nn` file from the Project window of
-   the Editor to the **Model** placeholder in the **Ball3DAgent**
-   inspector window.
-5. Press the :arrow_forward: button at the top of the Editor.
+1. Open the Unity Editor, and select the **3DBall** scene as described above.
+1. Select the **3DBall** prefab Agent object.
+1. Drag the `<behavior_name>.nn` file from the Project window of the Editor to
+   the **Model** placeholder in the **Ball3DAgent** inspector window.
+1. Press the :arrow_forward: button at the top of the Editor.
- For more information on the ML-Agents toolkit, in addition to helpful
+- For more information on the ML-Agents Toolkit, in addition to helpful
-  check out the [Making a New Learning
-  Environment](Learning-Environment-Create-New.md) page.
+  check out the
+  [Making a New Learning Environment](Learning-Environment-Create-New.md) page.
+- For an overview on the more complex example environments that are provided in
+  this toolkit, check out the
+  [Example Environments](Learning-Environment-Examples.md) page.
+- For more information on the various training options available, check out the
+  [Training ML-Agents](Training-ML-Agents.md) page.
--- a/docs/Installation-Anaconda-Windows.md
+++ b/docs/Installation-Anaconda-Windows.md
 # Installing ML-Agents Toolkit for Windows (Deprecated)

-Note: We no longer use this guide ourselves and so it may not work correctly. We've decided to
- keep it up just in case it is helpful to you.
+:warning: **Note:** We no longer use this guide ourselves and so it may not work
+correctly. We've decided to keep it up just in case it is helpful to you.

 The ML-Agents toolkit supports Windows 10. While it might be possible to run the
 ML-Agents toolkit using other versions of Windows, it has not been tested on

 [Download](https://www.anaconda.com/download/#windows) and install Anaconda for
 Windows. By using Anaconda, you can manage separate environments for different
-distributions of Python. Python 3.6.1 or higher is required as we no longer support
-Python 2. In this guide, we are using Python version 3.6 and Anaconda version
-5.1
+distributions of Python. Python 3.6.1 or higher is required as we no longer
+support Python 2. In this guide, we are using Python version 3.6 and Anaconda
+version 5.1
 ([64-bit](https://repo.continuum.io/archive/Anaconda3-5.1.0-Windows-x86_64.exe)
 or [32-bit](https://repo.continuum.io/archive/Anaconda3-5.1.0-Windows-x86.exe)
 direct links).
  <img src="images/anaconda_default.PNG" alt="Anaconda Install" width="500" border="10" />
 </p>

-After installation, you must open __Anaconda Navigator__ to finish the setup.
+After installation, you must open **Anaconda Navigator** to finish the setup.
 From the Windows search bar, type _anaconda navigator_. You can close Anaconda
 Navigator after it opens.


 Type `environment variables` in the search bar (this can be reached by hitting
 the Windows key or the bottom left Windows button). You should see an option
-called __Edit the system environment variables__.
+called **Edit the system environment variables**.

 <p align="center">
  <img src="images/edit_env_var.png"

-From here, click the __Environment Variables__ button. Double click "Path" under
-__System variable__ to edit the "Path" variable, click __New__ to add the
+From here, click the **Environment Variables** button. Double click "Path" under
+**System variable** to edit the "Path" variable, click **New** to add the
 following new paths.

 ```console
 install these Python dependencies.

 If you haven't already, clone the ML-Agents Toolkit Github repository to your
-local computer. You can do this using Git ([download
-here](https://git-scm.com/download/win)) and running the following commands in
-an Anaconda Prompt _(if you open a new prompt, be sure to activate the ml-agents
-Conda environment by typing `activate ml-agents`)_:
+local computer. You can do this using Git
+([download here](https://git-scm.com/download/win)) and running the following
+commands in an Anaconda Prompt _(if you open a new prompt, be sure to activate
+the ml-agents Conda environment by typing `activate ml-agents`)_:
-The `--branch latest_release` option will switch to the tag of the latest stable release.
-Omitting that will get the `master` branch which is potentially unstable.
+
+The `--branch latest_release` option will switch to the tag of the latest stable
+release. Omitting that will get the `master` branch which is potentially
+unstable.
-The `com.unity.ml-agents` subdirectory contains the core code to add to your projects.
-The `Project` subdirectory contains many [example environments](Learning-Environment-Examples.md)
-to help you get started.
+The `com.unity.ml-agents` subdirectory contains the core code to add to your
+projects. The `Project` subdirectory contains many
+[example environments](Learning-Environment-Examples.md) to help you get
+started.
-The `ml-agents` subdirectory contains a Python package which provides deep reinforcement
-learning trainers to use with Unity environments.
+The `ml-agents` subdirectory contains a Python package which provides deep
+reinforcement learning trainers to use with Unity environments.
-The `ml-agents-envs` subdirectory contains a Python API to interface with Unity, which
-the `ml-agents` package depends on.
+The `ml-agents-envs` subdirectory contains a Python API to interface with Unity,
+which the `ml-agents` package depends on.
-Keep in mind where the files were downloaded, as you will need the
-trainer config files in this directory when running `mlagents-learn`.
-Make sure you are connected to the Internet and then type in the Anaconda
-Prompt:
+Keep in mind where the files were downloaded, as you will need the trainer
+config files in this directory when running `mlagents-learn`. Make sure you are
+connected to the Internet and then type in the Anaconda Prompt:

 ```console
 pip install mlagents
 the ML-Agents toolkit.

-Sometimes on Windows, when you use pip to install certain Python packages, the pip will get stuck when trying to read the cache of the package. If you see this, you can try:
+Sometimes on Windows, when you use pip to install certain Python packages, the
+pip will get stuck when trying to read the cache of the package. If you see
+this, you can try:

 ```console
 pip install mlagents --no-cache-dir

 ### Installing for Development

-If you intend to make modifications to `ml-agents` or `ml-agents-envs`, you should install
-the packages from the cloned repo rather than from PyPi. To do this, you will need to install
- `ml-agents` and `ml-agents-envs` separately.
+If you intend to make modifications to `ml-agents` or `ml-agents-envs`, you
+should install the packages from the cloned repo rather than from PyPi. To do
+this, you will need to install `ml-agents` and `ml-agents-envs` separately.
-cloned or downloaded the files, from the Anaconda Prompt, change to the ml-agents
-subdirectory inside the ml-agents directory:
+cloned or downloaded the files, from the Anaconda Prompt, change to the
+ml-agents subdirectory inside the ml-agents directory:

 ```console
 cd C:\Downloads\ml-agents
 pip install -e .
 ```

-Running pip with the `-e` flag will let you make changes to the Python files directly and have those
-reflected when you run `mlagents-learn`. It is important to install these packages in this order as the
-`mlagents` package depends on `mlagents_envs`, and installing it in the other
-order will download `mlagents_envs` from PyPi.
+Running pip with the `-e` flag will let you make changes to the Python files
+directly and have those reflected when you run `mlagents-learn`. It is important
+to install these packages in this order as the `mlagents` package depends on
+`mlagents_envs`, and installing it in the other order will download
+`mlagents_envs` from PyPi.

 ## (Optional) Step 4: GPU Training using The ML-Agents Toolkit

 Additionally, you will need to check if your GPU is CUDA compatible. Please
 check Nvidia's page [here](https://developer.nvidia.com/cuda-gpus).

-Currently for the ML-Agents toolkit, only CUDA v9.0 and cuDNN v7.0.5 is supported.
+Currently for the ML-Agents toolkit, only CUDA v9.0 and cuDNN v7.0.5 is
+supported.

 ### Install Nvidia CUDA toolkit

 this guide, we are using version
 [9.0.176](https://developer.nvidia.com/compute/cuda/9.0/Prod/network_installers/cuda_9.0.176_win10_network-exe)).

-Before installing, please make sure you __close any running instances of Unity
-or Visual Studio__.
+Before installing, please make sure you **close any running instances of Unity
+or Visual Studio**.

 Run the installer and select the Express option. Note the directory where you
 installed the CUDA toolkit. In this guide, we installed in the directory
 </p>

 Once you've signed up, go back to the cuDNN
-[downloads page](https://developer.nvidia.com/cudnn).
-You may or may not be asked to fill out a short survey. When you get to the list
-cuDNN releases, __make sure you are downloading the right version for the CUDA
-toolkit you installed in Step 1.__ In this guide, we are using version 7.0.5 for
-CUDA toolkit version 9.0
+[downloads page](https://developer.nvidia.com/cudnn). You may or may not be
+asked to fill out a short survey. When you get to the list cuDNN releases,
+**make sure you are downloading the right version for the CUDA toolkit you
+installed in Step 1.** In this guide, we are using version 7.0.5 for CUDA
+toolkit version 9.0
 ([direct link](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/v7.0.5/prod/9.0_20171129/cudnn-9.0-windows10-x64-v7)).

 After you have downloaded the cuDNN files, you will need to extract the files

 To set the environment variable, type `environment variables` in the search bar
 (this can be reached by hitting the Windows key or the bottom left Windows
-button). You should see an option called __Edit the system environment
-variables__.
+button). You should see an option called **Edit the system environment
+variables**.

 <p align="center">
  <img src="images/edit_env_var.png"

-From here, click the __Environment Variables__ button. Click __New__ to add a
-new system variable _(make sure you do this under __System variables__ and not
+From here, click the **Environment Variables** button. Click **New** to add a
+new system variable _(make sure you do this under **System variables** and not
 User variables_.

 <p align="center">
 </p>

-For __Variable Name__, enter `CUDA_HOME`. For the variable value, put the
+For **Variable Name**, enter `CUDA_HOME`. For the variable value, put the
-is `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`. Press __OK__ once.
+is `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`. Press **OK** once.

 <p align="center">
  <img src="images/system_variable_name_value.PNG"

-To set the two path variables, inside the same __Environment Variables__ window
-and under the second box called __System Variables__, find a variable called
-`Path` and click __Edit__. You will add two directories to the list. For this
+To set the two path variables, inside the same **Environment Variables** window
+and under the second box called **System Variables**, find a variable called
+`Path` and click **Edit**. You will add two directories to the list. For this
 guide, the two entries would look like:

 ```console

 Next, install `tensorflow-gpu` using `pip`. You'll need version 1.7.1. In an
 Anaconda Prompt with the Conda environment ml-agents activated, type in the
-following command to uninstall TensorFlow for cpu and install TensorFlow
-for gpu _(make sure you are connected to the Internet)_:
+following command to uninstall TensorFlow for cpu and install TensorFlow for gpu
+_(make sure you are connected to the Internet)_:

 ```sh
 pip uninstall tensorflow
 Lastly, you should test to see if everything installed properly and that
-TensorFlow can identify your GPU. In the same Anaconda Prompt, open Python
-in the Prompt by calling:
+TensorFlow can identify your GPU. In the same Anaconda Prompt, open Python in
+the Prompt by calling:

 ```sh
 python
--- a/docs/Learning-Environment-Design-Agents.md
+++ b/docs/Learning-Environment-Design-Agents.md
 # Agents

 An agent is an entity that can observe its environment, decide on the best
-course of action using those observations, and execute those actions within
-its environment. Agents can be created in Unity by extending
-the `Agent` class. The most important aspects of creating agents that can
-successfully learn are the observations the agent collects,
-and the reward you assign to estimate the value of the
-agent's current state toward accomplishing its tasks.
+course of action using those observations, and execute those actions within its
+environment. Agents can be created in Unity by extending the `Agent` class. The
+most important aspects of creating agents that can successfully learn are the
+observations the agent collects, and the reward you assign to estimate the value
+of the agent's current state toward accomplishing its tasks.

 An Agent passes its observations to its Policy. The Policy then makes a decision
 and passes the chosen action back to the agent. Your agent code must execute the
 discover the optimal decision-making policy.

-The `Policy` class abstracts out the decision making logic from the Agent itself so
-that you can use the same Policy in multiple Agents. How a Policy makes its
+The `Policy` class abstracts out the decision making logic from the Agent itself
+so that you can use the same Policy in multiple Agents. How a Policy makes its
-write your own Policy. If the Agent has a `Model` file, its Policy will use
-the neural network `Model` to take decisions.
+write your own Policy. If the Agent has a `Model` file, its Policy will use the
+neural network `Model` to take decisions.
+
+When you create an Agent, you must extend the base Agent class. This includes
+implementing the following methods:
+
+- `Agent.OnEpisodeBegin()` — Called at the beginning of an Agent's episode,
+  including at the beginning of the simulation. The Ball3DAgent class uses this
+  function to reset the agent cube and ball to their starting positions. The
+  function randomizes the reset values so that the training generalizes to more
+  than a specific starting position and agent cube attitude.
+- `Agent.CollectObservations(VectorSensor sensor)` — Called every simulation
+  step. Responsible for collecting the Agent's observations of the environment.
+  Since the Behavior Parameters of the Agent are set with vector observation
+  space with a state size of 8, the `CollectObservations(VectorSensor sensor)`
+  must call `VectorSensor.AddObservation()` such that vector size adds up to 8.
+- `Agent.OnActionReceived()` — Called every time the Agent receives an action to
+  take. Receives the action chosen by the Agent. The vector action spaces result
+  in a small change in the agent cube's rotation at each step. The
+  `OnActionReceived()` method assigns a reward to the Agent; in this example, an
+  Agent receives a small positive reward for each step it keeps the ball on the
+  agent cube's head and a larger, negative reward for dropping the ball. An
+  Agent's episode is also ended when it drops the ball so that it will reset
+  with a new ball for the next simulation step.
+- `Agent.Heuristic()` - When the `Behavior Type` is set to `Heuristic Only` in
+  the Behavior Parameters of the Agent, the Agent will use the `Heuristic()`
+  method to generate the actions of the Agent. As such, the `Heuristic()` method
+  returns an array of floats. In the case of the Ball 3D Agent, the
+  `Heuristic()` method converts the keyboard inputs into actions.
-a decision.
-Agents will request a decision when `Agent.RequestDecision()` is called. If you need
-the Agent to request decisions on its own at regular intervals, add a
-`Decision Requester` component to the Agent's GameObject. Making decisions at regular step
-intervals is generally most appropriate for physics-based simulations. For example, an
-agent in a robotic simulator that must provide fine-control of joint torques
-should make its decisions every step of the simulation. On the other hand, an
-agent that only needs to make decisions when certain game or simulation events
-occur, such as in a turn-based game, should call `Agent.RequestDecision()` manually.
+a decision. Agents will request a decision when `Agent.RequestDecision()` is
+called. If you need the Agent to request decisions on its own at regular
+intervals, add a `Decision Requester` component to the Agent's GameObject.
+Making decisions at regular step intervals is generally most appropriate for
+physics-based simulations. For example, an agent in a robotic simulator that
+must provide fine-control of joint torques should make its decisions every step
+of the simulation. On the other hand, an agent that only needs to make decisions
+when certain game or simulation events occur, such as in a turn-based game,
+should call `Agent.RequestDecision()` manually.
-To make informed decisions, an agent must first make observations of the state of
-the environment. The observations are collected by Sensors attached to the agent
-GameObject. By default, agents come with a `VectorSensor` which allows them to
-collect floating-point observations into a single array. There are additional
-sensor components which can be attached to the agent GameObject which collect their own
-observations, or modify other observations. These are:
+To make informed decisions, an agent must first make observations of the state
+of the environment. The observations are collected by Sensors attached to the
+agent GameObject. By default, agents come with a `VectorSensor` which allows
+them to collect floating-point observations into a single array. There are
+additional sensor components which can be attached to the agent GameObject which
+collect their own observations, or modify other observations. These are:
-* `CameraSensorComponent` - Allows image from `Camera` to be used as observation.
-* `RenderTextureSensorComponent` - Allows content of `RenderTexture` to be used as observation.
-* `RayPerceptionSensorComponent` - Allows information from set of ray-casts to be used as observation.
+- `CameraSensorComponent` - Allows image from `Camera` to be used as
+  observation.
+- `RenderTextureSensorComponent` - Allows content of `RenderTexture` to be used
+  as observation.
+- `RayPerceptionSensorComponent` - Allows information from set of ray-casts to
+  be used as observation.
-Vector observations are best used for aspects of the environment which are numerical
-and non-visual. The Policy class calls the `CollectObservations(VectorSensor sensor)`
-method of each Agent. Your implementation of this function must call
-`VectorSensor.AddObservation` to add vector observations.
+Vector observations are best used for aspects of the environment which are
+numerical and non-visual. The Policy class calls the
+`CollectObservations(VectorSensor sensor)` method of each Agent. Your
+implementation of this function must call `VectorSensor.AddObservation` to add
+vector observations.
-information an agent needs to accomplish its task. Without sufficient and relevant
-information, an agent may learn poorly
-or may not learn at all. A reasonable approach for determining what information
-should be included is to consider what you would need to calculate an analytical
-solution to the problem, or what you would expect a human to be able to use to solve the problem.
+information an agent needs to accomplish its task. Without sufficient and
+relevant information, an agent may learn poorly or may not learn at all. A
+reasonable approach for determining what information should be included is to
+consider what you would need to calculate an analytical solution to the problem,
+or what you would expect a human to be able to use to solve the problem.
-ML-Agents SDK.  For instance, the 3DBall example uses the rotation of the
+ML-Agents SDK. For instance, the 3DBall example uses the rotation of the
 platform, the relative position of the ball, and the velocity of the ball as its
 state observation. As an experiment, you can remove the velocity components from
 the observation and retrain the 3DBall agent. While it will learn to balance the
 an agent's observations to a fixed subset. For example, instead of observing
 every enemy agent in an environment, you could only observe the closest five.

-When you set up an Agent's `Behavior Parameters` in the Unity Editor, set the following
-properties to use a vector observation:
+When you set up an Agent's `Behavior Parameters` in the Unity Editor, set the
+following properties to use a vector observation:
-* **Space Size** — The state size must match the length of your feature vector.
+- **Space Size** — The state size must match the length of your feature vector.
-The `VectorSensor.AddObservation` method provides a number of overloads for adding common types
-of data to your observation vector. You can add Integers and booleans directly to
-the observation vector, as well as some common Unity data types such as `Vector2`,
-`Vector3`, and `Quaternion`.
+The `VectorSensor.AddObservation` method provides a number of overloads for
+adding common types of data to your observation vector. You can add Integers and
+booleans directly to the observation vector, as well as some common Unity data
+types such as `Vector2`, `Vector3`, and `Quaternion`.

 #### One-hot encoding categorical information

 }
 ```

-`VectorSensor` also provides a two-argument function `AddOneHotObservation()` as a shortcut for _one-hot_
-style observations. The following example is identical to the previous one.
+`VectorSensor` also provides a two-argument function `AddOneHotObservation()` as
+a shortcut for _one-hot_ style observations. The following example is identical
+to the previous one.

 ```csharp
 enum CarriedItems { Sword, Shield, Bow, LastItem }
 ```csharp
 normalizedValue = (currentValue - minValue)/(maxValue - minValue)
 ```
-:warning: For vectors, you should apply the above formula to each component (x, y, and z). Note that this is *not* the same as using the `Vector3.normalized` property or `Vector3.Normalize()` method in Unity (and similar for `Vector2`).
+
+:warning: For vectors, you should apply the above formula to each component (x,
+y, and z). Note that this is _not_ the same as using the `Vector3.normalized`
+property or `Vector3.Normalize()` method in Unity (and similar for `Vector2`).

 Rotations and angles should also be normalized. For angles between 0 and 360
 degrees, you can use the following formulas:

 #### Vector Observation Summary & Best Practices

-* Vector Observations should include all variables relevant for allowing the
-  agent to take the optimally informed decision, and ideally no extraneous information.
-* In cases where Vector Observations need to be remembered or compared over
-  time, either an LSTM (see [here](Feature-Memory.md)) should be used in the model, or the
-  `Stacked Vectors` value in the agent GameObject's `Behavior Parameters` should be changed.
-* Categorical variables such as type of object (Sword, Shield, Bow) should be
-  encoded in one-hot fashion (i.e. `3` -> `0, 0, 1`). This can be done automatically using the
-  `AddOneHotObservation()` method of the `VectorSensor`.
-* In general, all inputs should be normalized to be in
-  the range 0 to +1 (or -1 to 1). For example, the `x` position information of
-  an agent where the maximum possible value is `maxValue` should be recorded as
+- Vector Observations should include all variables relevant for allowing the
+  agent to take the optimally informed decision, and ideally no extraneous
+  information.
+- In cases where Vector Observations need to be remembered or compared over
+  time, either an LSTM (see [here](Feature-Memory.md)) should be used in the
+  model, or the `Stacked Vectors` value in the agent GameObject's
+  `Behavior Parameters` should be changed.
+- Categorical variables such as type of object (Sword, Shield, Bow) should be
+  encoded in one-hot fashion (i.e. `3` -> `0, 0, 1`). This can be done
+  automatically using the `AddOneHotObservation()` method of the `VectorSensor`.
+- In general, all inputs should be normalized to be in the range 0 to +1 (or -1
+  to 1). For example, the `x` position information of an agent where the maximum
+  possible value is `maxValue` should be recorded as
-* Positional information of relevant GameObjects should be encoded in relative
+- Positional information of relevant GameObjects should be encoded in relative
-
-Visual observations are generally provided to agent via either a `CameraSensor` or `RenderTextureSensor`.
-These collect image information and transforms it into a 3D Tensor which
-can be fed into the convolutional neural network (CNN) of the agent policy. For more information on
-CNNs, see [this guide](http://cs231n.github.io/convolutional-networks/). This allows agents
-to learn from spatial regularities in the observation images. It is possible to
-use visual and vector observations with the same agent.
+Visual observations are generally provided to agent via either a `CameraSensor`
+or `RenderTextureSensor`. These collect image information and transforms it into
+a 3D Tensor which can be fed into the convolutional neural network (CNN) of the
+agent policy. For more information on CNNs, see
+[this guide](http://cs231n.github.io/convolutional-networks/). This allows
+agents to learn from spatial regularities in the observation images. It is
+possible to use visual and vector observations with the same agent.
-used when it is not possible to properly define the problem using vector or ray-cast observations.
+used when it is not possible to properly define the problem using vector or
+ray-cast observations.
-Visual observations can be derived from Cameras or RenderTextures within your scene.
-To add a visual observation to an Agent, add either a Camera Sensor Component
-or RenderTextures Sensor Component to the Agent. Then drag the camera or
-render texture you want to add to the `Camera` or `RenderTexture` field.
-You can have more than one camera or render texture and even use a combination
-of both attached to an Agent. For each visual observation, set the width and height
-of the image (in pixels) and whether or not the observation is color or grayscale.
+Visual observations can be derived from Cameras or RenderTextures within your
+scene. To add a visual observation to an Agent, add either a Camera Sensor
+Component or RenderTextures Sensor Component to the Agent. Then drag the camera
+or render texture you want to add to the `Camera` or `RenderTexture` field. You
+can have more than one camera or render texture and even use a combination of
+both attached to an Agent. For each visual observation, set the width and height
+of the image (in pixels) and whether or not the observation is color or
+grayscale.

 ![Agent Camera](images/visual-observation.png)


-Each Agent that uses the same Policy must have the same number of visual observations,
-and they must all have the same resolutions (including whether or not they are grayscale).
-Additionally, each Sensor Component on an Agent must have a unique name so that they can
-be sorted deterministically (the name must be unique for that Agent, but multiple Agents can
-have a Sensor Component with the same name).
+Each Agent that uses the same Policy must have the same number of visual
+observations, and they must all have the same resolutions (including whether or
+not they are grayscale). Additionally, each Sensor Component on an Agent must
+have a unique name so that they can be sorted deterministically (the name must
+be unique for that Agent, but multiple Agents can have a Sensor Component with
+the same name).
-adding a `Canvas`, then adding a `Raw Image` with it's texture set to the Agent's
-`RenderTexture`. This will render the agent observation on the game screen.
+adding a `Canvas`, then adding a `Raw Image` with it's texture set to the
+Agent's `RenderTexture`. This will render the agent observation on the game
+screen.
-The [GridWorld environment](Learning-Environment-Examples.md#gridworld)
-is an example on how to use a RenderTexture for both debugging and observation. Note
-that in this example, a Camera is rendered to a RenderTexture, which is then used for
-observations and debugging. To update the RenderTexture, the Camera must be asked to
-render every time a decision is requested within the game code. When using Cameras
-as observations directly, this is done automatically by the Agent.
+The [GridWorld environment](Learning-Environment-Examples.md#gridworld) is an
+example on how to use a RenderTexture for both debugging and observation. Note
+that in this example, a Camera is rendered to a RenderTexture, which is then
+used for observations and debugging. To update the RenderTexture, the Camera
+must be asked to render every time a decision is requested within the game code.
+When using Cameras as observations directly, this is done automatically by the
+Agent.
-* To collect visual observations, attach `CameraSensor` or `RenderTextureSensor`
+- To collect visual observations, attach `CameraSensor` or `RenderTextureSensor`
-* Visual observations should generally be used unless vector observations are not sufficient.
-* Image size should be kept as small as possible, without the loss of
-  needed details for decision making.
-* Images should be made greyscale in situations where color information is
-  not needed for making informed decisions.
+- Visual observations should generally be used unless vector observations are
+  not sufficient.
+- Image size should be kept as small as possible, without the loss of needed
+  details for decision making.
+- Images should be made greyscale in situations where color information is not
+  needed for making informed decisions.
-This can be easily implemented by adding a
-`RayPerceptionSensorComponent3D` (or `RayPerceptionSensorComponent2D`) to the Agent GameObject.
+This can be easily implemented by adding a `RayPerceptionSensorComponent3D` (or
+`RayPerceptionSensorComponent2D`) to the Agent GameObject.
-During observations, several rays (or spheres, depending on settings) are cast into
-the physics world, and the objects that are hit determine the observation vector that
-is produced.
+During observations, several rays (or spheres, depending on settings) are cast
+into the physics world, and the objects that are hit determine the observation
+vector that is produced.
- * _Detectable Tags_ A list of strings corresponding to the types of objects that the
- Agent should be able to distinguish between. For example, in the WallJump example,
- we use "wall", "goal", and "block" as the list of objects to detect.
- * _Rays Per Direction_ Determines the number of rays that are cast. One ray is
+
+- _Detectable Tags_ A list of strings corresponding to the types of objects that
+  the Agent should be able to distinguish between. For example, in the WallJump
+  example, we use "wall", "goal", and "block" as the list of objects to detect.
+- _Rays Per Direction_ Determines the number of rays that are cast. One ray is
- * _Max Ray Degrees_ The angle (in degrees) for the outermost rays. 90 degrees
+- _Max Ray Degrees_ The angle (in degrees) for the outermost rays. 90 degrees
- * _Sphere Cast Radius_ The size of the sphere used for sphere casting. If set
-  to 0, rays will be used instead of spheres. Rays may be more efficient,
+- _Sphere Cast Radius_ The size of the sphere used for sphere casting. If set to
+  0, rays will be used instead of spheres. Rays may be more efficient,
- * _Ray Length_ The length of the casts
- * _Observation Stacks_ The number of previous results to "stack" with the cast
-  results. Note that this can be independent of the "Stacked Vectors" setting
-  in `Behavior Parameters`.
- * _Start Vertical Offset_ (3D only) The vertical offset of the ray start point.
- * _End Vertical Offset_ (3D only) The vertical offset of the ray end point.
+- _Ray Length_ The length of the casts
+- _Observation Stacks_ The number of previous results to "stack" with the cast
+  results. Note that this can be independent of the "Stacked Vectors" setting in
+  `Behavior Parameters`.
+- _Start Vertical Offset_ (3D only) The vertical offset of the ray start point.
+- _End Vertical Offset_ (3D only) The vertical offset of the ray end point.
-Both use 3 Rays Per Direction and 90 Max Ray Degrees. One of the components
-had a vertical offset, so the Agent can tell whether it's clear to jump over
-the wall.
+Both use 3 Rays Per Direction and 90 Max Ray Degrees. One of the components had
+a vertical offset, so the Agent can tell whether it's clear to jump over the
+wall.
+
+
 so the number of rays and tags should be kept as small as possible to reduce the
 amount of data used. Note that this is separate from the State Size defined in
 `Behavior Parameters`, so you don't need to worry about the formula above when

-* Attach `RayPerceptionSensorComponent3D` or `RayPerceptionSensorComponent2D` to use.
-* This observation type is best used when there is relevant spatial information
+- Attach `RayPerceptionSensorComponent3D` or `RayPerceptionSensorComponent2D` to
+  use.
+- This observation type is best used when there is relevant spatial information
-* Use as few rays and tags as necessary to solve the problem in order to improve learning stability and agent performance.
+- Use as few rays and tags as necessary to solve the problem in order to improve
+  learning stability and agent performance.
-agent's `OnActionReceived()` function. Actions for an agent can take one of two forms, either **Continuous** or **Discrete**.
+agent's `OnActionReceived()` function. Actions for an agent can take one of two
+forms, either **Continuous** or **Discrete**.
-When you specify that the vector action space
-is **Continuous**, the action parameter passed to the Agent is an array of
-floating point numbers with length equal to the `Vector Action Space Size` property.
-When you specify a **Discrete** vector action space type, the action parameter
-is an array containing integers. Each integer is an index into a list or table
-of commands. In the **Discrete** vector action space type, the action parameter
-is an array of indices. The number of indices in the array is determined by the
-number of branches defined in the `Branches Size` property. Each branch
-corresponds to an action table, you can specify the size of each table by
-modifying the `Branches` property.
+When you specify that the vector action space is **Continuous**, the action
+parameter passed to the Agent is an array of floating point numbers with length
+equal to the `Vector Action Space Size` property. When you specify a
+**Discrete** vector action space type, the action parameter is an array
+containing integers. Each integer is an index into a list or table of commands.
+In the **Discrete** vector action space type, the action parameter is an array
+of indices. The number of indices in the array is determined by the number of
+branches defined in the `Branches Size` property. Each branch corresponds to an
+action table, you can specify the size of each table by modifying the `Branches`
+property.
-Neither the Policy nor the training algorithm know anything about what the action
-values themselves mean. The training algorithm simply tries different values for
-the action list and observes the affect on the accumulated rewards over time and
-many training episodes. Thus, the only place actions are defined for an Agent is
-in the `OnActionReceived()` function.
+Neither the Policy nor the training algorithm know anything about what the
+action values themselves mean. The training algorithm simply tries different
+values for the action list and observes the affect on the accumulated rewards
+over time and many training episodes. Thus, the only place actions are defined
+for an Agent is in the `OnActionReceived()` function.

 For example, if you designed an agent to move in two dimensions, you could use
 either continuous or the discrete vector actions. In the continuous case, you
 with values ranging from zero to one.

 Note that when you are programming actions for an agent, it is often helpful to
-test your action logic using the `Heuristic()` method of the Agent,
-which lets you map keyboard commands to actions.
+test your action logic using the `Heuristic()` method of the Agent, which lets
+you map keyboard commands to actions.

 The [3DBall](Learning-Environment-Examples.md#3dball-3d-balance-ball) and
 [Area](Learning-Environment-Examples.md#push-block) example environments are set

 When an Agent uses a Policy set to the **Continuous** vector action space, the
-action parameter passed to the Agent's `OnActionReceived()` function is an array with
-length equal to the `Vector Action Space Size` property value.
-The individual values in the array have whatever meanings that you ascribe to
-them. If you assign an element in the array as the speed of an Agent, for
-example, the training process learns to control the speed of the Agent through
-this parameter.
+action parameter passed to the Agent's `OnActionReceived()` function is an array
+with length equal to the `Vector Action Space Size` property value. The
+individual values in the array have whatever meanings that you ascribe to them.
+If you assign an element in the array as the speed of an Agent, for example, the
+training process learns to control the speed of the Agent through this
+parameter.

 The [Reacher example](Learning-Environment-Examples.md#reacher) defines a
 continuous action space with four control values.

 ### Discrete Action Space

-When an Agent uses a  **Discrete** vector action space, the
-action parameter passed to the Agent's `OnActionReceived()` function is an array
-containing indices. With the discrete vector action space, `Branches` is an
-array of integers, each value corresponds to the number of possibilities for
-each branch.
+When an Agent uses a **Discrete** vector action space, the action parameter
+passed to the Agent's `OnActionReceived()` function is an array containing
+indices. With the discrete vector action space, `Branches` is an array of
+integers, each value corresponds to the number of possibilities for each branch.
-agent be able to move __and__ jump concurrently. We define the first branch to
+agent be able to move **and** jump concurrently. We define the first branch to
 have 5 possible actions (don't move, go left, go right, go backward, go forward)
 and the second one to have 2 possible actions (don't jump, jump). The
 `OnActionReceived()` method would look something like:
 #### Masking Discrete Actions

 When using Discrete Actions, it is possible to specify that some actions are
-impossible for the next decision. When the Agent is controlled by a
-neural network, the Agent will be unable to perform the specified action. Note
-that when the Agent is controlled by its Heuristic, the Agent will
-still be able to decide to perform the masked action. In order to mask an
-action,  override the `Agent.CollectDiscreteActionMasks()` virtual method,
-and call `DiscreteActionMasker.SetMask()` in it:
+impossible for the next decision. When the Agent is controlled by a neural
+network, the Agent will be unable to perform the specified action. Note that
+when the Agent is controlled by its Heuristic, the Agent will still be able to
+decide to perform the masked action. In order to mask an action, override the
+`Agent.CollectDiscreteActionMasks()` virtual method, and call
+`DiscreteActionMasker.SetMask()` in it:

 ```csharp
 public override void CollectDiscreteActionMasks(DiscreteActionMasker actionMasker){

 Where:

-* `branch` is the index (starting at 0) of the branch on which you want to mask
+- `branch` is the index (starting at 0) of the branch on which you want to mask
-* `actionIndices` is a list of `int` corresponding to the
-  indices of the actions that the Agent cannot perform.
+- `actionIndices` is a list of `int` corresponding to the indices of the actions
+  that the Agent cannot perform.

 For example, if you have an Agent with 2 branches and on the first branch
 (branch 0) there are 4 possible actions : _"do nothing"_, _"jump"_, _"shoot"_

 Notes:

-* You can call `SetMask` multiple times if you want to put masks on
-  multiple branches.
-* You cannot mask all the actions of a branch.
-* You cannot mask actions in continuous control.
+- You can call `SetMask` multiple times if you want to put masks on multiple
+  branches.
+- You cannot mask all the actions of a branch.
+- You cannot mask actions in continuous control.
-### Actions Summary &  Best Practices
+### Actions Summary & Best Practices
-* Actions can either use `Discrete` or `Continuous` spaces.
-* When using `Discrete` it is possible to assign multiple action branches, and to mask certain actions.
-* In general, smaller action spaces will make for easier learning.
-* Be sure to set the Vector Action's Space Size to the number of used Vector
+- Actions can either use `Discrete` or `Continuous` spaces.
+- When using `Discrete` it is possible to assign multiple action branches, and
+  to mask certain actions.
+- In general, smaller action spaces will make for easier learning.
+- Be sure to set the Vector Action's Space Size to the number of used Vector
-* When using continuous control, action values should be clipped to an
+- When using continuous control, action values should be clipped to an
-

 ## Rewards

 reward over time. The better your reward mechanism, the better your agent will
 learn.

-**Note:** Rewards are not used during inference by an Agent using a
-trained model and is also not used during imitation learning.
+**Note:** Rewards are not used during inference by an Agent using a trained
+model and is also not used during imitation learning.
-the desired results. You can even use the
-Agent's Heuristic to control the Agent while watching how it accumulates rewards.
+the desired results. You can even use the Agent's Heuristic to control the Agent
+while watching how it accumulates rewards.
-Allocate rewards to an Agent by calling the `AddReward()` or `SetReward()` methods on the agent.
-The reward assigned between each decision
-should be in the range [-1,1]. Values outside this range can lead to
-unstable training. The `reward` value is reset to zero when the agent receives a
-new decision. If there are multiple calls to `AddReward()` for a single agent
-decision, the rewards will be summed together to evaluate how good the previous
-decision was. The `SetReward()` will override all
-previous rewards given to an agent since the previous decision.
+Allocate rewards to an Agent by calling the `AddReward()` or `SetReward()`
+methods on the agent. The reward assigned between each decision should be in the
+range [-1,1]. Values outside this range can lead to unstable training. The
+`reward` value is reset to zero when the agent receives a new decision. If there
+are multiple calls to `AddReward()` for a single agent decision, the rewards
+will be summed together to evaluate how good the previous decision was. The
+`SetReward()` will override all previous rewards given to an agent since the
+previous decision.
-You can examine the `OnActionReceived()` functions defined in the [example
-environments](Learning-Environment-Examples.md) to see how those projects
-allocate rewards.
+You can examine the `OnActionReceived()` functions defined in the
+[example environments](Learning-Environment-Examples.md) to see how those
+projects allocate rewards.
-The `GridAgent` class in the [GridWorld
-example](Learning-Environment-Examples.md#gridworld) uses a very simple reward
-system:
+The `GridAgent` class in the
+[GridWorld example](Learning-Environment-Examples.md#gridworld) uses a very
+simple reward system:

 ```csharp
 Collider[] hitObjects = Physics.OverlapBox(trueAgent.transform.position,
 example of a _sparse_ reward system. The agent must explore a lot to find the
 infrequent reward.

-In contrast, the `AreaAgent` in the [Area
-example](Learning-Environment-Examples.md#push-block) gets a small negative
-reward every step. In order to get the maximum reward, the agent must finish its
-task of reaching the goal square as quickly as possible:
+In contrast, the `AreaAgent` in the
+[Area example](Learning-Environment-Examples.md#push-block) gets a small
+negative reward every step. In order to get the maximum reward, the agent must
+finish its task of reaching the goal square as quickly as possible:

 ```csharp
 AddReward( -0.005f);
 The `Ball3DAgent` also assigns a negative penalty when the ball falls off the
 platform.

-Note that all of these environments make use of the `EndEpisode()` method, which manually
-terminates an episode when a termination condition is reached. This can be
-called independently of the `Max Step` property.
+Note that all of these environments make use of the `EndEpisode()` method, which
+manually terminates an episode when a termination condition is reached. This can
+be called independently of the `Max Step` property.
-* Use `AddReward()` to accumulate rewards between decisions. Use `SetReward()`
+- Use `AddReward()` to accumulate rewards between decisions. Use `SetReward()`
-* The magnitude of any given reward should typically not be greater than 1.0 in
+- The magnitude of any given reward should typically not be greater than 1.0 in
-* Positive rewards are often more helpful to shaping the desired behavior of an
-  agent than negative rewards. Excessive negative rewards can result in the agent
-  failing to learn any meaningful behavior.
-* For locomotion tasks, a small positive reward (+0.1) for forward velocity is
+- Positive rewards are often more helpful to shaping the desired behavior of an
+  agent than negative rewards. Excessive negative rewards can result in the
+  agent failing to learn any meaningful behavior.
+- For locomotion tasks, a small positive reward (+0.1) for forward velocity is
-* If you want the agent to finish a task quickly, it is often helpful to provide
+- If you want the agent to finish a task quickly, it is often helpful to provide
-  episode by calling `EndEpisode()` on the agent when it has accomplished its goal.
+  episode by calling `EndEpisode()` on the agent when it has accomplished its
+  goal.
-* `Behavior Parameters` - The parameters dictating what Policy the Agent will
-receive.
-  * `Behavior Name` - The identifier for the behavior. Agents with the same behavior name
-  will learn the same policy. If you're using [curriculum learning](Training-Curriculum-Learning.md),
-   this is used as the top-level key in the config.
-  * `Vector Observation`
-    * `Space Size` - Length of vector observation for the Agent.
-    * `Stacked Vectors` - The number of previous vector observations that will
+- `Behavior Parameters` - The parameters dictating what Policy the Agent will
+  receive.
+  - `Behavior Name` - The identifier for the behavior. Agents with the same
+    behavior name will learn the same policy. If you're using
+    [curriculum learning](Training-Curriculum-Learning.md), this is used as the
+    top-level key in the config.
+  - `Vector Observation`
+    - `Space Size` - Length of vector observation for the Agent.
+    - `Stacked Vectors` - The number of previous vector observations that will
-  * `Vector Action`
-    * `Space Type` - Corresponds to whether action vector contains a single
+  - `Vector Action`
+    - `Space Type` - Corresponds to whether action vector contains a single
-    * `Space Size` (Continuous) - Length of action vector.
-    * `Branches` (Discrete) - An array of integers, defines multiple concurrent
+    - `Space Size` (Continuous) - Length of action vector.
+    - `Branches` (Discrete) - An array of integers, defines multiple concurrent
-  * `Model` - The neural network model used for inference (obtained after
-  training)
-  * `Inference Device` - Whether to use CPU or GPU to run the model during inference
-  * `Behavior Type` - Determines whether the Agent will do training, inference, or use its
-  Heuristic() method:
-    * `Default` - the Agent will train if they connect to a python trainer, otherwise they will perform inference.
-    * `Heuristic Only` - the Agent will always use the `Heuristic()` method.
-    * `Inference Only` - the Agent will always perform inference.
-  * `Team ID` - Used to define the team for [self-play](Training-Self-Play.md)
-  * `Use Child Sensors` - Whether to use all Sensor components attached to child GameObjects of this Agent.
-* `Max Step` - The per-agent maximum number of steps. Once this number is
+  - `Model` - The neural network model used for inference (obtained after
+    training)
+  - `Inference Device` - Whether to use CPU or GPU to run the model during
+    inference
+  - `Behavior Type` - Determines whether the Agent will do training, inference,
+    or use its Heuristic() method:
+    - `Default` - the Agent will train if they connect to a python trainer,
+      otherwise they will perform inference.
+    - `Heuristic Only` - the Agent will always use the `Heuristic()` method.
+    - `Inference Only` - the Agent will always perform inference.
+  - `Team ID` - Used to define the team for [self-play](Training-Self-Play.md)
+  - `Use Child Sensors` - Whether to use all Sensor components attached to child
+    GameObjects of this Agent.
+- `Max Step` - The per-agent maximum number of steps. Once this number is
  reached, the Agent will be reset.

 ## Monitoring Agents

 ## Destroying an Agent

-You can destroy an Agent GameObject during the simulation. Make sure that there is
-always at least one Agent training at all times by either spawning a new Agent
-every time one is destroyed or by re-spawning new Agents when the whole environment
-resets.
+You can destroy an Agent GameObject during the simulation. Make sure that there
+is always at least one Agent training at all times by either spawning a new
+Agent every time one is destroyed or by re-spawning new Agents when the whole
+environment resets.
--- a/docs/Learning-Environment-Examples.md
+++ b/docs/Learning-Environment-Examples.md
 # Example Learning Environments

-The Unity ML-Agents toolkit contains an expanding set of example environments
-which demonstrate various features of the platform. Environments are located in
-`Project/Assets/ML-Agents/Examples` and summarized below. Additionally, our
+The Unity ML-Agents Toolkit includes an expanding set of example environments
+that highlight the various features of the toolkit. These environments can also
+serve as templates for new environments or as ways to test new ML algorithms.
+Environments are located in `Project/Assets/ML-Agents/Examples` and summarized
+below. Additionally, our
-This page only overviews the example environments we provide. To learn more on
-how to design and build your own environments see our [Making a New Learning
-Environment](Learning-Environment-Create-New.md) page.
+For the environments that highlight specific features of the toolkit, we provide
+the pre-trained model files and the training config file that enables you to
+train the scene yourself. The environments that are designed to serve as
+challenges for researchers do not have accompanying pre-trained model files or
+training configs and are marked as _Optional_ below.
-Note: Environment scenes marked as _optional_ do not have accompanying
-pre-trained model files, and are designed to serve as challenges for
-researchers.
-
-If you would like to contribute environments, please see our
+This page only overviews the example environments we provide. To learn more on
+how to design and build your own environments see our
+[Making a New Learning Environment](Learning-Environment-Create-New.md) page. If
+you would like to contribute environments, please see our
 [contribution guidelines](../com.unity.ml-agents/CONTRIBUTING.md) page.

 ## Basic
-* Set-up: A linear movement task where the agent must move left or right to
+- Set-up: A linear movement task where the agent must move left or right to
-* Goal: Move to the most reward state.
-* Agents: The environment contains one agent.
-* Agent Reward Function:
-  * -0.01 at each step
-  * +0.1 for arriving at suboptimal state.
-  * +1.0 for arriving at optimal state.
-* Behavior Parameters:
-  * Vector Observation space: One variable corresponding to current state.
-  * Vector Action space: (Discrete) Two possible actions (Move left, move
+- Goal: Move to the most reward state.
+- Agents: The environment contains one agent.
+- Agent Reward Function:
+  - -0.01 at each step
+  - +0.1 for arriving at suboptimal state.
+  - +1.0 for arriving at optimal state.
+- Behavior Parameters:
+  - Vector Observation space: One variable corresponding to current state.
+  - Vector Action space: (Discrete) Two possible actions (Move left, move
-  * Visual Observations: None
-* Float Properties: None
-* Benchmark Mean Reward: 0.93
+  - Visual Observations: None
+- Float Properties: None
+- Benchmark Mean Reward: 0.93
-* Set-up: A balance-ball task, where the agent balances the ball on it's head.
-* Goal: The agent must balance the ball on it's head for as long as possible.
-* Agents: The environment contains 12 agents of the same kind, all using the
+- Set-up: A balance-ball task, where the agent balances the ball on it's head.
+- Goal: The agent must balance the ball on it's head for as long as possible.
+- Agents: The environment contains 12 agents of the same kind, all using the
-* Agent Reward Function:
-  * +0.1 for every step the ball remains on it's head.
-  * -1.0 if the ball falls off.
-* Behavior Parameters:
-  * Vector Observation space: 8 variables corresponding to rotation of the agent cube,
-    and position and velocity of ball.
-  * Vector Observation space (Hard Version): 5 variables corresponding to
+- Agent Reward Function:
+  - +0.1 for every step the ball remains on it's head.
+  - -1.0 if the ball falls off.
+- Behavior Parameters:
+  - Vector Observation space: 8 variables corresponding to rotation of the agent
+    cube, and position and velocity of ball.
+  - Vector Observation space (Hard Version): 5 variables corresponding to
-  * Vector Action space: (Continuous) Size of 2, with one value corresponding to
+  - Vector Action space: (Continuous) Size of 2, with one value corresponding to
-  * Visual Observations: None.
-* Float Properties: Three
-    * scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions)
-      * Default: 1
-      * Recommended Minimum: 0.2
-      * Recommended Maximum: 5
-    * gravity: Magnitude of gravity
-      * Default: 9.81
-      * Recommended Minimum: 4
-      * Recommended Maximum: 105
-    * mass: Specifies mass of the ball
-      * Default: 1
-      * Recommended Minimum: 0.1
-      * Recommended Maximum: 20
-* Benchmark Mean Reward: 100
+  - Visual Observations: None.
+- Float Properties: Three
+  - scale: Specifies the scale of the ball in the 3 dimensions (equal across the
+    three dimensions)
+    - Default: 1
+    - Recommended Minimum: 0.2
+    - Recommended Maximum: 5
+  - gravity: Magnitude of gravity
+    - Default: 9.81
+    - Recommended Minimum: 4
+    - Recommended Maximum: 105
+  - mass: Specifies mass of the ball
+    - Default: 1
+    - Recommended Minimum: 0.1
+    - Recommended Maximum: 20
+- Benchmark Mean Reward: 100
-* Set-up: A version of the classic grid-world task. Scene contains agent, goal,
+- Set-up: A version of the classic grid-world task. Scene contains agent, goal,
-* Goal: The agent must navigate the grid to the goal while avoiding the
+- Goal: The agent must navigate the grid to the goal while avoiding the
-* Agents: The environment contains nine agents with the same Behavior Parameters.
-* Agent Reward Function:
-  * -0.01 for every step.
-  * +1.0 if the agent navigates to the goal position of the grid (episode ends).
-  * -1.0 if the agent navigates to an obstacle (episode ends).
-* Behavior Parameters:
-  * Vector Observation space: None
-  * Vector Action space: (Discrete) Size of 4, corresponding to movement in
+- Agents: The environment contains nine agents with the same Behavior
+  Parameters.
+- Agent Reward Function:
+  - -0.01 for every step.
+  - +1.0 if the agent navigates to the goal position of the grid (episode ends).
+  - -1.0 if the agent navigates to an obstacle (episode ends).
+- Behavior Parameters:
+  - Vector Observation space: None
+  - Vector Action space: (Discrete) Size of 4, corresponding to movement in
-    is turned on by default (this option can be toggled
-    using the `Mask Actions` checkbox within the `trueAgent` GameObject).
-    The trained model file provided was generated with action masking turned on.
-  * Visual Observations: One corresponding to top-down view of GridWorld.
-* Float Properties: Three, corresponding to grid size, number of obstacles, and
+    is turned on by default (this option can be toggled using the `Mask Actions`
+    checkbox within the `trueAgent` GameObject). The trained model file provided
+    was generated with action masking turned on.
+  - Visual Observations: One corresponding to top-down view of GridWorld.
+- Float Properties: Three, corresponding to grid size, number of obstacles, and
-* Benchmark Mean Reward: 0.8
+- Benchmark Mean Reward: 0.8
-* Set-up: Two-player game where agents control rackets to hit a ball over the
+- Set-up: Two-player game where agents control rackets to hit a ball over the
-* Goal: The agents must hit the ball so that the opponent cannot hit a valid
-return.
-* Agents: The environment contains two agent with same Behavior Parameters.
- After training you can set the `Behavior Type` to `Heuristic Only` on one of the Agent's
- Behavior Parameters to play against your trained model.
-* Agent Reward Function (independent):
-  * +1.0 To the agent that wins the point. An agent wins a point by preventing
-   the opponent from hitting a valid return.
-  * -1.0 To the agent who loses the point.
-* Behavior Parameters:
-  * Vector Observation space: 9 variables corresponding to position, velocity
+- Goal: The agents must hit the ball so that the opponent cannot hit a valid
+  return.
+- Agents: The environment contains two agent with same Behavior Parameters.
+  After training you can set the `Behavior Type` to `Heuristic Only` on one of
+  the Agent's Behavior Parameters to play against your trained model.
+- Agent Reward Function (independent):
+  - +1.0 To the agent that wins the point. An agent wins a point by preventing
+    the opponent from hitting a valid return.
+  - -1.0 To the agent who loses the point.
+- Behavior Parameters:
+  - Vector Observation space: 9 variables corresponding to position, velocity
-  * Vector Action space: (Continuous) Size of 3, corresponding to movement
+  - Vector Action space: (Continuous) Size of 3, corresponding to movement
-  * Visual Observations: None
-* Float Properties: Three
-    * gravity: Magnitude of gravity
-      * Default: 9.81
-      * Recommended Minimum: 6
-      * Recommended Maximum: 20
-    * scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions)
-      * Default: .5
-      * Recommended Minimum: 0.2
-      * Recommended Maximum: 5
+  - Visual Observations: None
+- Float Properties: Three
+  - gravity: Magnitude of gravity
+    - Default: 9.81
+    - Recommended Minimum: 6
+    - Recommended Maximum: 20
+  - scale: Specifies the scale of the ball in the 3 dimensions (equal across the
+    three dimensions)
+    - Default: .5
+    - Recommended Minimum: 0.2
+    - Recommended Maximum: 5
-* Set-up: A platforming environment where the agent can push a block around.
-* Goal: The agent must push the block to the goal.
-* Agents: The environment contains one agent.
-* Agent Reward Function:
-  * -0.0025 for every step.
-  * +1.0 if the block touches the goal.
-* Behavior Parameters:
-  * Vector Observation space: (Continuous) 70 variables corresponding to 14
+- Set-up: A platforming environment where the agent can push a block around.
+- Goal: The agent must push the block to the goal.
+- Agents: The environment contains one agent.
+- Agent Reward Function:
+  - -0.0025 for every step.
+  - +1.0 if the block touches the goal.
+- Behavior Parameters:
+  - Vector Observation space: (Continuous) 70 variables corresponding to 14
-  * Vector Action space: (Discrete) Size of 6, corresponding to turn clockwise
+  - Vector Action space: (Discrete) Size of 6, corresponding to turn clockwise
-  * Visual Observations (Optional): One first-person camera. Use
-    `VisualPushBlock` scene. __The visual observation version of
-     this environment does not train with the provided default
-     training parameters.__
-* Float Properties: Four
-    * block_scale: Scale of the block along the x and z dimensions
-        * Default: 2
-        * Recommended Minimum: 0.5
-        * Recommended Maximum:  4
-    * dynamic_friction: Coefficient of friction for the ground material acting on moving objects
-        * Default: 0
-        * Recommended Minimum: 0
-        * Recommended Maximum: 1
-    * static_friction: Coefficient of friction for the ground material acting on stationary objects
-        * Default: 0
-        * Recommended Minimum: 0
-        * Recommended Maximum: 1
-    * block_drag: Effect of air resistance on block
-        * Default: 0.5
-        * Recommended Minimum: 0
-        * Recommended Maximum: 2000
-* Benchmark Mean Reward: 4.5
+  - Visual Observations (Optional): One first-person camera. Use
+    `VisualPushBlock` scene. **The visual observation version of this
+    environment does not train with the provided default training parameters.**
+- Float Properties: Four
+  - block_scale: Scale of the block along the x and z dimensions
+    - Default: 2
+    - Recommended Minimum: 0.5
+    - Recommended Maximum: 4
+  - dynamic_friction: Coefficient of friction for the ground material acting on
+    moving objects
+    - Default: 0
+    - Recommended Minimum: 0
+    - Recommended Maximum: 1
+  - static_friction: Coefficient of friction for the ground material acting on
+    stationary objects
+    - Default: 0
+    - Recommended Minimum: 0
+    - Recommended Maximum: 1
+  - block_drag: Effect of air resistance on block
+    - Default: 0.5
+    - Recommended Minimum: 0
+    - Recommended Maximum: 2000
+- Benchmark Mean Reward: 4.5
-* Set-up: A platforming environment where the agent can jump over a wall.
-* Goal: The agent must use the block to scale the wall and reach the goal.
-* Agents: The environment contains one agent linked to two different
-  Models. The Policy the agent is linked to changes depending on the
-  height of the wall. The change of Policy is done in the WallJumpAgent class.
-* Agent Reward Function:
-  * -0.0005 for every step.
-  * +1.0 if the agent touches the goal.
-  * -1.0 if the agent falls off the platform.
-* Behavior Parameters:
-  * Vector Observation space: Size of 74, corresponding to 14 ray casts each
+- Set-up: A platforming environment where the agent can jump over a wall.
+- Goal: The agent must use the block to scale the wall and reach the goal.
+- Agents: The environment contains one agent linked to two different Models. The
+  Policy the agent is linked to changes depending on the height of the wall. The
+  change of Policy is done in the WallJumpAgent class.
+- Agent Reward Function:
+  - -0.0005 for every step.
+  - +1.0 if the agent touches the goal.
+  - -1.0 if the agent falls off the platform.
+- Behavior Parameters:
+  - Vector Observation space: Size of 74, corresponding to 14 ray casts each
-  * Vector Action space: (Discrete) 4 Branches:
-    * Forward Motion (3 possible actions: Forward, Backwards, No Action)
-    * Rotation (3 possible actions: Rotate Left, Rotate Right, No Action)
-    * Side Motion (3 possible actions: Left, Right, No Action)
-    * Jump (2 possible actions: Jump, No Action)
-  * Visual Observations: None
-* Float Properties: Four
-* Benchmark Mean Reward (Big & Small Wall): 0.8
+  - Vector Action space: (Discrete) 4 Branches:
+    - Forward Motion (3 possible actions: Forward, Backwards, No Action)
+    - Rotation (3 possible actions: Rotate Left, Rotate Right, No Action)
+    - Side Motion (3 possible actions: Left, Right, No Action)
+    - Jump (2 possible actions: Jump, No Action)
+  - Visual Observations: None
+- Float Properties: Four
+- Benchmark Mean Reward (Big & Small Wall): 0.8
-* Set-up: Double-jointed arm which can move to target locations.
-* Goal: The agents must move its hand to the goal location, and keep it there.
-* Agents: The environment contains 10 agent with same Behavior Parameters.
-* Agent Reward Function (independent):
-  * +0.1 Each step agent's hand is in goal location.
-* Behavior Parameters:
-  * Vector Observation space: 26 variables corresponding to position, rotation,
+- Set-up: Double-jointed arm which can move to target locations.
+- Goal: The agents must move its hand to the goal location, and keep it there.
+- Agents: The environment contains 10 agent with same Behavior Parameters.
+- Agent Reward Function (independent):
+  - +0.1 Each step agent's hand is in goal location.
+- Behavior Parameters:
+  - Vector Observation space: 26 variables corresponding to position, rotation,
-  * Vector Action space: (Continuous) Size of 4, corresponding to torque
+  - Vector Action space: (Continuous) Size of 4, corresponding to torque
-  * Visual Observations: None.
-* Float Properties: Five
-  * goal_size: radius of the goal zone
-    * Default: 5
-    * Recommended Minimum: 1
-    * Recommended Maximum: 10
-  * goal_speed: speed of the goal zone around the arm (in radians)
-    * Default: 1
-    * Recommended Minimum: 0.2
-    * Recommended Maximum: 4
-  * gravity
-    * Default: 9.81
-    * Recommended Minimum: 4
-    * Recommended Maximum: 20
-  * deviation: Magnitude of sinusoidal (cosine) deviation of the goal along the vertical dimension
-    * Default: 0
-    * Recommended Minimum: 0
-    * Recommended Maximum: 5
-  * deviation_freq: Frequency of the cosine deviation of the goal along the vertical dimension
-    * Default: 0
-    * Recommended Minimum: 0
-    * Recommended Maximum: 3
-* Benchmark Mean Reward: 30
+  - Visual Observations: None.
+- Float Properties: Five
+  - goal_size: radius of the goal zone
+    - Default: 5
+    - Recommended Minimum: 1
+    - Recommended Maximum: 10
+  - goal_speed: speed of the goal zone around the arm (in radians)
+    - Default: 1
+    - Recommended Minimum: 0.2
+    - Recommended Maximum: 4
+  - gravity
+    - Default: 9.81
+    - Recommended Minimum: 4
+    - Recommended Maximum: 20
+  - deviation: Magnitude of sinusoidal (cosine) deviation of the goal along the
+    vertical dimension
+    - Default: 0
+    - Recommended Minimum: 0
+    - Recommended Maximum: 5
+  - deviation_freq: Frequency of the cosine deviation of the goal along the
+    vertical dimension
+    - Default: 0
+    - Recommended Minimum: 0
+    - Recommended Maximum: 3
+- Benchmark Mean Reward: 30
-* Set-up: A creature with 4 arms and 4 forearms.
-* Goal: The agents must move its body toward the goal direction without falling.
-  * `CrawlerStaticTarget` - Goal direction is always forward.
-  * `CrawlerDynamicTarget`- Goal direction is randomized.
-* Agents: The environment contains 3 agent with same Behavior Parameters.
-* Agent Reward Function (independent):
-  * +0.03 times body velocity in the goal direction.
-  * +0.01 times body direction alignment with goal direction.
-* Behavior Parameters:
-  * Vector Observation space: 117 variables corresponding to position, rotation,
+- Set-up: A creature with 4 arms and 4 forearms.
+- Goal: The agents must move its body toward the goal direction without falling.
+  - `CrawlerStaticTarget` - Goal direction is always forward.
+  - `CrawlerDynamicTarget`- Goal direction is randomized.
+- Agents: The environment contains 3 agent with same Behavior Parameters.
+- Agent Reward Function (independent):
+  - +0.03 times body velocity in the goal direction.
+  - +0.01 times body direction alignment with goal direction.
+- Behavior Parameters:
+  - Vector Observation space: 117 variables corresponding to position, rotation,
-  * Vector Action space: (Continuous) Size of 20, corresponding to target
+  - Vector Action space: (Continuous) Size of 20, corresponding to target
-  * Visual Observations: None
-* Float Properties: None
-* Benchmark Mean Reward for `CrawlerStaticTarget`: 2000
-* Benchmark Mean Reward for `CrawlerDynamicTarget`: 400
+  - Visual Observations: None
+- Float Properties: None
+- Benchmark Mean Reward for `CrawlerStaticTarget`: 2000
+- Benchmark Mean Reward for `CrawlerDynamicTarget`: 400
-* Set-up: A multi-agent environment where agents compete to collect food.
-* Goal: The agents must learn to collect as many green food spheres as possible
+- Set-up: A multi-agent environment where agents compete to collect food.
+- Goal: The agents must learn to collect as many green food spheres as possible
-* Agents: The environment contains 5 agents with same Behavior Parameters.
-* Agent Reward Function (independent):
-  * +1 for interaction with green spheres
-  * -1 for interaction with red spheres
-* Behavior Parameters:
-  * Vector Observation space: 53 corresponding to velocity of agent (2), whether
+- Agents: The environment contains 5 agents with same Behavior Parameters.
+- Agent Reward Function (independent):
+  - +1 for interaction with green spheres
+  - -1 for interaction with red spheres
+- Behavior Parameters:
+  - Vector Observation space: 53 corresponding to velocity of agent (2), whether
-  * Vector Action space: (Discrete) 4 Branches:
-    * Forward Motion (3 possible actions: Forward, Backwards, No Action)
-    * Side Motion (3 possible actions: Left, Right, No Action)
-    * Rotation (3 possible actions: Rotate Left, Rotate Right, No Action)
-    * Laser (2 possible actions: Laser, No Action)
-  * Visual Observations (Optional): First-person camera per-agent. Use
-    `VisualFoodCollector` scene. __The visual observation version of
-     this environment does not train with the provided default
-     training parameters.__
-* Float Properties: Two
-  * laser_length: Length of the laser used by the agent
-    * Default: 1
-    * Recommended Minimum: 0.2
-    * Recommended Maximum: 7
-  * agent_scale: Specifies the scale of the agent in the 3 dimensions (equal across the three dimensions)
-    * Default: 1
-    * Recommended Minimum: 0.5
-    * Recommended Maximum: 5
-* Benchmark Mean Reward: 10
+  - Vector Action space: (Discrete) 4 Branches:
+    - Forward Motion (3 possible actions: Forward, Backwards, No Action)
+    - Side Motion (3 possible actions: Left, Right, No Action)
+    - Rotation (3 possible actions: Rotate Left, Rotate Right, No Action)
+    - Laser (2 possible actions: Laser, No Action)
+  - Visual Observations (Optional): First-person camera per-agent. Use
+    `VisualFoodCollector` scene. **The visual observation version of this
+    environment does not train with the provided default training parameters.**
+- Float Properties: Two
+  - laser_length: Length of the laser used by the agent
+    - Default: 1
+    - Recommended Minimum: 0.2
+    - Recommended Maximum: 7
+  - agent_scale: Specifies the scale of the agent in the 3 dimensions (equal
+    across the three dimensions)
+    - Default: 1
+    - Recommended Minimum: 0.5
+    - Recommended Maximum: 5
+- Benchmark Mean Reward: 10
-* Set-up: Environment where the agent needs to find information in a room,
+- Set-up: Environment where the agent needs to find information in a room,
-* Goal: Move to the goal which corresponds to the color of the block in the
+- Goal: Move to the goal which corresponds to the color of the block in the
-* Agents: The environment contains one agent.
-* Agent Reward Function (independent):
-  * +1 For moving to correct goal.
-  * -0.1 For moving to incorrect goal.
-  * -0.0003 Existential penalty.
-* Behavior Parameters:
-  * Vector Observation space: 30 corresponding to local ray-casts detecting
+- Agents: The environment contains one agent.
+- Agent Reward Function (independent):
+  - +1 For moving to correct goal.
+  - -0.1 For moving to incorrect goal.
+  - -0.0003 Existential penalty.
+- Behavior Parameters:
+  - Vector Observation space: 30 corresponding to local ray-casts detecting
-  * Vector Action space: (Discrete) 1 Branch, 4 actions corresponding to agent
+  - Vector Action space: (Discrete) 1 Branch, 4 actions corresponding to agent
-  * Visual Observations (Optional): First-person view for the agent. Use
-    `VisualHallway` scene. __The visual observation version of
-     this environment does not train with the provided default
-     training parameters.__
-* Float Properties: None
-* Benchmark Mean Reward: 0.7
-  * To speed up training, you can enable curiosity by adding the `curiosity` reward signal in `config/trainer_config.yaml`
+  - Visual Observations (Optional): First-person view for the agent. Use
+    `VisualHallway` scene. **The visual observation version of this environment
+    does not train with the provided default training parameters.**
+- Float Properties: None
+- Benchmark Mean Reward: 0.7
+  - To speed up training, you can enable curiosity by adding the `curiosity`
+    reward signal in `config/trainer_config.yaml`
-* Set-up: Environment where the agent needs on-demand decision making. The agent
+- Set-up: Environment where the agent needs on-demand decision making. The agent
-* Goal: Catch the floating green cube. Only has a limited number of jumps.
-* Agents: The environment contains one agent.
-* Agent Reward Function (independent):
-  * +1 For catching the green cube.
-  * -1 For bouncing out of bounds.
-  * -0.05 Times the action squared. Energy expenditure penalty.
-* Behavior Parameters:
-  * Vector Observation space: 6 corresponding to local position of agent and
+- Goal: Catch the floating green cube. Only has a limited number of jumps.
+- Agents: The environment contains one agent.
+- Agent Reward Function (independent):
+  - +1 For catching the green cube.
+  - -1 For bouncing out of bounds.
+  - -0.05 Times the action squared. Energy expenditure penalty.
+- Behavior Parameters:
+  - Vector Observation space: 6 corresponding to local position of agent and
-  * Vector Action space: (Continuous) 3 corresponding to agent force applied for
+  - Vector Action space: (Continuous) 3 corresponding to agent force applied for
-  * Visual Observations: None
-* Float Properties: Two
-    * target_scale: The scale of the green cube in the 3 dimensions
-        * Default: 150
-        * Recommended Minimum: 50
-        * Recommended Maximum: 250
-* Benchmark Mean Reward: 10
+  - Visual Observations: None
+- Float Properties: Two
+  - target_scale: The scale of the green cube in the 3 dimensions
+    - Default: 150
+    - Recommended Minimum: 50
+    - Recommended Maximum: 250
+- Benchmark Mean Reward: 10
-* Set-up: Environment where four agents compete in a 2 vs 2 toy soccer game.
-* Goal:
-  * Get the ball into the opponent's goal while preventing
-  the ball from entering own goal.
-* Agents: The environment contains four agents, with the same
-  Behavior Parameters : Soccer.
-* Agent Reward Function (dependent):
-    * +1 When ball enters opponent's goal.
-    * -1 When ball enters team's goal.
-    * -0.001 Existential penalty.
-* Behavior Parameters:
-  * Vector Observation space: 336 corresponding to 11 ray-casts forward distributed over 120 degrees (264)
-    and 3 ray-casts backward distributed over 90 degrees each detecting 6 possible object types, along with the object's distance.
-    The forward ray-casts contribute 264 state dimensions and backward 72 state dimensions.
-  * Vector Action space: (Discrete) Three branched actions corresponding to forward, backward, sideways movement,
-      as well as rotation.
-  * Visual Observations: None
-* Float Properties: Two
-  * ball_scale: Specifies the scale of the ball in the 3 dimensions (equal across the three dimensions)
-    * Default: 7.5
-    * Recommended minimum: 4
-    * Recommended maximum: 10
-  * gravity: Magnitude of the gravity
-    * Default: 9.81
-    * Recommended minimum: 6
-    * Recommended maximum: 20
+- Set-up: Environment where four agents compete in a 2 vs 2 toy soccer game.
+- Goal:
+  - Get the ball into the opponent's goal while preventing the ball from
+    entering own goal.
+- Agents: The environment contains four agents, with the same Behavior
+  Parameters : Soccer.
+- Agent Reward Function (dependent):
+  - +1 When ball enters opponent's goal.
+  - -1 When ball enters team's goal.
+  - -0.001 Existential penalty.
+- Behavior Parameters:
+  - Vector Observation space: 336 corresponding to 11 ray-casts forward
+    distributed over 120 degrees (264) and 3 ray-casts backward distributed over
+    90 degrees each detecting 6 possible object types, along with the object's
+    distance. The forward ray-casts contribute 264 state dimensions and backward
+    72 state dimensions.
+  - Vector Action space: (Discrete) Three branched actions corresponding to
+    forward, backward, sideways movement, as well as rotation.
+  - Visual Observations: None
+- Float Properties: Two
+  - ball_scale: Specifies the scale of the ball in the 3 dimensions (equal
+    across the three dimensions)
+    - Default: 7.5
+    - Recommended minimum: 4
+    - Recommended maximum: 10
+  - gravity: Magnitude of the gravity
+    - Default: 9.81
+    - Recommended minimum: 6
+    - Recommended maximum: 20
-* Set-up: Physics-based Humanoids agents with 26 degrees of freedom. These DOFs
+- Set-up: Physics-based Humanoids agents with 26 degrees of freedom. These DOFs
-* Goal: The agents must move its body toward the goal direction as quickly as
+- Goal: The agents must move its body toward the goal direction as quickly as
-* Agents: The environment contains 11 independent agents with same Behavior Parameters.
-* Agent Reward Function (independent):
-  * +0.03 times body velocity in the goal direction.
-  * +0.01 times head y position.
-  * +0.01 times body direction alignment with goal direction.
-  * -0.01 times head velocity difference from body velocity.
-* Behavior Parameters:
-  * Vector Observation space: 215 variables corresponding to position, rotation,
+- Agents: The environment contains 11 independent agents with same Behavior
+  Parameters.
+- Agent Reward Function (independent):
+  - +0.03 times body velocity in the goal direction.
+  - +0.01 times head y position.
+  - +0.01 times body direction alignment with goal direction.
+  - -0.01 times head velocity difference from body velocity.
+- Behavior Parameters:
+  - Vector Observation space: 215 variables corresponding to position, rotation,
-  * Vector Action space: (Continuous) Size of 39, corresponding to target
+  - Vector Action space: (Continuous) Size of 39, corresponding to target
-  * Visual Observations: None
-* Float Properties: Four
-    * gravity: Magnitude of gravity
-        * Default: 9.81
-        * Recommended Minimum:
-        * Recommended Maximum:
-    * hip_mass: Mass of the hip component of the walker
-        * Default: 15
-        * Recommended Minimum: 7
-        * Recommended Maximum: 28
-    * chest_mass: Mass of the chest component of the walker
-        * Default: 8
-        * Recommended Minimum: 3
-        * Recommended Maximum: 20
-    * spine_mass: Mass of the spine component of the walker
-        * Default: 10
-        * Recommended Minimum: 3
-        * Recommended Maximum: 20
-* Benchmark Mean Reward: 1000
+  - Visual Observations: None
+- Float Properties: Four
+  - gravity: Magnitude of gravity
+    - Default: 9.81
+    - Recommended Minimum:
+    - Recommended Maximum:
+  - hip_mass: Mass of the hip component of the walker
+    - Default: 15
+    - Recommended Minimum: 7
+    - Recommended Maximum: 28
+  - chest_mass: Mass of the chest component of the walker
+    - Default: 8
+    - Recommended Minimum: 3
+    - Recommended Maximum: 20
+  - spine_mass: Mass of the spine component of the walker
+    - Default: 10
+    - Recommended Minimum: 3
+    - Recommended Maximum: 20
+- Benchmark Mean Reward: 1000
-* Set-up: Environment where the agent needs to press a button to spawn a
+- Set-up: Environment where the agent needs to press a button to spawn a
-* Goal: Move to the golden brick on top of the spawned pyramid.
-* Agents: The environment contains one agent.
-* Agent Reward Function (independent):
-  * +2 For moving to golden brick (minus 0.001 per step).
-* Behavior Parameters:
-  * Vector Observation space: 148 corresponding to local ray-casts detecting
+- Goal: Move to the golden brick on top of the spawned pyramid.
+- Agents: The environment contains one agent.
+- Agent Reward Function (independent):
+  - +2 For moving to golden brick (minus 0.001 per step).
+- Behavior Parameters:
+  - Vector Observation space: 148 corresponding to local ray-casts detecting
-  * Vector Action space: (Discrete) 4 corresponding to agent rotation and
+  - Vector Action space: (Discrete) 4 corresponding to agent rotation and
-  * Visual Observations (Optional): First-person camera per-agent. Us
-    `VisualPyramids` scene. __The visual observation version of
-     this environment does not train with the provided default
-     training parameters.__
-* Float Properties: None
-* Benchmark Mean Reward: 1.75
+  - Visual Observations (Optional): First-person camera per-agent. Us
+    `VisualPyramids` scene. **The visual observation version of this environment
+    does not train with the provided default training parameters.**
+- Float Properties: None
+- Benchmark Mean Reward: 1.75
--- a/docs/Learning-Environment-Executable.md
+++ b/docs/Learning-Environment-Executable.md
 Editor to interact with an environment. Using an executable has some advantages
 over using the Editor:

-* You can exchange executable with other people without having to share your
+- You can exchange executable with other people without having to share your
-* You can put your executable on a remote machine for faster training.
-* You can use `Headless` mode for faster training.
-* You can keep using the Unity Editor for other tasks while the agents are
+- You can put your executable on a remote machine for faster training.
+- You can use `Headless` mode for faster training.
+- You can keep using the Unity Editor for other tasks while the agents are
  training.

 ## Building the 3DBall environment

 1. Launch Unity.
-2. On the Projects dialog, choose the **Open** option at the top of the window.
-3. Using the file dialog that opens, locate the `Project` folder within the
+1. On the Projects dialog, choose the **Open** option at the top of the window.
+1. Using the file dialog that opens, locate the `Project` folder within the
-4. In the **Project** window, navigate to the folder
+1. In the **Project** window, navigate to the folder
-5. Double-click the `3DBall` file to load the scene containing the Balance Ball
+1. Double-click the `3DBall` file to load the scene containing the Balance Ball
   environment.

 ![3DBall Scene](images/mlagents-Open3DBall.png)

-* The environment application runs in the background.
-* No dialogs require interaction.
-* The correct scene loads automatically.
+- The environment application runs in the background.
+- No dialogs require interaction.
+- The correct scene loads automatically.
-2. Under **Resolution and Presentation**:
-   * Ensure that **Run in Background** is Checked.
-   * Ensure that **Display Resolution Dialog** is set to Disabled.
-3. Open the Build Settings window (menu:**File** > **Build Settings**).
-4. Choose your target platform.
-   * (optional) Select “Development Build” to [log debug
-      messages](https://docs.unity3d.com/Manual/LogFiles.html).
-5. If any scenes are shown in the **Scenes in Build** list, make sure that the
+1. Under **Resolution and Presentation**:
+   - Ensure that **Run in Background** is Checked.
+   - Ensure that **Display Resolution Dialog** is set to Disabled.
+1. Open the Build Settings window (menu:**File** > **Build Settings**).
+1. Choose your target platform.
+   - (optional) Select “Development Build” to
+     [log debug messages](https://docs.unity3d.com/Manual/LogFiles.html).
+1. If any scenes are shown in the **Scenes in Build** list, make sure that the
-6. Click **Build**:
-   * In the File dialog, navigate to your ML-Agents directory.
-   * Assign a file name and click **Save**.
-   * (For Windows）With Unity 2018.1, it will ask you to select a folder instead
+1. Click **Build**:
+   - In the File dialog, navigate to your ML-Agents directory.
+   - Assign a file name and click **Save**.
+   - (For Windows）With Unity 2018.1, it will ask you to select a folder instead
-     subfolder's name as `env_name`. You cannot create builds in the Assets folder
+     subfolder's name as `env_name`. You cannot create builds in the Assets
+     folder

 ![Build Window](images/mlagents-BuildWindow.png)

 ## Training the Environment

 1. Open a command or terminal window.
-2. Navigate to the folder where you installed the ML-Agents Toolkit. If you
+1. Navigate to the folder where you installed the ML-Agents Toolkit. If you
-3. Run
+1. Run
-   * `<trainer-config-file>` is the file path of the trainer configuration yaml
-   * `<env_name>` is the name and path to the executable you exported from Unity
+   - `<trainer-config-file>` is the file path of the trainer configuration yaml
+   - `<env_name>` is the name and path to the executable you exported from Unity
-   * `<run-identifier>` is a string used to separate the results of different
+   - `<run-identifier>` is a string used to separate the results of different
     training runs

 For example, if you are training with a 3DBall executable you exported to the
        use_curiosity:       False
        curiosity_strength:  0.01
        curiosity_enc_size:  128
-        model_path:	./models/first-run-0/Ball3DLearning
+        model_path: ./models/first-run-0/Ball3DLearning
 INFO:mlagents.trainers: first-run-0: Ball3DLearning: Step: 1000. Mean Reward: 1.242. Std of Reward: 0.746. Training.
 INFO:mlagents.trainers: first-run-0: Ball3DLearning: Step: 2000. Mean Reward: 1.319. Std of Reward: 0.693. Training.
 INFO:mlagents.trainers: first-run-0: Ball3DLearning: Step: 3000. Mean Reward: 1.804. Std of Reward: 1.056. Training.
 ```

 You can press Ctrl+C to stop the training, and your trained model will be at
-`models/<run-identifier>/<behavior_name>.nn`, which corresponds
-to your model's latest checkpoint. (**Note:** There is a known bug on Windows
-that causes the saving of the model to fail when you early terminate the
-training, it's recommended to wait until Step has reached the max_steps
-parameter you set in trainer_config.yaml.) You can now embed this trained model
-into your Agent by following the steps below:
+`models/<run-identifier>/<behavior_name>.nn`, which corresponds to your model's
+latest checkpoint. (**Note:** There is a known bug on Windows that causes the
+saving of the model to fail when you early terminate the training, it's
+recommended to wait until Step has reached the max_steps parameter you set in
+trainer_config.yaml.) You can now embed this trained model into your Agent by
+following the steps below:
-2. Open the Unity Editor, and select the **3DBall** scene as described above.
-3. Select the **3DBall** prefab from the Project window and select **Agent**.
-5. Drag the `<behavior_name>.nn` file from the Project window of
-   the Editor to the **Model** placeholder in the **Ball3DAgent**
-   inspector window.
-6. Press the Play button at the top of the editor.
+1. Open the Unity Editor, and select the **3DBall** scene as described above.
+1. Select the **3DBall** prefab from the Project window and select **Agent**.
+1. Drag the `<behavior_name>.nn` file from the Project window of the Editor to
+   the **Model** placeholder in the **Ball3DAgent** inspector window.
+1. Press the :arrow_forward: button at the top of the editor.
--- a/docs/ML-Agents-Overview.md
+++ b/docs/ML-Agents-Overview.md
 complex behaviors by hand is challenging and prone to errors.

 With ML-Agents, it is possible to _train_ the behaviors of such NPCs (called
-**agents**) using a variety of methods. The basic idea is quite simple. We need
+**Agents**) using a variety of methods. The basic idea is quite simple. We need
 to define three entities at every moment of the game (called **environment**):

 - **Observations** - what the medic perceives about the environment.
 - **Agents** - which is attached to a Unity GameObject (any character within a
  scene) and handles generating its observations, performing the actions it
  receives and assigning a reward (positive / negative) when appropriate. Each
-  Agent is linked to a Policy.
+  Agent is linked to a Behavior.
-every character in the scene. While each Agent must be linked to a Policy, it is
+every character in the scene. While each Agent must be linked to a Behavior, it is
-the same Policy type. In our sample game, we have two teams each with their own medic.
+the same Behavior. In our sample game, we have two teams each with their own medic.
-but both of these medics can have the same Policy. Note that these two
-medics have the same Policy because their _space_ of observations and
-actions are similar. This does not mean that at each instance they will have
-identical observation and action _values_. In other words, the Policy defines the
-space of all possible observations and actions, while the Agents connected to it
-(in this case the medics) can each have their own, unique observation and action
-values. If we expanded our game to include tank driver NPCs, then the Agent
-attached to those characters cannot share a Policy with the Agent linked to the
+but both of these medics can have the same Behavior. Note that these two
+medics have the same Behavior. This does not mean that at each instance they will have
+identical observation and action _values_. If we expanded our game to include
+tank driver NPCs, then the Agent
+attached to those characters cannot share its Behavior with the Agent linked to the
 medics (medics and drivers have different actions).

 <p align="center">
 We have yet to discuss how the ML-Agents toolkit trains behaviors, and what role
 the Python API and External Communicator play. Before we dive into those
 details, let's summarize the earlier components. Each character is attached to
-an Agent, and each Agent has a Policy. The Policy receives observations
-and rewards from the Agent and returns actions. The Academy ensures that all the
+an Agent, and each Agent has a Behavior. The Behavior can be thought as a function
+that receives observations
+and rewards from the Agent and returns actions. The Learning Environment through
+the Academy (not represented in the diagram) ensures that all the
+Note that in a single environment, there can be multiple Agents and multiple Behaviors
+at the same time. These Behaviors can communicate with Python through the communicator
+but can also use a pre-trained _Neural Network_ or a _Heuristic_. Note that it is also
+possible to communicate data with Python without using Agents through _Side Channels_.
+One example of using _Side Channels_ is to exchange data with Python about
+_Environment Parameters_. The following diagram illustrates the above.
+
+<p align="center">
+  <img src="images/learning_environment_full.png"
+       alt="More Complete Example ML-Agents Scene Block Diagram"
+       border="10" />
+</p>

 ## Training Modes

--- a/docs/Readme.md
+++ b/docs/Readme.md

 ## Installation & Set-up

-* [Installation](Installation.md)
-  * [Using Virtual Environment](Using-Virtual-Environment.md)
+- [Installation](Installation.md)
+  - [Using Virtual Environment](Using-Virtual-Environment.md)
-* [Getting Started Guide](Getting-Started.md)
-* [ML-Agents Toolkit Overview](ML-Agents-Overview.md)
-  * [Background: Unity](Background-Unity.md)
-  * [Background: Machine Learning](Background-Machine-Learning.md)
-  * [Background: TensorFlow](Background-TensorFlow.md)
-* [Example Environments](Learning-Environment-Examples.md)
+- [Getting Started Guide](Getting-Started.md)
+- [ML-Agents Toolkit Overview](ML-Agents-Overview.md)
+  - [Background: Unity](Background-Unity.md)
+  - [Background: Machine Learning](Background-Machine-Learning.md)
+  - [Background: TensorFlow](Background-TensorFlow.md)
+- [Example Environments](Learning-Environment-Examples.md)
-* [Making a New Learning Environment](Learning-Environment-Create-New.md)
-* [Designing a Learning Environment](Learning-Environment-Design.md)
-* [Designing Agents](Learning-Environment-Design-Agents.md)
+- [Making a New Learning Environment](Learning-Environment-Create-New.md)
+- [Designing a Learning Environment](Learning-Environment-Design.md)
+- [Designing Agents](Learning-Environment-Design-Agents.md)
-  * [Using the Monitor](Feature-Monitor.md)
-  * [Using the Video Recorder](https://github.com/Unity-Technologies/video-recorder)
-  * [Using an Executable Environment](Learning-Environment-Executable.md)
-  * [Creating Custom Side Channels](Custom-SideChannels.md)
+
+- [Using the Monitor](Feature-Monitor.md)
+- [Using an Executable Environment](Learning-Environment-Executable.md)
-* [Training ML-Agents](Training-ML-Agents.md)
-* [Using TensorBoard to Observe Training](Using-Tensorboard.md)
-* [Training Using Concurrent Unity Instances](Training-Using-Concurrent-Unity-Instances.md)
-* [Training with Proximal Policy Optimization](Training-PPO.md)
-* [Training with Soft Actor-Critic](Training-SAC.md)
+- [Training ML-Agents](Training-ML-Agents.md)
+  - [Reward Signals](Reward-Signals.md)
+  - [Profiling Trainers](Profiling-Python.md)
+- [Using TensorBoard to Observe Training](Using-Tensorboard.md)
+- [Training Using Concurrent Unity Instances](Training-Using-Concurrent-Unity-Instances.md)
+- [Training with Proximal Policy Optimization](Training-PPO.md)
+- [Training with Soft Actor-Critic](Training-SAC.md)
+- [Training with Self-Play](Training-Self-Play.md)
-* [Training with Curriculum Learning](Training-Curriculum-Learning.md)
-* [Training with Imitation Learning](Training-Imitation-Learning.md)
-* [Training with LSTM](Feature-Memory.md)
-* [Training with Environment Parameter Randomization](Training-Environment-Parameter-Randomization.md)
+- [Training with Curriculum Learning](Training-Curriculum-Learning.md)
+- [Training with Imitation Learning](Training-Imitation-Learning.md)
+- [Training with LSTM](Feature-Memory.md)
+- [Training with Environment Parameter Randomization](Training-Environment-Parameter-Randomization.md)
-* [Unity Inference Engine](Unity-Inference-Engine.md)
+- [Unity Inference Engine](Unity-Inference-Engine.md)
+
+## Extending ML-Agents
+
+- [Creating Custom Side Channels](Custom-SideChannels.md)
-* [Migrating from earlier versions of ML-Agents](Migrating.md)
-* [Frequently Asked Questions](FAQ.md)
-* [ML-Agents Glossary](Glossary.md)
-* [Limitations](Limitations.md)
+- [Migrating from earlier versions of ML-Agents](Migrating.md)
+- [Frequently Asked Questions](FAQ.md)
+- [ML-Agents Glossary](Glossary.md)
+- [Limitations](Limitations.md)
-* [API Reference](API-Reference.md)
-* [How to use the Python API](Python-API.md)
-* [Wrapping Learning Environment as a Gym (+Baselines/Dopamine Integration)](../gym-unity/README.md)
+- [API Reference](API-Reference.md)
+- [How to use the Python API](Python-API.md)
+- [Wrapping Learning Environment as a Gym (+Baselines/Dopamine Integration)](../gym-unity/README.md)
-To make the Unity ML-Agents toolkit accessible to the global research and
-Unity developer communities, we're attempting to create and maintain
-translations of our documentation. We've started with translating a subset
-of the documentation to one language (Chinese), but we hope to continue
-translating more pages and to other languages. Consequently,
-we welcome any enhancements and improvements from the community.
+To make the Unity ML-Agents toolkit accessible to the global research and Unity
+developer communities, we're attempting to create and maintain translations of
+our documentation. We've started with translating a subset of the documentation
+to one language (Chinese), but we hope to continue translating more pages and to
+other languages. Consequently, we welcome any enhancements and improvements from
+the community.
-* [Chinese](localized/zh-CN/)
-* [Korean](localized/KR/)
+- [Chinese](localized/zh-CN/)
+- [Korean](localized/KR/)
-We no longer use them ourselves and so they may not be up-to-date.
-We've decided to keep them up just in case they are helpful to you.
+
+We no longer use them ourselves and so they may not be up-to-date. We've decided
+to keep them up just in case they are helpful to you.
-* [Training on the Cloud with Amazon Web Services](Training-on-Amazon-Web-Service.md)
-* [Training on the Cloud with Microsoft Azure](Training-on-Microsoft-Azure.md)
-* [Using Docker](Using-Docker.md)
-* [Windows Anaconda Installation](Installation-Anaconda-Windows.md)
+- [Windows Anaconda Installation](Installation-Anaconda-Windows.md)
+- [Using Docker](Using-Docker.md)
+- [Training on the Cloud with Amazon Web Services](Training-on-Amazon-Web-Service.md)
+- [Training on the Cloud with Microsoft Azure](Training-on-Microsoft-Azure.md)
+- [Using the Video Recorder](https://github.com/Unity-Technologies/video-recorder)
--- a/docs/Training-Imitation-Learning.md
+++ b/docs/Training-Imitation-Learning.md

 <p align="center">
  <img src="images/demo_component.png"
-       alt="BC Teacher Helper"
+       alt="Demonstration Recorder"
       width="375" border="10" />
 </p>


 <p align="center">
  <img src="images/demo_inspector.png"
-       alt="BC Teacher Helper"
+       alt="Demonstration Inspector"
       width="375" border="10" />
 </p>

        gail:
            demo_path: <path_to_your_demo_file>
            ...
-```
+```
--- a/docs/Training-ML-Agents.md
+++ b/docs/Training-ML-Agents.md
 # Training ML-Agents

-The ML-Agents toolkit conducts training using an external Python training
-process. During training, this external process communicates with the Academy
-to generate a block of agent experiences. These
-experiences become the training set for a neural network used to optimize the
-agent's policy (which is essentially a mathematical function mapping
-observations to actions). In reinforcement learning, the neural network
-optimizes the policy by maximizing the expected rewards. In imitation learning,
-the neural network optimizes the policy to achieve the smallest difference
-between the actions chosen by the agent trainee and the actions chosen by the
-expert in the same situation.
-
-The output of the training process is a model file containing the optimized
-policy. This model file is a TensorFlow data graph containing the mathematical
-operations and the optimized weights selected during the training process. You
-can set the generated model file in the Behaviors Parameters under your
-Agent in your Unity project to decide the best course of action for an agent.
-
-Use the command `mlagents-learn` to train your agents. This command is installed
-with the `mlagents` package and its implementation can be found at
-`ml-agents/mlagents/trainers/learn.py`. The [configuration file](#training-config-file),
-like `config/trainer_config.yaml` specifies the hyperparameters used during training.
-You can edit this file with a text editor to add a specific configuration for
-each Behavior.
+For a broad overview of reinforcement learning, imitation learning and all the
+training scenarios, methods and options within the ML-Agents Toolkit, see
+[ML-Agents Toolkit Overview](ML-Agents-Overview.md).
-For a broader overview of reinforcement learning, imitation learning and the
-ML-Agents training process, see [ML-Agents Toolkit
-Overview](ML-Agents-Overview.md).
+Once your learning environment has been created and is ready for training, the
+next step is to initiate a training run. Training in the ML-Agents Toolkit is
+powered by a dedicated Python package, `mlagents`. This package exposes a
+command `mlagents-learn` that is the single entry point for all training
+workflows (e.g. reinforcement leaning, imitation learning, curriculum learning).
+Its implementation can be found at
+[ml-agents/mlagents/trainers/learn.py](../ml-agents/mlagents/trainers/learn.py).
-Use the `mlagents-learn` command to train agents. `mlagents-learn` supports
-training with
-[reinforcement learning](Background-Machine-Learning.md#reinforcement-learning),
-[curriculum learning](Training-Curriculum-Learning.md),
-and [behavioral cloning imitation learning](Training-Imitation-Learning.md).
+### Starting Training
-Run `mlagents-learn` from the command line to launch the training process. Use
-the command line patterns and the `config/trainer_config.yaml` file to control
-training options.
+`mlagents-learn` is the main training utility provided by the ML-Agents Toolkit.
+It accepts a number of CLI options in addition to a YAML configuration file that
+contains all the configurations and hyperparameters to be used during training.
+The set of configurations and hyperparameters to include in this file depend on
+the agents in your environment and the specific training method you wish to
+utilize. Keep in mind that the hyperparameter values can have a big impact on
+the training performance (i.e. your agent's ability to learn a policy that
+solves the task). In this page, we will review all the hyperparameters for all
+training methods and provide guidelines and advice on their values.
-The basic command for training is:
+To view a description of all the CLI options accepted by `mlagents-learn`, use
+the `--help`:
-mlagents-learn <trainer-config-file> --env=<env_name> --run-id=<run-identifier>
+mlagents-learn --help
-where
-
-* `<trainer-config-file>` is the file path of the trainer configuration yaml.
-* `<env_name>`__(Optional)__ is the name (including path) of your Unity
-  executable containing the agents to be trained. If `<env_name>` is not passed,
-  the training will happen in the Editor. Press the :arrow_forward: button in
-  Unity when the message _"Start training by pressing the Play button in the
-  Unity Editor"_ is displayed on the screen.
-* `<run-identifier>` is an optional identifier you can use to identify the
-  results of individual training runs.
-
-For example, suppose you have a project in Unity named "CatsOnBicycles" which
-contains agents ready to train. To perform the training:
-
-1. [Build the project](Learning-Environment-Executable.md), making sure that you
-   only include the training scene.
-2. Open a terminal or console window.
-3. Navigate to the directory where you installed the ML-Agents Toolkit.
-4. Run the following to launch the training process using the path to the Unity
-   environment you built in step 1:
+The basic command for training is:
-mlagents-learn config/trainer_config.yaml --env=../../projects/Cats/CatsOnBicycles.app --run-id=cob_1
+mlagents-learn <trainer-config-file> --env=<env_name> --run-id=<run-identifier>
-During a training session, the training program prints out and saves updates at
-regular intervals (specified by the `summary_freq` option). The saved statistics
-are grouped by the `run-id` value so you should assign a unique id to each
-training run if you plan to view the statistics. You can view these statistics
-using TensorBoard during or after training by running the following command:
+where
-```sh
-tensorboard --logdir=summaries --port 6006
-```
+- `<trainer-config-file>` is the file path of the trainer configuration yaml.
+  This contains all the hyperparameter values. We offer a detailed guide on the
+  structure of this file and the meaning of the hyperameters (and advice on how
+  to set them) in the dedicated [Training Config File](#training-config-file)
+  section below.
+- `<env_name>`**(Optional)** is the name (including path) of your
+  [Unity executable](Learning-Environment-Executable.md) containing the agents
+  to be trained. If `<env_name>` is not passed, the training will happen in the
+  Editor. Press the :arrow_forward: button in Unity when the message _"Start
+  training by pressing the Play button in the Unity Editor"_ is displayed on
+  the screen.
+- `<run-identifier>` is a unique name you can use to identify the results of
+  your training runs.
-And then opening the URL: [localhost:6006](http://localhost:6006).
+See the
+[Getting Started Guide](Getting-Started.md#training-a-new-model-with-reinforcement-learning)
+for a sample execution of the `mlagents-learn` command.
-**Note:** The default port TensorBoard uses is 6006. If there is an existing session
-running on port 6006 a new session can be launched on an open port using the --port
-option.
+#### Observing Training
-When training is finished, you can find the saved model in the `models` folder
-under the assigned run-id — in the cats example, the path to the model would be
-`models/cob_1/CatsOnBicycles_cob_1.nn`.
+Regardless of which training methods, configurations or hyperparameters you
+provide, the training process will always generate three artifacts:
-While this example used the default training hyperparameters, you can edit the
-[trainer_config.yaml file](#training-config-file) with a text editor to set
-different values.
+1. Summaries (under the `summaries/` folder): these are training metrics that
+   are updated throughout the training process. They are helpful to monitor your
+   training performance and may help inform how to update your hyperparameter
+   values. See [Using TensorBoard](Using-Tensorboard.md) for more details on how
+   to visualize the training metrics.
+1. Models (under the `models/` folder): these contain the model checkpoints that
+   are updated throughout training and the final model file (`.nn`). This final
+   model file is generated once either when training completes or is
+   interrupted.
+1. Timers file (also under the `summaries/` folder): this contains aggregated
+   metrics on your training process, including time spent on specific code
+   blocks. See [Profiling in Python](Profiling-Python.md) for more information
+   on the timers generated.
-To interrupt training and save the current progress, hit Ctrl+C once and wait for the
-model to be saved out.
+These artifacts (except the `.nn` file) are updated throughout the training
+process and finalized when training completes or is interrupted.
-### Loading an Existing Model
+#### Stopping and Resuming Training
-If you've quit training early using Ctrl+C, you can resume the training run by running
-`mlagents-learn` again, specifying the same `<run-identifier>` and appending the `--resume` flag
-to the command.
+To interrupt training and save the current progress, hit `Ctrl+C` once and wait
+for the model(s) to be saved out.
-You can also use this mode to run inference of an already-trained model in Python.
-Append both the `--resume` and `--inference` to do this. Note that if you want to run
-inference in Unity, you should use the
-[Unity Inference Engine](Getting-started#Running-a-pre-trained-model).
+To resume a previously interrupted or completed training run, use the `--resume`
+flag and make sure to specify the previously used run ID.
-If you've already trained a model using the specified `<run-identifier>` and `--resume` is not
-specified, you will not be able to continue with training. Use `--force` to force ML-Agents to
-overwrite the existing data.
+If you would like to re-run a previously interrupted or completed training run
+and re-use the same run ID (in this case, overwriting the previously generated
+artifacts), then use the `--force` flag.
-Alternatively, you might want to start a new training run but _initialize_ it using an already-trained
-model. You may want to do this, for instance, if your environment changed and you want
-a new model, but the old behavior is still better than random. You can do this by specifying `--initialize-from=<run-identifier>`, where `<run-identifier>` is the old run ID.
+#### Loading an Existing Model
-### Command Line Training Options
+You can also use this mode to run inference of an already-trained model in
+Python by using both the `--resume` and `--inference` flags. Note that if you
+want to run inference in Unity, you should use the
+[Unity Inference Engine](Getting-Started.md#running-a-pre-trained-model).
-In addition to passing the path of the Unity executable containing your training
-environment, you can set the following command line options when invoking
-`mlagents-learn`:
+Alternatively, you might want to start a new training run but _initialize_ it
+using an already-trained model. You may want to do this, for instance, if your
+environment changed and you want a new model, but the old behavior is still
+better than random. You can do this by specifying
+`--initialize-from=<run-identifier>`, where `<run-identifier>` is the old run
+ID.
-* `--env=<env>`: Specify an executable environment to train.
-* `--curriculum=<file>`: Specify a curriculum JSON file for defining the
-  lessons for curriculum training. See [Curriculum
-  Training](Training-Curriculum-Learning.md) for more information.
-* `--sampler=<file>`: Specify a sampler YAML file for defining the
-  sampler for parameter randomization. See [Environment Parameter Randomization](Training-Environment-Parameter-Randomization.md) for more information.
-* `--keep-checkpoints=<n>`: Specify the maximum number of model checkpoints to
-  keep. Checkpoints are saved after the number of steps specified by the
-  `save-freq` option. Once the maximum number of checkpoints has been reached,
-  the oldest checkpoint is deleted when saving a new checkpoint. Defaults to 5.
-* `--lesson=<n>`: Specify which lesson to start with when performing curriculum
-  training. Defaults to 0.
-* `--num-envs=<n>`: Specifies the number of concurrent Unity environment instances to
-  collect experiences from when training. Defaults to 1.
-* `--run-id=<run-identifier>`: Specifies an identifier for each training run. This
-  identifier is used to name the subdirectories in which the trained model and
-  summary statistics are saved as well as the saved model itself. The default id
-  is "ppo". If you use TensorBoard to view the training statistics, always set a
-  unique run-id for each training run. (The statistics for all runs with the
-  same id are combined as if they were produced by a the same session.)
-* `--save-freq=<n>`: Specifies how often (in  steps) to save the model during
-  training. Defaults to 50000.
-* `--seed=<n>`: Specifies a number to use as a seed for the random number
-  generator used by the training code.
-* `--env-args=<string>`: Specify arguments for the executable environment. Be aware that
-  the standalone build will also process these as
-  [Unity Command Line Arguments](https://docs.unity3d.com/Manual/CommandLineArguments.html).
-  You should choose different argument names if you want to create environment-specific arguments.
-  All arguments after this flag will be passed to the executable. For example, setting
-  `mlagents-learn config/trainer_config.yaml --env-args --num-orcs 42` would result in
-   ` --num-orcs 42` passed to the executable.
-* `--base-port`: Specifies the starting port. Each concurrent Unity environment instance
-  will get assigned a port sequentially, starting from the `base-port`. Each instance
-  will use the port `(base_port + worker_id)`, where the `worker_id` is sequential IDs
-  given to each instance from 0 to `num_envs - 1`. Default is 5005. __Note:__ When
-  training using the Editor rather than an executable, the base port will be ignored.
-* `--inference`: Specifies whether to only run in inference mode. Omit to train the model.
-  To load an existing model, specify a run-id and combine with `--resume`.
-* `--resume`: If set, the training code loads an already trained model to
-  initialize the neural network before training. The learning code looks for the
-  model in `models/<run-id>/` (which is also where it saves models at the end of
-  training). This option only works when the models exist, and have the same behavior names
-  as the current agents in your scene.
-* `--force`: Attempting to train a model with a run-id that has been used before will
-  throw an error. Use `--force` to force-overwrite this run-id's summary and model data.
-* `--initialize-from=<run-identifier>`: Specify an old run-id here to initialize your model from
-  a previously trained model. Note that the previously saved models _must_ have the same behavior
-  parameters as your current environment.
-* `--no-graphics`: Specify this option to run the Unity executable in
-  `-batchmode` and doesn't initialize the graphics driver. Use this only if your
-  training doesn't involve visual observations (reading from Pixels). See
-  [here](https://docs.unity3d.com/Manual/CommandLineArguments.html) for more
-  details.
-* `--debug`: Specify this option to enable debug-level logging for some parts of the code.
-* `--cpu`: Forces training using CPU only.
-* Engine Configuration :
-  * `--width` : The width of the executable window of the environment(s) in pixels
-  (ignored for editor training) (Default 84)
-  * `--height` : The height of the executable window of the environment(s) in pixels
-  (ignored for editor training). (Default 84)
-  * `--quality-level` : The quality level of the environment(s). Equivalent to
-  calling `QualitySettings.SetQualityLevel` in Unity. (Default 5)
-  * `--time-scale` : The time scale of the Unity environment(s). Equivalent to setting
-  `Time.timeScale` in Unity. (Default 20.0, maximum 100.0)
-  * `--target-frame-rate` : The target frame rate of the Unity environment(s).
-  Equivalent to setting `Application.targetFrameRate` in Unity. (Default: -1)
+## Training Config File
-### Training Config File
+The Unity ML-Agents Toolkit provides a wide range of training scenarios, methods
+and options. As such, specific training runs may require different training
+configurations and may generate different artifacts and TensorBoard statistics.
+This section offers a detailed guide into how to manage the different training
+set-ups withing the toolkit.
-The training config files `config/trainer_config.yaml`, `config/sac_trainer_config.yaml`,
-`config/gail_config.yaml` and `config/offline_bc_config.yaml` specifies the training method,
-the hyperparameters, and a few additional values to use when training with Proximal Policy
-Optimization(PPO), Soft Actor-Critic(SAC), GAIL (Generative Adversarial Imitation Learning)
-with PPO/SAC, and Behavioral Cloning(BC)/Imitation with PPO/SAC. These files are divided
-into sections. The **default** section defines the default values for all the available
-training with PPO, SAC, GAIL (with PPO), and BC. These files are divided into sections.
-The **default** section defines the default values for all the available settings. You can
-also add new sections to override these defaults to train specific Behaviors. Name each of these
-override sections after the appropriate `Behavior Name`. Sections for the
+The training config files `config/trainer_config.yaml`,
+`config/sac_trainer_config.yaml`, `config/gail_config.yaml` and
+`config/offline_bc_config.yaml` specifies the training method, the
+hyperparameters, and a few additional values to use when training with Proximal
+Policy Optimization(PPO), Soft Actor-Critic(SAC), GAIL (Generative Adversarial
+Imitation Learning) with PPO/SAC, and Behavioral Cloning(BC)/Imitation with
+PPO/SAC. These files are divided into sections. The **default** section defines
+the default values for all the available training with PPO, SAC, GAIL (with
+PPO), and BC. These files are divided into sections. The **default** section
+defines the default values for all the available settings. You can also add new
+sections to override these defaults to train specific Behaviors. Name each of
+these override sections after the appropriate `Behavior Name`. Sections for the
-|     **Setting**      |                                                                                     **Description**                                                                                     | **Applies To Trainer\*** |
-| :------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------- |
-| batch_size           | The number of experiences in each iteration of gradient descent.                                                                                                                        | PPO, SAC             |
-| batches_per_epoch    | In imitation learning, the number of batches of training examples to collect before training the model.                                                                                 |                        |
-| beta                 | The strength of entropy regularization.                                                                                                                                                 | PPO                      |
-| buffer_size          | The number of experiences to collect before updating the policy model. In SAC, the max size of the experience buffer.                                                                   | PPO, SAC                 |
-| buffer_init_steps    | The number of experiences to collect into the buffer before updating the policy model.                                                                                                  | SAC                      |
-| epsilon              | Influences how rapidly the policy can evolve during training.                                                                                                                           | PPO                      |
-| hidden_units         | The number of units in the hidden layers of the neural network.                                                                                                                         | PPO, SAC             |
-| init_entcoef         | How much the agent should explore in the beginning of training.                                                                                                                         | SAC                      |
-| lambd                | The regularization parameter.                                                                                                                                                           | PPO                      |
-| learning_rate        | The initial learning rate for gradient descent.                                                                                                                                         | PPO, SAC             |
-| learning_rate_schedule | Determines how learning rate changes over time. | PPO, SAC |
-| max_steps            | The maximum number of simulation steps to run during a training session.                                                                                                                | PPO, SAC             |
-| memory_size          | The size of the memory an agent must keep. Used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md).                                 | PPO, SAC             |
-| normalize            | Whether to automatically normalize observations.                                                                                                                                        | PPO, SAC                 |
-| num_epoch            | The number of passes to make through the experience buffer when performing gradient descent optimization.                                                                               | PPO                      |
-| num_layers           | The number of hidden layers in the neural network.                                                                                                                                      | PPO, SAC             |
-| behavioral_cloning          | Use demonstrations to bootstrap the policy neural network. See [Pretraining Using Demonstrations](Training-PPO.md#optional-behavioral-cloning-using-demonstrations).                           | PPO, SAC                 |
-| reward_signals       | The reward signals used to train the policy. Enable Curiosity and GAIL here. See [Reward Signals](Reward-Signals.md) for configuration options.                                         | PPO, SAC             |
-| save_replay_buffer   | Saves the replay buffer when exiting training, and loads it on resume.                                                                                                                  | SAC                      |
-| sequence_length      | Defines how long the sequences of experiences must be while training. Only used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, SAC             |
-| summary_freq         | How often, in steps, to save training statistics. This determines the number of data points shown by TensorBoard.                                                                       | PPO, SAC             |
-| tau                  | How aggressively to update the target network used for bootstrapping value estimation in SAC.                                                                                           | SAC                      |
-| time_horizon         | How many steps of experience to collect per-agent before adding it to the experience buffer.                                                                                            | PPO, SAC    |
-| trainer              | The type of training to perform: "ppo", "sac", "offline_bc" or "online_bc".                                                                                                             | PPO, SAC             |
-| train_interval       | How often to update the agent.                                                                                                                                                          | SAC                      |
-| steps_per_update           | Ratio of agent steps per mini-batch update.                                                                                                                     | SAC                      |
-| use_recurrent        | Train using a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md).                                                                                       | PPO, SAC             |
-| init_path        | Initialize trainer from a previously saved model.                                                                                       | PPO, SAC             |
+\*PPO = Proximal Policy Optimization, SAC = Soft Actor-Critic, BC = Behavioral
+Cloning (Imitation), GAIL = Generative Adversarial Imitation Learning
-\*PPO = Proximal Policy Optimization, SAC = Soft Actor-Critic, BC = Behavioral Cloning (Imitation), GAIL = Generative Adversarial Imitaiton Learning
+| **Setting**            | **Description**                                                                                                                                                                         | **Applies To Trainer\*** |
+| :--------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------- |
+| batch_size             | The number of experiences in each iteration of gradient descent.                                                                                                                        | PPO, SAC                 |
+| batches_per_epoch      | In imitation learning, the number of batches of training examples to collect before training the model.                                                                                 |                          |
+| beta                   | The strength of entropy regularization.                                                                                                                                                 | PPO                      |
+| buffer_size            | The number of experiences to collect before updating the policy model. In SAC, the max size of the experience buffer.                                                                   | PPO, SAC                 |
+| buffer_init_steps      | The number of experiences to collect into the buffer before updating the policy model.                                                                                                  | SAC                      |
+| epsilon                | Influences how rapidly the policy can evolve during training.                                                                                                                           | PPO                      |
+| hidden_units           | The number of units in the hidden layers of the neural network.                                                                                                                         | PPO, SAC                 |
+| init_entcoef           | How much the agent should explore in the beginning of training.                                                                                                                         | SAC                      |
+| lambd                  | The regularization parameter.                                                                                                                                                           | PPO                      |
+| learning_rate          | The initial learning rate for gradient descent.                                                                                                                                         | PPO, SAC                 |
+| learning_rate_schedule | Determines how learning rate changes over time.                                                                                                                                         | PPO, SAC                 |
+| max_steps              | The maximum number of simulation steps to run during a training session.                                                                                                                | PPO, SAC                 |
+| memory_size            | The size of the memory an agent must keep. Used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md).                                 | PPO, SAC                 |
+| normalize              | Whether to automatically normalize observations.                                                                                                                                        | PPO, SAC                 |
+| num_epoch              | The number of passes to make through the experience buffer when performing gradient descent optimization.                                                                               | PPO                      |
+| num_layers             | The number of hidden layers in the neural network.                                                                                                                                      | PPO, SAC                 |
+| behavioral_cloning     | Use demonstrations to bootstrap the policy neural network. See [Pretraining Using Demonstrations](Training-PPO.md#optional-behavioral-cloning-using-demonstrations).                    | PPO, SAC                 |
+| reward_signals         | The reward signals used to train the policy. Enable Curiosity and GAIL here. See [Reward Signals](Reward-Signals.md) for configuration options.                                         | PPO, SAC                 |
+| save_replay_buffer     | Saves the replay buffer when exiting training, and loads it on resume.                                                                                                                  | SAC                      |
+| sequence_length        | Defines how long the sequences of experiences must be while training. Only used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, SAC                 |
+| summary_freq           | How often, in steps, to save training statistics. This determines the number of data points shown by TensorBoard.                                                                       | PPO, SAC                 |
+| tau                    | How aggressively to update the target network used for bootstrapping value estimation in SAC.                                                                                           | SAC                      |
+| time_horizon           | How many steps of experience to collect per-agent before adding it to the experience buffer.                                                                                            | PPO, SAC                 |
+| trainer                | The type of training to perform: "ppo", "sac", "offline_bc" or "online_bc".                                                                                                             | PPO, SAC                 |
+| steps_per_update           | Ratio of agent steps per mini-batch update.                                                                                                                     | SAC                      |
+| use_recurrent          | Train using a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md).                                                                                       | PPO, SAC                 |
+| init_path              | Initialize trainer from a previously saved model.                                                                                                                                       | PPO, SAC                 |
+| threaded              | Run the trainer in a parallel thread from the environment steps. (Default: true)                                                                                                                                      | PPO, SAC                 |
-* [Training with PPO](Training-PPO.md)
-* [Training with SAC](Training-SAC.md)
-* [Using Recurrent Neural Networks](Feature-Memory.md)
-* [Training with Curriculum Learning](Training-Curriculum-Learning.md)
-* [Training with Imitation Learning](Training-Imitation-Learning.md)
-* [Training with Environment Parameter Randomization](Training-Environment-Parameter-Randomization.md)
+- [Training with PPO](Training-PPO.md)
+- [Training with SAC](Training-SAC.md)
+- [Training with Self-Play](Training-Self-Play.md)
+- [Using Recurrent Neural Networks](Feature-Memory.md)
+- [Training with Curriculum Learning](Training-Curriculum-Learning.md)
+- [Training with Imitation Learning](Training-Imitation-Learning.md)
+- [Training with Environment Parameter Randomization](Training-Environment-Parameter-Randomization.md)
-[example environments](Learning-Environment-Examples.md)
-to the corresponding sections of the `config/trainer_config.yaml` file for each
-example to see how the hyperparameters and other configuration variables have
-been changed from the defaults.
-
-### Debugging and Profiling
-If you enable the `--debug` flag in the command line, the trainer metrics are logged to a CSV file
-stored in the `summaries` directory. The metrics stored are:
-  * brain name
-  * time to update policy
-  * time since start of training
-  * time for last experience collection
-  * number of experiences used for training
-  * mean return
-
-This option is not available currently for Behavioral Cloning.
-
-Additionally, we have included basic [Profiling in Python](Profiling-Python.md) as part of the toolkit.
-This information is also saved in the `summaries` directory.
+[example environments](Learning-Environment-Examples.md) to the corresponding
+sections of the `config/trainer_config.yaml` file for each example to see how
+the hyperparameters and other configuration variables have been changed from the
+defaults.
--- a/docs/Training-on-Amazon-Web-Service.md
+++ b/docs/Training-on-Amazon-Web-Service.md
 # Training on Amazon Web Service

-Note: We no longer use this guide ourselves and so it may not work correctly. We've
-decided to keep it up just in case it is helpful to you.
+:warning: **Note:** We no longer use this guide ourselves and so it may not work
+correctly. We've decided to keep it up just in case it is helpful to you.

 This page contains instructions for setting up an EC2 instance on Amazon Web
 Service for training ML-Agents environments.
-We've prepared a pre-configured AMI for you with the ID: `ami-016ff5559334f8619` in the
-`us-east-1` region. It was created as a modification of [Deep Learning AMI
-(Ubuntu)](https://aws.amazon.com/marketplace/pp/B077GCH38C). The AMI has been
-tested with p2.xlarge instance. Furthermore, if you want to train without
-headless mode, you need to enable X Server.
+We've prepared a pre-configured AMI for you with the ID: `ami-016ff5559334f8619`
+in the `us-east-1` region. It was created as a modification of
+[Deep Learning AMI (Ubuntu)](https://aws.amazon.com/marketplace/pp/B077GCH38C).
+The AMI has been tested with p2.xlarge instance. Furthermore, if you want to
+train without headless mode, you need to enable X Server.

 After launching your EC2 instance using the ami and ssh into it, run the
 following commands to enable it:

 1. Activate the python3 environment

-    ```sh
-    source activate python3
-    ```
+   ```sh
+   source activate python3
+   ```
-    ```sh
-    git clone --branch latest_release https://github.com/Unity-Technologies/ml-agents.git
-    cd ml-agents/ml-agents/
-    pip3 install -e .
-    ```
+   ```sh
+   git clone --branch latest_release https://github.com/Unity-Technologies/ml-agents.git
+   cd ml-agents/ml-agents/
+   pip3 install -e .
+   ```

 ### Setting up X Server (optional)


 #### Make sure there are no Xorg processes running:

-   ```sh
-   # Kill any possible running Xorg processes
-   # Note that you might have to run this command multiple times depending on
-   # how Xorg is configured.
-   $ sudo killall Xorg
+```sh
+# Kill any possible running Xorg processes
+# Note that you might have to run this command multiple times depending on
+# how Xorg is configured.
+$ sudo killall Xorg
-   # Check if there is any Xorg process left
-   # You will have a list of processes running on the GPU, Xorg should not be in
-   # the list, as shown below.
-   $ nvidia-smi
+# Check if there is any Xorg process left
+# You will have a list of processes running on the GPU, Xorg should not be in
+# the list, as shown below.
+$ nvidia-smi
-   # Thu Jun 14 20:21:11 2018
-   # +-----------------------------------------------------------------------------+
-   # | NVIDIA-SMI 390.67                 Driver Version: 390.67                    |
-   # |-------------------------------+----------------------+----------------------+
-   # | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
-   # | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
-   # |===============================+======================+======================|
-   # |   0  Tesla K80           On   | 00000000:00:1E.0 Off |                    0 |
-   # | N/A   37C    P8    31W / 149W |      0MiB / 11441MiB |      0%      Default |
-   # +-------------------------------+----------------------+----------------------+
-   #
-   # +-----------------------------------------------------------------------------+
-   # | Processes:                                                       GPU Memory |
-   # |  GPU       PID   Type   Process name                             Usage      |
-   # |=============================================================================|
-   # |  No running processes found                                                 |
-   # +-----------------------------------------------------------------------------+
+# Thu Jun 14 20:21:11 2018
+# +-----------------------------------------------------------------------------+
+# | NVIDIA-SMI 390.67                 Driver Version: 390.67                    |
+# |-------------------------------+----------------------+----------------------+
+# | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+# | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+# |===============================+======================+======================|
+# |   0  Tesla K80           On   | 00000000:00:1E.0 Off |                    0 |
+# | N/A   37C    P8    31W / 149W |      0MiB / 11441MiB |      0%      Default |
+# +-------------------------------+----------------------+----------------------+
+#
+# +-----------------------------------------------------------------------------+
+# | Processes:                                                       GPU Memory |
+# |  GPU       PID   Type   Process name                             Usage      |
+# |=============================================================================|
+# |  No running processes found                                                 |
+# +-----------------------------------------------------------------------------+
-   ```
+```

 #### Start X Server and make the ubuntu use X Server for display:

   can use one of the example environments if you have not created your own).
 2. Open the Build Settings window (menu: File > Build Settings).
 3. Select Linux as the Target Platform, and x86_64 as the target architecture
-(the default x86 currently does not work).
+   (the default x86 currently does not work).
-Headless Mode, you have to setup the X Server to enable training.)
+   Headless Mode, you have to setup the X Server to enable training.)
-    ```sh
-    chmod +x <your_env>.x86_64
-    ```
+   ```sh
+   chmod +x <your_env>.x86_64
+   ```
+
-    ```sh
-    # Start the X Server, press Enter to come back to the command line
-    $ sudo /usr/bin/X :0 &
+   ```sh
+   # Start the X Server, press Enter to come back to the command line
+   $ sudo /usr/bin/X :0 &
+
+   # Check if Xorg process is running
+   # You will have a list of processes running on the GPU, Xorg should be in the list.
+   $ nvidia-smi
-    # Check if Xorg process is running
-    # You will have a list of processes running on the GPU, Xorg should be in the list.
-    $ nvidia-smi
+   # Make the ubuntu use X Server for display
+   $ export DISPLAY=:0
+   ```
-    # Make the ubuntu use X Server for display
-    $ export DISPLAY=:0
-    ```
-    ```python
-    from mlagents_envs.environment import UnityEnvironment
+   ```python
+   from mlagents_envs.environment import UnityEnvironment
-    env = UnityEnvironment(<your_env>)
-    ```
+   env = UnityEnvironment(<your_env>)
+   ```
-    Where `<your_env>` corresponds to the path to your environment executable.
+   Where `<your_env>` corresponds to the path to your environment executable.
-    You should receive a message confirming that the environment was loaded successfully.
+   You should receive a message confirming that the environment was loaded
+   successfully.
+
 10. Train your models

    ```console
 ## FAQ

-### The <Executable_Name>_Data folder hasn't been copied cover
+### The <Executable_Name>\_Data folder hasn't been copied cover
-If you've built your Linux executable, but forget to copy over the corresponding <Executable_Name>_Data folder, you will see error message like the following:
+If you've built your Linux executable, but forget to copy over the corresponding
+<Executable_Name>\_Data folder, you will see error message like the following:

 ```sh
 Set current directory to /home/ubuntu/ml-agents/ml-agents

 ### Unity Environment not responding

-If you didn't setup X Server or hasn't launched it properly, or your environment somehow crashes, or you haven't `chmod +x` your Unity Environment, all of these will cause connection between Unity and Python to fail. Then you will see something like this:
+If you didn't setup X Server or hasn't launched it properly, or your environment
+somehow crashes, or you haven't `chmod +x` your Unity Environment, all of these
+will cause connection between Unity and Python to fail. Then you will see
+something like this:

 ```console
 Logging to /home/ubuntu/.config/unity3d/<Some_Path>/Player.log
         The environment and the Python interface have compatible versions.
 ```

-It would be also really helpful to check your /home/ubuntu/.config/unity3d/<Some_Path>/Player.log to see what happens with your Unity environment.
+It would be also really helpful to check your
+/home/ubuntu/.config/unity3d/<Some_Path>/Player.log to see what happens with
+your Unity environment.

 ### Could not launch X Server

 ```sh
 NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.
 ```
-This means the NVIDIA's driver needs to be updated. Refer to [this section](Training-on-Amazon-Web-Service.md#update-and-setup-nvidia-driver) for more information.
+
+This means the NVIDIA's driver needs to be updated. Refer to
+[this section](Training-on-Amazon-Web-Service.md#update-and-setup-nvidia-driver)
+for more information.
--- a/docs/Training-on-Microsoft-Azure.md
+++ b/docs/Training-on-Microsoft-Azure.md
 # Training on Microsoft Azure (works with ML-Agents toolkit v0.3)

-Note: We no longer use this guide ourselves and so it may not work correctly. We've
-decided to keep it up just in case it is helpful to you.
+:warning: **Note:** We no longer use this guide ourselves and so it may not work
+correctly. We've decided to keep it up just in case it is helpful to you.

 This page contains instructions for setting up training on Microsoft Azure
 through either
 ## Pre-Configured Azure Virtual Machine

 A pre-configured virtual machine image is available in the Azure Marketplace and
-is nearly completely ready for training.  You can start by deploying the
+is nearly completely ready for training. You can start by deploying the
-training will, by default, run on the GPU.  If you choose any other type of VM,
+training will, by default, run on the GPU. If you choose any other type of VM,
-Setting up your own instance requires a number of package installations.  Please
-view the documentation for doing so
-[here](Training-on-Microsoft-Azure-Custom-Instance.md).
+Setting up your own instance requires a number of package installations. Please
+view the documentation for doing so [here](#custom-instances).

 ## Installing ML-Agents

 To run your training on the VM:

 1. [Move](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/copy-files-to-linux-vm-using-scp)
-    your built Unity application to your Virtual Machine.
-2. Set the directory where the ML-Agents Toolkit was installed to your
-   working directory.
+   your built Unity application to your Virtual Machine.
+2. Set the directory where the ML-Agents Toolkit was installed to your working
+   directory.
 3. Run the following command:

 ```sh

 ## Monitoring your Training Run with TensorBoard

-Once you have started training, you can [use TensorBoard to observe the
-training](Using-Tensorboard.md).
+Once you have started training, you can
+[use TensorBoard to observe the training](Using-Tensorboard.md).
-1. Start by [opening the appropriate port for web traffic to connect to your VM](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/nsg-quickstart-portal).
+1. Start by
+   [opening the appropriate port for web traffic to connect to your VM](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/nsg-quickstart-portal).
-    * Note that you don't need to generate a new `Network Security Group` but
-      instead, go to the **Networking** tab under **Settings** for your VM.
-    * As an example, you could use the following settings to open the Port with
-      the following Inbound Rule settings:
-      * Source: Any
-      * Source Port Ranges: *
-      * Destination: Any
-      * Destination Port Ranges: 6006
-      * Protocol: Any
-      * Action: Allow
-      * Priority: (Leave as default)
+   - Note that you don't need to generate a new `Network Security Group` but
+     instead, go to the **Networking** tab under **Settings** for your VM.
+   - As an example, you could use the following settings to open the Port with
+     the following Inbound Rule settings:
+     - Source: Any
+     - Source Port Ranges: \*
+     - Destination: Any
+     - Destination Port Ranges: 6006
+     - Protocol: Any
+     - Action: Allow
+     - Priority: (Leave as default)

 2. Unless you started the training as a background process, connect to your VM
   from another terminal instance.

 [Azure Container Instances](https://azure.microsoft.com/services/container-instances/)
 allow you to spin up a container, on demand, that will run your training and
-then be shut down.  This ensures you aren't leaving a billable VM running when
-it isn't needed. Using ACI enables you to offload training of your models without needing to
-install Python and TensorFlow on your own computer.
+then be shut down. This ensures you aren't leaving a billable VM running when it
+isn't needed. Using ACI enables you to offload training of your models without
+needing to install Python and TensorFlow on your own computer.
+
+## Custom Instances
+
+This page contains instructions for setting up a custom Virtual Machine on
+Microsoft Azure so you can running ML-Agents training in the cloud.
+
+1. Start by
+   [deploying an Azure VM](https://docs.microsoft.com/azure/virtual-machines/linux/quick-create-portal)
+   with Ubuntu Linux (tests were done with 16.04 LTS). To use GPU support, use a
+   N-Series VM.
+2. SSH into your VM.
+3. Start with the following commands to install the Nvidia driver:
+
+   ```sh
+   wget http://us.download.nvidia.com/tesla/375.66/nvidia-diag-driver-local-repo-ubuntu1604_375.66-1_amd64.deb
+
+   sudo dpkg -i nvidia-diag-driver-local-repo-ubuntu1604_375.66-1_amd64.deb
+
+   sudo apt-get update
+
+   sudo apt-get install cuda-drivers
+
+   sudo reboot
+   ```
+
+4. After a minute you should be able to reconnect to your VM and install the
+   CUDA toolkit:
+
+   ```sh
+   wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_8.0.61-1_amd64.deb
+
+   sudo dpkg -i cuda-repo-ubuntu1604_8.0.61-1_amd64.deb
+
+   sudo apt-get update
+
+   sudo apt-get install cuda-8-0
+   ```
+
+5. You'll next need to download cuDNN from the Nvidia developer site. This
+   requires a registered account.
+
+6. Navigate to [http://developer.nvidia.com](http://developer.nvidia.com) and
+   create an account and verify it.
+
+7. Download (to your own computer) cuDNN from
+   [this url](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/v6/prod/8.0_20170307/Ubuntu16_04_x64/libcudnn6_6.0.20-1+cuda8.0_amd64-deb).
+
+8. Copy the deb package to your VM:
+
+   ```sh
+   scp libcudnn6_6.0.21-1+cuda8.0_amd64.deb <VMUserName>@<VMIPAddress>:libcudnn6_6.0.21-1+cuda8.0_amd64.deb
+   ```
+
+9. SSH back to your VM and execute the following:
+
+   ```console
+   sudo dpkg -i libcudnn6_6.0.21-1+cuda8.0_amd64.deb
+
+   export LD_LIBRARY_PATH=/usr/local/cuda/lib64/:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
+   . ~/.profile
+
+   sudo reboot
+   ```
+
+10. After a minute, you should be able to SSH back into your VM. After doing so,
+    run the following:
+
+    ```sh
+    sudo apt install python-pip
+    sudo apt install python3-pip
+    ```
+
+11. At this point, you need to install TensorFlow. The version you install
+    should be tied to if you are using GPU to train:
+
+    ```sh
+    pip3 install tensorflow-gpu==1.4.0 keras==2.0.6
+    ```
+
+    Or CPU to train:
+
+    ```sh
+    pip3 install tensorflow==1.4.0 keras==2.0.6
+    ```
+
+12. You'll then need to install additional dependencies:
+
+    ```sh
+    pip3 install pillow
+    pip3 install numpy
+    ```
--- a/docs/Using-Docker.md
+++ b/docs/Using-Docker.md
 # Using Docker For ML-Agents (Deprecated)

-Note: We no longer use this guide ourselves and so it may not work correctly. We've decided to
- keep it up just in case it is helpful to you.
+:warning: **Note:** We no longer use this guide ourselves and so it may not work
+correctly. We've decided to keep it up just in case it is helpful to you.

 We currently offer a solution for Windows and Mac users who would like to do
 training or inference using Docker. This option may be appealing to those who
 ## Requirements

 - [Docker](https://www.docker.com)
- Unity _Linux Build Support_ Component. Make sure to select the _Linux
-Build Support_ component when installing Unity.
+- Unity _Linux Build Support_ Component. Make sure to select the _Linux Build
+  Support_ component when installing Unity.

 <p align="center">
  <img src="images/unity_linux_build_support.png"
 Using Docker for ML-Agents involves three steps: building the Unity environment
 with specific flags, building a Docker container and, finally, running the
 container. If you are not familiar with building a Unity environment for
-ML-Agents, please read through our [Getting Started with the 3D Balance Ball
-Example](Getting-Started.md) guide first.
+ML-Agents, please read through our
+[Getting Started with the 3D Balance Ball Example](Getting-Started.md) guide
+first.

 ### Build the Environment (Optional)


 - Set the _Target Platform_ to `Linux`
 - Set the _Architecture_ to `x86_64`
- If the environment does not contain visual observations, you can select the
-  `headless` option here.

 Then click `Build`, pick an environment name (e.g. `3DBall`) and set the output
 directory to `unity-volume`. After building, ensure that the file
  random name if this is not set. _Note that this must be unique for every run
  of a Docker image._
 - `<image-name>` references the image name used when building the container.
- `<environment-name>` __(Optional)__: If you are training with a linux
+- `<environment-name>` **(Optional)**: If you are training with a linux
  executable, this is the name of the executable. If you are training in the
  Editor, do not pass a `<environment-name>` argument and press the
  :arrow_forward: button in Unity when the message _"Start training by pressing
 For more detail on Docker mounts, check out
 [these](https://docs.docker.com/storage/bind-mounts/) docs from Docker.

-**NOTE** If you are training using docker for environments that use visual observations, you may need to increase the default memory that Docker allocates for the container. For example, see [here](https://docs.docker.com/docker-for-mac/#advanced) for instructions for Docker for Mac.
+**NOTE** If you are training using docker for environments that use visual
+observations, you may need to increase the default memory that Docker allocates
+for the container. For example, see
+[here](https://docs.docker.com/docker-for-mac/#advanced) for instructions for
+Docker for Mac.
-You can run Tensorboard to monitor your training instance on http://localhost:6006:
+You can run Tensorboard to monitor your training instance on
+http://localhost:6006:

 ```sh
 docker exec -it <container-name> tensorboard --logdir=/unity-volume/summaries --host=0.0.0.0
+
-For more details on Tensorboard, check out the documentation about [Using Tensorboard](Using-Tensorboard.md).
+For more details on Tensorboard, check out the documentation about
+[Using Tensorboard](Using-Tensorboard.md).

 ### Stopping Container and Saving State

 docker kill --signal=SIGINT <container-name>
 ```

-`<container-name>` is the name of the container specified in the earlier `docker
-run` command. If you didn't specify one, you can find the randomly generated
-identifier by running `docker container ls`.
+`<container-name>` is the name of the container specified in the earlier
+`docker run` command. If you didn't specify one, you can find the randomly
+generated identifier by running `docker container ls`.
--- a/docs/Using-Tensorboard.md
+++ b/docs/Using-Tensorboard.md
 start TensorBoard:

 1. Open a terminal or console window:
-2. Navigate to the directory where the ML-Agents Toolkit is installed.
-3. From the command line run :
-
-      ```sh
-      tensorboard --logdir=summaries --port=6006
-      ```
-
-4. Open a browser window and navigate to [localhost:6006](http://localhost:6006).
+1. Navigate to the directory where the ML-Agents Toolkit is installed.
+1. From the command line run: `tensorboard --logdir=summaries --port=6006`
+1. Open a browser window and navigate to
+   [localhost:6006](http://localhost:6006).
-**Note:** The default port TensorBoard uses is 6006. If there is an existing session
-running on port 6006 a new session can be launched on an open port using the --port
-option.
+**Note:** The default port TensorBoard uses is 6006. If there is an existing
+session running on port 6006 a new session can be launched on an open port using
+the --port option.

 **Note:** If you don't assign a `run-id` identifier, `mlagents-learn` uses the
 default string, "ppo". All the statistics will be saved to the same sub-folder

 ### Environment Statistics

-* `Environment/Lesson` - Plots the progress from lesson to lesson. Only interesting when
-  performing [curriculum training](Training-Curriculum-Learning.md).
+- `Environment/Lesson` - Plots the progress from lesson to lesson. Only
+  interesting when performing
+  [curriculum training](Training-Curriculum-Learning.md).
-* `Environment/Cumulative Reward` - The mean cumulative episode reward over all agents. Should
-  increase during a successful training session.
+- `Environment/Cumulative Reward` - The mean cumulative episode reward over all
+  agents. Should increase during a successful training session.
-* `Environment/Episode Length` - The mean length of each episode in the environment for all agents.
+- `Environment/Episode Length` - The mean length of each episode in the
+  environment for all agents.
-* `Policy/Entropy` (PPO; BC) - How random the decisions of the model are. Should slowly decrease
-  during a successful training process. If it decreases too quickly, the `beta`
-  hyperparameter should be increased.
+- `Policy/Entropy` (PPO; BC) - How random the decisions of the model are. Should
+  slowly decrease during a successful training process. If it decreases too
+  quickly, the `beta` hyperparameter should be increased.
-* `Policy/Learning Rate` (PPO; BC) - How large a step the training algorithm takes as it searches
-  for the optimal policy. Should decrease over time.
+- `Policy/Learning Rate` (PPO; BC) - How large a step the training algorithm
+  takes as it searches for the optimal policy. Should decrease over time.
-* `Policy/Value Estimate` (PPO) - The mean value estimate for all states visited by the agent. Should increase during a successful training session.
+- `Policy/Value Estimate` (PPO) - The mean value estimate for all states visited
+  by the agent. Should increase during a successful training session.
-* `Policy/Curiosity Reward` (PPO+Curiosity) - This corresponds to the mean cumulative intrinsic reward generated per-episode.
+- `Policy/Curiosity Reward` (PPO+Curiosity) - This corresponds to the mean
+  cumulative intrinsic reward generated per-episode.
-* `Losses/Policy Loss` (PPO) - The mean magnitude of policy loss function. Correlates to how
-  much the policy (process for deciding actions) is changing. The magnitude of
-  this should decrease during a successful training session.
+- `Losses/Policy Loss` (PPO) - The mean magnitude of policy loss function.
+  Correlates to how much the policy (process for deciding actions) is changing.
+  The magnitude of this should decrease during a successful training session.
+
+- `Losses/Value Loss` (PPO) - The mean loss of the value function update.
+  Correlates to how well the model is able to predict the value of each state.
+  This should increase while the agent is learning, and then decrease once the
+  reward stabilizes.
-* `Losses/Value Loss` (PPO) - The mean loss of the value function update. Correlates to how
-  well the model is able to predict the value of each state. This should
-  increase while the agent is learning, and then decrease once the reward
-  stabilizes.
+- `Losses/Forward Loss` (PPO+Curiosity) - The mean magnitude of the inverse
+  model loss function. Corresponds to how well the model is able to predict the
+  new observation encoding.
+
+- `Losses/Inverse Loss` (PPO+Curiosity) - The mean magnitude of the forward
+  model loss function. Corresponds to how well the model is able to predict the
+  action taken between two observations.
-* `Losses/Forward Loss` (PPO+Curiosity) - The mean magnitude of the inverse model
-  loss function. Corresponds to how well the model is able to predict the new
-  observation encoding.
+- `Losses/Cloning Loss` (BC) - The mean magnitude of the behavioral cloning
+  loss. Corresponds to how well the model imitates the demonstration data.
-* `Losses/Inverse Loss` (PPO+Curiosity) - The mean magnitude of the forward model
-  loss function. Corresponds to how well the model is able to predict the action
-  taken between two observations.
+## Custom Metrics from Unity
-* `Losses/Cloning Loss` (BC) - The mean magnitude of the behavioral cloning loss. Corresponds to how well the model imitates the demonstration data.
+To get custom metrics from a C# environment into Tensorboard, you can use the
+StatsSideChannel:
-## Custom Metrics from C#
-To get custom metrics from a C# environment into Tensorboard, you can use the StatsSideChannel:
 ```csharp
 var statsSideChannel = SideChannelUtils.GetSideChannel<StatsSideChannel>();
 statsSideChannel.AddStat("MyMetric", 1.0);
--- a/docs/Using-Virtual-Environment.md
+++ b/docs/Using-Virtual-Environment.md
 # Using Virtual Environment

 ## What is a Virtual Environment?
-A Virtual Environment is a self contained directory tree that contains a Python installation
-for a particular version of Python, plus a number of additional packages. To learn more about
-Virtual Environments see [here](https://docs.python.org/3/library/venv.html)
+
+A Virtual Environment is a self contained directory tree that contains a Python
+installation for a particular version of Python, plus a number of additional
+packages. To learn more about Virtual Environments see
+[here](https://docs.python.org/3/library/venv.html).
-A Virtual Environment keeps all dependencies for the Python project separate from dependencies
-of other projects. This has a few advantages:
+
+A Virtual Environment keeps all dependencies for the Python project separate
+from dependencies of other projects. This has a few advantages:
+
-spinning up a new environment and verifying the compatibility of the code with the
-different version.
+   spinning up a new environment and verifying the compatibility of the code
+   with the different version.
-This guide has been tested with Python 3.6 and 3.7. Python 3.8 is not supported at this time.
+
+This guide has been tested with Python 3.6 and 3.7. Python 3.8 is not supported
+at this time.
-1. Download the `get-pip.py` file using the command `curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py`
+1. Download the `get-pip.py` file using the command
+   `curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py`
-Note (for Ubuntu users): If the `ModuleNotFoundError: No module named 'distutils.util'` error is encountered, then
-python3-distutils needs to be installed. Install python3-distutils using `sudo apt-get install python3-distutils`
+Note (for Ubuntu users): If the
+`ModuleNotFoundError: No module named 'distutils.util'` error is encountered,
+then python3-distutils needs to be installed. Install python3-distutils using
+`sudo apt-get install python3-distutils`
-1. Create a folder where the virtual environments will reside `$ mkdir ~/python-envs`
-1. To create a new environment named `sample-env` execute `$ python3 -m venv ~/python-envs/sample-env`
-1. To activate the environment execute `$ source ~/python-envs/sample-env/bin/activate`
-1. Verify pip version is the same as in the __Installing Pip__ section. In case it is not the latest, upgrade to
-the latest pip version using `$ pip3 install --upgrade pip`
-1. To deactivate the environment execute `$ deactivate` (you can reactivate the environment
-using the same `activate` command listed above)
+1. Create a folder where the virtual environments will reside
+   `$ mkdir ~/python-envs`
+1. To create a new environment named `sample-env` execute
+   `$ python3 -m venv ~/python-envs/sample-env`
+1. To activate the environment execute
+   `$ source ~/python-envs/sample-env/bin/activate`
+1. Upgrade to the latest pip version using `$ pip3 install --upgrade pip`
+1. Upgrade to the latest setuptools version using
+   `$ pip3 install --upgrade setuptools`
+1. To deactivate the environment execute `$ deactivate` (you can reactivate the
+   environment using the same `activate` command listed above)

 ## Ubuntu Setup

 ## Windows Setup

 1. Create a folder where the virtual environments will reside `md python-envs`
-1. To create a new environment named `sample-env` execute `python -m venv python-envs\sample-env`
+1. To create a new environment named `sample-env` execute
+   `python -m venv python-envs\sample-env`
-1. Verify pip version is the same as in the __Installing Pip__ section. In case it is not the
-latest, upgrade to the latest pip version using `pip install --upgrade pip`
-1. To deactivate the environment execute `deactivate` (you can reactivate the environment
-using the same `activate` command listed above)
+1. Upgrade to the latest pip version using `pip install --upgrade pip`
+1. To deactivate the environment execute `deactivate` (you can reactivate the
+   environment using the same `activate` command listed above)
- Verify that you are using Python 3.6 or Python 3.7. Launch a command prompt using `cmd` and
- execute `python --version` to verify the version.
+
+- Verify that you are using Python 3.6 or Python 3.7. Launch a command prompt
+  using `cmd` and execute `python --version` to verify the version.
- This guide is for Windows 10 using a 64-bit architecture only.
+- This guide is for Windows 10 using a 64-bit architecture only.
--- a/docs/images/demo_inspector.png
+++ b/docs/images/demo_inspector.png
--- a/docs/images/docker_build_settings.png
+++ b/docs/images/docker_build_settings.png
--- a/docs/images/learning_environment_basic.png
+++ b/docs/images/learning_environment_basic.png
--- a/docs/images/learning_environment_example.png
+++ b/docs/images/learning_environment_example.png
--- a/docs/images/unity_package_json.png
+++ b/docs/images/unity_package_json.png
--- a/docs/images/unity_package_manager_window.png
+++ b/docs/images/unity_package_manager_window.png
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
    )
    argparser.add_argument("trainer_config_path")
    argparser.add_argument(
-        "--env", default=None, dest="env_path", help="Name of the Unity executable "
+        "--env",
+        default=None,
+        dest="env_path",
+        help="Path to the Unity executable to train",
-        help="Curriculum config yaml file for environment",
+        help="YAML file for defining the lessons for curriculum training",
+    )
+    argparser.add_argument(
+        "--lesson",
+        default=0,
+        type=int,
+        help="The lesson to start with when performing curriculum training",
-        help="Reset parameter yaml file for environment",
+        help="YAML file for defining the sampler for environment parameter randomization",
-        help="How many model checkpoints to keep",
-    )
-    argparser.add_argument(
-        "--lesson", default=0, type=int, help="Start learning from this lesson"
+        help="The maximum number of model checkpoints to keep. Checkpoints are saved after the"
+        "number of steps specified by the save-freq option. Once the maximum number of checkpoints"
+        "has been reached, the oldest checkpoint is deleted when saving a new checkpoint.",
    )
    argparser.add_argument(
        "--load",
        default=False,
        dest="resume",
        action="store_true",
-        help="Resumes training from a checkpoint. Specify a --run-id to use this option.",
+        help="Whether to resume training from a checkpoint. Specify a --run-id to use this option. "
+        "If set, the training code loads an already trained model to initialize the neural network "
+        "before resuming training. This option is only valid when the models exist, and have the same "
+        "behavior names as the current agents in your scene.",
    )
    argparser.add_argument(
        "--force",
-        help="Force-overwrite existing models and summaries for a run ID that has been used "
-        "before.",
+        help="Whether to force-overwrite this run-id's existing summary and model data. (Without "
+        "this flag, attempting to train a model with a run-id that has been used before will throw "
+        "an error.",
-        help="The run identifier for model and summary statistics.",
+        help="The identifier for the training run. This identifier is used to name the "
+        "subdirectories in which the trained model and summary statistics are saved as well "
+        "as the saved model itself. If you use TensorBoard to view the training statistics, "
+        "always set a unique run-id for each training run. (The statistics for all runs with the "
+        "same id are combined as if they were produced by a the same session.)",
    )
    argparser.add_argument(
        "--initialize-from",
-        "This can be used, for instance, to fine-tune an existing model on a new environment. ",
+        "This can be used, for instance, to fine-tune an existing model on a new environment. "
+        "Note that the previously saved models must have the same behavior parameters as your "
+        "current environment.",
-        "--save-freq", default=50000, type=int, help="Frequency at which to save model"
+        "--save-freq",
+        default=50000,
+        type=int,
+        help="How often (in steps) to save the model during training",
-        "--seed", default=-1, type=int, help="Random seed used for training"
+        "--seed",
+        default=-1,
+        type=int,
+        help="A number to use as a seed for the random number generator used by the training code",
    )
    argparser.add_argument(
        "--train",
        default=False,
        dest="inference",
        action="store_true",
-        help="Run in Python inference mode (don't train). Use with --resume to load a model trained with an "
-        "existing run ID.",
+        help="Whether to run in Python inference mode (i.e. no training). Use with --resume to load "
+        "a model trained with an existing run ID.",
-        help="Base port for environment communication",
+        help="The starting port for environment communication. Each concurrent Unity environment "
+        "instance will get assigned a port sequentially, starting from the base-port. Each instance "
+        "will use the port (base_port + worker_id), where the worker_id is sequential IDs given to "
+        "each instance from 0 to (num_envs - 1). Note that when training using the Editor rather "
+        "than an executable, the base port will be ignored.",
-        help="Number of parallel environments to use for training",
+        help="The number of concurrent Unity environment instances to collect experiences "
+        "from when training",
-        help="Whether to run the environment in no-graphics mode",
+        help="Whether to run the Unity executable in no-graphics mode (i.e. without initializing "
+        "the graphics driver. Use this only if your agents don't use visual observations.",
-        help="Whether to run ML-Agents in debug mode with detailed logging",
+        help="Whether to enable debug-level logging for some parts of the code",
-        help="Arguments passed to the Unity executable.",
+        help="Arguments passed to the Unity executable. Be aware that the standalone build will also "
+        "process these as Unity Command Line Arguments. You should choose different argument names if "
+        "you want to create environment-specific arguments. All arguments after this flag will be "
+        "passed to the executable.",
-        "--cpu", default=False, action="store_true", help="Run with CPU only"
+        "--cpu",
+        default=False,
+        action="store_true",
+        help="Forces training using CPU only",
    )

    argparser.add_argument("--version", action="version", version="")
        "--width",
        default=84,
        type=int,
-        help="The width of the executable window of the environment(s)",
+        help="The width of the executable window of the environment(s) in pixels "
+        "(ignored for editor training).",
-        help="The height of the executable window of the environment(s)",
+        help="The height of the executable window of the environment(s) in pixels "
+        "(ignored for editor training)",
-        help="The quality level of the environment(s)",
+        help="The quality level of the environment(s). Equivalent to calling "
+        "QualitySettings.SetQualityLevel in Unity.",
-        help="The time scale of the Unity environment(s)",
+        help="The time scale of the Unity environment(s). Equivalent to setting "
+        "Time.timeScale in Unity.",
-        help="The target frame rate of the Unity environment(s)",
+        help="The target frame rate of the Unity environment(s). Equivalent to setting "
+        "Application.targetFrameRate in Unity.",
    )
    return argparser

--- a/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationMetaData.cs
+++ b/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationMetaData.cs
+using System;
+using UnityEngine;
+using MLAgents.Policies;
+using UnityEngine.Serialization;
+
+namespace MLAgents.Demonstrations
+{
+    /// <summary>
+    /// Demonstration meta-data.
+    /// Kept in a struct for easy serialization and deserialization.
+    /// </summary>
+    [Serializable]
+    internal class DemonstrationMetaData
+    {
+        [FormerlySerializedAs("numberExperiences")]
+        public int numberSteps;
+        public int numberEpisodes;
+        public float meanReward;
+        public string demonstrationName;
+        public const int ApiVersion = 1;
+    }
+}
--- a/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationMetaData.cs.meta
+++ b/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationMetaData.cs.meta
+fileFormatVersion: 2
+guid: af5f3b4258a2d4ead90e733f30cfaa7a
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationSummary.cs
+++ b/com.unity.ml-agents/Runtime/Demonstrations/DemonstrationSummary.cs
+using System;
+using System.Collections.Generic;
+using UnityEngine;
+using MLAgents.Policies;
+
+namespace MLAgents.Demonstrations
+{
+    /// <summary>
+    /// Summary of a loaded Demonstration file. Only used for display in the Inspector.
+    /// </summary>
+    [Serializable]
+    internal class DemonstrationSummary : ScriptableObject
+    {
+        public DemonstrationMetaData metaData;
+        public BrainParameters brainParameters;
+        public List<ObservationSummary> observationSummaries;
+
+        public void Initialize(BrainParameters brainParams,
+            DemonstrationMetaData demonstrationMetaData, List<ObservationSummary> obsSummaries)
+        {
+            brainParameters = brainParams;
+            metaData = demonstrationMetaData;
+            observationSummaries = obsSummaries;
+        }
+    }
+
+
+    /// <summary>
+    /// Summary of a loaded Observation. Currently only contains the shape of the Observation.
+    /// </summary>
+    /// <remarks>This is necessary because serialization doesn't support nested containers or arrays.</remarks>
+    [Serializable]
+    internal struct ObservationSummary
+    {
+        public int[] shape;
+    }
+}
--- a/docs/images/learning_environment_full.png
+++ b/docs/images/learning_environment_full.png
--- a/com.unity.ml-agents/Runtime/Demonstrations/Demonstration.cs
+++ b/com.unity.ml-agents/Runtime/Demonstrations/Demonstration.cs
-using System;
-using UnityEngine;
-using MLAgents.Policies;
-
-namespace MLAgents.Demonstrations
-{
-    /// <summary>
-    /// Demonstration Object. Contains meta-data regarding demonstration.
-    /// Used for imitation learning, or other forms of learning from data.
-    /// </summary>
-    [Serializable]
-    internal class Demonstration : ScriptableObject
-    {
-        public DemonstrationMetaData metaData;
-        public BrainParameters brainParameters;
-
-        public void Initialize(BrainParameters brainParams,
-            DemonstrationMetaData demonstrationMetaData)
-        {
-            brainParameters = brainParams;
-            metaData = demonstrationMetaData;
-        }
-    }
-
-    /// <summary>
-    /// Demonstration meta-data.
-    /// Kept in a struct for easy serialization and deserialization.
-    /// </summary>
-    [Serializable]
-    internal class DemonstrationMetaData
-    {
-        public int numberExperiences;
-        public int numberEpisodes;
-        public float meanReward;
-        public string demonstrationName;
-        public const int ApiVersion = 1;
-    }
-}
--- a/docs/Training-on-Microsoft-Azure-Custom-Instance.md
+++ b/docs/Training-on-Microsoft-Azure-Custom-Instance.md
-# Setting up a Custom Instance on Microsoft Azure for Training (works with the ML-Agents toolkit v0.3)
-
-This page contains instructions for setting up a custom Virtual Machine on Microsoft Azure so you can running ML-Agents training in the cloud.
-
-1. Start by
-   [deploying an Azure VM](https://docs.microsoft.com/azure/virtual-machines/linux/quick-create-portal)
-   with Ubuntu Linux (tests were done with 16.04 LTS).  To use GPU support, use
-   a N-Series VM.
-2. SSH into your VM.
-3. Start with the following commands to install the Nvidia driver:
-
-   ```sh
-   wget http://us.download.nvidia.com/tesla/375.66/nvidia-diag-driver-local-repo-ubuntu1604_375.66-1_amd64.deb
-
-   sudo dpkg -i nvidia-diag-driver-local-repo-ubuntu1604_375.66-1_amd64.deb
-
-   sudo apt-get update
-
-   sudo apt-get install cuda-drivers
-
-   sudo reboot
-   ```
-
-4. After a minute you should be able to reconnect to your VM and install the
-   CUDA toolkit:
-
-   ```sh
-   wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_8.0.61-1_amd64.deb
-
-   sudo dpkg -i cuda-repo-ubuntu1604_8.0.61-1_amd64.deb
-
-   sudo apt-get update
-
-   sudo apt-get install cuda-8-0
-   ```
-
-5. You'll next need to download cuDNN from the Nvidia developer site.  This
-   requires a registered account.
-
-6. Navigate to [http://developer.nvidia.com](http://developer.nvidia.com) and
-   create an account and verify it.
-
-7. Download (to your own computer) cuDNN from [this url](https://developer.nvidia.com/compute/machine-learning/cudnn/secure/v6/prod/8.0_20170307/Ubuntu16_04_x64/libcudnn6_6.0.20-1+cuda8.0_amd64-deb).
-
-8. Copy the deb package to your VM:
-
-   ```sh
-   scp libcudnn6_6.0.21-1+cuda8.0_amd64.deb <VMUserName>@<VMIPAddress>:libcudnn6_6.0.21-1+cuda8.0_amd64.deb
-   ```
-
-9. SSH back to your VM and execute the following:
-
-   ```console
-   sudo dpkg -i libcudnn6_6.0.21-1+cuda8.0_amd64.deb
-
-   export LD_LIBRARY_PATH=/usr/local/cuda/lib64/:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
-   . ~/.profile
-
-   sudo reboot
-   ```
-
-10. After a minute, you should be able to SSH back into your VM.  After doing
-    so, run the following:
-
-    ```sh
-    sudo apt install python-pip
-    sudo apt install python3-pip
-    ```
-
-11. At this point, you need to install TensorFlow.  The version you install
-    should be tied to if you are using GPU to train:
-
-    ```sh
-    pip3 install tensorflow-gpu==1.4.0 keras==2.0.6
-    ```
-
-    Or CPU to train:
-
-    ```sh
-    pip3 install tensorflow==1.4.0 keras==2.0.6
-    ```
-
-12. You'll then need to install additional dependencies:
-
-    ```sh
-    pip3 install pillow
-    pip3 install numpy
-    ```
-
-13. You can now return to the
-    [main Azure instruction page](Training-on-Microsoft-Azure.md).
--- a//com.unity.ml-agents/Runtime/Demonstrations/DemonstrationSummary.cs.meta
+++ b//com.unity.ml-agents/Runtime/Demonstrations/DemonstrationSummary.cs.meta