浏览代码

Merge remote-tracking branch 'origin/master' into release_1_to_master

/release_1_branch
Chris Elion 5 年前
当前提交
68b68396
共有 107 个文件被更改,包括 1826 次插入293 次删除
  1. 4
      .gitignore
  2. 20
      com.unity.ml-agents/CHANGELOG.md
  3. 4
      com.unity.ml-agents/Editor/BehaviorParametersEditor.cs
  4. 2
      com.unity.ml-agents/Runtime/Agent.cs
  5. 2
      com.unity.ml-agents/Runtime/Inference/BarracudaModelParamLoader.cs
  6. 4
      com.unity.ml-agents/Runtime/Inference/GeneratorImpl.cs
  7. 2
      com.unity.ml-agents/Runtime/Inference/TensorApplier.cs
  8. 2
      com.unity.ml-agents/Runtime/Inference/TensorGenerator.cs
  9. 2
      com.unity.ml-agents/Runtime/Inference/TensorProxy.cs
  10. 2
      com.unity.ml-agents/Runtime/Sensors/ObservationWriter.cs
  11. 2
      com.unity.ml-agents/Tests/Editor/EditModeTestInternalBrainTensorApplier.cs
  12. 4
      com.unity.ml-agents/Tests/Editor/ModelRunnerTest.cs
  13. 4
      com.unity.ml-agents/Tests/Editor/ParameterLoaderTest.cs
  14. 2
      com.unity.ml-agents/Tests/Editor/Sensor/ObservationWriterTests.cs
  15. 4
      com.unity.ml-agents/Tests/Editor/TensorUtilsTest.cs
  16. 15
      docs/Getting-Started.md
  17. 43
      docs/Learning-Environment-Create-New.md
  18. 4
      docs/Learning-Environment-Examples.md
  19. 13
      docs/Learning-Environment-Executable.md
  20. 19
      docs/Migrating.md
  21. 2
      docs/Using-Tensorboard.md
  22. 3
      gym-unity/README.md
  23. 4
      gym-unity/gym_unity/__init__.py
  24. 4
      ml-agents-envs/mlagents_envs/__init__.py
  25. 30
      ml-agents-envs/mlagents_envs/environment.py
  26. 12
      ml-agents-envs/mlagents_envs/tests/test_envs.py
  27. 4
      ml-agents/mlagents/trainers/__init__.py
  28. 169
      ml-agents/mlagents/trainers/learn.py
  29. 5
      ml-agents/mlagents/trainers/policy/tf_policy.py
  30. 3
      ml-agents/mlagents/trainers/ppo/trainer.py
  31. 7
      ml-agents/mlagents/trainers/sac/trainer.py
  32. 6
      ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
  33. 2
      ml-agents/mlagents/trainers/tests/test_bcmodule.py
  34. 9
      ml-agents/mlagents/trainers/tests/test_ghost.py
  35. 133
      ml-agents/mlagents/trainers/tests/test_learn.py
  36. 8
      ml-agents/mlagents/trainers/tests/test_nn_policy.py
  37. 2
      ml-agents/mlagents/trainers/tests/test_policy.py
  38. 9
      ml-agents/mlagents/trainers/tests/test_ppo.py
  39. 2
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  40. 2
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  41. 14
      ml-agents/mlagents/trainers/tests/test_sac.py
  42. 6
      ml-agents/mlagents/trainers/tests/test_simple_rl.py
  43. 6
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  44. 101
      ml-agents/mlagents/trainers/tests/test_trainer_util.py
  45. 3
      ml-agents/mlagents/trainers/trainer/trainer.py
  46. 20
      ml-agents/mlagents/trainers/trainer_controller.py
  47. 55
      ml-agents/mlagents/trainers/trainer_util.py
  48. 17
      ml-agents/tests/yamato/scripts/run_llapi.py
  49. 4
      ml-agents/tests/yamato/training_int_tests.py
  50. 3
      ml-agents/tests/yamato/yamato_utils.py
  51. 41
      ml-agents/mlagents/trainers/cli_utils.py
  52. 29
      config/imitation/CrawlerStatic.yaml
  53. 29
      config/imitation/FoodCollector.yaml
  54. 28
      config/imitation/Hallway.yaml
  55. 25
      config/imitation/PushBlock.yaml
  56. 36
      config/imitation/Pyramids.yaml
  57. 25
      config/ppo/3DBall.yaml
  58. 25
      config/ppo/3DBallHard.yaml
  59. 40
      config/ppo/3DBall_randomize.yaml
  60. 25
      config/ppo/Basic.yaml
  61. 25
      config/ppo/Bouncer.yaml
  62. 25
      config/ppo/CrawlerDynamic.yaml
  63. 25
      config/ppo/CrawlerStatic.yaml
  64. 25
      config/ppo/FoodCollector.yaml
  65. 25
      config/ppo/GridWorld.yaml
  66. 25
      config/ppo/Hallway.yaml
  67. 25
      config/ppo/PushBlock.yaml
  68. 29
      config/ppo/Pyramids.yaml
  69. 25
      config/ppo/Reacher.yaml
  70. 38
      config/ppo/SoccerTwos.yaml
  71. 62
      config/ppo/StrikersVsGoalie.yaml
  72. 31
      config/ppo/Tennis.yaml
  73. 25
      config/ppo/VisualHallway.yaml
  74. 25
      config/ppo/VisualPushBlock.yaml
  75. 29
      config/ppo/VisualPyramids.yaml
  76. 25
      config/ppo/Walker.yaml
  77. 50
      config/ppo/WallJump.yaml
  78. 65
      config/ppo/WallJump_curriculum.yaml
  79. 25
      config/ppo/WormDynamic.yaml
  80. 25
      config/ppo/WormStatic.yaml
  81. 25
      config/sac/3DBall.yaml
  82. 25
      config/sac/3DBallHard.yaml
  83. 25
      config/sac/Basic.yaml
  84. 25
      config/sac/Bouncer.yaml
  85. 25
      config/sac/CrawlerDynamic.yaml
  86. 25
      config/sac/CrawlerStatic.yaml
  87. 25
      config/sac/FoodCollector.yaml
  88. 25
      config/sac/GridWorld.yaml
  89. 25
      config/sac/Hallway.yaml
  90. 25
      config/sac/PushBlock.yaml
  91. 31
      config/sac/Pyramids.yaml
  92. 25
      config/sac/Reacher.yaml
  93. 30
      config/sac/Tennis.yaml
  94. 26
      config/sac/VisualHallway.yaml
  95. 26
      config/sac/VisualPushBlock.yaml

4
.gitignore


# Tensorflow Model Info
# Output Artifacts (Legacy)
# Output Artifacts
/results
# Training environments
/envs

20
com.unity.ml-agents/CHANGELOG.md


and this project adheres to
[Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [1.0.0-preview] - 2020-05-06
## [Unreleased]
### Major Changes
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
### Minor Changes
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
- Curriculum and Parameter Randomization configurations have been merged
into the main training configuration file. Note that this means training
configuration files are now environment-specific. (#3791)
- Training artifacts (trained models, summaries) are now found in the `results/`
directory. (#3829)
- Unity Player logs are now written out to the results directory. (#3877)
- Run configuration YAML files are written out to the results directory at the end of the run. (#3815)
### Bug Fixes
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
## [1.0.0-preview] - 2020-04-30
### Major Changes

4
com.unity.ml-agents/Editor/BehaviorParametersEditor.cs


using Unity.MLAgents.Sensors;
using Unity.MLAgents.Policies;
using Unity.MLAgents.Policies;
using Unity.MLAgents.Sensors;
using UnityEngine;
namespace Unity.MLAgents.Editor

2
com.unity.ml-agents/Runtime/Agent.cs


using System.Collections.Generic;
using System.Collections.ObjectModel;
using UnityEngine;
using Unity.Barracuda;
using Unity.Barracuda;
using UnityEngine.Serialization;
namespace Unity.MLAgents

2
com.unity.ml-agents/Runtime/Inference/BarracudaModelParamLoader.cs


using System;
using System.Collections.Generic;
using System.Linq;
using Unity.Barracuda;
using Unity.Barracuda;
namespace Unity.MLAgents.Inference
{

4
com.unity.ml-agents/Runtime/Inference/GeneratorImpl.cs


using System.Collections.Generic;
using System;
using Unity.MLAgents.Inference.Utils;
using Unity.Barracuda;
using Unity.Barracuda;
using Unity.MLAgents.Inference.Utils;
using Unity.MLAgents.Sensors;
namespace Unity.MLAgents.Inference

2
com.unity.ml-agents/Runtime/Inference/TensorApplier.cs


using System.Collections.Generic;
using Unity.MLAgents.Policies;
using Unity.MLAgents.Policies;
namespace Unity.MLAgents.Inference
{

2
com.unity.ml-agents/Runtime/Inference/TensorGenerator.cs


using System.Collections.Generic;
using Unity.MLAgents.Sensors;
using Unity.MLAgents.Sensors;
namespace Unity.MLAgents.Inference
{

2
com.unity.ml-agents/Runtime/Inference/TensorProxy.cs


using System;
using System.Collections.Generic;
using Unity.MLAgents.Inference.Utils;
using Unity.MLAgents.Inference.Utils;
namespace Unity.MLAgents.Inference
{

2
com.unity.ml-agents/Runtime/Sensors/ObservationWriter.cs


using System;
using System.Collections.Generic;
using Unity.MLAgents.Inference;
using Unity.MLAgents.Inference;
namespace Unity.MLAgents.Sensors
{

2
com.unity.ml-agents/Tests/Editor/EditModeTestInternalBrainTensorApplier.cs


using System.Collections.Generic;
using NUnit.Framework;
using Unity.Barracuda;
using Unity.Barracuda;
namespace Unity.MLAgents.Tests
{

4
com.unity.ml-agents/Tests/Editor/ModelRunnerTest.cs


using System.Linq;
using Unity.Barracuda;
using Unity.Barracuda;
using System.Linq;
using Unity.MLAgents.Policies;
namespace Unity.MLAgents.Tests

4
com.unity.ml-agents/Tests/Editor/ParameterLoaderTest.cs


using System.Linq;
using Unity.Barracuda;
using Unity.Barracuda;
using System.Linq;
using Unity.MLAgents.Policies;
namespace Unity.MLAgents.Tests

2
com.unity.ml-agents/Tests/Editor/Sensor/ObservationWriterTests.cs


using NUnit.Framework;
using Unity.Barracuda;
using Unity.Barracuda;
namespace Unity.MLAgents.Tests

4
com.unity.ml-agents/Tests/Editor/TensorUtilsTest.cs


using System;
using NUnit.Framework;
using Unity.Barracuda;
using Unity.Barracuda;
using NUnit.Framework;
namespace Unity.MLAgents.Tests
{

15
docs/Getting-Started.md


1. Navigate to the folder where you cloned the `ml-agents` repository. **Note**:
If you followed the default [installation](Installation.md), then you should
be able to run `mlagents-learn` from any directory.
1. Run `mlagents-learn config/trainer_config.yaml --run-id=first3DBallRun`.
- `config/trainer_config.yaml` is the path to a default training
configuration file that we provide. In includes training configurations for
all our example environments, including 3DBall.
1. Run `mlagents-learn config/ppo/3DBall.yaml --run-id=first3DBallRun`.
- `config/ppo/3DBall.yaml` is the path to a default training
configuration file that we provide. The `config/ppo` folder includes training configuration
files for all our example environments, including 3DBall.
- `run-id` is a unique name for this training session.
1. When the message _"Start training by pressing the Play button in the Unity
Editor"_ is displayed on the screen, you can press the **Play** button in

sequence_length: 64
summary_freq: 1000
use_recurrent: False
summary_path: ./summaries/first3DBallRun
model_path: ./models/first3DBallRun/3DBallLearning
output_path: ./results/first3DBallRun/3DBallLearning
INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 1000. Mean Reward: 1.242. Std of Reward: 0.746. Training.
INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 2000. Mean Reward: 1.319. Std of Reward: 0.693. Training.
INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 3000. Mean Reward: 1.804. Std of Reward: 1.056. Training.

run the same command again, appending the `--resume` flag:
```sh
mlagents-learn config/trainer_config.yaml --run-id=first3DBallRun --resume
mlagents-learn config/ppo/3DBall.yaml --run-id=firstRun --resume
Your trained model will be at `models/<run-identifier>/<behavior_name>.nn` where
Your trained model will be at `results/<run-identifier>/<behavior_name>.nn` where
`<behavior_name>` is the name of the `Behavior Name` of the agents corresponding
to the model. This file corresponds to your model's latest checkpoint. You can
now embed this trained model into your Agents by following the steps below,

43
docs/Learning-Environment-Create-New.md


and include the following hyperparameter values:
```yml
RollerBall:
trainer: ppo
batch_size: 10
beta: 5.0e-3
buffer_size: 100
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 3.0e-4
learning_rate_schedule: linear
max_steps: 5.0e4
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
summary_freq: 10000
use_recurrent: false
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
behaviors:
RollerBall:
trainer: ppo
batch_size: 10
beta: 5.0e-3
buffer_size: 100
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 3.0e-4
learning_rate_schedule: linear
max_steps: 5.0e4
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
summary_freq: 10000
use_recurrent: false
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
```
Since this example creates a very simple training environment with only a few

4
docs/Learning-Environment-Examples.md


does not train with the provided default training parameters.**
- Float Properties: None
- Benchmark Mean Reward: 0.7
- To speed up training, you can enable curiosity by adding the `curiosity`
reward signal in `config/trainer_config.yaml`
- To train this environment, you can enable curiosity by adding the `curiosity` reward signal
in `config/ppo/Hallway.yaml`
## Bouncer

13
docs/Learning-Environment-Executable.md


the directory where you installed the ML-Agents Toolkit, run:
```sh
mlagents-learn ../config/trainer_config.yaml --env=3DBall --run-id=firstRun
mlagents-learn ../config/ppo/3DBall.yaml --env=3DBall --run-id=firstRun
ml-agents$ mlagents-learn config/trainer_config.yaml --env=3DBall --run-id=first-run
ml-agents$ mlagents-learn config/ppo/3DBall.yaml --env=3DBall --run-id=first-run
▄▄▄▓▓▓▓

sequence_length: 64
summary_freq: 1000
use_recurrent: False
summary_path: ./summaries/first-run-0
model_path: ./models/first-run-0/Ball3DLearning
output_path: ./results/first-run-0/Ball3DLearning
INFO:mlagents.trainers: first-run-0: Ball3DLearning: Step: 1000. Mean Reward: 1.242. Std of Reward: 0.746. Training.
INFO:mlagents.trainers: first-run-0: Ball3DLearning: Step: 2000. Mean Reward: 1.319. Std of Reward: 0.693. Training.
INFO:mlagents.trainers: first-run-0: Ball3DLearning: Step: 3000. Mean Reward: 1.804. Std of Reward: 1.056. Training.

INFO:mlagents.trainers: first-run-0: Ball3DLearning: Step: 10000. Mean Reward: 27.284. Std of Reward: 28.667. Training.
```
You can press `Ctrl+C` to stop the training, and your trained model will be at
`models/<run-identifier>/<behavior_name>.nn`, which corresponds to your model's
You can press Ctrl+C to stop the training, and your trained model will be at
`results/<run-identifier>/<behavior_name>.nn`, which corresponds to your model's
trainer_config.yaml.) You can now embed this trained model into your Agent by
your config YAML.) You can now embed this trained model into your Agent by
following the steps below:
1. Move your model file into

19
docs/Migrating.md


## Migrating from Release 1 to latest
### Important changes
- Training artifacts (trained models, summaries) are now found under `results/`
instead of `summaries/` and `models/`.
- Trainer configuration, curriculum configuration, and parameter randomization
configuration have all been moved to a single YAML file. (#3791)
- Before upgrading, copy your `Behavior Name` sections from `trainer_config.yaml` into
a separate trainer configuration file, under a `behaviors` section. You can move the `default` section too
if it's being used. This file should be specific to your environment, and not contain configurations for
multiple environments (unless they have the same Behavior Names).
- If your training uses [curriculum](Training-Curriculum-Learning.md), move those configurations under
the `Behavior Name` section.
- If your training uses [parameter randomization](Training-Environment-Parameter-Randomization.md), move
the contents of the sampler config to `parameter_randomization` in the main trainer configuration.
## Migrating from 0.15 to Release 1

longer takes a file name as input but a fully constructed `UnityEnvironment`
instead.
- Update uses of "camelCase" fields and properties to "PascalCase".
- If you have a custom `ISensor` implementation, you will need to change the
signature of its `Write()` method to use `ObservationWriter` instead of
`WriteAdapter`.
## Migrating from 0.14 to 0.15

- Multiply `max_steps` and `summary_freq` in your `trainer_config.yaml` by the
number of Agents in the scene.
- Combine curriculum configs into a single file. See
[the WallJump curricula](../config/curricula/wall_jump.yaml) for an example of
[the WallJump curricula](https://github.com/Unity-Technologies/ml-agents/blob/0.14.1/config/curricula/wall_jump.yaml) for an example of
the new curriculum config format. A tool like https://www.json2yaml.com may be
useful to help with the conversion.
- If you have a model trained which uses RayPerceptionSensor and has non-1.0

- It is now required to specify the path to the yaml trainer configuration file
when running `mlagents-learn`. For an example trainer configuration file, see
[trainer_config.yaml](../config/trainer_config.yaml). An example of passing a
[trainer_config.yaml](https://github.com/Unity-Technologies/ml-agents/blob/0.5.0a/config/trainer_config.yaml). An example of passing a
trainer configuration to `mlagents-learn` is shown above.
- The environment name is now passed through the `--env` option.
- Curriculum learning has been changed. In summary:

2
docs/Using-Tensorboard.md


1. Open a terminal or console window:
1. Navigate to the directory where the ML-Agents Toolkit is installed.
1. From the command line run: `tensorboard --logdir=summaries --port=6006`
1. From the command line run: `tensorboard --logdir=results --port=6006`
1. Open a browser window and navigate to
[localhost:6006](http://localhost:6006).

3
gym-unity/README.md


We provide results from our PPO implementation and the DQN from Baselines as
reference. Note that all runs used the same greyscale GridWorld as Dopamine. For
PPO, `num_layers` was set to 2, and all other hyperparameters are the default
for GridWorld in `trainer_config.yaml`. For Baselines DQN, the provided
for GridWorld in `config/ppo/GridWorld.yaml`. For Baselines DQN, the provided
![Dopamine on GridWorld](images/dopamine_gridworld_plot.png)

4
gym-unity/gym_unity/__init__.py


# Version of the library that will be used to upload to pypi
__version__ = "0.16.0"
__version__ = "0.17.0.dev0"
__release_tag__ = "release_1"
__release_tag__ = None

4
ml-agents-envs/mlagents_envs/__init__.py


# Version of the library that will be used to upload to pypi
__version__ = "0.16.0"
__version__ = "0.17.0.dev0"
__release_tag__ = "release_1"
__release_tag__ = None

30
ml-agents-envs/mlagents_envs/environment.py


seed: int = 0,
no_graphics: bool = False,
timeout_wait: int = 60,
args: Optional[List[str]] = None,
additional_args: Optional[List[str]] = None,
log_folder: Optional[str] = None,
):
"""
Starts a new unity environment and establishes a connection with the environment.

:int timeout_wait: Time (in seconds) to wait for connection from environment.
:list args: Addition Unity command line arguments
:list side_channels: Additional side channel for no-rl communication with Unity
:str log_folder: Optional folder to write the Unity Player log file into. Requires absolute path.
args = args or []
self.additional_args = additional_args or []
self.no_graphics = no_graphics
# If base port is not specified, use BASE_ENVIRONMENT_PORT if we have
# an environment, otherwise DEFAULT_EDITOR_PORT
if base_port is None:

)
)
self.side_channels[_sc.channel_id] = _sc
self.log_folder = log_folder
# If the environment name is None, a new environment will not be launched
# and the communicator will directly try to connect to an existing unity environment.

"the worker-id must be 0 in order to connect with the Editor."
)
if file_name is not None:
self.executable_launcher(file_name, no_graphics, args)
self.executable_launcher(file_name, no_graphics, additional_args)
else:
logger.info(
f"Listening on port {self.port}. "

launch_string = candidates[0]
return launch_string
def executable_args(self) -> List[str]:
args: List[str] = []
if self.no_graphics:
args += ["-nographics", "-batchmode"]
args += [UnityEnvironment.PORT_COMMAND_LINE_ARG, str(self.port)]
if self.log_folder:
log_file_path = os.path.join(
self.log_folder, f"Player-{self.worker_id}.log"
)
args += ["-logFile", log_file_path]
# Add in arguments passed explicitly by the user.
args += self.additional_args
return args
def executable_launcher(self, file_name, no_graphics, args):
launch_string = self.validate_environment_path(file_name)
if launch_string is None:

else:
logger.debug("This is the launch string {}".format(launch_string))
# Launch Unity environment
subprocess_args = [launch_string]
if no_graphics:
subprocess_args += ["-nographics", "-batchmode"]
subprocess_args += [UnityEnvironment.PORT_COMMAND_LINE_ARG, str(self.port)]
subprocess_args += args
subprocess_args = [launch_string] + self.executable_args()
try:
self.proc1 = subprocess.Popen(
subprocess_args,

12
ml-agents-envs/mlagents_envs/tests/test_envs.py


@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
def test_log_file_path_is_set(mock_communicator, mock_launcher):
mock_communicator.return_value = MockCommunicator()
env = UnityEnvironment(
file_name="myfile", worker_id=0, log_folder="./some-log-folder-path"
)
args = env.executable_args()
log_file_index = args.index("-logFile")
assert args[log_file_index + 1] == "./some-log-folder-path/Player-0.log"
@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
def test_reset(mock_communicator, mock_launcher):
mock_communicator.return_value = MockCommunicator(
discrete_action=False, visual_inputs=0

4
ml-agents/mlagents/trainers/__init__.py


# Version of the library that will be used to upload to pypi
__version__ = "0.16.0"
__version__ = "0.17.0.dev0"
__release_tag__ = "release_1"
__release_tag__ = None

169
ml-agents/mlagents/trainers/learn.py


# # Unity ML-Agents Toolkit
import argparse
import yaml
import os
import numpy as np

load_config,
TrainerFactory,
handle_existing_directories,
assemble_curriculum_config,
)
from mlagents.trainers.stats import (
TensorboardWriter,

ConsoleWriter,
)
from mlagents.trainers.cli_utils import (
StoreConfigFile,
DetectDefault,
DetectDefaultStoreTrue,
)
from mlagents.trainers.exception import SamplerException
from mlagents.trainers.exception import SamplerException, TrainerConfigError
from mlagents_envs.base_env import BaseEnv
from mlagents.trainers.subprocess_env_manager import SubprocessEnvManager
from mlagents_envs.side_channel.side_channel import SideChannel

argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
argparser.add_argument("trainer_config_path")
argparser.add_argument("trainer_config_path", action=StoreConfigFile)
)
argparser.add_argument(
"--curriculum",
default=None,
dest="curriculum_config_path",
help="YAML file for defining the lessons for curriculum training",
action=DetectDefault,
)
argparser.add_argument(
"--lesson",

)
argparser.add_argument(
"--sampler",
default=None,
dest="sampler_file_path",
help="YAML file for defining the sampler for environment parameter randomization",
action=DetectDefault,
)
argparser.add_argument(
"--keep-checkpoints",

"number of steps specified by the save-freq option. Once the maximum number of checkpoints"
"has been reached, the oldest checkpoint is deleted when saving a new checkpoint.",
action=DetectDefault,
action="store_true",
action=DetectDefaultStoreTrue,
help=argparse.SUPPRESS, # Deprecated but still usable for now.
)
argparser.add_argument(

action="store_true",
action=DetectDefaultStoreTrue,
help="Whether to resume training from a checkpoint. Specify a --run-id to use this option. "
"If set, the training code loads an already trained model to initialize the neural network "
"before resuming training. This option is only valid when the models exist, and have the same "

"--force",
default=False,
dest="force",
action="store_true",
action=DetectDefaultStoreTrue,
help="Whether to force-overwrite this run-id's existing summary and model data. (Without "
"this flag, attempting to train a model with a run-id that has been used before will throw "
"an error.",

"as the saved model itself. If you use TensorBoard to view the training statistics, "
"always set a unique run-id for each training run. (The statistics for all runs with the "
"same id are combined as if they were produced by a the same session.)",
action=DetectDefault,
)
argparser.add_argument(
"--initialize-from",

"This can be used, for instance, to fine-tune an existing model on a new environment. "
"Note that the previously saved models must have the same behavior parameters as your "
"current environment.",
action=DetectDefault,
)
argparser.add_argument(
"--save-freq",

action=DetectDefault,
)
argparser.add_argument(
"--seed",

action=DetectDefault,
action="store_true",
action=DetectDefaultStoreTrue,
help=argparse.SUPPRESS,
)
argparser.add_argument(

action="store_true",
action=DetectDefaultStoreTrue,
help="Whether to run in Python inference mode (i.e. no training). Use with --resume to load "
"a model trained with an existing run ID.",
)

"will use the port (base_port + worker_id), where the worker_id is sequential IDs given to "
"each instance from 0 to (num_envs - 1). Note that when training using the Editor rather "
"than an executable, the base port will be ignored.",
action=DetectDefault,
)
argparser.add_argument(
"--num-envs",

"from when training",
action=DetectDefault,
action="store_true",
action=DetectDefaultStoreTrue,
help="Whether to run the Unity executable in no-graphics mode (i.e. without initializing "
"the graphics driver. Use this only if your agents don't use visual observations.",
)

action="store_true",
action=DetectDefaultStoreTrue,
help="Whether to enable debug-level logging for some parts of the code",
)
argparser.add_argument(

"process these as Unity Command Line Arguments. You should choose different argument names if "
"you want to create environment-specific arguments. All arguments after this flag will be "
"passed to the executable.",
action=DetectDefault,
action="store_true",
action=DetectDefaultStoreTrue,
help="Forces training using CPU only",
)

type=int,
help="The width of the executable window of the environment(s) in pixels "
"(ignored for editor training).",
action=DetectDefault,
)
eng_conf.add_argument(
"--height",

"(ignored for editor training)",
action=DetectDefault,
)
eng_conf.add_argument(
"--quality-level",

"QualitySettings.SetQualityLevel in Unity.",
action=DetectDefault,
)
eng_conf.add_argument(
"--time-scale",

"Time.timeScale in Unity.",
action=DetectDefault,
)
eng_conf.add_argument(
"--target-frame-rate",

"Application.targetFrameRate in Unity.",
action=DetectDefault,
)
eng_conf.add_argument(
"--capture-frame-rate",

"Time.captureFramerate in Unity.",
action=DetectDefault,
)
return argparser

class RunOptions(NamedTuple):
trainer_config: Dict
behaviors: Dict
debug: bool = parser.get_default("debug")
seed: int = parser.get_default("seed")
env_path: Optional[str] = parser.get_default("env_path")

lesson: int = parser.get_default("lesson")
no_graphics: bool = parser.get_default("no_graphics")
multi_gpu: bool = parser.get_default("multi_gpu")
sampler_config: Optional[Dict] = None
parameter_randomization: Optional[Dict] = None
env_args: Optional[List[str]] = parser.get_default("env_args")
cpu: bool = parser.get_default("cpu")
width: int = parser.get_default("width")

configs loaded from files.
"""
argparse_args = vars(args)
trainer_config_path = argparse_args["trainer_config_path"]
curriculum_config_path = argparse_args["curriculum_config_path"]
argparse_args["trainer_config"] = load_config(trainer_config_path)
if curriculum_config_path is not None:
argparse_args["curriculum_config"] = load_config(curriculum_config_path)
if argparse_args["sampler_file_path"] is not None:
argparse_args["sampler_config"] = load_config(
argparse_args["sampler_file_path"]
run_options_dict = {}
run_options_dict.update(argparse_args)
config_path = StoreConfigFile.trainer_config_path
# Load YAML
yaml_config = load_config(config_path)
# This is the only option that is not optional and has no defaults.
if "behaviors" not in yaml_config:
raise TrainerConfigError(
"Trainer configurations not found. Make sure your YAML file has a section for behaviors."
# Use the YAML file values for all values not specified in the CLI.
for key, val in yaml_config.items():
# Detect bad config options
if not hasattr(RunOptions, key):
raise TrainerConfigError(
"The option {} was specified in your YAML file, but is invalid.".format(
key
)
)
if key not in DetectDefault.non_default_args:
run_options_dict[key] = val
argparse_args["resume"] = argparse_args["resume"] or argparse_args["load_model"]
# Since argparse accepts file paths in the config options which don't exist in CommandLineOptions,
# these keys will need to be deleted to use the **/splat operator below.
argparse_args.pop("sampler_file_path")
argparse_args.pop("curriculum_config_path")
argparse_args.pop("trainer_config_path")
return RunOptions(**vars(args))
run_options_dict["resume"] = (
run_options_dict["resume"] or run_options_dict["load_model"]
)
return RunOptions(**run_options_dict)
def get_version_string() -> str:

:param run_options: Command line arguments for training.
"""
with hierarchical_timer("run_training.setup"):
model_path = f"./models/{options.run_id}"
base_path = "results"
write_path = os.path.join(base_path, options.run_id)
f"./models/{options.initialize_from}" if options.initialize_from else None
os.path.join(base_path, options.run_id) if options.initialize_from else None
summaries_dir = "./summaries"
run_logs_dir = os.path.join(write_path, "run_logs")
# Check if directory exists
handle_existing_directories(
write_path, options.resume, options.force, maybe_init_path
)
# Make run logs directory
os.makedirs(run_logs_dir, exist_ok=True)
summaries_dir,
write_path,
handle_existing_directories(
model_path, summaries_dir, options.resume, options.force, maybe_init_path
)
tb_writer = TensorboardWriter(summaries_dir, clear_past_data=not options.resume)
tb_writer = TensorboardWriter(write_path, clear_past_data=not options.resume)
gauge_write = GaugeWriter()
console_writer = ConsoleWriter()
StatsReporter.add_writer(tb_writer)

if options.env_path is None:
port = UnityEnvironment.DEFAULT_EDITOR_PORT
env_factory = create_environment_factory(
options.env_path, options.no_graphics, run_seed, port, options.env_args
options.env_path,
options.no_graphics,
run_seed,
port,
options.env_args,
os.path.abspath(run_logs_dir), # Unity environment requires absolute path
)
engine_config = EngineConfig(
width=options.width,

capture_frame_rate=options.capture_frame_rate,
)
env_manager = SubprocessEnvManager(env_factory, engine_config, options.num_envs)
curriculum_config = assemble_curriculum_config(options.behaviors)
options.curriculum_config, env_manager, options.lesson
curriculum_config, env_manager, options.lesson
options.sampler_config, run_seed
options.parameter_randomization, run_seed
options.trainer_config,
summaries_dir,
options.behaviors,
model_path,
write_path,
options.keep_checkpoints,
not options.inference,
options.resume,

# Create controller and begin training.
tc = TrainerController(
trainer_factory,
model_path,
summaries_dir,
write_path,
options.run_id,
options.save_freq,
maybe_meta_curriculum,

tc.start_learning(env_manager)
finally:
env_manager.close()
write_timing_tree(summaries_dir, options.run_id)
write_run_options(write_path, options)
write_timing_tree(run_logs_dir)
def write_timing_tree(summaries_dir: str, run_id: str) -> None:
timing_path = f"{summaries_dir}/{run_id}_timers.json"
def write_run_options(output_dir: str, run_options: RunOptions) -> None:
run_options_path = os.path.join(output_dir, "configuration.yaml")
try:
with open(run_options_path, "w") as f:
try:
yaml.dump(dict(run_options._asdict()), f, sort_keys=False)
except TypeError: # Older versions of pyyaml don't support sort_keys
yaml.dump(dict(run_options._asdict()), f)
except FileNotFoundError:
logger.warning(
f"Unable to save configuration to {run_options_path}. Make sure the directory exists"
)
def write_timing_tree(output_dir: str) -> None:
timing_path = os.path.join(output_dir, "timers.json")
try:
with open(timing_path, "w") as f:
json.dump(get_timer_tree(), f, indent=4)

def try_create_meta_curriculum(
curriculum_config: Optional[Dict], env: SubprocessEnvManager, lesson: int
) -> Optional[MetaCurriculum]:
if curriculum_config is None:
if curriculum_config is None or len(curriculum_config) <= 0:
return None
else:
meta_curriculum = MetaCurriculum(curriculum_config)

seed: int,
start_port: int,
env_args: Optional[List[str]],
log_folder: str,
) -> Callable[[int, List[SideChannel]], BaseEnv]:
if env_path is not None:
launch_string = UnityEnvironment.validate_environment_path(env_path)

seed=env_seed,
no_graphics=no_graphics,
base_port=start_port,
args=env_args,
additional_args=env_args,
log_folder=log_folder,
)
return create_unity_environment

5
ml-agents/mlagents/trainers/policy/tf_policy.py


from typing import Any, Dict, List, Optional
import abc
import os
import numpy as np
from mlagents.tf_utils import tf
from mlagents import tf_utils

self.use_continuous_act = brain.vector_action_space_type == "continuous"
if self.use_continuous_act:
self.num_branches = self.brain.vector_action_space_size[0]
self.model_path = trainer_parameters["model_path"]
self.model_path = trainer_parameters["output_path"]
self.initialize_path = trainer_parameters.get("init_path", None)
self.keep_checkpoints = trainer_parameters.get("keep_checkpoints", 5)
self.graph = tf.Graph()

:return:
"""
with self.graph.as_default():
last_checkpoint = self.model_path + "/model-" + str(steps) + ".ckpt"
last_checkpoint = os.path.join(self.model_path, f"model-{steps}.ckpt")
self.saver.save(self.sess, last_checkpoint)
tf.train.write_graph(
self.graph, self.model_path, "raw_graph_def.pb", as_text=False

3
ml-agents/mlagents/trainers/ppo/trainer.py


"sequence_length",
"summary_freq",
"use_recurrent",
"summary_path",
"model_path",
"output_path",
"reward_signals",
]
self._check_param_keys()

7
ml-agents/mlagents/trainers/sac/trainer.py


"summary_freq",
"tau",
"use_recurrent",
"summary_path",
"model_path",
"output_path",
"reward_signals",
]

Save the training buffer's update buffer to a pickle file.
"""
filename = os.path.join(
self.trainer_parameters["model_path"], "last_replay_buffer.hdf5"
self.trainer_parameters["output_path"], "last_replay_buffer.hdf5"
)
logger.info("Saving Experience Replay Buffer to {}".format(filename))
with open(filename, "wb") as file_object:

Loads the last saved replay buffer from a file.
"""
filename = os.path.join(
self.trainer_parameters["model_path"], "last_replay_buffer.hdf5"
self.trainer_parameters["output_path"], "last_replay_buffer.hdf5"
)
logger.info("Loading Experience Replay Buffer from {}".format(filename))
with open(filename, "rb+") as file_object:

6
ml-agents/mlagents/trainers/tests/test_barracuda_converter.py


memory_size: 8
curiosity_strength: 0.0
curiosity_enc_size: 1
summary_path: test
model_path: test
output_path: test
reward_signals:
extrinsic:
strength: 1.0

@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
def test_policy_conversion(dummy_config, tmpdir, rnn, visual, discrete):
tf.reset_default_graph()
dummy_config["summary_path"] = str(tmpdir)
dummy_config["model_path"] = os.path.join(tmpdir, "test")
dummy_config["output_path"] = os.path.join(tmpdir, "test")
policy = create_policy_mock(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)

2
ml-agents/mlagents/trainers/tests/test_bcmodule.py


def create_bc_module(mock_brain, trainer_config, use_rnn, demo_file, tanhresample):
# model_path = env.external_brain_names[0]
trainer_config["model_path"] = "testpath"
trainer_config["output_path"] = "testpath"
trainer_config["keep_checkpoints"] = 3
trainer_config["use_recurrent"] = use_rnn
trainer_config["behavioral_cloning"]["demo_path"] = (

9
ml-agents/mlagents/trainers/tests/test_ghost.py


memory_size: 8
curiosity_strength: 0.0
curiosity_enc_size: 1
summary_path: test
model_path: test
output_path: test
reward_signals:
extrinsic:
strength: 1.0

vector_action_descriptions=[],
vector_action_space_type=0,
)
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
controller = GhostController(100)
trainer = GhostTrainer(

vector_action_descriptions=[],
vector_action_space_type=0,
)
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
controller = GhostController(100)
trainer = GhostTrainer(

133
ml-agents/mlagents/trainers/tests/test_learn.py


import pytest
import yaml
from mlagents.trainers.learn import parse_command_line
from mlagents.trainers.learn import parse_command_line, DetectDefault
from mlagents_envs.exception import UnityEnvironmentException
from mlagents.trainers.stats import StatsReporter

return parse_command_line(args)
MOCK_YAML = """
behaviors:
{}
"""
MOCK_PARAMETER_YAML = """
behaviors:
{}
env_path: "./oldenvfile"
keep_checkpoints: 34
lesson: 2
run_id: uselessrun
save_freq: 654321
seed: 9870
base_port: 4001
num_envs: 4
debug: false
"""
MOCK_SAMPLER_CURRICULUM_YAML = """
behaviors:
behavior1:
curriculum:
curriculum1
behavior2:
curriculum:
curriculum2
parameter_randomization:
sampler1
"""
@patch("mlagents.trainers.learn.write_timing_tree")
@patch("mlagents.trainers.learn.write_run_options")
@patch("mlagents.trainers.learn.handle_existing_directories")
@patch("mlagents.trainers.learn.TrainerFactory")
@patch("mlagents.trainers.learn.SamplerManager")

sampler_manager_mock,
trainer_factory_mock,
handle_dir_mock,
write_run_options_mock,
write_timing_tree_mock,
trainer_config_mock = MagicMock()
load_config.return_value = trainer_config_mock
load_config.return_value = yaml.safe_load(MOCK_YAML)
learn.run_training(0, basic_options())
options = basic_options()
learn.run_training(0, options)
"./models/ppo",
"./summaries",
"results/ppo",
"ppo",
50000,
None,

None,
)
handle_dir_mock.assert_called_once_with(
"./models/ppo", "./summaries", False, False, None
)
handle_dir_mock.assert_called_once_with("results/ppo", False, False, None)
write_timing_tree_mock.assert_called_once_with("results/ppo/run_logs")
write_run_options_mock.assert_called_once_with("results/ppo", options)
StatsReporter.writers.clear() # make sure there aren't any writers as added by learn.py

seed=None,
start_port=8000,
env_args=None,
log_folder="results/log_folder",
@patch("builtins.open", new_callable=mock_open, read_data="{}")
@patch("builtins.open", new_callable=mock_open, read_data=MOCK_YAML)
assert opt.trainer_config == {}
assert opt.behaviors == {}
assert opt.curriculum_config is None
assert opt.sampler_config is None
assert opt.parameter_randomization is None
assert opt.keep_checkpoints == 5
assert opt.lesson == 0
assert opt.resume is False

full_args = [
"mytrainerpath",
"--env=./myenvfile",
"--curriculum=./mycurriculum",
"--sampler=./mysample",
"--keep-checkpoints=42",
"--lesson=3",
"--resume",
"--inference",
"--run-id=myawesomerun",
"--save-freq=123456",
"--seed=7890",
"--train",
"--base-port=4004",
"--num-envs=2",
"--no-graphics",
"--debug",
]
opt = parse_command_line(full_args)
assert opt.behaviors == {}
assert opt.env_path == "./myenvfile"
assert opt.parameter_randomization is None
assert opt.keep_checkpoints == 42
assert opt.lesson == 3
assert opt.run_id == "myawesomerun"
assert opt.save_freq == 123456
assert opt.seed == 7890
assert opt.base_port == 4004
assert opt.num_envs == 2
assert opt.no_graphics is True
assert opt.debug is True
assert opt.inference is True
assert opt.resume is True
@patch("builtins.open", new_callable=mock_open, read_data=MOCK_PARAMETER_YAML)
def test_yaml_args(mock_file):
# Test with opts loaded from YAML
DetectDefault.non_default_args.clear()
opt = parse_command_line(["mytrainerpath"])
assert opt.behaviors == {}
assert opt.env_path == "./oldenvfile"
assert opt.parameter_randomization is None
assert opt.keep_checkpoints == 34
assert opt.lesson == 2
assert opt.run_id == "uselessrun"
assert opt.save_freq == 654321
assert opt.seed == 9870
assert opt.base_port == 4001
assert opt.num_envs == 4
assert opt.no_graphics is False
assert opt.debug is False
assert opt.env_args is None
# Test that CLI overrides YAML
full_args = [
"mytrainerpath",
"--env=./myenvfile",
"--keep-checkpoints=42",
"--lesson=3",
"--resume",

]
opt = parse_command_line(full_args)
assert opt.trainer_config == {}
assert opt.behaviors == {}
assert opt.curriculum_config == {}
assert opt.sampler_config == {}
assert opt.parameter_randomization is None
assert opt.keep_checkpoints == 42
assert opt.lesson == 3
assert opt.run_id == "myawesomerun"

assert opt.resume is True
@patch("builtins.open", new_callable=mock_open, read_data="{}")
@patch("builtins.open", new_callable=mock_open, read_data=MOCK_SAMPLER_CURRICULUM_YAML)
def test_sampler_configs(mock_file):
opt = parse_command_line(["mytrainerpath"])
assert opt.parameter_randomization == "sampler1"
@patch("builtins.open", new_callable=mock_open, read_data=MOCK_YAML)
def test_env_args(mock_file):
full_args = [
"mytrainerpath",

8
ml-agents/mlagents/trainers/tests/test_nn_policy.py


memory_size: 8
curiosity_strength: 0.0
curiosity_enc_size: 1
summary_path: test
model_path: test
output_path: test
reward_signals:
extrinsic:
strength: 1.0

path1 = os.path.join(tmp_path, "runid1")
path2 = os.path.join(tmp_path, "runid2")
trainer_params = dummy_config
trainer_params["model_path"] = path1
trainer_params["output_path"] = path1
policy = create_policy_mock(trainer_params)
policy.initialize_or_load()
policy.save_model(2000)

vector_action_descriptions=[],
vector_action_space_type=0,
)
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
time_horizon = 6
trajectory = make_fake_trajectory(

2
ml-agents/mlagents/trainers/tests/test_policy.py


def basic_params():
return {"use_recurrent": False, "model_path": "my/path"}
return {"use_recurrent": False, "output_path": "my/path"}
class FakePolicy(TFPolicy):

9
ml-agents/mlagents/trainers/tests/test_ppo.py


memory_size: 10
curiosity_strength: 0.0
curiosity_enc_size: 1
summary_path: test
model_path: test
output_path: test
reward_signals:
extrinsic:
strength: 1.0

vector_action_descriptions=[],
vector_action_space_type=0,
)
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = trainer.create_policy(brain_params.brain_name, brain_params)
trainer.add_policy(brain_params.brain_name, policy)

mock_optimizer.reward_signals = {}
ppo_optimizer.return_value = mock_optimizer
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = mock.Mock(spec=NNPolicy)
policy.get_current_step.return_value = 2000

2
ml-agents/mlagents/trainers/tests/test_reward_signals.py


)
trainer_parameters = trainer_config
model_path = "testpath"
trainer_parameters["model_path"] = model_path
trainer_parameters["output_path"] = model_path
trainer_parameters["keep_checkpoints"] = 3
trainer_parameters["reward_signals"].update(reward_signal_config)
trainer_parameters["use_recurrent"] = use_rnn

2
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


def dummy_config():
return yaml.safe_load(
"""
summary_path: "test/"
output_path: "test/"
summary_freq: 1000
max_steps: 100
reward_signals:

14
ml-agents/mlagents/trainers/tests/test_sac.py


trainer_parameters = dummy_config
model_path = "testmodel"
trainer_parameters["model_path"] = model_path
trainer_parameters["output_path"] = model_path
trainer_parameters["keep_checkpoints"] = 3
trainer_parameters["use_recurrent"] = use_rnn
policy = NNPolicy(

discrete_action_space=DISCRETE_ACTION_SPACE,
)
trainer_params = dummy_config
trainer_params["summary_path"] = str(tmpdir)
trainer_params["model_path"] = str(tmpdir)
trainer_params["output_path"] = str(tmpdir)
trainer_params["save_replay_buffer"] = True
trainer = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, False, 0, 0)
policy = trainer.create_policy(mock_brain.brain_name, mock_brain)

mock_optimizer.reward_signals = {}
sac_optimizer.return_value = mock_optimizer
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = mock.Mock(spec=NNPolicy)
policy.get_current_step.return_value = 2000

brain_params = make_brain_parameters(
discrete_action=False, visual_inputs=0, vec_obs_size=6
)
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
dummy_config["steps_per_update"] = 20
trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = trainer.create_policy(brain_params.brain_name, brain_params)

dummy_config["sequence_length"] = 64
dummy_config["batch_size"] = 32
dummy_config["use_recurrent"] = True
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
with pytest.raises(UnityTrainerException):
_ = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")

6
ml-agents/mlagents/trainers/tests/test_simple_rl.py


env_manager = SimpleEnvManager(env, EnvironmentParametersChannel())
trainer_factory = TrainerFactory(
trainer_config=trainer_config,
summaries_dir=dir,
model_path=dir,
output_path=dir,
keep_checkpoints=1,
train_model=True,
load_model=False,

tc = TrainerController(
trainer_factory=trainer_factory,
summaries_dir=dir,
model_path=dir,
output_path=dir,
run_id=run_id,
meta_curriculum=meta_curriculum,
train=True,

6
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


trainer_factory_mock.ghost_controller = GhostController()
return TrainerController(
trainer_factory=trainer_factory_mock,
model_path="test_model_path",
summaries_dir="test_summaries_dir",
output_path="test_model_path",
run_id="test_run_id",
save_freq=100,
meta_curriculum=None,

trainer_factory_mock.ghost_controller = GhostController()
TrainerController(
trainer_factory=trainer_factory_mock,
model_path="",
summaries_dir="",
output_path="",
run_id="1",
save_freq=1,
meta_curriculum=None,

101
ml-agents/mlagents/trainers/tests/test_trainer_util.py


from unittest.mock import patch
from mlagents.trainers import trainer_util
from mlagents.trainers.trainer_util import load_config, _load_config
from mlagents.trainers.trainer_util import (
load_config,
_load_config,
assemble_curriculum_config,
)
from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.exception import TrainerConfigError, UnityTrainerException
from mlagents.trainers.brain import BrainParameters

def test_initialize_trainer_parameters_override_defaults(
BrainParametersMock, dummy_config_with_override
):
summaries_dir = "test_dir"
model_path = "model_dir"
output_path = "model_dir"
keep_checkpoints = 1
train_model = True
load_model = False

base_config = dummy_config_with_override
expected_config = base_config["default"]
expected_config["summary_path"] = f"{run_id}_testbrain"
expected_config["model_path"] = model_path + "/testbrain"
expected_config["output_path"] = output_path + "/testbrain"
expected_config["keep_checkpoints"] = keep_checkpoints
# Override value from specific brain config

with patch.object(PPOTrainer, "__init__", mock_constructor):
trainer_factory = trainer_util.TrainerFactory(
trainer_config=base_config,
summaries_dir=summaries_dir,
model_path=model_path,
output_path=output_path,
keep_checkpoints=keep_checkpoints,
train_model=train_model,
load_model=load_model,

brain_params_mock = BrainParametersMock()
BrainParametersMock.return_value.brain_name = "testbrain"
external_brains = {"testbrain": BrainParametersMock()}
summaries_dir = "test_dir"
model_path = "model_dir"
output_path = "results_dir"
keep_checkpoints = 1
train_model = True
load_model = False

base_config = dummy_config
expected_config = base_config["default"]
expected_config["summary_path"] = f"{run_id}_testbrain"
expected_config["model_path"] = model_path + "/testbrain"
expected_config["output_path"] = output_path + "/testbrain"
expected_config["keep_checkpoints"] = keep_checkpoints
def mock_constructor(

with patch.object(PPOTrainer, "__init__", mock_constructor):
trainer_factory = trainer_util.TrainerFactory(
trainer_config=base_config,
summaries_dir=summaries_dir,
model_path=model_path,
output_path=output_path,
keep_checkpoints=keep_checkpoints,
train_model=train_model,
load_model=load_model,

def test_initialize_invalid_trainer_raises_exception(
BrainParametersMock, dummy_bad_config
):
summaries_dir = "test_dir"
model_path = "model_dir"
output_path = "results_dir"
keep_checkpoints = 1
train_model = True
load_model = False

with pytest.raises(TrainerConfigError):
trainer_factory = trainer_util.TrainerFactory(
trainer_config=bad_config,
summaries_dir=summaries_dir,
model_path=model_path,
output_path=output_path,
keep_checkpoints=keep_checkpoints,
train_model=train_model,
load_model=load_model,

with pytest.raises(TrainerConfigError):
trainer_factory = trainer_util.TrainerFactory(
trainer_config=bad_config,
summaries_dir=summaries_dir,
model_path=model_path,
output_path=output_path,
keep_checkpoints=keep_checkpoints,
train_model=train_model,
load_model=load_model,

with pytest.raises(UnityTrainerException):
trainer_factory = trainer_util.TrainerFactory(
trainer_config=bad_config,
summaries_dir=summaries_dir,
model_path=model_path,
output_path=output_path,
keep_checkpoints=keep_checkpoints,
train_model=train_model,
load_model=load_model,

trainer_factory = trainer_util.TrainerFactory(
trainer_config=no_default_config,
summaries_dir="test_dir",
model_path="model_dir",
output_path="output_path",
keep_checkpoints=1,
train_model=True,
load_model=False,

trainer_factory = trainer_util.TrainerFactory(
trainer_config=bad_config,
summaries_dir="test_dir",
model_path="model_dir",
output_path="output_path",
keep_checkpoints=1,
train_model=True,
load_model=False,

_load_config(fp)
def test_assemble_curriculum_config():
file_contents = """
behavior1:
curriculum:
foo: 5
behavior2:
curriculum:
foo: 6
"""
trainer_config = _load_config(file_contents)
curriculum_config = assemble_curriculum_config(trainer_config)
assert curriculum_config == {"behavior1": {"foo": 5}, "behavior2": {"foo": 6}}
# Check that nothing is returned if no curriculum.
file_contents = """
behavior1:
foo: 3
behavior2:
foo: 4
"""
trainer_config = _load_config(file_contents)
curriculum_config = assemble_curriculum_config(trainer_config)
assert curriculum_config == {}
# Check that method doesn't break if 1st level entity isn't a dict.
# Note: this is a malformed configuration.
file_contents = """
behavior1: 3
behavior2: 4
"""
trainer_config = _load_config(file_contents)
curriculum_config = assemble_curriculum_config(trainer_config)
assert curriculum_config == {}
model_path = os.path.join(tmp_path, "runid")
# Unused summary path
summary_path = os.path.join(tmp_path, "runid")
output_path = os.path.join(tmp_path, "runid")
trainer_util.handle_existing_directories(model_path, summary_path, False, False)
trainer_util.handle_existing_directories(output_path, False, False)
trainer_util.handle_existing_directories(model_path, summary_path, True, False)
trainer_util.handle_existing_directories(output_path, True, False)
os.mkdir(model_path)
os.mkdir(output_path)
trainer_util.handle_existing_directories(model_path, summary_path, False, False)
trainer_util.handle_existing_directories(output_path, False, False)
trainer_util.handle_existing_directories(model_path, summary_path, True, False)
trainer_util.handle_existing_directories(output_path, True, False)
trainer_util.handle_existing_directories(model_path, summary_path, False, True)
trainer_util.handle_existing_directories(output_path, False, True)
trainer_util.handle_existing_directories(
model_path, summary_path, False, True, init_path
)
trainer_util.handle_existing_directories(output_path, False, True, init_path)
trainer_util.handle_existing_directories(
model_path, summary_path, False, True, init_path
)
trainer_util.handle_existing_directories(output_path, False, True, init_path)

3
ml-agents/mlagents/trainers/trainer/trainer.py


self.brain_name = brain_name
self.run_id = run_id
self.trainer_parameters = trainer_parameters
self.summary_path = trainer_parameters["summary_path"]
self._stats_reporter = StatsReporter(self.summary_path)
self._stats_reporter = StatsReporter(brain_name)
self.is_training = training
self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)
self.policy_queues: List[AgentManagerQueue[Policy]] = []

20
ml-agents/mlagents/trainers/trainer_controller.py


def __init__(
self,
trainer_factory: TrainerFactory,
model_path: str,
summaries_dir: str,
output_path: str,
run_id: str,
save_freq: int,
meta_curriculum: Optional[MetaCurriculum],

resampling_interval: Optional[int],
):
"""
:param model_path: Path to save the model.
:param output_path: Path to save the model.
:param summaries_dir: Folder to save training summaries.
:param run_id: The sub-directory name for model and summary statistics
:param save_freq: Frequency at which to save model

self.trainers: Dict[str, Trainer] = {}
self.brain_name_to_identifier: Dict[str, Set] = defaultdict(set)
self.trainer_factory = trainer_factory
self.model_path = model_path
self.summaries_dir = summaries_dir
self.output_path = output_path
self.logger = get_logger(__name__)
self.run_id = run_id
self.save_freq = save_freq

self.trainers[brain_name].export_model(name_behavior_id)
@staticmethod
def _create_model_path(model_path):
def _create_output_path(output_path):
if not os.path.exists(model_path):
os.makedirs(model_path)
if not os.path.exists(output_path):
os.makedirs(output_path)
"The folder {} containing the "
f"The folder {output_path} containing the "
"permissions are set correctly.".format(model_path)
"permissions are set correctly."
)
@timed

@timed
def start_learning(self, env_manager: EnvManager) -> None:
self._create_model_path(self.model_path)
self._create_output_path(self.output_path)
tf.reset_default_graph()
global_step = 0
last_brain_behavior_ids: Set[str] = set()

55
ml-agents/mlagents/trainers/trainer_util.py


def __init__(
self,
trainer_config: Any,
summaries_dir: str,
model_path: str,
output_path: str,
keep_checkpoints: int,
train_model: bool,
load_model: bool,

multi_gpu: bool = False,
):
self.trainer_config = trainer_config
self.summaries_dir = summaries_dir
self.model_path = model_path
self.output_path = output_path
self.init_path = init_path
self.keep_checkpoints = keep_checkpoints
self.train_model = train_model

return initialize_trainer(
self.trainer_config,
brain_name,
self.summaries_dir,
self.model_path,
self.output_path,
self.keep_checkpoints,
self.train_model,
self.load_model,

def initialize_trainer(
trainer_config: Any,
brain_name: str,
summaries_dir: str,
model_path: str,
output_path: str,
keep_checkpoints: int,
train_model: bool,
load_model: bool,

:param trainer_config: Original trainer configuration loaded from YAML
:param brain_name: Name of the brain to be associated with trainer
:param summaries_dir: Directory to store trainer summary statistics
:param model_path: Path to save the model
:param output_path: Path to save the model and summary statistics
:param keep_checkpoints: How many model checkpoints to keep
:param train_model: Whether to train the model (vs. run inference)
:param load_model: Whether to load the model or randomly initialize

"""
if "default" not in trainer_config and brain_name not in trainer_config:
raise TrainerConfigError(
f'Trainer config must have either a "default" section, or a section for the brain name ({brain_name}). '
"See config/trainer_config.yaml for an example."
f'Trainer config must have either a "default" section, or a section for the brain name {brain_name}. '
"See the config/ directory for examples."
trainer_parameters["summary_path"] = str(run_id) + "_" + brain_name
trainer_parameters["model_path"] = "{basedir}/{name}".format(
basedir=model_path, name=brain_name
)
trainer_parameters["output_path"] = os.path.join(output_path, brain_name)
trainer_parameters["init_path"] = "{basedir}/{name}".format(
basedir=init_path, name=brain_name
)
trainer_parameters["init_path"] = os.path.join(init_path, brain_name)
trainer_parameters["keep_checkpoints"] = keep_checkpoints
if brain_name in trainer_config:
_brain_key: Any = brain_name

if init_path is not None:
trainer_parameters["init_path"] = "{basedir}/{name}".format(
basedir=init_path, name=brain_name
)
min_lesson_length = 1
if meta_curriculum:
if brain_name in meta_curriculum.brains_to_curricula:

) from e
def assemble_curriculum_config(trainer_config: Dict[str, Any]) -> Dict[str, Any]:
"""
Assembles a curriculum config Dict from a trainer config. The resulting
dictionary should have a mapping of {brain_name: config}, where config is another
Dict that
:param trainer_config: Dict of trainer configurations (keys are brain_names).
:return: Dict of curriculum configurations. Returns empty dict if none are found.
"""
curriculum_config: Dict[str, Any] = {}
for behavior_name, behavior_config in trainer_config.items():
# Don't try to iterate non-Dicts. This probably means your config is malformed.
if isinstance(behavior_config, dict) and "curriculum" in behavior_config:
curriculum_config[behavior_name] = behavior_config["curriculum"]
return curriculum_config
model_path: str, summary_path: str, resume: bool, force: bool, init_path: str = None
output_path: str, resume: bool, force: bool, init_path: str = None
) -> None:
"""
Validates that if the run_id model exists, we do not overwrite it unless --force is specified.

:param force: Whether or not the --force flag was passed.
"""
model_path_exists = os.path.isdir(model_path)
output_path_exists = os.path.isdir(output_path)
if model_path_exists:
if output_path_exists:
if not resume and not force:
raise UnityTrainerException(
"Previous data from this run ID was found. "

17
ml-agents/tests/yamato/scripts/run_llapi.py


file_name=env_name,
side_channels=[engine_configuration_channel],
no_graphics=True,
args=["-logFile", "-"],
additional_args=["-logFile", "-"],
)
try:

"""
try:
env1 = UnityEnvironment(
file_name=env_name, base_port=5006, no_graphics=True, args=["-logFile", "-"]
file_name=env_name,
base_port=5006,
no_graphics=True,
additional_args=["-logFile", "-"],
file_name=env_name, base_port=5006, no_graphics=True, args=["-logFile", "-"]
file_name=env_name,
base_port=5006,
no_graphics=True,
additional_args=["-logFile", "-"],
file_name=env_name, base_port=5007, no_graphics=True, args=["-logFile", "-"]
file_name=env_name,
base_port=5007,
no_graphics=True,
additional_args=["-logFile", "-"],
)
env2.reset()
finally:

4
ml-agents/tests/yamato/training_int_tests.py


print(
f"Running training with python={python_version or latest} and c#={csharp_version or latest}"
)
nn_file_expected = f"./models/{run_id}/3DBall.nn"
nn_file_expected = f"./results/{run_id}/3DBall.nn"
if os.path.exists(nn_file_expected):
# Should never happen - make sure nothing leftover from an old test.
print("Artifacts from previous build found!")

# Copy the default training config but override the max_steps parameter,
# and reduce the batch_size and buffer_size enough to ensure an update step happens.
override_config_file(
"config/trainer_config.yaml",
"config/ppo/3DBall.yaml",
"override.yaml",
max_steps=100,
batch_size=10,

3
ml-agents/tests/yamato/yamato_utils.py


"""
with open(src_path) as f:
configs = yaml.safe_load(f)
behavior_configs = configs["behaviors"]
for config in configs.values():
for config in behavior_configs.values():
config.update(**kwargs)
with open(dest_path, "w") as f:

41
ml-agents/mlagents/trainers/cli_utils.py


from typing import Set
import argparse
class DetectDefault(argparse.Action):
"""
Internal custom Action to help detect arguments that aren't default.
"""
non_default_args: Set[str] = set()
def __call__(self, arg_parser, namespace, values, option_string=None):
setattr(namespace, self.dest, values)
DetectDefault.non_default_args.add(self.dest)
class DetectDefaultStoreTrue(DetectDefault):
"""
Internal class to help detect arguments that aren't default.
Used for store_true arguments.
"""
def __init__(self, nargs=0, **kwargs):
super().__init__(nargs=nargs, **kwargs)
def __call__(self, arg_parser, namespace, values, option_string=None):
super().__call__(arg_parser, namespace, True, option_string)
class StoreConfigFile(argparse.Action):
"""
Custom Action to store the config file location not as part of the CLI args.
This is because we want to maintain an equivalence between the config file's
contents and the args themselves.
"""
trainer_config_path: str
def __call__(self, arg_parser, namespace, values, option_string=None):
delattr(namespace, self.dest)
StoreConfigFile.trainer_config_path = values

29
config/imitation/CrawlerStatic.yaml


behaviors:
CrawlerStatic:
trainer: ppo
batch_size: 2024
beta: 0.005
buffer_size: 20240
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
max_steps: 1e7
memory_size: 256
normalize: true
num_epoch: 3
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
use_recurrent: false
reward_signals:
gail:
strength: 1.0
gamma: 0.99
encoding_size: 128
demo_path: Project/Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerSta.demo
behavioral_cloning:
demo_path: Project/Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerSta.demo
strength: 0.5
steps: 50000

29
config/imitation/FoodCollector.yaml


behaviors:
FoodCollector:
trainer: ppo
batch_size: 64
beta: 0.005
buffer_size: 10240
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
max_steps: 2.0e6
memory_size: 256
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 32
summary_freq: 10000
use_recurrent: false
reward_signals:
gail:
strength: 0.1
gamma: 0.99
encoding_size: 128
demo_path: Project/Assets/ML-Agents/Examples/FoodCollector/Demos/ExpertFood.demo
behavioral_cloning:
demo_path: Project/Assets/ML-Agents/Examples/FoodCollector/Demos/ExpertFood.demo
strength: 1.0
steps: 0

28
config/imitation/Hallway.yaml


behaviors:
Hallway:
trainer: ppo
batch_size: 128
beta: 0.01
buffer_size: 1024
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
max_steps: 1.0e7
memory_size: 256
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 10000
use_recurrent: true
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
gail:
strength: 0.1
gamma: 0.99
encoding_size: 128
demo_path: Project/Assets/ML-Agents/Examples/Hallway/Demos/ExpertHallway.demo

25
config/imitation/PushBlock.yaml


behaviors:
PushBlock:
trainer: ppo
batch_size: 128
beta: 0.01
buffer_size: 2048
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
max_steps: 1.5e7
memory_size: 256
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 60000
use_recurrent: false
reward_signals:
gail:
strength: 1.0
gamma: 0.99
encoding_size: 128
demo_path: Project/Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo

36
config/imitation/Pyramids.yaml


behaviors:
Pyramids:
trainer: ppo
batch_size: 128
beta: 0.01
buffer_size: 2048
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
max_steps: 1.0e7
memory_size: 256
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 128
sequence_length: 64
summary_freq: 30000
use_recurrent: false
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
curiosity:
strength: 0.02
gamma: 0.99
encoding_size: 256
gail:
strength: 0.01
gamma: 0.99
encoding_size: 128
demo_path: Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
behavioral_cloning:
demo_path: Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
strength: 0.5
steps: 150000

25
config/ppo/3DBall.yaml


behaviors:
3DBall:
trainer: ppo
batch_size: 64
beta: 0.001
buffer_size: 12000
epsilon: 0.2
hidden_units: 128
lambd: 0.99
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 5.0e5
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 12000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

25
config/ppo/3DBallHard.yaml


behaviors:
3DBallHard:
trainer: ppo
batch_size: 1200
beta: 0.001
buffer_size: 12000
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 5.0e6
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 12000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.995

40
config/ppo/3DBall_randomize.yaml


behaviors:
3DBall:
trainer: ppo
batch_size: 64
beta: 0.001
buffer_size: 12000
epsilon: 0.2
hidden_units: 128
lambd: 0.99
learning_rate: 3.0e-4
learning_rate_schedule: linear
max_steps: 5.0e5
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 12000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
parameter_randomization:
resampling-interval: 500
mass:
sampler-type: "uniform"
min_value: 0.5
max_value: 10
gravity:
sampler-type: "uniform"
min_value: 7
max_value: 12
scale:
sampler-type: "uniform"
min_value: 0.75
max_value: 3

25
config/ppo/Basic.yaml


behaviors:
Basic:
trainer: ppo
batch_size: 32
beta: 0.005
buffer_size: 256
epsilon: 0.2
hidden_units: 20
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 5.0e5
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 1
time_horizon: 3
sequence_length: 64
summary_freq: 2000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.9

25
config/ppo/Bouncer.yaml


behaviors:
Bouncer:
trainer: ppo
batch_size: 1024
beta: 0.005
buffer_size: 10240
epsilon: 0.2
hidden_units: 64
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 4.0e6
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 10000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

25
config/ppo/CrawlerDynamic.yaml


behaviors:
CrawlerDynamic:
trainer: ppo
batch_size: 2024
beta: 0.005
buffer_size: 20240
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 1e7
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.995

25
config/ppo/CrawlerStatic.yaml


behaviors:
CrawlerStatic:
trainer: ppo
batch_size: 2024
beta: 0.005
buffer_size: 20240
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 1e7
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.995

25
config/ppo/FoodCollector.yaml


behaviors:
FoodCollector:
trainer: ppo
batch_size: 1024
beta: 0.005
buffer_size: 10240
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 2.0e6
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 10000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

25
config/ppo/GridWorld.yaml


behaviors:
GridWorld:
trainer: ppo
batch_size: 32
beta: 0.005
buffer_size: 256
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 500000
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 1
time_horizon: 5
sequence_length: 64
summary_freq: 20000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.9

25
config/ppo/Hallway.yaml


behaviors:
Hallway:
trainer: ppo
batch_size: 128
beta: 0.01
buffer_size: 1024
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 1.0e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 10000
use_recurrent: true
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

25
config/ppo/PushBlock.yaml


behaviors:
PushBlock:
trainer: ppo
batch_size: 128
beta: 0.01
buffer_size: 2048
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 2.0e6
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 60000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

29
config/ppo/Pyramids.yaml


behaviors:
Pyramids:
trainer: ppo
batch_size: 128
beta: 0.01
buffer_size: 2048
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 1.0e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 128
sequence_length: 64
summary_freq: 30000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
curiosity:
strength: 0.02
gamma: 0.99
encoding_size: 256

25
config/ppo/Reacher.yaml


behaviors:
Reacher:
trainer: ppo
batch_size: 2024
beta: 0.005
buffer_size: 20240
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 2e7
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 60000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.995

38
config/ppo/SoccerTwos.yaml


behaviors:
SoccerTwos:
trainer: ppo
batch_size: 2048
beta: 0.005
buffer_size: 20480
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 10000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
self_play:
window: 10
play_against_latest_model_ratio: 0.5
save_steps: 50000
swap_steps: 50000
team_change: 200000
curriculum:
measure: progress
thresholds: [0.05, 0.1]
min_lesson_length: 100
signal_smoothing: true
parameters:
ball_touch: [1.0, 0.5, 0.0]

62
config/ppo/StrikersVsGoalie.yaml


behaviors:
Goalie:
trainer: ppo
batch_size: 2048
beta: 0.005
buffer_size: 20480
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 10000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
self_play:
window: 10
play_against_latest_model_ratio: 0.5
save_steps: 50000
swap_steps: 25000
team_change: 200000
Striker:
trainer: ppo
batch_size: 2048
beta: 0.005
buffer_size: 20480
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 10000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
self_play:
window: 10
play_against_latest_model_ratio: 0.5
save_steps: 50000
swap_steps: 100000
team_change: 200000

31
config/ppo/Tennis.yaml


behaviors:
Tennis:
trainer: ppo
batch_size: 1024
beta: 0.005
buffer_size: 10240
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e7
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 10000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
self_play:
window: 10
play_against_latest_model_ratio: 0.5
save_steps: 50000
swap_steps: 50000
team_change: 100000

25
config/ppo/VisualHallway.yaml


behaviors:
VisualHallway:
trainer: ppo
batch_size: 64
beta: 0.01
buffer_size: 1024
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 1.0e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 1
time_horizon: 64
sequence_length: 64
summary_freq: 10000
use_recurrent: true
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

25
config/ppo/VisualPushBlock.yaml


behaviors:
VisualPushBlock:
trainer: ppo
batch_size: 64
beta: 0.01
buffer_size: 1024
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 3.0e6
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 1
time_horizon: 64
sequence_length: 32
summary_freq: 60000
use_recurrent: true
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

29
config/ppo/VisualPyramids.yaml


behaviors:
VisualPyramids:
trainer: ppo
batch_size: 64
beta: 0.01
buffer_size: 2024
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 1.0e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 1
time_horizon: 128
sequence_length: 64
summary_freq: 10000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
curiosity:
strength: 0.01
gamma: 0.99
encoding_size: 256

25
config/ppo/Walker.yaml


behaviors:
Walker:
trainer: ppo
batch_size: 2048
beta: 0.005
buffer_size: 20480
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 2e7
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.995

50
config/ppo/WallJump.yaml


behaviors:
BigWallJump:
trainer: ppo
batch_size: 128
beta: 0.005
buffer_size: 2048
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 2e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 128
sequence_length: 64
summary_freq: 20000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
SmallWallJump:
trainer: ppo
batch_size: 128
beta: 0.005
buffer_size: 2048
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 5e6
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 128
sequence_length: 64
summary_freq: 20000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

65
config/ppo/WallJump_curriculum.yaml


behaviors:
BigWallJump:
trainer: ppo
batch_size: 128
beta: 0.005
buffer_size: 2048
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 2e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 128
sequence_length: 64
summary_freq: 20000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
curriculum:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
big_wall_min_height: [0.0, 4.0, 6.0, 8.0]
big_wall_max_height: [4.0, 7.0, 8.0, 8.0]
SmallWallJump:
trainer: ppo
batch_size: 128
beta: 0.005
buffer_size: 2048
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 5e6
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 128
sequence_length: 64
summary_freq: 20000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
curriculum:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
small_wall_height: [1.5, 2.0, 2.5, 4.0]

25
config/ppo/WormDynamic.yaml


behaviors:
WormDynamic:
trainer: ppo
batch_size: 2024
beta: 0.005
buffer_size: 20240
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 3.5e6
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.995

25
config/ppo/WormStatic.yaml


behaviors:
WormStatic:
trainer: ppo
batch_size: 2024
beta: 0.005
buffer_size: 20240
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 3.5e6
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.995

25
config/sac/3DBall.yaml


behaviors:
3DBall:
trainer: sac
batch_size: 64
buffer_size: 12000
buffer_init_steps: 0
hidden_units: 64
init_entcoef: 0.5
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e5
memory_size: 128
normalize: true
steps_per_update: 10
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 12000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

25
config/sac/3DBallHard.yaml


behaviors:
3DBallHard:
trainer: sac
batch_size: 256
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 128
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e5
memory_size: 128
normalize: true
steps_per_update: 10
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 12000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

25
config/sac/Basic.yaml


behaviors:
Basic:
trainer: sac
batch_size: 64
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 20
init_entcoef: 0.01
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e5
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 2
time_horizon: 10
sequence_length: 64
summary_freq: 2000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

25
config/sac/Bouncer.yaml


behaviors:
Bouncer:
trainer: sac
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 64
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 1.0e6
memory_size: 128
normalize: true
steps_per_update: 10
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 20000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

25
config/sac/CrawlerDynamic.yaml


behaviors:
CrawlerDynamic:
trainer: sac
batch_size: 256
buffer_size: 500000
buffer_init_steps: 0
hidden_units: 512
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5e6
memory_size: 128
normalize: true
steps_per_update: 20
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.995

25
config/sac/CrawlerStatic.yaml


behaviors:
CrawlerStatic:
trainer: sac
batch_size: 256
buffer_size: 500000
buffer_init_steps: 2000
hidden_units: 512
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 3e6
memory_size: 128
normalize: true
steps_per_update: 20
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.995

25
config/sac/FoodCollector.yaml


behaviors:
FoodCollector:
trainer: sac
batch_size: 256
buffer_size: 500000
buffer_init_steps: 0
hidden_units: 128
init_entcoef: 0.05
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 2.0e6
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 10000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

25
config/sac/GridWorld.yaml


behaviors:
GridWorld:
trainer: sac
batch_size: 128
buffer_size: 50000
buffer_init_steps: 1000
hidden_units: 128
init_entcoef: 0.5
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 500000
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 1
time_horizon: 5
sequence_length: 64
summary_freq: 20000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.9

25
config/sac/Hallway.yaml


behaviors:
Hallway:
trainer: sac
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 128
init_entcoef: 0.1
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e6
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 2
time_horizon: 64
sequence_length: 32
summary_freq: 10000
tau: 0.005
use_recurrent: true
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

25
config/sac/PushBlock.yaml


behaviors:
PushBlock:
trainer: sac
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 256
init_entcoef: 0.05
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 2e6
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 100000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

31
config/sac/Pyramids.yaml


behaviors:
Pyramids:
trainer: sac
batch_size: 128
buffer_size: 500000
buffer_init_steps: 10000
hidden_units: 256
init_entcoef: 0.01
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 1.0e7
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 2
time_horizon: 128
sequence_length: 16
summary_freq: 30000
tau: 0.01
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 2.0
gamma: 0.99
gail:
strength: 0.02
gamma: 0.99
encoding_size: 128
use_actions: true
demo_path: Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo

25
config/sac/Reacher.yaml


behaviors:
Reacher:
trainer: sac
batch_size: 128
buffer_size: 500000
buffer_init_steps: 0
hidden_units: 128
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 2e7
memory_size: 128
normalize: true
steps_per_update: 20
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 60000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99

30
config/sac/Tennis.yaml


behaviors:
Tennis:
trainer: sac
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 256
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 2e7
memory_size: 128
normalize: true
steps_per_update: 10
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 10000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
self_play:
window: 10
play_against_current_self_ratio: 0.5
save_steps: 50000
swap_steps: 50000

26
config/sac/VisualHallway.yaml


behaviors:
VisualHallway:
trainer: sac
batch_size: 64
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 128
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 1.0e7
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 1
time_horizon: 64
sequence_length: 32
summary_freq: 10000
tau: 0.005
use_recurrent: true
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
gamma: 0.99

26
config/sac/VisualPushBlock.yaml


behaviors:
VisualPushBlock:
trainer: sac
batch_size: 64
buffer_size: 1024
buffer_init_steps: 0
hidden_units: 128
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 3.0e6
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 1
time_horizon: 64
sequence_length: 32
summary_freq: 60000
tau: 0.005
use_recurrent: true
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
gamma: 0.99

部分文件因为文件数量过多而无法显示

正在加载...
取消
保存