浏览代码

[refactor] Structure configuration files into classes (#3936)

/test-sampler
GitHub 5 年前
当前提交
e92b4f88
共有 98 个文件被更改,包括 3088 次插入3176 次删除
  1. 2
      com.unity.ml-agents/CHANGELOG.md
  2. 46
      config/imitation/CrawlerStatic.yaml
  3. 46
      config/imitation/FoodCollector.yaml
  4. 48
      config/imitation/Hallway.yaml
  5. 43
      config/imitation/PushBlock.yaml
  6. 30
      config/imitation/Pyramids.yaml
  7. 42
      config/ppo/3DBall.yaml
  8. 42
      config/ppo/3DBallHard.yaml
  9. 76
      config/ppo/3DBall_randomize.yaml
  10. 42
      config/ppo/Basic.yaml
  11. 42
      config/ppo/Bouncer.yaml
  12. 42
      config/ppo/CrawlerDynamic.yaml
  13. 42
      config/ppo/CrawlerStatic.yaml
  14. 42
      config/ppo/FoodCollector.yaml
  15. 42
      config/ppo/GridWorld.yaml
  16. 45
      config/ppo/Hallway.yaml
  17. 42
      config/ppo/PushBlock.yaml
  18. 45
      config/ppo/Pyramids.yaml
  19. 42
      config/ppo/Reacher.yaml
  20. 56
      config/ppo/SoccerTwos.yaml
  21. 99
      config/ppo/StrikersVsGoalie.yaml
  22. 49
      config/ppo/Tennis.yaml
  23. 45
      config/ppo/VisualHallway.yaml
  24. 45
      config/ppo/VisualPushBlock.yaml
  25. 45
      config/ppo/VisualPyramids.yaml
  26. 42
      config/ppo/Walker.yaml
  27. 83
      config/ppo/WallJump.yaml
  28. 115
      config/ppo/WallJump_curriculum.yaml
  29. 42
      config/ppo/WormDynamic.yaml
  30. 42
      config/ppo/WormStatic.yaml
  31. 44
      config/sac/3DBall.yaml
  32. 44
      config/sac/3DBallHard.yaml
  33. 44
      config/sac/Basic.yaml
  34. 44
      config/sac/Bouncer.yaml
  35. 44
      config/sac/CrawlerDynamic.yaml
  36. 44
      config/sac/CrawlerStatic.yaml
  37. 44
      config/sac/FoodCollector.yaml
  38. 44
      config/sac/GridWorld.yaml
  39. 47
      config/sac/Hallway.yaml
  40. 44
      config/sac/PushBlock.yaml
  41. 48
      config/sac/Pyramids.yaml
  42. 44
      config/sac/Reacher.yaml
  43. 50
      config/sac/Tennis.yaml
  44. 48
      config/sac/VisualHallway.yaml
  45. 48
      config/sac/VisualPushBlock.yaml
  46. 48
      config/sac/VisualPyramids.yaml
  47. 44
      config/sac/Walker.yaml
  48. 87
      config/sac/WallJump.yaml
  49. 44
      config/sac/WormDynamic.yaml
  50. 44
      config/sac/WormStatic.yaml
  51. 13
      docs/Migrating.md
  52. 113
      docs/Training-Configuration-File.md
  53. 141
      docs/Training-ML-Agents.md
  54. 233
      ml-agents/mlagents/trainers/cli_utils.py
  55. 39
      ml-agents/mlagents/trainers/components/bc/module.py
  56. 29
      ml-agents/mlagents/trainers/components/reward_signals/__init__.py
  57. 36
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
  58. 12
      ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
  59. 47
      ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
  60. 22
      ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
  61. 66
      ml-agents/mlagents/trainers/curriculum.py
  62. 22
      ml-agents/mlagents/trainers/ghost/trainer.py
  63. 377
      ml-agents/mlagents/trainers/learn.py
  64. 7
      ml-agents/mlagents/trainers/meta_curriculum.py
  65. 29
      ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
  66. 11
      ml-agents/mlagents/trainers/policy/nn_policy.py
  67. 41
      ml-agents/mlagents/trainers/policy/tf_policy.py
  68. 33
      ml-agents/mlagents/trainers/ppo/optimizer.py
  69. 42
      ml-agents/mlagents/trainers/ppo/trainer.py
  70. 6
      ml-agents/mlagents/trainers/run_experiment.py
  71. 38
      ml-agents/mlagents/trainers/sac/optimizer.py
  72. 100
      ml-agents/mlagents/trainers/sac/trainer.py
  73. 39
      ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
  74. 100
      ml-agents/mlagents/trainers/tests/test_bcmodule.py
  75. 94
      ml-agents/mlagents/trainers/tests/test_curriculum.py
  76. 36
      ml-agents/mlagents/trainers/tests/test_distributions.py
  77. 40
      ml-agents/mlagents/trainers/tests/test_ghost.py
  78. 133
      ml-agents/mlagents/trainers/tests/test_learn.py
  79. 55
      ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
  80. 72
      ml-agents/mlagents/trainers/tests/test_nn_policy.py
  81. 11
      ml-agents/mlagents/trainers/tests/test_policy.py
  82. 99
      ml-agents/mlagents/trainers/tests/test_ppo.py
  83. 141
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  84. 18
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  85. 83
      ml-agents/mlagents/trainers/tests/test_sac.py
  86. 388
      ml-agents/mlagents/trainers/tests/test_simple_rl.py
  87. 4
      ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
  88. 274
      ml-agents/mlagents/trainers/tests/test_trainer_util.py
  89. 10
      ml-agents/mlagents/trainers/trainer/rl_trainer.py
  90. 28
      ml-agents/mlagents/trainers/trainer/trainer.py
  91. 8
      ml-agents/mlagents/trainers/trainer_controller.py
  92. 106
      ml-agents/mlagents/trainers/trainer_util.py
  93. 2
      ml-agents/setup.py
  94. 8
      ml-agents/tests/yamato/training_int_tests.py
  95. 12
      ml-agents/tests/yamato/yamato_utils.py
  96. 110
      config/upgrade_config.py
  97. 373
      ml-agents/mlagents/trainers/settings.py
  98. 151
      ml-agents/mlagents/trainers/tests/test_settings.py

2
com.unity.ml-agents/CHANGELOG.md


- Curriculum and Parameter Randomization configurations have been merged
into the main training configuration file. Note that this means training
configuration files are now environment-specific. (#3791)
- The format for trainer configuration has changed, and the "default" behavior has been deprecated.
See the [Migration Guide](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Migrating.md) for more details. (#3936)
- Training artifacts (trained models, summaries) are now found in the `results/`
directory. (#3829)
- Unity Player logs are now written out to the results directory. (#3877)

46
config/imitation/CrawlerStatic.yaml


behaviors:
CrawlerStatic:
trainer: ppo
batch_size: 2024
beta: 0.005
buffer_size: 20240
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
max_steps: 1e7
memory_size: 256
normalize: true
num_epoch: 3
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
use_recurrent: false
trainer_type: ppo
hyperparameters:
batch_size: 2024
buffer_size: 20240
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: true
hidden_units: 512
num_layers: 3
vis_encode_type: simple
strength: 1.0
strength: 1.0
learning_rate: 0.0003
use_actions: false
use_vail: false
output_path: default
keep_checkpoints: 5
max_steps: 10000000
time_horizon: 1000
summary_freq: 30000
threaded: true
steps: 50000
steps: 50000
samples_per_update: 0

46
config/imitation/FoodCollector.yaml


behaviors:
FoodCollector:
trainer: ppo
batch_size: 64
beta: 0.005
buffer_size: 10240
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
max_steps: 2.0e6
memory_size: 256
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 32
summary_freq: 10000
use_recurrent: false
trainer_type: ppo
hyperparameters:
batch_size: 64
buffer_size: 10240
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 128
num_layers: 2
vis_encode_type: simple
strength: 0.1
strength: 0.1
learning_rate: 0.0003
use_actions: false
use_vail: false
output_path: default
keep_checkpoints: 5
max_steps: 2000000
time_horizon: 64
summary_freq: 10000
threaded: true
steps: 0
steps: 0
samples_per_update: 0

48
config/imitation/Hallway.yaml


behaviors:
Hallway:
trainer: ppo
batch_size: 128
beta: 0.01
buffer_size: 1024
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
max_steps: 1.0e7
memory_size: 256
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 10000
use_recurrent: true
trainer_type: ppo
hyperparameters:
batch_size: 128
buffer_size: 1024
learning_rate: 0.0003
beta: 0.01
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 128
num_layers: 2
vis_encode_type: simple
memory:
sequence_length: 64
memory_size: 256
strength: 1.0
strength: 1.0
gamma: 0.99
gamma: 0.99
learning_rate: 0.0003
use_actions: false
use_vail: false
output_path: default
keep_checkpoints: 5
max_steps: 10000000
time_horizon: 64
summary_freq: 10000
threaded: true

43
config/imitation/PushBlock.yaml


behaviors:
PushBlock:
trainer: ppo
batch_size: 128
beta: 0.01
buffer_size: 2048
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
max_steps: 1.5e7
memory_size: 256
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 60000
use_recurrent: false
trainer_type: ppo
hyperparameters:
batch_size: 128
buffer_size: 2048
learning_rate: 0.0003
beta: 0.01
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
learning_rate: 0.0003
use_actions: false
use_vail: false
output_path: default
keep_checkpoints: 5
max_steps: 15000000
time_horizon: 64
summary_freq: 60000
threaded: true

30
config/imitation/Pyramids.yaml


behaviors:
Pyramids:
trainer: ppo
batch_size: 128
beta: 0.01
buffer_size: 2048
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
max_steps: 1.0e7
memory_size: 256
normalize: false
num_epoch: 3
num_layers: 2
trainer_type: ppo
sequence_length: 64
summary_freq: 30000
use_recurrent: false
max_steps: 1.0e7
hyperparameters:
batch_size: 128
beta: 0.01
buffer_size: 2048
epsilon: 0.2
lambd: 0.95
learning_rate: 0.0003
num_epoch: 3
network_settings:
num_layers: 2
normalize: false
hidden_units: 512
reward_signals:
extrinsic:
strength: 1.0

42
config/ppo/3DBall.yaml


behaviors:
3DBall:
trainer: ppo
batch_size: 64
beta: 0.001
buffer_size: 12000
epsilon: 0.2
hidden_units: 128
lambd: 0.99
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 5.0e5
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 12000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 64
buffer_size: 12000
learning_rate: 0.0003
beta: 0.001
epsilon: 0.2
lambd: 0.99
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: true
hidden_units: 128
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 500000
time_horizon: 1000
summary_freq: 12000
threaded: true

42
config/ppo/3DBallHard.yaml


behaviors:
3DBallHard:
trainer: ppo
batch_size: 1200
beta: 0.001
buffer_size: 12000
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 5.0e6
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 12000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 1200
buffer_size: 12000
learning_rate: 0.0003
beta: 0.001
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: true
hidden_units: 128
num_layers: 2
vis_encode_type: simple
gamma: 0.995
gamma: 0.995
output_path: default
keep_checkpoints: 5
max_steps: 5000000
time_horizon: 1000
summary_freq: 12000
threaded: true

76
config/ppo/3DBall_randomize.yaml


behaviors:
3DBall:
trainer: ppo
batch_size: 64
beta: 0.001
buffer_size: 12000
epsilon: 0.2
hidden_units: 128
lambd: 0.99
learning_rate: 3.0e-4
learning_rate_schedule: linear
max_steps: 5.0e5
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 12000
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
3DBall:
trainer_type: ppo
hyperparameters:
batch_size: 64
buffer_size: 12000
learning_rate: 0.0003
beta: 0.001
epsilon: 0.2
lambd: 0.99
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: true
hidden_units: 128
num_layers: 2
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
output_path: default
keep_checkpoints: 5
max_steps: 500000
time_horizon: 1000
summary_freq: 12000
threaded: true
resampling-interval: 5000
mass:
sampler-type: "uniform"
min_value: 0.5
max_value: 10
gravity:
sampler-type: "uniform"
min_value: 7
max_value: 12
scale:
sampler-type: "uniform"
min_value: 0.75
max_value: 3
resampling-interval: 5000
mass:
sampler-type: uniform
min_value: 0.5
max_value: 10
gravity:
sampler-type: uniform
min_value: 7
max_value: 12
scale:
sampler-type: uniform
min_value: 0.75
max_value: 3

42
config/ppo/Basic.yaml


behaviors:
Basic:
trainer: ppo
batch_size: 32
beta: 0.005
buffer_size: 256
epsilon: 0.2
hidden_units: 20
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 5.0e5
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 1
time_horizon: 3
sequence_length: 64
summary_freq: 2000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 32
buffer_size: 256
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 20
num_layers: 1
vis_encode_type: simple
gamma: 0.9
gamma: 0.9
output_path: default
keep_checkpoints: 5
max_steps: 500000
time_horizon: 3
summary_freq: 2000
threaded: true

42
config/ppo/Bouncer.yaml


behaviors:
Bouncer:
trainer: ppo
batch_size: 1024
beta: 0.005
buffer_size: 10240
epsilon: 0.2
hidden_units: 64
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 4.0e6
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 10000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 1024
buffer_size: 10240
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: true
hidden_units: 64
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 4000000
time_horizon: 64
summary_freq: 10000
threaded: true

42
config/ppo/CrawlerDynamic.yaml


behaviors:
CrawlerDynamic:
trainer: ppo
batch_size: 2024
beta: 0.005
buffer_size: 20240
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 1e7
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 2024
buffer_size: 20240
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: true
hidden_units: 512
num_layers: 3
vis_encode_type: simple
gamma: 0.995
gamma: 0.995
output_path: default
keep_checkpoints: 5
max_steps: 10000000
time_horizon: 1000
summary_freq: 30000
threaded: true

42
config/ppo/CrawlerStatic.yaml


behaviors:
CrawlerStatic:
trainer: ppo
batch_size: 2024
beta: 0.005
buffer_size: 20240
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 1e7
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 2024
buffer_size: 20240
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: true
hidden_units: 512
num_layers: 3
vis_encode_type: simple
gamma: 0.995
gamma: 0.995
output_path: default
keep_checkpoints: 5
max_steps: 10000000
time_horizon: 1000
summary_freq: 30000
threaded: true

42
config/ppo/FoodCollector.yaml


behaviors:
FoodCollector:
trainer: ppo
batch_size: 1024
beta: 0.005
buffer_size: 10240
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 2.0e6
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 10000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 1024
buffer_size: 10240
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 128
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 2000000
time_horizon: 64
summary_freq: 10000
threaded: true

42
config/ppo/GridWorld.yaml


behaviors:
GridWorld:
trainer: ppo
batch_size: 32
beta: 0.005
buffer_size: 256
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 500000
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 1
time_horizon: 5
sequence_length: 64
summary_freq: 20000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 32
buffer_size: 256
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 256
num_layers: 1
vis_encode_type: simple
gamma: 0.9
gamma: 0.9
output_path: default
keep_checkpoints: 5
max_steps: 500000
time_horizon: 5
summary_freq: 20000
threaded: true

45
config/ppo/Hallway.yaml


behaviors:
Hallway:
trainer: ppo
batch_size: 128
beta: 0.01
buffer_size: 1024
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 1.0e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 10000
use_recurrent: true
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 128
buffer_size: 1024
learning_rate: 0.0003
beta: 0.01
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 128
num_layers: 2
vis_encode_type: simple
memory:
sequence_length: 64
memory_size: 128
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 10000000
time_horizon: 64
summary_freq: 10000
threaded: true

42
config/ppo/PushBlock.yaml


behaviors:
PushBlock:
trainer: ppo
batch_size: 128
beta: 0.01
buffer_size: 2048
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 2.0e6
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 60000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 128
buffer_size: 2048
learning_rate: 0.0003
beta: 0.01
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 2000000
time_horizon: 64
summary_freq: 60000
threaded: true

45
config/ppo/Pyramids.yaml


behaviors:
Pyramids:
trainer: ppo
batch_size: 128
beta: 0.01
buffer_size: 2048
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 1.0e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 128
sequence_length: 64
summary_freq: 30000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 128
buffer_size: 2048
learning_rate: 0.0003
beta: 0.01
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 512
num_layers: 2
vis_encode_type: simple
strength: 1.0
strength: 1.0
strength: 0.02
strength: 0.02
learning_rate: 0.0003
output_path: default
keep_checkpoints: 5
max_steps: 10000000
time_horizon: 128
summary_freq: 30000
threaded: true

42
config/ppo/Reacher.yaml


behaviors:
Reacher:
trainer: ppo
batch_size: 2024
beta: 0.005
buffer_size: 20240
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 2e7
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 60000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 2024
buffer_size: 20240
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: true
hidden_units: 128
num_layers: 2
vis_encode_type: simple
gamma: 0.995
gamma: 0.995
output_path: default
keep_checkpoints: 5
max_steps: 20000000
time_horizon: 1000
summary_freq: 60000
threaded: true

56
config/ppo/SoccerTwos.yaml


behaviors:
SoccerTwos:
trainer: ppo
batch_size: 2048
beta: 0.005
buffer_size: 20480
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 10000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 2048
buffer_size: 20480
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: constant
network_settings:
normalize: false
hidden_units: 512
num_layers: 2
vis_encode_type: simple
strength: 1.0
strength: 1.0
output_path: default
keep_checkpoints: 5
max_steps: 50000000
time_horizon: 1000
summary_freq: 10000
threaded: true
window: 10
play_against_latest_model_ratio: 0.5
swap_steps: 50000
curriculum:
measure: progress
thresholds: [0.05, 0.1]
min_lesson_length: 100
signal_smoothing: true
parameters:
ball_touch: [1.0, 0.5, 0.0]
swap_steps: 50000
window: 10
play_against_latest_model_ratio: 0.5
initial_elo: 1200.0

99
config/ppo/StrikersVsGoalie.yaml


behaviors:
Goalie:
trainer: ppo
batch_size: 2048
beta: 0.005
buffer_size: 20480
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 10000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 2048
buffer_size: 20480
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: constant
network_settings:
normalize: false
hidden_units: 512
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 50000000
time_horizon: 1000
summary_freq: 10000
threaded: true
window: 10
play_against_latest_model_ratio: 0.5
swap_steps: 25000
swap_steps: 25000
window: 10
play_against_latest_model_ratio: 0.5
initial_elo: 1200.0
trainer: ppo
batch_size: 2048
beta: 0.005
buffer_size: 20480
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 10000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 2048
buffer_size: 20480
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: constant
network_settings:
normalize: false
hidden_units: 512
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 50000000
time_horizon: 1000
summary_freq: 10000
threaded: true
window: 10
play_against_latest_model_ratio: 0.5
team_change: 200000
team_change: 200000
window: 10
play_against_latest_model_ratio: 0.5
initial_elo: 1200.0

49
config/ppo/Tennis.yaml


behaviors:
Tennis:
trainer: ppo
batch_size: 1024
beta: 0.005
buffer_size: 10240
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e7
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 10000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 1024
buffer_size: 10240
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: constant
network_settings:
normalize: true
hidden_units: 256
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 50000000
time_horizon: 1000
summary_freq: 10000
threaded: true
window: 10
play_against_latest_model_ratio: 0.5
team_change: 100000
team_change: 100000
window: 10
play_against_latest_model_ratio: 0.5
initial_elo: 1200.0

45
config/ppo/VisualHallway.yaml


behaviors:
VisualHallway:
trainer: ppo
batch_size: 64
beta: 0.01
buffer_size: 1024
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 1.0e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 1
time_horizon: 64
sequence_length: 64
summary_freq: 10000
use_recurrent: true
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 64
buffer_size: 1024
learning_rate: 0.0003
beta: 0.01
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 128
num_layers: 1
vis_encode_type: simple
memory:
sequence_length: 64
memory_size: 128
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 10000000
time_horizon: 64
summary_freq: 10000
threaded: true

45
config/ppo/VisualPushBlock.yaml


behaviors:
VisualPushBlock:
trainer: ppo
batch_size: 64
beta: 0.01
buffer_size: 1024
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 3.0e6
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 1
time_horizon: 64
sequence_length: 32
summary_freq: 60000
use_recurrent: true
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 64
buffer_size: 1024
learning_rate: 0.0003
beta: 0.01
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 128
num_layers: 1
vis_encode_type: simple
memory:
sequence_length: 32
memory_size: 128
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 3000000
time_horizon: 64
summary_freq: 60000
threaded: true

45
config/ppo/VisualPyramids.yaml


behaviors:
VisualPyramids:
trainer: ppo
batch_size: 64
beta: 0.01
buffer_size: 2024
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 1.0e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 1
time_horizon: 128
sequence_length: 64
summary_freq: 10000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 64
buffer_size: 2024
learning_rate: 0.0003
beta: 0.01
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 256
num_layers: 1
vis_encode_type: simple
strength: 1.0
strength: 1.0
strength: 0.01
strength: 0.01
learning_rate: 0.0003
output_path: default
keep_checkpoints: 5
max_steps: 10000000
time_horizon: 128
summary_freq: 10000
threaded: true

42
config/ppo/Walker.yaml


behaviors:
Walker:
trainer: ppo
batch_size: 2048
beta: 0.005
buffer_size: 20480
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 2e7
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 2048
buffer_size: 20480
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: true
hidden_units: 512
num_layers: 3
vis_encode_type: simple
gamma: 0.995
gamma: 0.995
output_path: default
keep_checkpoints: 5
max_steps: 20000000
time_horizon: 1000
summary_freq: 30000
threaded: true

83
config/ppo/WallJump.yaml


behaviors:
BigWallJump:
trainer: ppo
batch_size: 128
beta: 0.005
buffer_size: 2048
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 2e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 128
sequence_length: 64
summary_freq: 20000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 128
buffer_size: 2048
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
strength: 1.0
SmallWallJump:
trainer: ppo
batch_size: 128
beta: 0.005
buffer_size: 2048
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 5e6
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
strength: 1.0
output_path: default
keep_checkpoints: 5
max_steps: 20000000
sequence_length: 64
use_recurrent: false
vis_encode_type: simple
threaded: true
SmallWallJump:
trainer_type: ppo
hyperparameters:
batch_size: 128
buffer_size: 2048
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 5000000
time_horizon: 128
summary_freq: 20000
threaded: true

115
config/ppo/WallJump_curriculum.yaml


behaviors:
BigWallJump:
trainer: ppo
batch_size: 128
beta: 0.005
buffer_size: 2048
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 2e7
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 128
sequence_length: 64
summary_freq: 20000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 128
buffer_size: 2048
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
curriculum:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
big_wall_min_height: [0.0, 4.0, 6.0, 8.0]
big_wall_max_height: [4.0, 7.0, 8.0, 8.0]
SmallWallJump:
trainer: ppo
batch_size: 128
beta: 0.005
buffer_size: 2048
epsilon: 0.2
hidden_units: 256
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 5e6
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2
output_path: default
keep_checkpoints: 5
max_steps: 20000000
sequence_length: 64
use_recurrent: false
vis_encode_type: simple
threaded: true
SmallWallJump:
trainer_type: ppo
hyperparameters:
batch_size: 128
buffer_size: 2048
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
curriculum:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
small_wall_height: [1.5, 2.0, 2.5, 4.0]
output_path: default
keep_checkpoints: 5
max_steps: 5000000
time_horizon: 128
summary_freq: 20000
threaded: true
curriculum:
BigWallJump:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
big_wall_min_height: [0.0, 4.0, 6.0, 8.0]
big_wall_max_height: [4.0, 7.0, 8.0, 8.0]
SmallWallJump:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
small_wall_height: [1.5, 2.0, 2.5, 4.0]

42
config/ppo/WormDynamic.yaml


behaviors:
WormDynamic:
trainer: ppo
batch_size: 2024
beta: 0.005
buffer_size: 20240
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 3.5e6
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 2024
buffer_size: 20240
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: true
hidden_units: 512
num_layers: 3
vis_encode_type: simple
gamma: 0.995
gamma: 0.995
output_path: default
keep_checkpoints: 5
max_steps: 3500000
time_horizon: 1000
summary_freq: 30000
threaded: true

42
config/ppo/WormStatic.yaml


behaviors:
WormStatic:
trainer: ppo
batch_size: 2024
beta: 0.005
buffer_size: 20240
epsilon: 0.2
hidden_units: 512
lambd: 0.95
learning_rate: 0.0003
learning_rate_schedule: linear
max_steps: 3.5e6
memory_size: 128
normalize: true
num_epoch: 3
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
use_recurrent: false
vis_encode_type: simple
trainer_type: ppo
hyperparameters:
batch_size: 2024
buffer_size: 20240
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: true
hidden_units: 512
num_layers: 3
vis_encode_type: simple
gamma: 0.995
gamma: 0.995
output_path: default
keep_checkpoints: 5
max_steps: 3500000
time_horizon: 1000
summary_freq: 30000
threaded: true

44
config/sac/3DBall.yaml


behaviors:
3DBall:
trainer: sac
batch_size: 64
buffer_size: 12000
buffer_init_steps: 0
hidden_units: 64
init_entcoef: 0.5
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e5
memory_size: 128
normalize: true
steps_per_update: 10
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 12000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 64
buffer_size: 12000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 0.5
reward_signal_steps_per_update: 10.0
network_settings:
normalize: true
hidden_units: 64
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 500000
time_horizon: 1000
summary_freq: 12000
threaded: true

44
config/sac/3DBallHard.yaml


behaviors:
3DBallHard:
trainer: sac
batch_size: 256
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 128
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e5
memory_size: 128
normalize: true
steps_per_update: 10
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 12000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 256
buffer_size: 50000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 1.0
reward_signal_steps_per_update: 10.0
network_settings:
normalize: true
hidden_units: 128
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 500000
time_horizon: 1000
summary_freq: 12000
threaded: true

44
config/sac/Basic.yaml


behaviors:
Basic:
trainer: sac
batch_size: 64
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 20
init_entcoef: 0.01
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e5
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 2
time_horizon: 10
sequence_length: 64
summary_freq: 2000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 64
buffer_size: 50000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 0.01
reward_signal_steps_per_update: 10.0
network_settings:
normalize: false
hidden_units: 20
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 500000
time_horizon: 10
summary_freq: 2000
threaded: true

44
config/sac/Bouncer.yaml


behaviors:
Bouncer:
trainer: sac
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 64
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 1.0e6
memory_size: 128
normalize: true
steps_per_update: 10
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 20000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 1.0
reward_signal_steps_per_update: 10.0
network_settings:
normalize: true
hidden_units: 64
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 1000000
time_horizon: 64
summary_freq: 20000
threaded: true

44
config/sac/CrawlerDynamic.yaml


behaviors:
CrawlerDynamic:
trainer: sac
batch_size: 256
buffer_size: 500000
buffer_init_steps: 0
hidden_units: 512
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5e6
memory_size: 128
normalize: true
steps_per_update: 20
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 256
buffer_size: 500000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 20.0
save_replay_buffer: false
init_entcoef: 1.0
reward_signal_steps_per_update: 20.0
network_settings:
normalize: true
hidden_units: 512
num_layers: 3
vis_encode_type: simple
gamma: 0.995
gamma: 0.995
output_path: default
keep_checkpoints: 5
max_steps: 5000000
time_horizon: 1000
summary_freq: 30000
threaded: true

44
config/sac/CrawlerStatic.yaml


behaviors:
CrawlerStatic:
trainer: sac
batch_size: 256
buffer_size: 500000
buffer_init_steps: 2000
hidden_units: 512
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 3e6
memory_size: 128
normalize: true
steps_per_update: 20
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 256
buffer_size: 500000
buffer_init_steps: 2000
tau: 0.005
steps_per_update: 20.0
save_replay_buffer: false
init_entcoef: 1.0
reward_signal_steps_per_update: 20.0
network_settings:
normalize: true
hidden_units: 512
num_layers: 3
vis_encode_type: simple
gamma: 0.995
gamma: 0.995
output_path: default
keep_checkpoints: 5
max_steps: 3000000
time_horizon: 1000
summary_freq: 30000
threaded: true

44
config/sac/FoodCollector.yaml


behaviors:
FoodCollector:
trainer: sac
batch_size: 256
buffer_size: 500000
buffer_init_steps: 0
hidden_units: 128
init_entcoef: 0.05
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 2.0e6
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 10000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 256
buffer_size: 500000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 0.05
reward_signal_steps_per_update: 10.0
network_settings:
normalize: false
hidden_units: 128
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 2000000
time_horizon: 64
summary_freq: 10000
threaded: true

44
config/sac/GridWorld.yaml


behaviors:
GridWorld:
trainer: sac
batch_size: 128
buffer_size: 50000
buffer_init_steps: 1000
hidden_units: 128
init_entcoef: 0.5
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 500000
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 1
time_horizon: 5
sequence_length: 64
summary_freq: 20000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 128
buffer_size: 50000
buffer_init_steps: 1000
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 0.5
reward_signal_steps_per_update: 10.0
network_settings:
normalize: false
hidden_units: 128
num_layers: 1
vis_encode_type: simple
gamma: 0.9
gamma: 0.9
output_path: default
keep_checkpoints: 5
max_steps: 500000
time_horizon: 5
summary_freq: 20000
threaded: true

47
config/sac/Hallway.yaml


behaviors:
Hallway:
trainer: sac
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 128
init_entcoef: 0.1
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5.0e6
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 2
time_horizon: 64
sequence_length: 32
summary_freq: 10000
tau: 0.005
use_recurrent: true
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 0.1
reward_signal_steps_per_update: 10.0
network_settings:
normalize: false
hidden_units: 128
num_layers: 2
vis_encode_type: simple
memory:
sequence_length: 32
memory_size: 128
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 5000000
time_horizon: 64
summary_freq: 10000
threaded: true

44
config/sac/PushBlock.yaml


behaviors:
PushBlock:
trainer: sac
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 256
init_entcoef: 0.05
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 2e6
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 100000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 0.05
reward_signal_steps_per_update: 10.0
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 2000000
time_horizon: 64
summary_freq: 100000
threaded: true

48
config/sac/Pyramids.yaml


behaviors:
Pyramids:
trainer: sac
batch_size: 128
buffer_size: 500000
buffer_init_steps: 10000
hidden_units: 256
init_entcoef: 0.01
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 1.0e7
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 2
time_horizon: 128
sequence_length: 16
summary_freq: 30000
tau: 0.01
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 128
buffer_size: 500000
buffer_init_steps: 10000
tau: 0.01
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 0.01
reward_signal_steps_per_update: 10.0
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
strength: 2.0
strength: 2.0
strength: 0.02
strength: 0.02
learning_rate: 0.0003
use_vail: false
output_path: default
keep_checkpoints: 5
max_steps: 10000000
time_horizon: 128
summary_freq: 30000
threaded: true

44
config/sac/Reacher.yaml


behaviors:
Reacher:
trainer: sac
batch_size: 128
buffer_size: 500000
buffer_init_steps: 0
hidden_units: 128
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 2e7
memory_size: 128
normalize: true
steps_per_update: 20
num_layers: 2
time_horizon: 1000
sequence_length: 64
summary_freq: 60000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 128
buffer_size: 500000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 20.0
save_replay_buffer: false
init_entcoef: 1.0
reward_signal_steps_per_update: 20.0
network_settings:
normalize: true
hidden_units: 128
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 20000000
time_horizon: 1000
summary_freq: 60000
threaded: true

50
config/sac/Tennis.yaml


behaviors:
Tennis:
trainer: sac
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 256
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 2e7
memory_size: 128
normalize: true
steps_per_update: 10
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 10000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 1.0
reward_signal_steps_per_update: 10.0
network_settings:
normalize: true
hidden_units: 256
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 20000000
time_horizon: 64
summary_freq: 10000
threaded: true
window: 10
play_against_current_self_ratio: 0.5
team_change: 250000
window: 10
play_against_latest_model_ratio: 0.5
initial_elo: 1200.0

48
config/sac/VisualHallway.yaml


behaviors:
VisualHallway:
trainer: sac
batch_size: 64
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 128
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 1.0e7
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 1
time_horizon: 64
sequence_length: 32
summary_freq: 10000
tau: 0.005
use_recurrent: true
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 64
buffer_size: 50000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 1.0
reward_signal_steps_per_update: 10.0
network_settings:
normalize: false
hidden_units: 128
num_layers: 1
vis_encode_type: simple
memory:
sequence_length: 32
memory_size: 128
strength: 1.0
gamma: 0.99
strength: 1.0
output_path: default
keep_checkpoints: 5
max_steps: 10000000
time_horizon: 64
summary_freq: 10000
threaded: true

48
config/sac/VisualPushBlock.yaml


behaviors:
VisualPushBlock:
trainer: sac
batch_size: 64
buffer_size: 1024
buffer_init_steps: 0
hidden_units: 128
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 3.0e6
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 1
time_horizon: 64
sequence_length: 32
summary_freq: 60000
tau: 0.005
use_recurrent: true
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 64
buffer_size: 1024
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 1.0
reward_signal_steps_per_update: 10.0
network_settings:
normalize: false
hidden_units: 128
num_layers: 1
vis_encode_type: simple
memory:
sequence_length: 32
memory_size: 128
strength: 1.0
gamma: 0.99
strength: 1.0
output_path: default
keep_checkpoints: 5
max_steps: 3000000
time_horizon: 64
summary_freq: 60000
threaded: true

48
config/sac/VisualPyramids.yaml


behaviors:
VisualPyramids:
trainer: sac
batch_size: 64
buffer_size: 500000
buffer_init_steps: 1000
hidden_units: 256
init_entcoef: 0.01
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 1.0e7
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 1
time_horizon: 128
sequence_length: 64
summary_freq: 10000
tau: 0.01
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 64
buffer_size: 500000
buffer_init_steps: 1000
tau: 0.01
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 0.01
reward_signal_steps_per_update: 10.0
network_settings:
normalize: false
hidden_units: 256
num_layers: 1
vis_encode_type: simple
strength: 2.0
strength: 2.0
strength: 0.02
strength: 0.02
learning_rate: 0.0003
use_vail: false
output_path: default
keep_checkpoints: 5
max_steps: 10000000
time_horizon: 128
summary_freq: 10000
threaded: true

44
config/sac/Walker.yaml


behaviors:
Walker:
trainer: sac
batch_size: 256
buffer_size: 500000
buffer_init_steps: 0
hidden_units: 512
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 2e7
memory_size: 128
normalize: true
steps_per_update: 30
num_layers: 4
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 256
buffer_size: 500000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 30.0
save_replay_buffer: false
init_entcoef: 1.0
reward_signal_steps_per_update: 30.0
network_settings:
normalize: true
hidden_units: 512
num_layers: 4
vis_encode_type: simple
gamma: 0.995
gamma: 0.995
output_path: default
keep_checkpoints: 5
max_steps: 20000000
time_horizon: 1000
summary_freq: 30000
threaded: true

87
config/sac/WallJump.yaml


behaviors:
BigWallJump:
trainer: sac
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 256
init_entcoef: 0.1
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 2e7
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 2
time_horizon: 128
sequence_length: 64
summary_freq: 20000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 0.1
reward_signal_steps_per_update: 10.0
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
SmallWallJump:
trainer: sac
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 256
init_entcoef: 0.1
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5e6
memory_size: 128
normalize: false
steps_per_update: 10
num_layers: 2
output_path: default
keep_checkpoints: 5
max_steps: 20000000
sequence_length: 64
tau: 0.005
use_recurrent: false
vis_encode_type: simple
threaded: true
SmallWallJump:
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 0.1
reward_signal_steps_per_update: 10.0
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
gamma: 0.99
gamma: 0.99
output_path: default
keep_checkpoints: 5
max_steps: 5000000
time_horizon: 128
summary_freq: 20000
threaded: true

44
config/sac/WormDynamic.yaml


behaviors:
WormDynamic:
trainer: sac
batch_size: 256
buffer_size: 500000
buffer_init_steps: 0
hidden_units: 512
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 5e6
memory_size: 128
normalize: true
steps_per_update: 20
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 256
buffer_size: 500000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 20.0
save_replay_buffer: false
init_entcoef: 1.0
reward_signal_steps_per_update: 20.0
network_settings:
normalize: true
hidden_units: 512
num_layers: 3
vis_encode_type: simple
gamma: 0.995
gamma: 0.995
output_path: default
keep_checkpoints: 5
max_steps: 5000000
time_horizon: 1000
summary_freq: 30000
threaded: true

44
config/sac/WormStatic.yaml


behaviors:
WormStatic:
trainer: sac
batch_size: 256
buffer_size: 500000
buffer_init_steps: 2000
hidden_units: 512
init_entcoef: 1.0
learning_rate: 0.0003
learning_rate_schedule: constant
max_steps: 3e6
memory_size: 128
normalize: true
steps_per_update: 20
num_layers: 3
time_horizon: 1000
sequence_length: 64
summary_freq: 30000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 256
buffer_size: 500000
buffer_init_steps: 2000
tau: 0.005
steps_per_update: 20.0
save_replay_buffer: false
init_entcoef: 1.0
reward_signal_steps_per_update: 20.0
network_settings:
normalize: true
hidden_units: 512
num_layers: 3
vis_encode_type: simple
gamma: 0.995
gamma: 0.995
output_path: default
keep_checkpoints: 5
max_steps: 3000000
time_horizon: 1000
summary_freq: 30000
threaded: true

13
docs/Migrating.md


instead of `summaries/` and `models/`.
- Trainer configuration, curriculum configuration, and parameter randomization
configuration have all been moved to a single YAML file. (#3791)
- Trainer configuration format has changed, and using a "default" behavior name has
been deprecated. (#3936)
- `max_step` in the `TerminalStep` and `TerminalSteps` objects was renamed `interrupted`.
- On the UnityEnvironment API, `get_behavior_names()` and `get_behavior_specs()` methods were combined into the property `behavior_specs` that contains a mapping from behavior names to behavior spec.
- `use_visual` and `allow_multiple_visual_obs` in the `UnityToGymWrapper` constructor

### Steps to Migrate
- Before upgrading, copy your `Behavior Name` sections from `trainer_config.yaml` into
a separate trainer configuration file, under a `behaviors` section. You can move the `default` section too
if it's being used. This file should be specific to your environment, and not contain configurations for
multiple environments (unless they have the same Behavior Names).
- To upgrade your configuration files, an upgrade script has been provided. Run `python config/update_config.py
-h` to see the script usage.
To do it manually, copy your `<BehaviorName>` sections from `trainer_config.yaml` into a separate trainer configuration file, under a `behaviors` section.
The `default` section is no longer needed. This new file should be specific to your environment, and not contain
configurations for multiple environments (unless they have the same Behavior Names).
- You will need to reformat your trainer settings as per the [example](Training-ML-Agents.md).
- If your training uses [curriculum](Training-ML-Agents.md#curriculum-learning), move those configurations under
the `Behavior Name` section.
- If your training uses [parameter randomization](Training-ML-Agents.md#environment-parameter-randomization), move

113
docs/Training-Configuration-File.md


| **Setting** | **Description** |
| :----------------------- ||
| `trainer` | The type of trainer to use: `ppo` or `sac` |
| `summary_freq` | Number of experiences that needs to be collected before generating and displaying training statistics. This determines the granularity of the graphs in Tensorboard. |
| `batch_size` | Number of experiences in each iteration of gradient descent. **This should always be multiple times smaller than `buffer_size`**. If you are using a continuous action space, this value should be large (in the order of 1000s). If you are using a discrete action space, this value should be smaller (in order of 10s). <br><br> Typical range: (Continuous - PPO): `512` - `5120`; (Continuous - SAC): `128` - `1024`; (Discrete, PPO & SAC): `32` - `512`. |
| `buffer_size` | Number of experiences to collect before updating the policy model. Corresponds to how many experiences should be collected before we do any learning or updating of the model. **This should be multiple times larger than `batch_size`**. Typically a larger `buffer_size` corresponds to more stable training updates. In SAC, the max size of the experience buffer - on the order of thousands of times longer than your episodes, so that SAC can learn from old as well as new experiences. <br><br>Typical range: PPO: `2048` - `409600`; SAC: `50000` - `1000000` |
| `hidden_units` | Number of units in the hidden layers of the neural network. Correspond to how many units are in each fully connected layer of the neural network. For simple problems where the correct action is a straightforward combination of the observation inputs, this should be small. For problems where the action is a very complex interaction between the observation variables, this should be larger. <br><br> Typical range: `32` - `512` |
| `learning_rate` | Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3` |
| `learning_rate_schedule` | (Optional, default = `linear` for PPO and `constant` for SAC) Determines how learning rate changes over time. For PPO, we recommend decaying learning rate until max_steps so learning converges more stably. However, for some cases (e.g. training for an unknown amount of time) this feature can be disabled. For SAC, we recommend holding learning rate constant so that the agent can continue to learn until its Q function converges naturally. <br><br>`linear` decays the learning_rate linearly, reaching 0 at max_steps, while `constant` keeps the learning rate constant for the entire training run. |
| `max_steps` | Total number of experience points that must be collected from the simulation before ending the training process. <br><br>Typical range: `5e5` - `1e7` |
| `normalize` | Whether normalization is applied to the vector observation inputs. This normalization is based on the running average and variance of the vector observation. Normalization can be helpful in cases with complex continuous control problems, but may be harmful with simpler discrete control problems. |
| `num_layers` | The number of hidden layers in the neural network. Corresponds to how many hidden layers are present after the observation input, or after the CNN encoding of the visual observation. For simple problems, fewer layers are likely to train faster and more efficiently. More layers may be necessary for more complex control problems. <br><br> Typical range: `1` - `3` |
| `time_horizon` | How many steps of experience to collect per-agent before adding it to the experience buffer. When this limit is reached before the end of an episode, a value estimate is used to predict the overall expected reward from the agent's current state. As such, this parameter trades off between a less biased, but higher variance estimate (long time horizon) and more biased, but less varied estimate (short time horizon). In cases where there are frequent rewards within an episode, or episodes are prohibitively large, a smaller number can be more ideal. This number should be large enough to capture all the important behavior within a sequence of an agent's actions. <br><br> Typical range: `32` - `2048` |
| `vis_encoder_type` | (Optional, default = `simple`) Encoder type for encoding visual observations. <br><br> `simple` (default) uses a simple encoder which consists of two convolutional layers, `nature_cnn` uses the CNN implementation proposed by [Mnih et al.](https://www.nature.com/articles/nature14236), consisting of three convolutional layers, and `resnet` uses the [IMPALA Resnet](https://arxiv.org/abs/1802.01561) consisting of three stacked layers, each with two residual blocks, making a much larger network than the other two. |
| `init_path` | (Optional, default = None) Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. <br><br>You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run. |
| `threaded` | (Optional, default = `true`) By default, model updates can happen while the environment is being stepped. This violates the [on-policy](https://spinningup.openai.com/en/latest/user/algorithms.html#the-on-policy-algorithms) assumption of PPO slightly in exchange for a training speedup. To maintain the strict on-policyness of PPO, you can disable parallel updates by setting `threaded` to `false`. There is usually no reason to turn `threaded` off for SAC. |
| `trainer_type` | (default = `ppo`) The type of trainer to use: `ppo` or `sac` |
| `summary_freq` | (default = `50000`) Number of experiences that needs to be collected before generating and displaying training statistics. This determines the granularity of the graphs in Tensorboard. |
| `time_horizon` | (default = `64`) How many steps of experience to collect per-agent before adding it to the experience buffer. When this limit is reached before the end of an episode, a value estimate is used to predict the overall expected reward from the agent's current state. As such, this parameter trades off between a less biased, but higher variance estimate (long time horizon) and more biased, but less varied estimate (short time horizon). In cases where there are frequent rewards within an episode, or episodes are prohibitively large, a smaller number can be more ideal. This number should be large enough to capture all the important behavior within a sequence of an agent's actions. <br><br> Typical range: `32` - `2048` |
| `max_steps` | (default = `500000`) Total number of steps (i.e., observation collected and action taken) that must be taken in the environment (or across all environments if using multiple in parallel) before ending the training process. If you have multiple agents with the same behavior name within your environment, all steps taken by those agents will contribute to the same `max_steps` count. <br><br>Typical range: `5e5` - `1e7` |
| `keep_checkpoints` | (default = `5`) The maximum number of model checkpoints to keep. Checkpoints are saved after the number of steps specified by the save-freq option. Once the maximum number of checkpoints has been reached, the oldest checkpoint is deleted when saving a new checkpoint. |
| `init_path` | (default = None) Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. <br><br>You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run. |
| `threaded` | (default = `true`) By default, model updates can happen while the environment is being stepped. This violates the [on-policy](https://spinningup.openai.com/en/latest/user/algorithms.html#the-on-policy-algorithms) assumption of PPO slightly in exchange for a training speedup. To maintain the strict on-policyness of PPO, you can disable parallel updates by setting `threaded` to `false`. There is usually no reason to turn `threaded` off for SAC. |
| `hyperparameters -> learning_rate` | (default = `3e-4`) Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3` |
| `hyperparameters -> batch_size` | Number of experiences in each iteration of gradient descent. **This should always be multiple times smaller than `buffer_size`**. If you are using a continuous action space, this value should be large (in the order of 1000s). If you are using a discrete action space, this value should be smaller (in order of 10s). <br><br> Typical range: (Continuous - PPO): `512` - `5120`; (Continuous - SAC): `128` - `1024`; (Discrete, PPO & SAC): `32` - `512`. |
| `hyperparameters -> buffer_size` | (default = `10240` for PPO and `50000` for SAC) Number of experiences to collect before updating the policy model. Corresponds to how many experiences should be collected before we do any learning or updating of the model. **This should be multiple times larger than `batch_size`**. Typically a larger `buffer_size` corresponds to more stable training updates. In SAC, the max size of the experience buffer - on the order of thousands of times longer than your episodes, so that SAC can learn from old as well as new experiences. <br><br>Typical range: PPO: `2048` - `409600`; SAC: `50000` - `1000000` |
| `hyperparameters -> learning_rate_schedule` | (default = `linear` for PPO and `constant` for SAC) Determines how learning rate changes over time. For PPO, we recommend decaying learning rate until max_steps so learning converges more stably. However, for some cases (e.g. training for an unknown amount of time) this feature can be disabled. For SAC, we recommend holding learning rate constant so that the agent can continue to learn until its Q function converges naturally. <br><br>`linear` decays the learning_rate linearly, reaching 0 at max_steps, while `constant` keeps the learning rate constant for the entire training run. |
| `network_settings -> hidden_units` | (default = `128`) Number of units in the hidden layers of the neural network. Correspond to how many units are in each fully connected layer of the neural network. For simple problems where the correct action is a straightforward combination of the observation inputs, this should be small. For problems where the action is a very complex interaction between the observation variables, this should be larger. <br><br> Typical range: `32` - `512` |
| `network_settings -> num_layers` | (default = `false`) The number of hidden layers in the neural network. Corresponds to how many hidden layers are present after the observation input, or after the CNN encoding of the visual observation. For simple problems, fewer layers are likely to train faster and more efficiently. More layers may be necessary for more complex control problems. <br><br> Typical range: `1` - `3` |
| `network_settings -> normalize` | (default = `false`) Whether normalization is applied to the vector observation inputs. This normalization is based on the running average and variance of the vector observation. Normalization can be helpful in cases with complex continuous control problems, but may be harmful with simpler discrete control problems. |
| `network_settings -> vis_encoder_type` | (default = `simple`) Encoder type for encoding visual observations. <br><br> `simple` (default) uses a simple encoder which consists of two convolutional layers, `nature_cnn` uses the CNN implementation proposed by [Mnih et al.](https://www.nature.com/articles/nature14236), consisting of three convolutional layers, and `resnet` uses the [IMPALA Resnet](https://arxiv.org/abs/1802.01561) consisting of three stacked layers, each with two residual blocks, making a much larger network than the other two. |
## Trainer-specific Configurations

| **Setting** | **Description** |
| :---------- ||
| `beta` | Strength of the entropy regularization, which makes the policy "more random." This ensures that agents properly explore the action space during training. Increasing this will ensure more random actions are taken. This should be adjusted such that the entropy (measurable from TensorBoard) slowly decreases alongside increases in reward. If entropy drops too quickly, increase beta. If entropy drops too slowly, decrease `beta`. <br><br>Typical range: `1e-4` - `1e-2` |
| `epsilon` | Influences how rapidly the policy can evolve during training. Corresponds to the acceptable threshold of divergence between the old and new policies during gradient descent updating. Setting this value small will result in more stable updates, but will also slow the training process. <br><br>Typical range: `0.1` - `0.3` |
| `lambd` | Regularization parameter (lambda) used when calculating the Generalized Advantage Estimate ([GAE](https://arxiv.org/abs/1506.02438)). This can be thought of as how much the agent relies on its current value estimate when calculating an updated value estimate. Low values correspond to relying more on the current value estimate (which can be high bias), and high values correspond to relying more on the actual rewards received in the environment (which can be high variance). The parameter provides a trade-off between the two, and the right value can lead to a more stable training process. <br><br>Typical range: `0.9` - `0.95` |
| `num_epoch` | Number of passes to make through the experience buffer when performing gradient descent optimization.The larger the batch_size, the larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning. <br><br>Typical range: `3` - `10` |
| `hyperparameters -> beta` | (default = `5.0e-3`) Strength of the entropy regularization, which makes the policy "more random." This ensures that agents properly explore the action space during training. Increasing this will ensure more random actions are taken. This should be adjusted such that the entropy (measurable from TensorBoard) slowly decreases alongside increases in reward. If entropy drops too quickly, increase beta. If entropy drops too slowly, decrease `beta`. <br><br>Typical range: `1e-4` - `1e-2` |
| `hyperparameters -> epsilon` | (default = `0.2`) Influences how rapidly the policy can evolve during training. Corresponds to the acceptable threshold of divergence between the old and new policies during gradient descent updating. Setting this value small will result in more stable updates, but will also slow the training process. <br><br>Typical range: `0.1` - `0.3` |
| `hyperparameters -> lambd` | (default = `0.95`) Regularization parameter (lambda) used when calculating the Generalized Advantage Estimate ([GAE](https://arxiv.org/abs/1506.02438)). This can be thought of as how much the agent relies on its current value estimate when calculating an updated value estimate. Low values correspond to relying more on the current value estimate (which can be high bias), and high values correspond to relying more on the actual rewards received in the environment (which can be high variance). The parameter provides a trade-off between the two, and the right value can lead to a more stable training process. <br><br>Typical range: `0.9` - `0.95` |
| `hyperparameters -> num_epoch` | Number of passes to make through the experience buffer when performing gradient descent optimization.The larger the batch_size, the larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning. <br><br>Typical range: `3` - `10` |
| `buffer_init_steps` | Number of experiences to collect into the buffer before updating the policy model. As the untrained policy is fairly random, pre-filling the buffer with random actions is useful for exploration. Typically, at least several episodes of experiences should be pre-filled. <br><br>Typical range: `1000` - `10000` |
| `init_entcoef` | How much the agent should explore in the beginning of training. Corresponds to the initial entropy coefficient set at the beginning of training. In SAC, the agent is incentivized to make its actions entropic to facilitate better exploration. The entropy coefficient weighs the true reward with a bonus entropy reward. The entropy coefficient is [automatically adjusted](https://arxiv.org/abs/1812.05905) to a preset target entropy, so the `init_entcoef` only corresponds to the starting value of the entropy bonus. Increase init_entcoef to explore more in the beginning, decrease to converge to a solution faster. <br><br>Typical range: (Continuous): `0.5` - `1.0`; (Discrete): `0.05` - `0.5` |
| `save_replay_buffer` | (Optional, default = `false`) Whether to save and load the experience replay buffer as well as the model when quitting and re-starting training. This may help resumes go more smoothly, as the experiences collected won't be wiped. Note that replay buffers can be very large, and will take up a considerable amount of disk space. For that reason, we disable this feature by default. |
| `tau` | How aggressively to update the target network used for bootstrapping value estimation in SAC. Corresponds to the magnitude of the target Q update during the SAC model update. In SAC, there are two neural networks: the target and the policy. The target network is used to bootstrap the policy's estimate of the future rewards at a given state, and is fixed while the policy is being updated. This target is then slowly updated according to tau. Typically, this value should be left at 0.005. For simple problems, increasing tau to 0.01 might reduce the time it takes to learn, at the cost of stability. <br><br>Typical range: `0.005` - `0.01` |
| `steps_per_update` | Average ratio of agent steps (actions) taken to updates made of the agent's policy. In SAC, a single "update" corresponds to grabbing a batch of size `batch_size` from the experience replay buffer, and using this mini batch to update the models. Note that it is not guaranteed that after exactly `steps_per_update` steps an update will be made, only that the ratio will hold true over many steps. Typically, `steps_per_update` should be greater than or equal to 1. Note that setting `steps_per_update` lower will improve sample efficiency (reduce the number of steps required to train) but increase the CPU time spent performing updates. For most environments where steps are fairly fast (e.g. our example environments) `steps_per_update` equal to the number of agents in the scene is a good balance. For slow environments (steps take 0.1 seconds or more) reducing `steps_per_update` may improve training speed. We can also change `steps_per_update` to lower than 1 to update more often than once per step, though this will usually result in a slowdown unless the environment is very slow. <br><br>Typical range: `1` - `20` |
| `hyperparameters -> buffer_init_steps` | (default = `0`) Number of experiences to collect into the buffer before updating the policy model. As the untrained policy is fairly random, pre-filling the buffer with random actions is useful for exploration. Typically, at least several episodes of experiences should be pre-filled. <br><br>Typical range: `1000` - `10000` |
| `hyperparameters -> init_entcoef` | (default = `1.0`) How much the agent should explore in the beginning of training. Corresponds to the initial entropy coefficient set at the beginning of training. In SAC, the agent is incentivized to make its actions entropic to facilitate better exploration. The entropy coefficient weighs the true reward with a bonus entropy reward. The entropy coefficient is [automatically adjusted](https://arxiv.org/abs/1812.05905) to a preset target entropy, so the `init_entcoef` only corresponds to the starting value of the entropy bonus. Increase init_entcoef to explore more in the beginning, decrease to converge to a solution faster. <br><br>Typical range: (Continuous): `0.5` - `1.0`; (Discrete): `0.05` - `0.5` |
| `hyperparameters -> save_replay_buffer` | (default = `false`) Whether to save and load the experience replay buffer as well as the model when quitting and re-starting training. This may help resumes go more smoothly, as the experiences collected won't be wiped. Note that replay buffers can be very large, and will take up a considerable amount of disk space. For that reason, we disable this feature by default. |
| `hyperparameters -> tau` | (default = `0.005`) How aggressively to update the target network used for bootstrapping value estimation in SAC. Corresponds to the magnitude of the target Q update during the SAC model update. In SAC, there are two neural networks: the target and the policy. The target network is used to bootstrap the policy's estimate of the future rewards at a given state, and is fixed while the policy is being updated. This target is then slowly updated according to tau. Typically, this value should be left at 0.005. For simple problems, increasing tau to 0.01 might reduce the time it takes to learn, at the cost of stability. <br><br>Typical range: `0.005` - `0.01` |
| `hyperparameters -> steps_per_update` | (default = `1`) Average ratio of agent steps (actions) taken to updates made of the agent's policy. In SAC, a single "update" corresponds to grabbing a batch of size `batch_size` from the experience replay buffer, and using this mini batch to update the models. Note that it is not guaranteed that after exactly `steps_per_update` steps an update will be made, only that the ratio will hold true over many steps. Typically, `steps_per_update` should be greater than or equal to 1. Note that setting `steps_per_update` lower will improve sample efficiency (reduce the number of steps required to train) but increase the CPU time spent performing updates. For most environments where steps are fairly fast (e.g. our example environments) `steps_per_update` equal to the number of agents in the scene is a good balance. For slow environments (steps take 0.1 seconds or more) reducing `steps_per_update` may improve training speed. We can also change `steps_per_update` to lower than 1 to update more often than once per step, though this will usually result in a slowdown unless the environment is very slow. <br><br>Typical range: `1` - `20` |
| `hyperparameters -> reward_signal_num_update` | (default = `steps_per_update`) Number of steps per mini batch sampled and used for updating the reward signals. By default, we update the reward signals once every time the main policy is updated. However, to imitate the training procedure in certain imitation learning papers (e.g. [Kostrikov et. al](http://arxiv.org/abs/1809.02925), [Blondé et. al](http://arxiv.org/abs/1809.02064)), we may want to update the reward signal (GAIL) M times for every update of the policy. We can change `steps_per_update` of SAC to N, as well as `reward_signal_steps_per_update` under `reward_signals` to N / M to accomplish this. By default, `reward_signal_steps_per_update` is set to `steps_per_update`. |
## Reward Signals

| **Setting** | **Description** |
| :---------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `extrinsic -> strength` | Factor by which to multiply the reward given by the environment. Typical ranges will vary depending on the reward signal. <br><br>Typical range: `1.00` |
| `extrinsic -> gamma` | Discount factor for future rewards coming from the environment. This can be thought of as how far into the future the agent should care about possible rewards. In situations when the agent should be acting in the present in order to prepare for rewards in the distant future, this value should be large. In cases when rewards are more immediate, it can be smaller. Must be strictly smaller than 1. <br><br>Typical range: `0.8` - `0.995` |
| `extrinsic -> strength` | (default = `1.0`) Factor by which to multiply the reward given by the environment. Typical ranges will vary depending on the reward signal. <br><br>Typical range: `1.00` |
| `extrinsic -> gamma` | (default = `0.99`) Discount factor for future rewards coming from the environment. This can be thought of as how far into the future the agent should care about possible rewards. In situations when the agent should be acting in the present in order to prepare for rewards in the distant future, this value should be large. In cases when rewards are more immediate, it can be smaller. Must be strictly smaller than 1. <br><br>Typical range: `0.8` - `0.995` |
### Curiosity Intrinsic Reward

| :--------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `curiosity -> strength` | Magnitude of the curiosity reward generated by the intrinsic curiosity module. This should be scaled in order to ensure it is large enough to not be overwhelmed by extrinsic reward signals in the environment. Likewise it should not be too large to overwhelm the extrinsic reward signal. <br><br>Typical range: `0.001` - `0.1` |
| `curiosity -> gamma` | Discount factor for future rewards. <br><br>Typical range: `0.8` - `0.995` |
| `curiosity -> encoding_size` | (Optional, default = `64`) Size of the encoding used by the intrinsic curiosity model. This value should be small enough to encourage the ICM to compress the original observation, but also not too small to prevent it from learning to differentiate between expected and actual observations. <br><br>Typical range: `64` - `256` |
| `curiosity -> learning_rate` | (Optional, default = `3e-4`) Learning rate used to update the intrinsic curiosity module. This should typically be decreased if training is unstable, and the curiosity loss is unstable. <br><br>Typical range: `1e-5` - `1e-3` |
| `curiosity -> strength` | (default = `1.0`) Magnitude of the curiosity reward generated by the intrinsic curiosity module. This should be scaled in order to ensure it is large enough to not be overwhelmed by extrinsic reward signals in the environment. Likewise it should not be too large to overwhelm the extrinsic reward signal. <br><br>Typical range: `0.001` - `0.1` |
| `curiosity -> gamma` | (default = `0.99`) Discount factor for future rewards. <br><br>Typical range: `0.8` - `0.995` |
| `curiosity -> encoding_size` | (default = `64`) Size of the encoding used by the intrinsic curiosity model. This value should be small enough to encourage the ICM to compress the original observation, but also not too small to prevent it from learning to differentiate between expected and actual observations. <br><br>Typical range: `64` - `256` |
| `curiosity -> learning_rate` | (default = `3e-4`) Learning rate used to update the intrinsic curiosity module. This should typically be decreased if training is unstable, and the curiosity loss is unstable. <br><br>Typical range: `1e-5` - `1e-3` |
### GAIL Intrinsic Reward

| **Setting** | **Description** |
| :---------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `gail -> strength` | Factor by which to multiply the raw reward. Note that when using GAIL with an Extrinsic Signal, this value should be set lower if your demonstrations are suboptimal (e.g. from a human), so that a trained agent will focus on receiving extrinsic rewards instead of exactly copying the demonstrations. Keep the strength below about 0.1 in those cases. <br><br>Typical range: `0.01` - `1.0` |
| `gail -> gamma` | Discount factor for future rewards. <br><br>Typical range: `0.8` - `0.9` |
| `gail -> demo_path` | The path to your .demo file or directory of .demo files. |
| `gail -> encoding_size` | (Optional, default = `64`) Size of the hidden layer used by the discriminator. This value should be small enough to encourage the discriminator to compress the original observation, but also not too small to prevent it from learning to differentiate between demonstrated and actual behavior. Dramatically increasing this size will also negatively affect training times. <br><br>Typical range: `64` - `256` |
| `gail -> strength` | (default = `1.0`) Factor by which to multiply the raw reward. Note that when using GAIL with an Extrinsic Signal, this value should be set lower if your demonstrations are suboptimal (e.g. from a human), so that a trained agent will focus on receiving extrinsic rewards instead of exactly copying the demonstrations. Keep the strength below about 0.1 in those cases. <br><br>Typical range: `0.01` - `1.0` |
| `gail -> gamma` | (default = `0.99`) Discount factor for future rewards. <br><br>Typical range: `0.8` - `0.9` |
| `gail -> demo_path` | (Required, no default) The path to your .demo file or directory of .demo files. |
| `gail -> encoding_size` | (default = `64`) Size of the hidden layer used by the discriminator. This value should be small enough to encourage the discriminator to compress the original observation, but also not too small to prevent it from learning to differentiate between demonstrated and actual behavior. Dramatically increasing this size will also negatively affect training times. <br><br>Typical range: `64` - `256` |
| `gail -> use_actions` | (Optional, default = `false`) Determines whether the discriminator should discriminate based on both observations and actions, or just observations. Set to True if you want the agent to mimic the actions from the demonstrations, and False if you'd rather have the agent visit the same states as in the demonstrations but with possibly different actions. Setting to False is more likely to be stable, especially with imperfect demonstrations, but may learn slower. |
| `gail -> use_vail` | (Optional, default = `false`) Enables a variational bottleneck within the GAIL discriminator. This forces the discriminator to learn a more general representation and reduces its tendency to be "too good" at discriminating, making learning more stable. However, it does increase training time. Enable this if you notice your imitation learning is unstable, or unable to learn the task at hand. |
### Reward Signal Settings for SAC
All of the reward signals configurations described above apply to both PPO and
SAC. There is one configuration for all reward signals that only applies to SAC.
| `gail -> use_actions` | (default = `false`) Determines whether the discriminator should discriminate based on both observations and actions, or just observations. Set to True if you want the agent to mimic the actions from the demonstrations, and False if you'd rather have the agent visit the same states as in the demonstrations but with possibly different actions. Setting to False is more likely to be stable, especially with imperfect demonstrations, but may learn slower. |
| `gail -> use_vail` | (default = `false`) Enables a variational bottleneck within the GAIL discriminator. This forces the discriminator to learn a more general representation and reduces its tendency to be "too good" at discriminating, making learning more stable. However, it does increase training time. Enable this if you notice your imitation learning is unstable, or unable to learn the task at hand. |
| **Setting** | **Description** |
| :------------------------------------------- ||
| `reward_signals -> reward_signal_num_update` | (Optional, default = `steps_per_update`) Number of steps per mini batch sampled and used for updating the reward signals. By default, we update the reward signals once every time the main policy is updated. However, to imitate the training procedure in certain imitation learning papers (e.g. [Kostrikov et. al](http://arxiv.org/abs/1809.02925), [Blondé et. al](http://arxiv.org/abs/1809.02064)), we may want to update the reward signal (GAIL) M times for every update of the policy. We can change `steps_per_update` of SAC to N, as well as `reward_signal_steps_per_update` under `reward_signals` to N / M to accomplish this. By default, `reward_signal_steps_per_update` is set to `steps_per_update`. |
## Behavioral Cloning

| **Setting** | **Description** |
| :------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `demo_path` | The path to your .demo file or directory of .demo files. |
| `strength` | Learning rate of the imitation relative to the learning rate of PPO, and roughly corresponds to how strongly we allow BC to influence the policy. <br><br>Typical range: `0.1` - `0.5` |
| `steps` | During BC, it is often desirable to stop using demonstrations after the agent has "seen" rewards, and allow it to optimize past the available demonstrations and/or generalize outside of the provided demonstrations. steps corresponds to the training steps over which BC is active. The learning rate of BC will anneal over the steps. Set the steps to 0 for constant imitation over the entire training run. |
| `batch_size` | Number of demonstration experiences used for one iteration of a gradient descent update. If not specified, it will default to the `batch_size`. <br><br>Typical range: (Continuous): `512` - `5120`; (Discrete): `32` - `512` |
| `num_epoch` | Number of passes through the experience buffer during gradient descent. If not specified, it will default to the number of epochs set for PPO. <br><br>Typical range: `3` - `10` |
| `samples_per_update` | (Optional, default = `0`) Maximum number of samples to use during each imitation update. You may want to lower this if your demonstration dataset is very large to avoid overfitting the policy on demonstrations. Set to 0 to train over all of the demonstrations at each update step. <br><br>Typical range: `buffer_size` |
| `init_path` | Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run. |
| `demo_path` | (Required, no default) The path to your .demo file or directory of .demo files. |
| `strength` | (default = `1.0`) Learning rate of the imitation relative to the learning rate of PPO, and roughly corresponds to how strongly we allow BC to influence the policy. <br><br>Typical range: `0.1` - `0.5` |
| `steps` | (default = `0`) During BC, it is often desirable to stop using demonstrations after the agent has "seen" rewards, and allow it to optimize past the available demonstrations and/or generalize outside of the provided demonstrations. steps corresponds to the training steps over which BC is active. The learning rate of BC will anneal over the steps. Set the steps to 0 for constant imitation over the entire training run. |
| `batch_size` | (default = `batch_size` of trainer) Number of demonstration experiences used for one iteration of a gradient descent update. If not specified, it will default to the `batch_size` of the trainer. <br><br>Typical range: (Continuous): `512` - `5120`; (Discrete): `32` - `512` |
| `num_epoch` | (default = `num_epoch` of trainer) Number of passes through the experience buffer during gradient descent. If not specified, it will default to the number of epochs set for PPO. <br><br>Typical range: `3` - `10` |
| `samples_per_update` | (default = `0`) Maximum number of samples to use during each imitation update. You may want to lower this if your demonstration dataset is very large to avoid overfitting the policy on demonstrations. Set to 0 to train over all of the demonstrations at each update step. <br><br>Typical range: `buffer_size`
You can enable your agents to use memory, by setting `use_recurrent` to `true`
You can enable your agents to use memory by adding a `memory` section under `network_settings`,
| `use_recurrent` | Whether to enable this option or not. |
| `memory_size` | Size of the memory an agent must keep. In order to use a LSTM, training requires a sequence of experiences instead of single experiences. Corresponds to the size of the array of floating point numbers used to store the hidden state of the recurrent neural network of the policy. This value must be a multiple of 2, and should scale with the amount of information you expect the agent will need to remember in order to successfully complete the task. <br><br>Typical range: `32` - `256` |
| `sequence_length` | Defines how long the sequences of experiences must be while training. Note that if this number is too small, the agent will not be able to remember things over longer periods of time. If this number is too large, the neural network will take longer to train. <br><br>Typical range: `4` - `128` |
| `network_settings -> memory -> memory_size` | (default = `128`) Size of the memory an agent must keep. In order to use a LSTM, training requires a sequence of experiences instead of single experiences. Corresponds to the size of the array of floating point numbers used to store the hidden state of the recurrent neural network of the policy. This value must be a multiple of 2, and should scale with the amount of information you expect the agent will need to remember in order to successfully complete the task. <br><br>Typical range: `32` - `256` |
| `network_settings -> memory -> sequence_length` | (default = `64`) Defines how long the sequences of experiences must be while training. Note that if this number is too small, the agent will not be able to remember things over longer periods of time. If this number is too large, the neural network will take longer to train. <br><br>Typical range: `4` - `128` |
A few considerations when deciding to use memory:

| **Setting** | **Description** |
| :-------------------------------- ||
| `save_steps` | Number of _trainer steps_ between snapshots. For example, if `save_steps=10000` then a snapshot of the current policy will be saved every `10000` trainer steps. Note, trainer steps are counted per agent. For more information, please see the [migration doc](Migrating.md) after v0.13. <br><br>A larger value of `save_steps` will yield a set of opponents that cover a wider range of skill levels and possibly play styles since the policy receives more training. As a result, the agent trains against a wider variety of opponents. Learning a policy to defeat more diverse opponents is a harder problem and so may require more overall training steps but also may lead to more general and robust policy at the end of training. This value is also dependent on how intrinsically difficult the environment is for the agent. <br><br> Typical range: `10000` - `100000` |
| `team_change` | Number of _trainer_steps_ between switching the learning team. This is the number of trainer steps the teams associated with a specific ghost trainer will train before a different team becomes the new learning team. It is possible that, in asymmetric games, opposing teams require fewer trainer steps to make similar performance gains. This enables users to train a more complicated team of agents for more trainer steps than a simpler team of agents per team switch. <br><br>A larger value of `team-change` will allow the agent to train longer against it's opponents. The longer an agent trains against the same set of opponents the more able it will be to defeat them. However, training against them for too long may result in overfitting to the particular opponent strategies and so the agent may fail against the next batch of opponents. <br><br> The value of `team-change` will determine how many snapshots of the agent's policy are saved to be used as opponents for the other team. So, we recommend setting this value as a function of the `save_steps` parameter discussed previously. <br><br> Typical range: 4x-10x where x=`save_steps` |
| `swap_steps` | Number of _ghost steps_ (not trainer steps) between swapping the opponents policy with a different snapshot. A 'ghost step' refers to a step taken by an agent _that is following a fixed policy and not learning_. The reason for this distinction is that in asymmetric games, we may have teams with an unequal number of agents e.g. a 2v1 scenario like our Strikers Vs Goalie example environment. The team with two agents collects twice as many agent steps per environment step as the team with one agent. Thus, these two values will need to be distinct to ensure that the same number of trainer steps corresponds to the same number of opponent swaps for each team. The formula for `swap_steps` if a user desires `x` swaps of a team with `num_agents` agents against an opponent team with `num_opponent_agents` agents during `team-change` total steps is: `(num_agents / num_opponent_agents) * (team_change / x)` <br><br> Typical range: `10000` - `100000` |
| `play_against_latest_model_ratio` | Probability an agent will play against the latest opponent policy. With probability 1 - `play_against_latest_model_ratio`, the agent will play against a snapshot of its opponent from a past iteration. <br><br> A larger value of `play_against_latest_model_ratio` indicates that an agent will be playing against the current opponent more often. Since the agent is updating it's policy, the opponent will be different from iteration to iteration. This can lead to an unstable learning environment, but poses the agent with an [auto-curricula](https://openai.com/blog/emergent-tool-use/) of more increasingly challenging situations which may lead to a stronger final policy. <br><br> Typical range: `0.0` - `1.0` |
| `window` | Size of the sliding window of past snapshots from which the agent's opponents are sampled. For example, a `window` size of 5 will save the last 5 snapshots taken. Each time a new snapshot is taken, the oldest is discarded. A larger value of `window` means that an agent's pool of opponents will contain a larger diversity of behaviors since it will contain policies from earlier in the training run. Like in the `save_steps` hyperparameter, the agent trains against a wider variety of opponents. Learning a policy to defeat more diverse opponents is a harder problem and so may require more overall training steps but also may lead to more general and robust policy at the end of training. <br><br> Typical range: `5` - `30` |
| `save_steps` | (default = `20000`) Number of _trainer steps_ between snapshots. For example, if `save_steps=10000` then a snapshot of the current policy will be saved every `10000` trainer steps. Note, trainer steps are counted per agent. For more information, please see the [migration doc](Migrating.md) after v0.13. <br><br>A larger value of `save_steps` will yield a set of opponents that cover a wider range of skill levels and possibly play styles since the policy receives more training. As a result, the agent trains against a wider variety of opponents. Learning a policy to defeat more diverse opponents is a harder problem and so may require more overall training steps but also may lead to more general and robust policy at the end of training. This value is also dependent on how intrinsically difficult the environment is for the agent. <br><br> Typical range: `10000` - `100000` |
| `team_change` | (default = `5 * save_steps`) Number of _trainer_steps_ between switching the learning team. This is the number of trainer steps the teams associated with a specific ghost trainer will train before a different team becomes the new learning team. It is possible that, in asymmetric games, opposing teams require fewer trainer steps to make similar performance gains. This enables users to train a more complicated team of agents for more trainer steps than a simpler team of agents per team switch. <br><br>A larger value of `team-change` will allow the agent to train longer against it's opponents. The longer an agent trains against the same set of opponents the more able it will be to defeat them. However, training against them for too long may result in overfitting to the particular opponent strategies and so the agent may fail against the next batch of opponents. <br><br> The value of `team-change` will determine how many snapshots of the agent's policy are saved to be used as opponents for the other team. So, we recommend setting this value as a function of the `save_steps` parameter discussed previously. <br><br> Typical range: 4x-10x where x=`save_steps` |
| `swap_steps` | (default = `10000`) Number of _ghost steps_ (not trainer steps) between swapping the opponents policy with a different snapshot. A 'ghost step' refers to a step taken by an agent _that is following a fixed policy and not learning_. The reason for this distinction is that in asymmetric games, we may have teams with an unequal number of agents e.g. a 2v1 scenario like our Strikers Vs Goalie example environment. The team with two agents collects twice as many agent steps per environment step as the team with one agent. Thus, these two values will need to be distinct to ensure that the same number of trainer steps corresponds to the same number of opponent swaps for each team. The formula for `swap_steps` if a user desires `x` swaps of a team with `num_agents` agents against an opponent team with `num_opponent_agents` agents during `team-change` total steps is: `(num_agents / num_opponent_agents) * (team_change / x)` <br><br> Typical range: `10000` - `100000` |
| `play_against_latest_model_ratio` | (default = `0.5`) Probability an agent will play against the latest opponent policy. With probability 1 - `play_against_latest_model_ratio`, the agent will play against a snapshot of its opponent from a past iteration. <br><br> A larger value of `play_against_latest_model_ratio` indicates that an agent will be playing against the current opponent more often. Since the agent is updating it's policy, the opponent will be different from iteration to iteration. This can lead to an unstable learning environment, but poses the agent with an [auto-curricula](https://openai.com/blog/emergent-tool-use/) of more increasingly challenging situations which may lead to a stronger final policy. <br><br> Typical range: `0.0` - `1.0` |
| `window` | (default = `10`) Size of the sliding window of past snapshots from which the agent's opponents are sampled. For example, a `window` size of 5 will save the last 5 snapshots taken. Each time a new snapshot is taken, the oldest is discarded. A larger value of `window` means that an agent's pool of opponents will contain a larger diversity of behaviors since it will contain policies from earlier in the training run. Like in the `save_steps` hyperparameter, the agent trains against a wider variety of opponents. Learning a policy to defeat more diverse opponents is a harder problem and so may require more overall training steps but also may lead to more general and robust policy at the end of training. <br><br> Typical range: `5` - `30` |
### Note on Reward Signals

141
docs/Training-ML-Agents.md


The rest of this guide breaks down the different sub-sections of the trainer config file
and explains the possible settings for each.
**NOTE:** The configuration file format has been changed from 0.17.0 and onwards. To convert
an old set of configuration files (trainer config, curriculum, and sampler files) to the new
format, a script has been provided. Run `python config/upgrade_config.py -h` in your console
to see the script's usage.
### Behavior Configurations
The primary section of the trainer config file is a

```yaml
behaviors:
BehaviorPPO:
trainer: ppo
trainer_type: ppo
hyperparameters:
# Hyperparameters common to PPO and SAC
batch_size: 1024
buffer_size: 10240
learning_rate: 3.0e-4
learning_rate_schedule: linear
# Trainer configs common to PPO/SAC (excluding reward signals)
batch_size: 1024
buffer_size: 10240
hidden_units: 128
learning_rate: 3.0e-4
learning_rate_schedule: linear
# PPO-specific hyperparameters
# Replaces the "PPO-specific hyperparameters" section above
beta: 5.0e-3
epsilon: 0.2
lambd: 0.95
num_epoch: 3
# Configuration of the neural network (common to PPO/SAC)
network_settings:
vis_encoder_type: simple
normalize: false
hidden_units: 128
num_layers: 2
# memory
memory:
sequence_length: 64
memory_size: 256
# Trainer configurations common to all trainers
normalize: false
num_layers: 2
vis_encoder_type: simple
init_path: null
# PPO-specific configs
beta: 5.0e-3
epsilon: 0.2
lambd: 0.95
num_epoch: 3
keep_checkpoints: 5
# memory
use_recurrent: true
sequence_length: 64
memory_size: 256
init_path: null
# behavior cloning
behavioral_cloning:

samples_per_update: 0
reward_signals:
# environment reward
# environment reward (default)
extrinsic:
strength: 1.0
gamma: 0.99

```yaml
behaviors:
BehaviorSAC:
trainer: sac
trainer_type: sac
# SAC-specific configs (replaces the "PPO-specific configs" section above)
buffer_init_steps: 0
tau: 0.005
steps_per_update: 1
train_interval: 1
init_entcoef: 1.0
save_replay_buffer: false
# SAC-specific configs (replaces the hyperparameters section above)
hyperparameters:
# Hyperparameters common to PPO and SAC
# Same as PPO config
# SAC-specific hyperparameters
# Replaces the "PPO-specific hyperparameters" section above
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 0.5
reward_signal_steps_per_update: 10.0
# memory
# same as PPO config
# Configuration of the neural network (common to PPO/SAC)
network_settings:
# Same as PPO config
# Trainer configurations common to all trainers
# <Same as PPO config>
# pre-training using behavior cloning
behavioral_cloning:

reward_signal_num_update: 1 # only applies to SAC
# environment reward
extrinsic:
# same as PPO config

We now break apart the components of the configuration file and describe what
each of these parameters mean and provide guidelines on how to set them. See
[Training Configuration File](Training-Configuration-File.md) for a detailed
description of all the configurations listed above.
description of all the configurations listed above, along with their defaults.
Unless otherwise specified, omitting a configuration will revert it to its default.
To enable curriculum learning, you need to add a sub-section to the corresponding
`behaivors` entry in the trainer config YAML file that defines the curriculum for that
behavior. Here is one example:
To enable curriculum learning, you need to add a `curriculum ` sub-section to the trainer
configuration YAML file. Within this sub-section, add an entry for each behavior that defines
the curriculum for thatbehavior. Here is one example:
```yml
behaviors:

# Add this section
curriculum:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
wall_height: [1.5, 2.0, 2.5, 4.0]
# Add this section
curriculum:
BehaviorY:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
wall_height: [1.5, 2.0, 2.5, 4.0]
```
Each group of Agents under the same `Behavior Name` in an environment can have a

behaviors:
BigWallJump:
# < Trainer parameters for BigWallJump >
# Curriculum configuration
curriculum:
SmallWallJump:
# < Trainer parameters for SmallWallJump >
curriculum:
BigWallJump:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100

big_wall_max_height: [4.0, 7.0, 8.0, 8.0]
# < Trainer parameters for BigWallJump >
# Curriculum configuration
curriculum:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
small_wall_height: [1.5, 2.0, 2.5, 4.0]
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
small_wall_height: [1.5, 2.0, 2.5, 4.0]
```
The curriculum for each Behavior has the following parameters:

#### Training with a Curriculum
Once we have specified our metacurriculum and curricula, we can launch
`mlagents-learn` using the config file for
`mlagents-learn` to point to the config file containing
our curricula and PPO will train using Curriculum Learning. For example, to
train agents in the Wall Jump environment with curriculum learning, we can run:

233
ml-agents/mlagents/trainers/cli_utils.py


from typing import Set
from typing import Set, Dict, Any, TextIO
import os
import yaml
from mlagents.trainers.exception import TrainerConfigError
from mlagents_envs.environment import UnityEnvironment
import argparse

def __call__(self, arg_parser, namespace, values, option_string=None):
delattr(namespace, self.dest)
StoreConfigFile.trainer_config_path = values
def _create_parser() -> argparse.ArgumentParser:
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
argparser.add_argument(
"trainer_config_path", action=StoreConfigFile, nargs="?", default=None
)
argparser.add_argument(
"--env",
default=None,
dest="env_path",
help="Path to the Unity executable to train",
action=DetectDefault,
)
argparser.add_argument(
"--lesson",
default=0,
type=int,
help="The lesson to start with when performing curriculum training",
action=DetectDefault,
)
argparser.add_argument(
"--load",
default=False,
dest="load_model",
action=DetectDefaultStoreTrue,
help=argparse.SUPPRESS, # Deprecated but still usable for now.
)
argparser.add_argument(
"--resume",
default=False,
dest="resume",
action=DetectDefaultStoreTrue,
help="Whether to resume training from a checkpoint. Specify a --run-id to use this option. "
"If set, the training code loads an already trained model to initialize the neural network "
"before resuming training. This option is only valid when the models exist, and have the same "
"behavior names as the current agents in your scene.",
)
argparser.add_argument(
"--force",
default=False,
dest="force",
action=DetectDefaultStoreTrue,
help="Whether to force-overwrite this run-id's existing summary and model data. (Without "
"this flag, attempting to train a model with a run-id that has been used before will throw "
"an error.",
)
argparser.add_argument(
"--run-id",
default="ppo",
help="The identifier for the training run. This identifier is used to name the "
"subdirectories in which the trained model and summary statistics are saved as well "
"as the saved model itself. If you use TensorBoard to view the training statistics, "
"always set a unique run-id for each training run. (The statistics for all runs with the "
"same id are combined as if they were produced by a the same session.)",
action=DetectDefault,
)
argparser.add_argument(
"--initialize-from",
metavar="RUN_ID",
default=None,
help="Specify a previously saved run ID from which to initialize the model from. "
"This can be used, for instance, to fine-tune an existing model on a new environment. "
"Note that the previously saved models must have the same behavior parameters as your "
"current environment.",
action=DetectDefault,
)
argparser.add_argument(
"--save-freq",
default=50000,
type=int,
help="How often (in steps) to save the model during training",
action=DetectDefault,
)
argparser.add_argument(
"--seed",
default=-1,
type=int,
help="A number to use as a seed for the random number generator used by the training code",
action=DetectDefault,
)
argparser.add_argument(
"--train",
default=False,
dest="train_model",
action=DetectDefaultStoreTrue,
help=argparse.SUPPRESS,
)
argparser.add_argument(
"--inference",
default=False,
dest="inference",
action=DetectDefaultStoreTrue,
help="Whether to run in Python inference mode (i.e. no training). Use with --resume to load "
"a model trained with an existing run ID.",
)
argparser.add_argument(
"--base-port",
default=UnityEnvironment.BASE_ENVIRONMENT_PORT,
type=int,
help="The starting port for environment communication. Each concurrent Unity environment "
"instance will get assigned a port sequentially, starting from the base-port. Each instance "
"will use the port (base_port + worker_id), where the worker_id is sequential IDs given to "
"each instance from 0 to (num_envs - 1). Note that when training using the Editor rather "
"than an executable, the base port will be ignored.",
action=DetectDefault,
)
argparser.add_argument(
"--num-envs",
default=1,
type=int,
help="The number of concurrent Unity environment instances to collect experiences "
"from when training",
action=DetectDefault,
)
argparser.add_argument(
"--debug",
default=False,
action=DetectDefaultStoreTrue,
help="Whether to enable debug-level logging for some parts of the code",
)
argparser.add_argument(
"--env-args",
default=None,
nargs=argparse.REMAINDER,
help="Arguments passed to the Unity executable. Be aware that the standalone build will also "
"process these as Unity Command Line Arguments. You should choose different argument names if "
"you want to create environment-specific arguments. All arguments after this flag will be "
"passed to the executable.",
action=DetectDefault,
)
argparser.add_argument(
"--cpu",
default=False,
action=DetectDefaultStoreTrue,
help="Forces training using CPU only",
)
eng_conf = argparser.add_argument_group(title="Engine Configuration")
eng_conf.add_argument(
"--width",
default=84,
type=int,
help="The width of the executable window of the environment(s) in pixels "
"(ignored for editor training).",
action=DetectDefault,
)
eng_conf.add_argument(
"--height",
default=84,
type=int,
help="The height of the executable window of the environment(s) in pixels "
"(ignored for editor training)",
action=DetectDefault,
)
eng_conf.add_argument(
"--quality-level",
default=5,
type=int,
help="The quality level of the environment(s). Equivalent to calling "
"QualitySettings.SetQualityLevel in Unity.",
action=DetectDefault,
)
eng_conf.add_argument(
"--time-scale",
default=20,
type=float,
help="The time scale of the Unity environment(s). Equivalent to setting "
"Time.timeScale in Unity.",
action=DetectDefault,
)
eng_conf.add_argument(
"--target-frame-rate",
default=-1,
type=int,
help="The target frame rate of the Unity environment(s). Equivalent to setting "
"Application.targetFrameRate in Unity.",
action=DetectDefault,
)
eng_conf.add_argument(
"--capture-frame-rate",
default=60,
type=int,
help="The capture frame rate of the Unity environment(s). Equivalent to setting "
"Time.captureFramerate in Unity.",
action=DetectDefault,
)
eng_conf.add_argument(
"--no-graphics",
default=False,
action=DetectDefaultStoreTrue,
help="Whether to run the Unity executable in no-graphics mode (i.e. without initializing "
"the graphics driver. Use this only if your agents don't use visual observations.",
)
return argparser
def load_config(config_path: str) -> Dict[str, Any]:
try:
with open(config_path) as data_file:
return _load_config(data_file)
except IOError:
abs_path = os.path.abspath(config_path)
raise TrainerConfigError(f"Config file could not be found at {abs_path}.")
except UnicodeDecodeError:
raise TrainerConfigError(
f"There was an error decoding Config file from {config_path}. "
f"Make sure your file is save using UTF-8"
)
def _load_config(fp: TextIO) -> Dict[str, Any]:
"""
Load the yaml config from the file-like object.
"""
try:
return yaml.safe_load(fp)
except yaml.parser.ParserError as e:
raise TrainerConfigError(
"Error parsing yaml file. Please check for formatting errors. "
"A tool such as http://www.yamllint.com/ can be helpful with this."
) from e
parser = _create_parser()

39
ml-agents/mlagents/trainers/components/bc/module.py


from mlagents.trainers.policy.tf_policy import TFPolicy
from .model import BCModel
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.settings import BehavioralCloningSettings
class BCModule:

settings: BehavioralCloningSettings,
strength: float,
demo_path: str,
steps: int,
batch_size: int = None,
num_epoch: int = None,
samples_per_update: int = 0,
):
"""
A BC trainer that can be used inline with RL.

:param samples_per_update: Maximum number of samples to train on during each BC update.
"""
self.policy = policy
self.current_lr = policy_learning_rate * strength
self.model = BCModel(policy, self.current_lr, steps)
self.current_lr = policy_learning_rate * settings.strength
self.model = BCModel(policy, self.current_lr, settings.steps)
demo_path, policy.sequence_length, policy.brain
settings.demo_path, policy.sequence_length, policy.brain
self.batch_size = batch_size if batch_size else default_batch_size
self.num_epoch = num_epoch if num_epoch else default_num_epoch
self.batch_size = (
settings.batch_size if settings.batch_size else default_batch_size
)
self.num_epoch = settings.num_epoch if settings.num_epoch else default_num_epoch
self.n_sequences = max(
min(self.batch_size, self.demonstration_buffer.num_experiences)
// policy.sequence_length,

self.has_updated = False
self.use_recurrent = self.policy.use_recurrent
self.samples_per_update = samples_per_update
self.samples_per_update = settings.samples_per_update
@staticmethod
def check_config(config_dict: Dict[str, Any]) -> None:
"""
Check the behavioral_cloning config for the required keys.
:param config_dict: Pretraining section of trainer_config
"""
param_keys = ["strength", "demo_path", "steps"]
for k in param_keys:
if k not in config_dict:
raise UnityTrainerException(
"The required pre-training hyper-parameter {0} was not defined. Please check your \
trainer YAML file.".format(
k
)
)
def update(self) -> Dict[str, Any]:
"""

29
ml-agents/mlagents/trainers/components/reward_signals/__init__.py


from typing import Any, Dict, List
from typing import Any, Dict
from collections import namedtuple
import numpy as np
import abc

from mlagents_envs.logging_util import get_logger
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.settings import RewardSignalSettings
logger = get_logger(__name__)

class RewardSignal(abc.ABC):
def __init__(self, policy: TFPolicy, strength: float, gamma: float):
def __init__(self, policy: TFPolicy, settings: RewardSignalSettings):
:param strength: The strength of the reward. The reward's raw value will be multiplied by this value.
:param gamma: The time discounting factor used for this reward.
:param settings: Settings parameters for this Reward Signal, including gamma and strength.
:return: A RewardSignal object.
"""
class_name = self.__class__.__name__

# no natural end, e.g. GAIL or Curiosity
self.use_terminal_states = True
self.update_dict: Dict[str, tf.Tensor] = {}
self.gamma = gamma
self.gamma = settings.gamma
self.strength = strength
self.strength = settings.strength
self.stats_name_to_update_name: Dict[str, str] = {}
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:

:return: A dict that corresponds to the feed_dict needed for the update.
"""
return {}
@classmethod
def check_config(
cls, config_dict: Dict[str, Any], param_keys: List[str] = None
) -> None:
"""
Check the config dict, and throw an error if there are missing hyperparameters.
"""
param_keys = param_keys or []
for k in param_keys:
if k not in config_dict:
raise UnityTrainerException(
"The hyper-parameter {0} could not be found for {1}.".format(
k, cls.__name__
)
)

36
ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py


from typing import Any, Dict, List
from typing import Any, Dict
import numpy as np
from mlagents.tf_utils import tf

from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.settings import CuriositySettings
def __init__(
self,
policy: TFPolicy,
strength: float,
gamma: float,
encoding_size: int = 128,
learning_rate: float = 3e-4,
):
def __init__(self, policy: TFPolicy, settings: CuriositySettings):
:param strength: The scaling parameter for the reward. The scaled reward will be the unscaled
reward multiplied by the strength parameter
:param gamma: The time discounting factor used for this reward.
:param encoding_size: The size of the hidden encoding layer for the ICM
:param learning_rate: The learning rate for the ICM.
:param settings: CuriositySettings object that contains the parameters
(including encoding size and learning rate) for this CuriosityRewardSignal.
super().__init__(policy, strength, gamma)
super().__init__(policy, settings)
policy, encoding_size=encoding_size, learning_rate=learning_rate
policy,
encoding_size=settings.encoding_size,
learning_rate=settings.learning_rate,
)
self.use_terminal_states = False
self.update_dict = {

unscaled_reward * float(self.has_updated) * self.strength, 0, 1
)
return RewardSignalResult(scaled_reward, unscaled_reward)
@classmethod
def check_config(
cls, config_dict: Dict[str, Any], param_keys: List[str] = None
) -> None:
"""
Checks the config and throw an exception if a hyperparameter is missing. Curiosity requires strength,
gamma, and encoding size at minimum.
"""
param_keys = ["strength", "gamma", "encoding_size"]
super().check_config(config_dict, param_keys)
def prepare_update(
self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int

12
ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py


from typing import Any, Dict, List
import numpy as np
from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult

class ExtrinsicRewardSignal(RewardSignal):
@classmethod
def check_config(
cls, config_dict: Dict[str, Any], param_keys: List[str] = None
) -> None:
"""
Checks the config and throw an exception if a hyperparameter is missing. Extrinsic requires strength and gamma
at minimum.
"""
param_keys = ["strength", "gamma"]
super().check_config(config_dict, param_keys)
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
env_rews = np.array(mini_batch["environment_rewards"], dtype=np.float32)
return RewardSignalResult(self.strength * env_rews, env_rews)

47
ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py


from typing import Any, Dict, List
from typing import Any, Dict
import numpy as np
from mlagents.tf_utils import tf

from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.settings import GAILSettings
def __init__(
self,
policy: TFPolicy,
strength: float,
gamma: float,
demo_path: str,
encoding_size: int = 64,
learning_rate: float = 3e-4,
use_actions: bool = False,
use_vail: bool = False,
):
def __init__(self, policy: TFPolicy, settings: GAILSettings):
:param strength: The scaling parameter for the reward. The scaled reward will be the unscaled
reward multiplied by the strength parameter
:param gamma: The time discounting factor used for this reward.
:param demo_path: The path to the demonstration file
:param num_epoch: The number of epochs to train over the training buffer for the discriminator.
:param encoding_size: The size of the the hidden layers of the discriminator
:param learning_rate: The Learning Rate used during GAIL updates.
:param use_actions: Whether or not to use the actions for the discriminator.
:param use_vail: Whether or not to use a variational bottleneck for the discriminator.
:param settings: The settings for this GAILRewardSignal.
super().__init__(policy, strength, gamma)
super().__init__(policy, settings)
policy, 128, learning_rate, encoding_size, use_actions, use_vail
policy,
128,
settings.learning_rate,
settings.encoding_size,
settings.use_actions,
settings.use_vail,
demo_path, policy.sequence_length, policy.brain
settings.demo_path, policy.sequence_length, policy.brain
)
self.has_updated = False
self.update_dict: Dict[str, tf.Tensor] = {

)
scaled_reward = unscaled_reward * float(self.has_updated) * self.strength
return RewardSignalResult(scaled_reward, unscaled_reward)
@classmethod
def check_config(
cls, config_dict: Dict[str, Any], param_keys: List[str] = None
) -> None:
"""
Checks the config and throw an exception if a hyperparameter is missing. GAIL requires strength and gamma
at minimum.
"""
param_keys = ["strength", "gamma", "demo_path"]
super().check_config(config_dict, param_keys)
def prepare_update(
self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int

22
ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py


from typing import Any, Dict, Type
from typing import Dict, Type
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.components.reward_signals import RewardSignal
from mlagents.trainers.components.reward_signals.extrinsic.signal import (

CuriosityRewardSignal,
)
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
NAME_TO_CLASS: Dict[str, Type[RewardSignal]] = {
"extrinsic": ExtrinsicRewardSignal,
"curiosity": CuriosityRewardSignal,
"gail": GAILRewardSignal,
NAME_TO_CLASS: Dict[RewardSignalType, Type[RewardSignal]] = {
RewardSignalType.EXTRINSIC: ExtrinsicRewardSignal,
RewardSignalType.CURIOSITY: CuriosityRewardSignal,
RewardSignalType.GAIL: GAILRewardSignal,
policy: TFPolicy, name: str, config_entry: Dict[str, Any]
policy: TFPolicy, name: RewardSignalType, settings: RewardSignalSettings
) -> RewardSignal:
"""
Creates a reward signal class based on the name and config entry provided as a dict.

rcls = NAME_TO_CLASS.get(name)
if not rcls:
raise UnityTrainerException("Unknown reward signal type {0}".format(name))
rcls.check_config(config_entry)
try:
class_inst = rcls(policy, **config_entry)
except TypeError:
raise UnityTrainerException(
"Unknown parameters given for reward signal {0}".format(name)
)
class_inst = rcls(policy, settings)
return class_inst

66
ml-agents/mlagents/trainers/curriculum.py


import json
from typing import Dict, Any, TextIO
from typing import Dict, Any
from .exception import CurriculumConfigError, CurriculumLoadingError
from mlagents.trainers.exception import CurriculumConfigError
from mlagents.trainers.settings import CurriculumSettings
def __init__(self, brain_name: str, config: Dict):
def __init__(self, brain_name: str, settings: CurriculumSettings):
"""
Initializes a Curriculum object.
:param brain_name: Name of the brain this Curriculum is associated with

self.measure = None
self._lesson_num = 0
self.brain_name = brain_name
self.config = config
self.settings = settings
for key in [
"parameters",
"measure",
"thresholds",
"min_lesson_length",
"signal_smoothing",
]:
if key not in self.config:
raise CurriculumConfigError(
f"{brain_name} curriculum config does not contain a {key} field."
)
self.smoothing_value = 0
self.measure = self.config["measure"]
self.min_lesson_length = self.config["min_lesson_length"]
self.max_lesson_num = len(self.config["thresholds"])
self.measure = self.settings.measure
self.min_lesson_length = self.settings.min_lesson_length
self.max_lesson_num = len(self.settings.thresholds)
parameters = self.config["parameters"]
parameters = self.settings.parameters
for key in parameters:
if len(parameters[key]) != self.max_lesson_num + 1:
raise CurriculumConfigError(

steps completed).
:return Whether the lesson was incremented.
"""
if not self.config or not measure_val or math.isnan(measure_val):
if not self.settings or not measure_val or math.isnan(measure_val):
if self.config["signal_smoothing"]:
if self.settings.signal_smoothing:
if measure_val > self.config["thresholds"][self.lesson_num]:
if measure_val > self.settings.thresholds[self.lesson_num]:
parameters = self.config["parameters"]
parameters = self.settings.parameters
for key in parameters:
config[key] = parameters[key][self.lesson_num]
logger.info(

current lesson is returned.
:return: The configuration of the reset parameters.
"""
if not self.config:
if not self.settings:
parameters = self.config["parameters"]
parameters = self.settings.parameters
@staticmethod
def load_curriculum_file(config_path: str) -> Dict:
try:
with open(config_path) as data_file:
return Curriculum._load_curriculum(data_file)
except IOError:
raise CurriculumLoadingError(
"The file {0} could not be found.".format(config_path)
)
except UnicodeDecodeError:
raise CurriculumLoadingError(
"There was an error decoding {}".format(config_path)
)
@staticmethod
def _load_curriculum(fp: TextIO) -> Dict:
try:
return json.load(fp)
except json.decoder.JSONDecodeError as e:
raise CurriculumLoadingError(
"Error parsing JSON file. Please check for formatting errors. "
"A tool such as https://jsonlint.com/ can be helpful with this."
) from e

22
ml-agents/mlagents/trainers/ghost/trainer.py


brain_name,
controller,
reward_buff_cap,
trainer_parameters,
trainer_settings,
training,
run_id,
):

:param brain_name: The name of the brain associated with trainer config
:param controller: GhostController that coordinates all ghost trainers and calculates ELO
:param reward_buff_cap: Max reward history to track in the reward buffer
:param trainer_parameters: The parameters for the trainer (dictionary).
:param trainer_settings: The parameters for the trainer.
brain_name, trainer_parameters, training, run_id, reward_buff_cap
brain_name, trainer_settings, training, run_id, reward_buff_cap
)
self.trainer = trainer

# Set the logging to print ELO in the console
self._stats_reporter.add_property(StatsPropertyType.SELF_PLAY, True)
self_play_parameters = trainer_parameters["self_play"]
self.window = self_play_parameters.get("window", 10)
self.play_against_latest_model_ratio = self_play_parameters.get(
"play_against_latest_model_ratio", 0.5
self_play_parameters = trainer_settings.self_play
self.window = self_play_parameters.window
self.play_against_latest_model_ratio = (
self_play_parameters.play_against_latest_model_ratio
)
if (
self.play_against_latest_model_ratio > 1.0

"The play_against_latest_model_ratio is not between 0 and 1."
)
self.steps_between_save = self_play_parameters.get("save_steps", 20000)
self.steps_between_swap = self_play_parameters.get("swap_steps", 20000)
self.steps_to_train_team = self_play_parameters.get("team_change", 100000)
self.steps_between_save = self_play_parameters.save_steps
self.steps_between_swap = self_play_parameters.swap_steps
self.steps_to_train_team = self_play_parameters.team_change
if self.steps_to_train_team > self.get_max_steps:
logger.warning(
"The max steps of the GhostTrainer for behavior name {} is less than team change. This team will not face \

self.last_team_change: int = 0
# Chosen because it is the initial ELO in Chess
self.initial_elo: float = self_play_parameters.get("initial_elo", 1200.0)
self.initial_elo: float = self_play_parameters.initial_elo
self.policy_elos: List[float] = [self.initial_elo] * (
self.window + 1
) # for learning policy

377
ml-agents/mlagents/trainers/learn.py


# # Unity ML-Agents Toolkit
import argparse
import yaml
import os

from typing import Callable, Optional, List, NamedTuple, Dict
from typing import Callable, Optional, List, Dict
import mlagents.trainers
import mlagents_envs

from mlagents.trainers.trainer_util import (
load_config,
TrainerFactory,
handle_existing_directories,
assemble_curriculum_config,
)
from mlagents.trainers.trainer_util import TrainerFactory, handle_existing_directories
from mlagents.trainers.stats import (
TensorboardWriter,
CSVWriter,

)
from mlagents.trainers.cli_utils import (
StoreConfigFile,
DetectDefault,
DetectDefaultStoreTrue,
)
from mlagents.trainers.cli_utils import parser
from mlagents.trainers.exception import SamplerException, TrainerConfigError
from mlagents.trainers.exception import SamplerException
from mlagents.trainers.settings import RunOptions
from mlagents_envs.base_env import BaseEnv
from mlagents.trainers.subprocess_env_manager import SubprocessEnvManager
from mlagents_envs.side_channel.side_channel import SideChannel

logger = logging_util.get_logger(__name__)
def _create_parser():
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
argparser.add_argument("trainer_config_path", action=StoreConfigFile)
argparser.add_argument(
"--env",
default=None,
dest="env_path",
help="Path to the Unity executable to train",
action=DetectDefault,
)
argparser.add_argument(
"--lesson",
default=0,
type=int,
help="The lesson to start with when performing curriculum training",
action=DetectDefault,
)
argparser.add_argument(
"--keep-checkpoints",
default=5,
type=int,
help="The maximum number of model checkpoints to keep. Checkpoints are saved after the"
"number of steps specified by the save-freq option. Once the maximum number of checkpoints"
"has been reached, the oldest checkpoint is deleted when saving a new checkpoint.",
action=DetectDefault,
)
argparser.add_argument(
"--load",
default=False,
dest="load_model",
action=DetectDefaultStoreTrue,
help=argparse.SUPPRESS, # Deprecated but still usable for now.
)
argparser.add_argument(
"--resume",
default=False,
dest="resume",
action=DetectDefaultStoreTrue,
help="Whether to resume training from a checkpoint. Specify a --run-id to use this option. "
"If set, the training code loads an already trained model to initialize the neural network "
"before resuming training. This option is only valid when the models exist, and have the same "
"behavior names as the current agents in your scene.",
)
argparser.add_argument(
"--force",
default=False,
dest="force",
action=DetectDefaultStoreTrue,
help="Whether to force-overwrite this run-id's existing summary and model data. (Without "
"this flag, attempting to train a model with a run-id that has been used before will throw "
"an error.",
)
argparser.add_argument(
"--run-id",
default="ppo",
help="The identifier for the training run. This identifier is used to name the "
"subdirectories in which the trained model and summary statistics are saved as well "
"as the saved model itself. If you use TensorBoard to view the training statistics, "
"always set a unique run-id for each training run. (The statistics for all runs with the "
"same id are combined as if they were produced by a the same session.)",
action=DetectDefault,
)
argparser.add_argument(
"--initialize-from",
metavar="RUN_ID",
default=None,
help="Specify a previously saved run ID from which to initialize the model from. "
"This can be used, for instance, to fine-tune an existing model on a new environment. "
"Note that the previously saved models must have the same behavior parameters as your "
"current environment.",
action=DetectDefault,
)
argparser.add_argument(
"--save-freq",
default=50000,
type=int,
help="How often (in steps) to save the model during training",
action=DetectDefault,
)
argparser.add_argument(
"--seed",
default=-1,
type=int,
help="A number to use as a seed for the random number generator used by the training code",
action=DetectDefault,
)
argparser.add_argument(
"--train",
default=False,
dest="train_model",
action=DetectDefaultStoreTrue,
help=argparse.SUPPRESS,
)
argparser.add_argument(
"--inference",
default=False,
dest="inference",
action=DetectDefaultStoreTrue,
help="Whether to run in Python inference mode (i.e. no training). Use with --resume to load "
"a model trained with an existing run ID.",
)
argparser.add_argument(
"--base-port",
default=UnityEnvironment.BASE_ENVIRONMENT_PORT,
type=int,
help="The starting port for environment communication. Each concurrent Unity environment "
"instance will get assigned a port sequentially, starting from the base-port. Each instance "
"will use the port (base_port + worker_id), where the worker_id is sequential IDs given to "
"each instance from 0 to (num_envs - 1). Note that when training using the Editor rather "
"than an executable, the base port will be ignored.",
action=DetectDefault,
)
argparser.add_argument(
"--num-envs",
default=1,
type=int,
help="The number of concurrent Unity environment instances to collect experiences "
"from when training",
action=DetectDefault,
)
argparser.add_argument(
"--no-graphics",
default=False,
action=DetectDefaultStoreTrue,
help="Whether to run the Unity executable in no-graphics mode (i.e. without initializing "
"the graphics driver. Use this only if your agents don't use visual observations.",
)
argparser.add_argument(
"--debug",
default=False,
action=DetectDefaultStoreTrue,
help="Whether to enable debug-level logging for some parts of the code",
)
argparser.add_argument(
"--env-args",
default=None,
nargs=argparse.REMAINDER,
help="Arguments passed to the Unity executable. Be aware that the standalone build will also "
"process these as Unity Command Line Arguments. You should choose different argument names if "
"you want to create environment-specific arguments. All arguments after this flag will be "
"passed to the executable.",
action=DetectDefault,
)
argparser.add_argument(
"--cpu",
default=False,
action=DetectDefaultStoreTrue,
help="Forces training using CPU only",
)
argparser.add_argument("--version", action="version", version="")
eng_conf = argparser.add_argument_group(title="Engine Configuration")
eng_conf.add_argument(
"--width",
default=None,
type=int,
help="The width of the executable window of the environment(s) in pixels "
"(ignored for editor training).",
action=DetectDefault,
)
eng_conf.add_argument(
"--height",
default=None,
type=int,
help="The height of the executable window of the environment(s) in pixels "
"(ignored for editor training)",
action=DetectDefault,
)
eng_conf.add_argument(
"--quality-level",
default=5,
type=int,
help="The quality level of the environment(s). Equivalent to calling "
"QualitySettings.SetQualityLevel in Unity.",
action=DetectDefault,
)
eng_conf.add_argument(
"--time-scale",
default=20,
type=float,
help="The time scale of the Unity environment(s). Equivalent to setting "
"Time.timeScale in Unity.",
action=DetectDefault,
)
eng_conf.add_argument(
"--target-frame-rate",
default=-1,
type=int,
help="The target frame rate of the Unity environment(s). Equivalent to setting "
"Application.targetFrameRate in Unity.",
action=DetectDefault,
)
eng_conf.add_argument(
"--capture-frame-rate",
default=60,
type=int,
help="The capture frame rate of the Unity environment(s). Equivalent to setting "
"Time.captureFramerate in Unity.",
action=DetectDefault,
)
return argparser
parser = _create_parser()
class RunOptions(NamedTuple):
behaviors: Dict
debug: bool = parser.get_default("debug")
seed: int = parser.get_default("seed")
env_path: Optional[str] = parser.get_default("env_path")
run_id: str = parser.get_default("run_id")
initialize_from: str = parser.get_default("initialize_from")
load_model: bool = parser.get_default("load_model")
resume: bool = parser.get_default("resume")
force: bool = parser.get_default("force")
train_model: bool = parser.get_default("train_model")
inference: bool = parser.get_default("inference")
save_freq: int = parser.get_default("save_freq")
keep_checkpoints: int = parser.get_default("keep_checkpoints")
base_port: int = parser.get_default("base_port")
num_envs: int = parser.get_default("num_envs")
curriculum_config: Optional[Dict] = None
lesson: int = parser.get_default("lesson")
no_graphics: bool = parser.get_default("no_graphics")
multi_gpu: bool = parser.get_default("multi_gpu")
parameter_randomization: Optional[Dict] = None
env_args: Optional[List[str]] = parser.get_default("env_args")
cpu: bool = parser.get_default("cpu")
width: int = parser.get_default("width")
height: int = parser.get_default("height")
quality_level: int = parser.get_default("quality_level")
time_scale: float = parser.get_default("time_scale")
target_frame_rate: int = parser.get_default("target_frame_rate")
capture_frame_rate: int = parser.get_default("capture_frame_rate")
@staticmethod
def from_argparse(args: argparse.Namespace) -> "RunOptions":
"""
Takes an argparse.Namespace as specified in `parse_command_line`, loads input configuration files
from file paths, and converts to a CommandLineOptions instance.
:param args: collection of command-line parameters passed to mlagents-learn
:return: CommandLineOptions representing the passed in arguments, with trainer config, curriculum and sampler
configs loaded from files.
"""
argparse_args = vars(args)
run_options_dict = {}
run_options_dict.update(argparse_args)
config_path = StoreConfigFile.trainer_config_path
# Load YAML
yaml_config = load_config(config_path)
# This is the only option that is not optional and has no defaults.
if "behaviors" not in yaml_config:
raise TrainerConfigError(
"Trainer configurations not found. Make sure your YAML file has a section for behaviors."
)
# Use the YAML file values for all values not specified in the CLI.
for key, val in yaml_config.items():
# Detect bad config options
if not hasattr(RunOptions, key):
raise TrainerConfigError(
"The option {} was specified in your YAML file, but is invalid.".format(
key
)
)
if key not in DetectDefault.non_default_args:
run_options_dict[key] = val
# Keep deprecated --load working, TODO: remove
run_options_dict["resume"] = (
run_options_dict["resume"] or run_options_dict["load_model"]
)
return RunOptions(**run_options_dict)
def get_version_string() -> str:
# pylint: disable=no-member
return f""" Version information:

:param run_options: Command line arguments for training.
"""
with hierarchical_timer("run_training.setup"):
checkpoint_settings = options.checkpoint_settings
env_settings = options.env_settings
engine_settings = options.engine_settings
write_path = os.path.join(base_path, options.run_id)
write_path = os.path.join(base_path, checkpoint_settings.run_id)
os.path.join(base_path, options.run_id) if options.initialize_from else None
os.path.join(base_path, checkpoint_settings.run_id)
if checkpoint_settings.initialize_from
else None
port: Optional[int] = options.base_port
port: Optional[int] = env_settings.base_port
write_path, options.resume, options.force, maybe_init_path
write_path,
checkpoint_settings.resume,
checkpoint_settings.force,
maybe_init_path,
)
# Make run logs directory
os.makedirs(run_logs_dir, exist_ok=True)

"Environment/Episode Length",
],
)
tb_writer = TensorboardWriter(write_path, clear_past_data=not options.resume)
tb_writer = TensorboardWriter(
write_path, clear_past_data=not checkpoint_settings.resume
)
gauge_write = GaugeWriter()
console_writer = ConsoleWriter()
StatsReporter.add_writer(tb_writer)

if options.env_path is None:
if env_settings.env_path is None:
options.env_path,
options.no_graphics,
env_settings.env_path,
engine_settings.no_graphics,
options.env_args,
env_settings.env_args,
width=options.width,
height=options.height,
quality_level=options.quality_level,
time_scale=options.time_scale,
target_frame_rate=options.target_frame_rate,
capture_frame_rate=options.capture_frame_rate,
width=engine_settings.width,
height=engine_settings.height,
quality_level=engine_settings.quality_level,
time_scale=engine_settings.time_scale,
target_frame_rate=engine_settings.target_frame_rate,
capture_frame_rate=engine_settings.capture_frame_rate,
env_manager = SubprocessEnvManager(env_factory, engine_config, options.num_envs)
curriculum_config = assemble_curriculum_config(options.behaviors)
env_manager = SubprocessEnvManager(
env_factory, engine_config, env_settings.num_envs
)
curriculum_config, env_manager, options.lesson
options.curriculum, env_manager, checkpoint_settings.lesson
)
sampler_manager, resampling_interval = create_sampler_manager(
options.parameter_randomization, run_seed

options.run_id,
checkpoint_settings.run_id,
options.keep_checkpoints,
not options.inference,
options.resume,
not checkpoint_settings.inference,
checkpoint_settings.resume,
options.multi_gpu,
False,
options.run_id,
options.save_freq,
checkpoint_settings.run_id,
checkpoint_settings.save_freq,
not options.inference,
not checkpoint_settings.inference,
run_seed,
sampler_manager,
resampling_interval,

try:
with open(run_options_path, "w") as f:
try:
yaml.dump(dict(run_options._asdict()), f, sort_keys=False)
yaml.dump(run_options.as_dict(), f, sort_keys=False)
yaml.dump(dict(run_options._asdict()), f)
yaml.dump(run_options.as_dict(), f)
except FileNotFoundError:
logger.warning(
f"Unable to save configuration to {run_options_path}. Make sure the directory exists"

logging_util.set_log_level(log_level)
logger.debug("Configuration for this run:")
logger.debug(json.dumps(options._asdict(), indent=4))
logger.debug(json.dumps(options.as_dict(), indent=4))
if options.load_model:
if options.checkpoint_settings.load_model:
if options.train_model:
if options.checkpoint_settings.train_model:
run_seed = options.seed
if options.cpu:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
run_seed = options.env_settings.seed
# Add some timer metadata
add_timer_metadata("mlagents_version", mlagents.trainers.__version__)

if options.seed == -1:
if options.env_settings.seed == -1:
run_seed = np.random.randint(0, 10000)
run_training(run_seed, options)

7
ml-agents/mlagents/trainers/meta_curriculum.py


from typing import Dict, Set
from mlagents.trainers.curriculum import Curriculum
from mlagents.trainers.settings import CurriculumSettings
from mlagents_envs.logging_util import get_logger

particular brain in the environment.
"""
def __init__(self, curriculum_configs: Dict[str, Dict]):
def __init__(self, curriculum_configs: Dict[str, CurriculumSettings]):
"""Initializes a MetaCurriculum object.
:param curriculum_folder: Dictionary of brain_name to the

used_reset_parameters: Set[str] = set()
for brain_name, curriculum_config in curriculum_configs.items():
for brain_name, curriculum_settings in curriculum_configs.items():
brain_name, curriculum_config
brain_name, curriculum_settings
)
config_keys: Set[str] = set(
self._brains_to_curricula[brain_name].get_config().keys()

29
ml-agents/mlagents/trainers/optimizer/tf_optimizer.py


from mlagents.trainers.components.reward_signals.reward_signal_factory import (
create_reward_signal,
)
from mlagents.trainers.settings import TrainerSettings, RewardSignalType
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
self.create_reward_signals(trainer_params["reward_signals"])
self.create_reward_signals(trainer_params.reward_signals)
if "behavioral_cloning" in trainer_params:
BCModule.check_config(trainer_params["behavioral_cloning"])
if trainer_params.behavioral_cloning is not None:
policy_learning_rate=trainer_params["learning_rate"],
default_batch_size=trainer_params["batch_size"],
trainer_params.behavioral_cloning,
policy_learning_rate=trainer_params.hyperparameters.learning_rate,
default_batch_size=trainer_params.hyperparameters.batch_size,
**trainer_params["behavioral_cloning"],
)
def get_trajectory_value_estimates(

return value_estimates
def create_reward_signals(self, reward_signal_configs: Dict[str, Any]) -> None:
def create_reward_signals(
self, reward_signal_configs: Dict[RewardSignalType, Any]
) -> None:
"""
Create reward signals
:param reward_signal_configs: Reward signal config.

for reward_signal, config in reward_signal_configs.items():
self.reward_signals[reward_signal] = create_reward_signal(
self.policy, reward_signal, config
for reward_signal, settings in reward_signal_configs.items():
# Name reward signals by string in case we have duplicates later
self.reward_signals[reward_signal.value] = create_reward_signal(
self.policy, reward_signal, settings
)
self.update_dict.update(
self.reward_signals[reward_signal.value].update_dict
self.update_dict.update(self.reward_signals[reward_signal].update_dict)
def create_optimizer_op(
self, learning_rate: tf.Tensor, name: str = "Adam"

11
ml-agents/mlagents/trainers/policy/nn_policy.py


from mlagents.trainers.models import EncoderType
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.distributions import (
GaussianDistribution,
MultiCategoricalDistribution,

self,
seed: int,
brain: BrainParameters,
trainer_params: Dict[str, Any],
trainer_params: TrainerSettings,
is_training: bool,
load: bool,
tanh_squash: bool = False,

super().__init__(seed, brain, trainer_params, load)
self.grads = None
self.update_batch: Optional[tf.Operation] = None
num_layers = trainer_params["num_layers"]
self.h_size = trainer_params["hidden_units"]
num_layers = self.network_settings.num_layers
self.h_size = self.network_settings.hidden_units
self.vis_encode_type = EncoderType(
trainer_params.get("vis_encode_type", "simple")
)
self.vis_encode_type = self.network_settings.vis_encode_type
self.tanh_squash = tanh_squash
self.reparameterize = reparameterize
self.condition_sigma_on_obs = condition_sigma_on_obs

41
ml-agents/mlagents/trainers/policy/tf_policy.py


from mlagents.trainers.brain_conversion_utils import get_global_agent_id
from mlagents_envs.base_env import DecisionSteps
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.brain import BrainParameters
logger = get_logger(__name__)

functions to save/load models and create the input placeholders.
"""
def __init__(self, seed, brain, trainer_parameters, load=False):
def __init__(
self,
seed: int,
brain: BrainParameters,
trainer_settings: TrainerSettings,
load: bool = False,
):
:param trainer_parameters: The trainer parameters.
:param trainer_settings: The trainer parameters.
self.trainer_settings = trainer_settings
self.network_settings: NetworkSettings = trainer_settings.network_settings
self.assign_phs = []
self.assign_ops = []
self.assign_phs: List[tf.Tensor] = []
self.assign_ops: List[tf.Operation] = []
self.inference_dict = {}
self.update_dict = {}
self.inference_dict: Dict[str, tf.Tensor] = {}
self.update_dict: Dict[str, tf.Tensor] = {}
self.sequence_length = 1
self.seed = seed
self.brain = brain

self.vis_obs_size = brain.number_visual_observations
self.use_recurrent = trainer_parameters["use_recurrent"]
self.use_recurrent = self.network_settings.memory is not None
self.normalize = trainer_parameters.get("normalize", False)
self.normalize = self.network_settings.normalize
self.model_path = trainer_parameters["output_path"]
self.initialize_path = trainer_parameters.get("init_path", None)
self.keep_checkpoints = trainer_parameters.get("keep_checkpoints", 5)
self.model_path = self.trainer_settings.output_path
self.initialize_path = self.trainer_settings.init_path
self.keep_checkpoints = self.trainer_settings.keep_checkpoints
self.saver = None
self.saver: Optional[tf.Operation] = None
if self.use_recurrent:
self.m_size = trainer_parameters["memory_size"]
self.sequence_length = trainer_parameters["sequence_length"]
if self.network_settings.memory is not None:
self.m_size = self.network_settings.memory.memory_size
self.sequence_length = self.network_settings.memory.sequence_length
if self.m_size == 0:
raise UnityPolicyException(
"The memory size for brain {0} is 0 even "

33
ml-agents/mlagents/trainers/ppo/optimizer.py


from typing import Optional, Any, Dict
from typing import Optional, Any, Dict, cast
from mlagents.trainers.models import ModelUtils, EncoderType, ScheduleType
from mlagents.trainers.models import ModelUtils, EncoderType
from mlagents.trainers.settings import TrainerSettings, PPOSettings
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
"""
Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
The PPO optimizer has a value estimator and a loss function.

with policy.graph.as_default():
with tf.variable_scope("optimizer/"):
super().__init__(policy, trainer_params)
hyperparameters: PPOSettings = cast(
PPOSettings, trainer_params.hyperparameters
)
lr = float(hyperparameters.learning_rate)
self._schedule = hyperparameters.learning_rate_schedule
epsilon = float(hyperparameters.epsilon)
beta = float(hyperparameters.beta)
max_step = float(trainer_params.max_steps)
lr = float(trainer_params["learning_rate"])
self._schedule = ScheduleType(
trainer_params.get("learning_rate_schedule", "linear")
)
h_size = int(trainer_params["hidden_units"])
epsilon = float(trainer_params["epsilon"])
beta = float(trainer_params["beta"])
max_step = float(trainer_params["max_steps"])
num_layers = int(trainer_params["num_layers"])
vis_encode_type = EncoderType(
trainer_params.get("vis_encode_type", "simple")
)
self.burn_in_ratio = float(trainer_params.get("burn_in_ratio", 0.0))
policy_network_settings = policy.network_settings
h_size = int(policy_network_settings.hidden_units)
num_layers = policy_network_settings.num_layers
vis_encode_type = policy_network_settings.vis_encode_type
self.burn_in_ratio = 0.0
self.stream_names = list(self.reward_signals.keys())

42
ml-agents/mlagents/trainers/ppo/trainer.py


# Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347
from collections import defaultdict
from typing import cast
import numpy as np

from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.settings import TrainerSettings, PPOSettings
logger = get_logger(__name__)

self,
brain_name: str,
reward_buff_cap: int,
trainer_parameters: dict,
trainer_settings: TrainerSettings,
training: bool,
load: bool,
seed: int,

Responsible for collecting experiences and training PPO model.
:param brain_name: The name of the brain associated with trainer config
:param reward_buff_cap: Max reward history to track in the reward buffer
:param trainer_parameters: The parameters for the trainer (dictionary).
:param trainer_settings: The parameters for the trainer.
:param training: Whether the trainer is set for training.
:param load: Whether the model should be loaded.
:param seed: The seed the model will be initialized with

brain_name, trainer_parameters, training, run_id, reward_buff_cap
brain_name, trainer_settings, training, run_id, reward_buff_cap
)
self.param_keys = [
"batch_size",

"output_path",
"reward_signals",
]
self._check_param_keys()
self.hyperparameters: PPOSettings = cast(
PPOSettings, self.trainer_settings.hyperparameters
)
def _check_param_keys(self):
super()._check_param_keys()
# Check that batch size is greater than sequence length. Else, throw
# an exception.
if (
self.trainer_parameters["sequence_length"]
> self.trainer_parameters["batch_size"]
and self.trainer_parameters["use_recurrent"]
):
raise UnityTrainerException(
"batch_size must be greater than or equal to sequence_length when use_recurrent is True."
)
def _process_trajectory(self, trajectory: Trajectory) -> None:
"""
Takes a trajectory and processes it, putting it into the update buffer.

value_estimates=local_value_estimates,
value_next=bootstrap_value,
gamma=self.optimizer.reward_signals[name].gamma,
lambd=self.trainer_parameters["lambd"],
lambd=self.hyperparameters.lambd,
)
local_return = local_advantage + local_value_estimates
# This is later use as target for the different value estimates

:return: A boolean corresponding to whether or not update_model() can be run
"""
size_of_buffer = self.update_buffer.num_experiences
return size_of_buffer > self.trainer_parameters["buffer_size"]
return size_of_buffer > self.hyperparameters.buffer_size
def _update_policy(self):
"""

# Make sure batch_size is a multiple of sequence length. During training, we
# will need to reshape the data into a batch_size x sequence_length tensor.
batch_size = (
self.trainer_parameters["batch_size"]
- self.trainer_parameters["batch_size"] % self.policy.sequence_length
self.hyperparameters.batch_size
- self.hyperparameters.batch_size % self.policy.sequence_length
int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
advantages = self.update_buffer["advantages"].get_batch()

num_epoch = self.trainer_parameters["num_epoch"]
num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)
for _ in range(num_epoch):
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)

policy = NNPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,
self.trainer_settings,
self.is_training,
self.load,
condition_sigma_on_obs=False, # Faster training for PPO

if not isinstance(policy, NNPolicy):
raise RuntimeError("Non-NNPolicy passed to PPOTrainer.add_policy()")
self.policy = policy
self.optimizer = PPOOptimizer(self.policy, self.trainer_parameters)
self.optimizer = PPOOptimizer(self.policy, self.trainer_settings)
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly

6
ml-agents/mlagents/trainers/run_experiment.py


import argparse
from typing import Optional, List
from mlagents.trainers.learn import RunOptions, run_cli, load_config
from mlagents.trainers.learn import run_cli
from mlagents.trainers.settings import RunOptions
from mlagents.trainers.cli_utils import load_config
def parse_command_line(argv: Optional[List[str]] = None) -> argparse.Namespace:

"""
args = parse_command_line()
expt_config = load_config(args.experiment_config_path)
run_cli(RunOptions(**expt_config))
run_cli(RunOptions.from_dict(expt_config))
if __name__ == "__main__":

38
ml-agents/mlagents/trainers/sac/optimizer.py


import numpy as np
from typing import Dict, List, Optional, Any, Mapping
from typing import Dict, List, Optional, Any, Mapping, cast
from mlagents.trainers.models import ScheduleType, EncoderType, ModelUtils
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.settings import TrainerSettings, SACSettings
EPSILON = 1e-6 # Small value to avoid divide by zero

class SACOptimizer(TFOptimizer):
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
"""
Takes a Unity environment and model-specific hyper-parameters and returns the
appropriate PPO agent model for the environment.

with policy.graph.as_default():
with tf.variable_scope(""):
super().__init__(policy, trainer_params)
lr = float(trainer_params["learning_rate"])
lr_schedule = ScheduleType(
trainer_params.get("learning_rate_schedule", "constant")
hyperparameters: SACSettings = cast(
SACSettings, trainer_params.hyperparameters
lr = hyperparameters.learning_rate
lr_schedule = hyperparameters.learning_rate_schedule
max_step = trainer_params.max_steps
self.tau = hyperparameters.tau
self.init_entcoef = hyperparameters.init_entcoef
self.act_size = self.policy.act_size
h_size = int(trainer_params["hidden_units"])
max_step = float(trainer_params["max_steps"])
num_layers = int(trainer_params["num_layers"])
vis_encode_type = EncoderType(
trainer_params.get("vis_encode_type", "simple")
)
self.tau = trainer_params.get("tau", 0.005)
self.burn_in_ratio = float(trainer_params.get("burn_in_ratio", 0.0))
self.act_size = policy.act_size
policy_network_settings = policy.network_settings
h_size = policy_network_settings.hidden_units
num_layers = policy_network_settings.num_layers
vis_encode_type = policy_network_settings.vis_encode_type
self.tau = hyperparameters.tau
self.burn_in_ratio = 0.0
# Non-exposed SAC parameters
self.discrete_target_entropy_scale = (

self.init_entcoef = trainer_params.get("init_entcoef", 1.0)
_val["gamma"] for _val in trainer_params["reward_signals"].values()
_val.gamma for _val in trainer_params.reward_signals.values()
]
self.use_dones_in_backup = {
name: tf.Variable(1.0) for name in stream_names

100
ml-agents/mlagents/trainers/sac/trainer.py


# and implemented in https://github.com/hill-a/stable-baselines
from collections import defaultdict
from typing import Dict
from typing import Dict, cast
import os
import numpy as np

from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.trajectory import Trajectory, SplitObservations
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.settings import TrainerSettings, SACSettings
DEFAULT_STEPS_PER_UPDATE = 1
class SACTrainer(RLTrainer):

self,
brain_name: str,
reward_buff_cap: int,
trainer_parameters: dict,
trainer_settings: TrainerSettings,
training: bool,
load: bool,
seed: int,

Responsible for collecting experiences and training SAC model.
:param brain_name: The name of the brain associated with trainer config
:param reward_buff_cap: Max reward history to track in the reward buffer
:param trainer_parameters: The parameters for the trainer (dictionary).
:param trainer_settings: The parameters for the trainer.
:param training: Whether the trainer is set for training.
:param load: Whether the model should be loaded.
:param seed: The seed the model will be initialized with

brain_name, trainer_parameters, training, run_id, reward_buff_cap
brain_name, trainer_settings, training, run_id, reward_buff_cap
self.param_keys = [
"batch_size",
"buffer_size",
"buffer_init_steps",
"hidden_units",
"learning_rate",
"init_entcoef",
"max_steps",
"normalize",
"num_layers",
"time_horizon",
"steps_per_update",
"sequence_length",
"summary_freq",
"tau",
"use_recurrent",
"memory_size",
"output_path",
"reward_signals",
]
self._check_param_keys()
self.hyperparameters: SACSettings = cast(
SACSettings, trainer_settings.hyperparameters
)
self.update_steps = max(1, self.trainer_parameters["buffer_init_steps"])
self.reward_signal_update_steps = max(
1, self.trainer_parameters["buffer_init_steps"]
)
self.update_steps = max(1, self.hyperparameters.buffer_init_steps)
self.reward_signal_update_steps = max(1, self.hyperparameters.buffer_init_steps)
self.steps_per_update = (
trainer_parameters["steps_per_update"]
if "steps_per_update" in trainer_parameters
else DEFAULT_STEPS_PER_UPDATE
)
self.steps_per_update = self.hyperparameters.steps_per_update
trainer_parameters["reward_signals"]["reward_signal_steps_per_update"]
if "reward_signal_steps_per_update" in trainer_parameters["reward_signals"]
else self.steps_per_update
self.hyperparameters.reward_signal_steps_per_update
self.checkpoint_replay_buffer = (
trainer_parameters["save_replay_buffer"]
if "save_replay_buffer" in trainer_parameters
else False
)
def _check_param_keys(self):
super()._check_param_keys()
# Check that batch size is greater than sequence length. Else, throw
# an exception.
if (
self.trainer_parameters["sequence_length"]
> self.trainer_parameters["batch_size"]
and self.trainer_parameters["use_recurrent"]
):
raise UnityTrainerException(
"batch_size must be greater than or equal to sequence_length when use_recurrent is True."
)
self.checkpoint_replay_buffer = self.hyperparameters.save_replay_buffer
def save_model(self, name_behavior_id: str) -> None:
"""

Save the training buffer's update buffer to a pickle file.
"""
filename = os.path.join(
self.trainer_parameters["output_path"], "last_replay_buffer.hdf5"
self.trainer_settings.output_path, "last_replay_buffer.hdf5"
)
logger.info("Saving Experience Replay Buffer to {}".format(filename))
with open(filename, "wb") as file_object:

Loads the last saved replay buffer from a file.
"""
filename = os.path.join(
self.trainer_parameters["output_path"], "last_replay_buffer.hdf5"
self.trainer_settings.output_path, "last_replay_buffer.hdf5"
)
logger.info("Loading Experience Replay Buffer from {}".format(filename))
with open(filename, "rb+") as file_object:

:return: A boolean corresponding to whether or not _update_policy() can be run
"""
return (
self.update_buffer.num_experiences >= self.trainer_parameters["batch_size"]
and self.step >= self.trainer_parameters["buffer_init_steps"]
self.update_buffer.num_experiences >= self.hyperparameters.batch_size
and self.step >= self.hyperparameters.buffer_init_steps
)
@timed

policy = NNPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,
self.trainer_settings,
self.is_training,
self.load,
tanh_squash=True,

has_updated = False
self.cumulative_returns_since_policy_update.clear()
n_sequences = max(
int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
batch_update_stats: Dict[str, list] = defaultdict(list)

if (
self.update_buffer.num_experiences
>= self.trainer_parameters["batch_size"]
):
if self.update_buffer.num_experiences >= self.hyperparameters.batch_size:
self.trainer_parameters["batch_size"],
self.hyperparameters.batch_size,
sequence_length=self.policy.sequence_length,
)
# Get rewards for each reward

# Truncate update buffer if neccessary. Truncate more than we need to to avoid truncating
# a large buffer at each update.
if self.update_buffer.num_experiences > self.trainer_parameters["buffer_size"]:
if self.update_buffer.num_experiences > self.hyperparameters.buffer_size:
int(self.trainer_parameters["buffer_size"] * BUFFER_TRUNCATE_PERCENT)
int(self.hyperparameters.buffer_size * BUFFER_TRUNCATE_PERCENT)
)
return has_updated

"""
buffer = self.update_buffer
n_sequences = max(
int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
batch_update_stats: Dict[str, list] = defaultdict(list)
while (

# Some signals don't need a minibatch to be sampled - so we don't!
if signal.update_dict:
reward_signal_minibatches[name] = buffer.sample_mini_batch(
self.trainer_parameters["batch_size"],
self.hyperparameters.batch_size,
sequence_length=self.policy.sequence_length,
)
update_stats = self.optimizer.update_reward_signals(

if not isinstance(policy, NNPolicy):
raise RuntimeError("Non-SACPolicy passed to SACTrainer.add_policy()")
self.policy = policy
self.optimizer = SACOptimizer(self.policy, self.trainer_parameters)
self.optimizer = SACOptimizer(self.policy, self.trainer_settings)
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly

39
ml-agents/mlagents/trainers/tests/test_barracuda_converter.py


import os
import tempfile
import pytest
import yaml
from mlagents.trainers.settings import TrainerSettings
from mlagents.tf_utils import tf
from mlagents.model_serialization import SerializationSettings, export_policy_model

os.remove(tmpfile)
@pytest.fixture
def dummy_config():
return yaml.safe_load(
"""
trainer: ppo
batch_size: 32
beta: 5.0e-3
buffer_size: 512
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 3.0e-4
max_steps: 5.0e4
normalize: true
num_epoch: 5
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 1000
use_recurrent: false
normalize: true
memory_size: 8
curiosity_strength: 0.0
curiosity_enc_size: 1
output_path: test
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
)
def test_policy_conversion(dummy_config, tmpdir, rnn, visual, discrete):
def test_policy_conversion(tmpdir, rnn, visual, discrete):
dummy_config["output_path"] = os.path.join(tmpdir, "test")
dummy_config = TrainerSettings(output_path=os.path.join(tmpdir, "test"))
policy = create_policy_mock(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)

100
ml-agents/mlagents/trainers/tests/test_bcmodule.py


import mlagents.trainers.tests.mock_brain as mb
import numpy as np
import yaml
def ppo_dummy_config():
return yaml.safe_load(
"""
trainer: ppo
batch_size: 32
beta: 5.0e-3
buffer_size: 512
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 3.0e-4
max_steps: 5.0e4
normalize: true
num_epoch: 5
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 1000
use_recurrent: false
memory_size: 8
behavioral_cloning:
demo_path: ./Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
strength: 1.0
steps: 10000000
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
)
from mlagents.trainers.settings import (
TrainerSettings,
BehavioralCloningSettings,
NetworkSettings,
)
def create_bc_module(mock_brain, trainer_config, use_rnn, demo_file, tanhresample):
def create_bc_module(mock_brain, bc_settings, use_rnn, tanhresample):
trainer_config["output_path"] = "testpath"
trainer_config["keep_checkpoints"] = 3
trainer_config["use_recurrent"] = use_rnn
trainer_config["behavioral_cloning"]["demo_path"] = (
os.path.dirname(os.path.abspath(__file__)) + "/" + demo_file
trainer_config = TrainerSettings()
trainer_config.network_settings.memory = (
NetworkSettings.MemorySettings() if use_rnn else None
policy = NNPolicy(
0, mock_brain, trainer_config, False, False, tanhresample, tanhresample
)

policy_learning_rate=trainer_config["learning_rate"],
default_batch_size=trainer_config["batch_size"],
policy_learning_rate=trainer_config.hyperparameters.learning_rate,
default_batch_size=trainer_config.hyperparameters.batch_size,
**trainer_config["behavioral_cloning"],
settings=bc_settings,
)
policy.initialize_or_load() # Normally the optimizer calls this after the BCModule is created
return bc_module

def test_bcmodule_defaults():
# See if default values match
mock_brain = mb.create_mock_3dball_brain()
trainer_config = ppo_dummy_config()
bc_module = create_bc_module(mock_brain, trainer_config, False, "test.demo", False)
bc_settings = BehavioralCloningSettings(
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo"
)
bc_module = create_bc_module(mock_brain, bc_settings, False, False)
assert bc_module.batch_size == trainer_config["batch_size"]
assert bc_module.batch_size == TrainerSettings().hyperparameters.batch_size
trainer_config["behavioral_cloning"]["num_epoch"] = 100
trainer_config["behavioral_cloning"]["batch_size"] = 10000
bc_module = create_bc_module(mock_brain, trainer_config, False, "test.demo", False)
bc_settings = BehavioralCloningSettings(
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo",
num_epoch=100,
batch_size=10000,
)
bc_module = create_bc_module(mock_brain, bc_settings, False, False)
assert bc_module.num_epoch == 100
assert bc_module.batch_size == 10000

def test_bcmodule_update(is_sac):
mock_brain = mb.create_mock_3dball_brain()
bc_module = create_bc_module(
mock_brain, ppo_dummy_config(), False, "test.demo", is_sac
bc_settings = BehavioralCloningSettings(
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo"
bc_module = create_bc_module(mock_brain, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)

@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_constant_lr_update(is_sac):
trainer_config = ppo_dummy_config()
trainer_config["behavioral_cloning"]["steps"] = 0
bc_module = create_bc_module(mock_brain, trainer_config, False, "test.demo", is_sac)
bc_settings = BehavioralCloningSettings(
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo",
steps=0,
)
bc_module = create_bc_module(mock_brain, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)

@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_rnn_update(is_sac):
mock_brain = mb.create_mock_3dball_brain()
bc_module = create_bc_module(
mock_brain, ppo_dummy_config(), True, "test.demo", is_sac
bc_settings = BehavioralCloningSettings(
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo"
bc_module = create_bc_module(mock_brain, bc_settings, True, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)

@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_dc_visual_update(is_sac):
mock_brain = mb.create_mock_banana_brain()
bc_module = create_bc_module(
mock_brain, ppo_dummy_config(), False, "testdcvis.demo", is_sac
bc_settings = BehavioralCloningSettings(
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "testdcvis.demo"
bc_module = create_bc_module(mock_brain, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)

@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_rnn_dc_update(is_sac):
mock_brain = mb.create_mock_banana_brain()
bc_module = create_bc_module(
mock_brain, ppo_dummy_config(), True, "testdcvis.demo", is_sac
bc_settings = BehavioralCloningSettings(
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "testdcvis.demo"
bc_module = create_bc_module(mock_brain, bc_settings, True, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)

94
ml-agents/mlagents/trainers/tests/test_curriculum.py


import io
import json
from unittest.mock import patch, mock_open
from mlagents.trainers.exception import CurriculumConfigError, CurriculumLoadingError
from mlagents.trainers.exception import CurriculumConfigError
from mlagents.trainers.settings import CurriculumSettings
dummy_curriculum_json_str = """
{
"measure" : "reward",
"thresholds" : [10, 20, 50],
"min_lesson_length" : 3,
"signal_smoothing" : true,
"parameters" :
{
"param1" : [0.7, 0.5, 0.3, 0.1],
"param2" : [100, 50, 20, 15],
"param3" : [0.2, 0.3, 0.7, 0.9]
}
}
"""
dummy_curriculum_config = json.loads(dummy_curriculum_json_str)
bad_curriculum_json_str = """
{
"measure" : "reward",
"thresholds" : [10, 20, 50],
"min_lesson_length" : 3,
"signal_smoothing" : false,
"parameters" :
{
"param1" : [0.7, 0.5, 0.3, 0.1],
"param2" : [100, 50, 20],
"param3" : [0.2, 0.3, 0.7, 0.9]
}
}
"""
dummy_curriculum_config = CurriculumSettings(
measure="reward",
thresholds=[10, 20, 50],
min_lesson_length=3,
signal_smoothing=True,
parameters={
"param1": [0.7, 0.5, 0.3, 0.1],
"param2": [100, 50, 20, 15],
"param3": [0.2, 0.3, 0.7, 0.9],
},
)
dummy_curriculum_config_path = "TestBrain.json"
bad_curriculum_config = CurriculumSettings(
measure="reward",
thresholds=[10, 20, 50],
min_lesson_length=3,
signal_smoothing=False,
parameters={
"param1": [0.7, 0.5, 0.3, 0.1],
"param2": [100, 50, 20],
"param3": [0.2, 0.3, 0.7, 0.9],
},
)
@pytest.fixture

assert curriculum.brain_name == "TestBrain"
assert curriculum.lesson_num == 0
assert curriculum.measure == "reward"
@patch("builtins.open", new_callable=mock_open, read_data=bad_curriculum_json_str)
def test_load_bad_curriculum_file_raises_error(mock_file):
with pytest.raises(CurriculumConfigError):
Curriculum(
"TestBrain", Curriculum.load_curriculum_file(dummy_curriculum_config_path)
)
def test_increment_lesson():

assert curriculum.get_config(0) == {"param1": 0.7, "param2": 100, "param3": 0.2}
# Test json loading and error handling. These examples don't need to valid config files.
def test_curriculum_load_good():
expected = {"x": 1}
value = json.dumps(expected)
fp = io.StringIO(value)
assert expected == Curriculum._load_curriculum(fp)
def test_curriculum_load_missing_file():
with pytest.raises(CurriculumLoadingError):
Curriculum.load_curriculum_file("notAValidFile.json")
def test_curriculum_load_invalid_json():
# This isn't valid json because of the trailing comma
contents = """
{
"x": [1, 2, 3,]
}
"""
fp = io.StringIO(contents)
with pytest.raises(CurriculumLoadingError):
Curriculum._load_curriculum(fp)
def test_load_bad_curriculum_file_raises_error():
with pytest.raises(CurriculumConfigError):
Curriculum("TestBrain", bad_curriculum_config)

36
ml-agents/mlagents/trainers/tests/test_distributions.py


from mlagents.tf_utils import tf
import yaml
@pytest.fixture
def dummy_config():
return yaml.safe_load(
"""
trainer: ppo
batch_size: 32
beta: 5.0e-3
buffer_size: 512
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 3.0e-4
max_steps: 5.0e4
normalize: true
num_epoch: 5
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 1000
use_recurrent: false
normalize: true
memory_size: 8
curiosity_strength: 0.0
curiosity_enc_size: 1
summary_path: test
model_path: test
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
)
VECTOR_ACTION_SPACE = [2]

40
ml-agents/mlagents/trainers/tests/test_ghost.py


import numpy as np
import yaml
from mlagents.trainers.ghost.trainer import GhostTrainer
from mlagents.trainers.ghost.controller import GhostController
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers

from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
from mlagents.trainers.settings import TrainerSettings, SelfPlaySettings
return yaml.safe_load(
"""
trainer: ppo
batch_size: 32
beta: 5.0e-3
buffer_size: 512
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 3.0e-4
max_steps: 5.0e4
normalize: true
num_epoch: 5
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 1000
use_recurrent: false
normalize: true
memory_size: 8
curiosity_strength: 0.0
curiosity_enc_size: 1
output_path: test
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
self_play:
window: 5
play_against_current_self_ratio: 0.5
save_steps: 1000
swap_steps: 1000
"""
)
return TrainerSettings(self_play=SelfPlaySettings())
VECTOR_ACTION_SPACE = [1]

vector_action_descriptions=[],
vector_action_space_type=0,
)
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
controller = GhostController(100)
trainer = GhostTrainer(

vector_action_descriptions=[],
vector_action_space_type=0,
)
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
controller = GhostController(100)
trainer = GhostTrainer(

133
ml-agents/mlagents/trainers/tests/test_learn.py


from unittest.mock import MagicMock, patch, mock_open
from mlagents.trainers import learn
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.learn import parse_command_line, DetectDefault
from mlagents.trainers.learn import parse_command_line
from mlagents.trainers.cli_utils import DetectDefault
from mlagents_envs.exception import UnityEnvironmentException
from mlagents.trainers.stats import StatsReporter

MOCK_PARAMETER_YAML = """
behaviors:
{}
env_path: "./oldenvfile"
keep_checkpoints: 34
lesson: 2
run_id: uselessrun
save_freq: 654321
seed: 9870
base_port: 4001
num_envs: 4
env_settings:
env_path: "./oldenvfile"
num_envs: 4
base_port: 4001
seed: 9870
checkpoint_settings:
lesson: 2
run_id: uselessrun
save_freq: 654321
behaviors:
parameter_randomization:
sampler1: foo
curriculum:
curriculum:
curriculum1
parameters:
foo: [0.2, 0.5]
curriculum:
curriculum2
parameter_randomization:
sampler1
parameters:
foo: [0.2, 0.5]
"""

@patch("mlagents.trainers.learn.SamplerManager")
@patch("mlagents.trainers.learn.SubprocessEnvManager")
@patch("mlagents.trainers.learn.create_environment_factory")
@patch("mlagents.trainers.learn.load_config")
@patch("mlagents.trainers.settings.load_config")
def test_run_training(
load_config,
create_environment_factory,

@patch("builtins.open", new_callable=mock_open, read_data=MOCK_YAML)
def test_commandline_args(mock_file):
# No args raises
with pytest.raises(SystemExit):
parse_command_line([])
# with pytest.raises(SystemExit):
# parse_command_line([])
assert opt.env_path is None
assert opt.env_settings.env_path is None
assert opt.keep_checkpoints == 5
assert opt.lesson == 0
assert opt.resume is False
assert opt.inference is False
assert opt.run_id == "ppo"
assert opt.save_freq == 50000
assert opt.seed == -1
assert opt.base_port == 5005
assert opt.num_envs == 1
assert opt.no_graphics is False
assert opt.checkpoint_settings.lesson == 0
assert opt.checkpoint_settings.resume is False
assert opt.checkpoint_settings.inference is False
assert opt.checkpoint_settings.run_id == "ppo"
assert opt.checkpoint_settings.save_freq == 50000
assert opt.env_settings.seed == -1
assert opt.env_settings.base_port == 5005
assert opt.env_settings.num_envs == 1
assert opt.engine_settings.no_graphics is False
assert opt.env_args is None
assert opt.env_settings.env_args is None
"--keep-checkpoints=42",
"--lesson=3",
"--resume",
"--inference",

opt = parse_command_line(full_args)
assert opt.behaviors == {}
assert opt.env_path == "./myenvfile"
assert opt.env_settings.env_path == "./myenvfile"
assert opt.keep_checkpoints == 42
assert opt.lesson == 3
assert opt.run_id == "myawesomerun"
assert opt.save_freq == 123456
assert opt.seed == 7890
assert opt.base_port == 4004
assert opt.num_envs == 2
assert opt.no_graphics is True
assert opt.checkpoint_settings.lesson == 3
assert opt.checkpoint_settings.run_id == "myawesomerun"
assert opt.checkpoint_settings.save_freq == 123456
assert opt.env_settings.seed == 7890
assert opt.env_settings.base_port == 4004
assert opt.env_settings.num_envs == 2
assert opt.engine_settings.no_graphics is True
assert opt.inference is True
assert opt.resume is True
assert opt.checkpoint_settings.inference is True
assert opt.checkpoint_settings.resume is True
@patch("builtins.open", new_callable=mock_open, read_data=MOCK_PARAMETER_YAML)

opt = parse_command_line(["mytrainerpath"])
assert opt.behaviors == {}
assert opt.env_path == "./oldenvfile"
assert opt.env_settings.env_path == "./oldenvfile"
assert opt.keep_checkpoints == 34
assert opt.lesson == 2
assert opt.run_id == "uselessrun"
assert opt.save_freq == 654321
assert opt.seed == 9870
assert opt.base_port == 4001
assert opt.num_envs == 4
assert opt.no_graphics is False
assert opt.checkpoint_settings.lesson == 2
assert opt.checkpoint_settings.run_id == "uselessrun"
assert opt.checkpoint_settings.save_freq == 654321
assert opt.env_settings.seed == 9870
assert opt.env_settings.base_port == 4001
assert opt.env_settings.num_envs == 4
assert opt.engine_settings.no_graphics is False
assert opt.env_args is None
assert opt.env_settings.env_args is None
"--keep-checkpoints=42",
"--lesson=3",
"--resume",
"--inference",

opt = parse_command_line(full_args)
assert opt.behaviors == {}
assert opt.env_path == "./myenvfile"
assert opt.env_settings.env_path == "./myenvfile"
assert opt.keep_checkpoints == 42
assert opt.lesson == 3
assert opt.run_id == "myawesomerun"
assert opt.save_freq == 123456
assert opt.seed == 7890
assert opt.base_port == 4004
assert opt.num_envs == 2
assert opt.no_graphics is True
assert opt.checkpoint_settings.lesson == 3
assert opt.checkpoint_settings.run_id == "myawesomerun"
assert opt.checkpoint_settings.save_freq == 123456
assert opt.env_settings.seed == 7890
assert opt.env_settings.base_port == 4004
assert opt.env_settings.num_envs == 2
assert opt.engine_settings.no_graphics is True
assert opt.inference is True
assert opt.resume is True
assert opt.checkpoint_settings.inference is True
assert opt.checkpoint_settings.resume is True
assert opt.parameter_randomization == "sampler1"
assert opt.parameter_randomization == {"sampler1": "foo"}
assert len(opt.curriculum.keys()) == 2
@patch("builtins.open", new_callable=mock_open, read_data=MOCK_YAML)

]
opt = parse_command_line(full_args)
assert opt.env_args == ["--foo=bar", "--blah", "baz", "100"]
assert opt.env_settings.env_args == ["--foo=bar", "--blah", "baz", "100"]

55
ml-agents/mlagents/trainers/tests/test_meta_curriculum.py


from unittest.mock import patch, Mock
from mlagents.trainers.meta_curriculum import MetaCurriculum
import json
import yaml
from mlagents.trainers.tests.test_simple_rl import _check_environment_trains, BRAIN_NAME
from mlagents.trainers.tests.test_curriculum import dummy_curriculum_json_str
from mlagents.trainers.tests.test_simple_rl import (
_check_environment_trains,
BRAIN_NAME,
PPO_CONFIG,
)
from mlagents.trainers.tests.test_curriculum import dummy_curriculum_config
from mlagents.trainers.settings import CurriculumSettings
@pytest.fixture

def test_curriculum_config(param_name="test_param1", min_lesson_length=100):
return {
"measure": "progress",
"thresholds": [0.1, 0.3, 0.5],
"min_lesson_length": min_lesson_length,
"signal_smoothing": True,
"parameters": {f"{param_name}": [0.0, 4.0, 6.0, 8.0]},
}
return CurriculumSettings(
thresholds=[0.1, 0.3, 0.5],
min_lesson_length=min_lesson_length,
parameters={f"{param_name}": [0.0, 4.0, 6.0, 8.0]},
)
test_meta_curriculum_config = {

assert meta_curriculum.get_config() == {"test_param1": 0.0, "test_param2": 0.0}
TRAINER_CONFIG = """
default:
trainer: ppo
batch_size: 16
beta: 5.0e-3
buffer_size: 64
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 5.0e-3
max_steps: 100
memory_size: 256
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 50
use_recurrent: false
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
curriculum_config = json.loads(dummy_curriculum_json_str)
mc = MetaCurriculum({curriculum_brain_name: curriculum_config})
trainer_config = yaml.safe_load(TRAINER_CONFIG)
mc = MetaCurriculum({curriculum_brain_name: dummy_curriculum_config})
env, trainer_config, meta_curriculum=mc, success_threshold=None
env, {BRAIN_NAME: PPO_CONFIG}, meta_curriculum=mc, success_threshold=None
)

72
ml-agents/mlagents/trainers/tests/test_nn_policy.py


import pytest
import os
from typing import Dict, Any
import yaml
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.models import EncoderType, ModelUtils

from mlagents.trainers.settings import TrainerSettings, NetworkSettings
@pytest.fixture
def dummy_config():
return yaml.safe_load(
"""
trainer: ppo
batch_size: 32
beta: 5.0e-3
buffer_size: 512
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 3.0e-4
max_steps: 5.0e4
normalize: true
num_epoch: 5
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 1000
use_recurrent: false
normalize: true
memory_size: 8
curiosity_strength: 0.0
curiosity_enc_size: 1
output_path: test
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
)
VECTOR_ACTION_SPACE = [2]

def create_policy_mock(
dummy_config: Dict[str, Any],
dummy_config: TrainerSettings,
use_rnn: bool = False,
use_discrete: bool = True,
use_visual: bool = False,

discrete_action_space=DISCRETE_ACTION_SPACE,
)
trainer_parameters = dummy_config
trainer_parameters["keep_checkpoints"] = 3
trainer_parameters["use_recurrent"] = use_rnn
policy = NNPolicy(seed, mock_brain, trainer_parameters, False, load)
trainer_settings = dummy_config
trainer_settings.keep_checkpoints = 3
trainer_settings.network_settings.memory = (
NetworkSettings.MemorySettings() if use_rnn else None
)
policy = NNPolicy(seed, mock_brain, trainer_settings, False, load)
def test_load_save(dummy_config, tmp_path):
def test_load_save(tmp_path):
trainer_params = dummy_config
trainer_params["output_path"] = path1
trainer_params = TrainerSettings(output_path=path1)
policy = create_policy_mock(trainer_params)
policy.initialize_or_load()
policy._set_step(2000)

assert policy2.get_current_step() == 2000
# Try initialize from path 1
trainer_params["model_path"] = path2
trainer_params["init_path"] = path1
trainer_params.output_path = path2
trainer_params.init_path = path1
policy3 = create_policy_mock(trainer_params, load=False, seed=2)
policy3.initialize_or_load()

@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
def test_policy_evaluate(dummy_config, rnn, visual, discrete):
def test_policy_evaluate(rnn, visual, discrete):
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
decision_step, terminal_step = mb.create_steps_from_brainparams(
policy.brain, num_agents=NUM_AGENTS

assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE[0])
def test_normalization(dummy_config):
def test_normalization():
brain_params = BrainParameters(
brain_name="test_brain",
vector_observation_space_size=1,

vector_action_space_type=0,
)
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
time_horizon = 6
trajectory = make_fake_trajectory(

# Change half of the obs to 0
for i in range(3):
trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
policy = policy = NNPolicy(0, brain_params, dummy_config, False, False)
policy = NNPolicy(
0,
brain_params,
TrainerSettings(network_settings=NetworkSettings(normalize=True)),
False,
False,
)
trajectory_buffer = trajectory.to_agentbuffer()
policy.update_normalization(trajectory_buffer["vector_obs"])

11
ml-agents/mlagents/trainers/tests/test_policy.py


from mlagents_envs.base_env import DecisionSteps, BehaviorSpec
from mlagents.trainers.action_info import ActionInfo
from unittest.mock import MagicMock
from mlagents.trainers.settings import TrainerSettings
import numpy as np

return mock_brain
def basic_params():
return {"use_recurrent": False, "output_path": "my/path"}
class FakePolicy(TFPolicy):
def create_tf_graph(self):
pass

def test_take_action_returns_empty_with_no_agents():
test_seed = 3
policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings())
# Doesn't really matter what this is
dummy_groupspec = BehaviorSpec([(1,)], "continuous", 1)
no_agent_step = DecisionSteps.empty(dummy_groupspec)

def test_take_action_returns_nones_on_missing_values():
test_seed = 3
policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings())
policy.evaluate = MagicMock(return_value={})
policy.save_memories = MagicMock()
step_with_agents = DecisionSteps(

def test_take_action_returns_action_info_when_available():
test_seed = 3
policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings())
policy_eval_out = {
"action": np.array([1.0], dtype=np.float32),
"memory_out": np.array([[2.5]], dtype=np.float32),

99
ml-agents/mlagents/trainers/tests/test_ppo.py


import numpy as np
from mlagents.tf_utils import tf
import yaml
import copy
import attr
from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
from mlagents.trainers.ppo.optimizer import PPOOptimizer

from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.mock_brain import make_brain_parameters
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.settings import NetworkSettings, TrainerSettings, PPOSettings
from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG
from mlagents.trainers.exception import TrainerConfigError
from mlagents.trainers.tests.test_reward_signals import ( # noqa: F401; pylint: disable=unused-variable
curiosity_dummy_config,
gail_dummy_config,

@pytest.fixture
def dummy_config():
return yaml.safe_load(
"""
trainer: ppo
batch_size: 32
beta: 5.0e-3
buffer_size: 512
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 3.0e-4
max_steps: 5.0e4
normalize: true
num_epoch: 5
num_layers: 2
time_horizon: 64
sequence_length: 16
summary_freq: 1000
use_recurrent: false
normalize: true
memory_size: 10
curiosity_strength: 0.0
curiosity_enc_size: 1
output_path: test
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
)
return copy.deepcopy(PPO_CONFIG)
VECTOR_ACTION_SPACE = [2]

discrete_action_space=DISCRETE_ACTION_SPACE,
)
trainer_parameters = dummy_config
model_path = "testmodel"
trainer_parameters["model_path"] = model_path
trainer_parameters["keep_checkpoints"] = 3
trainer_parameters["use_recurrent"] = use_rnn
trainer_settings = attr.evolve(dummy_config)
trainer_settings.network_settings.memory = (
NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
if use_rnn
else None
)
0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
0, mock_brain, trainer_settings, False, False, create_tf_graph=False
optimizer = PPOOptimizer(policy, trainer_parameters)
optimizer = PPOOptimizer(policy, trainer_settings)
return optimizer

@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
# We need to test this separately from test_reward_signals.py to ensure no interactions
def test_ppo_optimizer_update_curiosity(
curiosity_dummy_config, dummy_config, rnn, visual, discrete # noqa: F811
dummy_config, curiosity_dummy_config, rnn, visual, discrete # noqa: F811
dummy_config["reward_signals"].update(curiosity_dummy_config)
dummy_config.reward_signals = curiosity_dummy_config
optimizer = _create_ppo_optimizer_ops_mock(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)

def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811
# Test evaluate
tf.reset_default_graph()
dummy_config["reward_signals"].update(gail_dummy_config)
dummy_config.reward_signals = gail_dummy_config
dummy_config, use_rnn=False, use_discrete=False, use_visual=False
PPO_CONFIG, use_rnn=False, use_discrete=False, use_visual=False
)
# Test update
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)

@mock.patch("mlagents.trainers.ppo.trainer.PPOOptimizer")
def test_trainer_increment_step(ppo_optimizer, dummy_config):
trainer_params = dummy_config
def test_trainer_increment_step(ppo_optimizer):
trainer_params = PPO_CONFIG
mock_optimizer = mock.Mock()
mock_optimizer.reward_signals = {}
ppo_optimizer.return_value = mock_optimizer

policy_mock = mock.Mock(spec=NNPolicy)
policy_mock.get_current_step.return_value = 0
step_count = (
5
) # 10 hacked because this function is no longer called through trainer
5 # 10 hacked because this function is no longer called through trainer
)
policy_mock.increment_step = mock.Mock(return_value=step_count)
trainer.add_policy("testbehavior", policy_mock)

@pytest.mark.parametrize("use_discrete", [True, False])
def test_trainer_update_policy(dummy_config, use_discrete):
def test_trainer_update_policy(
dummy_config, curiosity_dummy_config, use_discrete # noqa: F811
):
mock_brain = mb.setup_mock_brain(
use_discrete,
False,

)
trainer_params = dummy_config
trainer_params["use_recurrent"] = True
trainer_params.network_settings.memory = NetworkSettings.MemorySettings(
memory_size=10, sequence_length=16
)
trainer_params["reward_signals"]["curiosity"] = {}
trainer_params["reward_signals"]["curiosity"]["strength"] = 1.0
trainer_params["reward_signals"]["curiosity"]["gamma"] = 0.99
trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128
trainer_params.reward_signals = curiosity_dummy_config
trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0")
policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
trainer.add_policy(mock_brain.brain_name, policy)

vector_action_descriptions=[],
vector_action_space_type=0,
)
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = trainer.create_policy(brain_params.brain_name, brain_params)
trainer.add_policy(brain_params.brain_name, policy)

mock_optimizer.reward_signals = {}
ppo_optimizer.return_value = mock_optimizer
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = mock.Mock(spec=NNPolicy)
policy.get_current_step.return_value = 2000

trainer.add_policy(brain_params, policy)
def test_bad_config(dummy_config):
# TODO: Move this to test_settings.py
def test_bad_config():
dummy_config["sequence_length"] = 64
dummy_config["batch_size"] = 32
dummy_config["use_recurrent"] = True
with pytest.raises(UnityTrainerException):
with pytest.raises(TrainerConfigError):
TrainerSettings(
network_settings=NetworkSettings(
memory=NetworkSettings.MemorySettings(sequence_length=64)
),
hyperparameters=PPOSettings(batch_size=32),
)
_ = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")

141
ml-agents/mlagents/trainers/tests/test_reward_signals.py


import pytest
import yaml
import copy
from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG, SAC_CONFIG
from mlagents.trainers.settings import (
GAILSettings,
CuriositySettings,
RewardSignalSettings,
BehavioralCloningSettings,
NetworkSettings,
TrainerType,
RewardSignalType,
)
CONTINUOUS_PATH = os.path.dirname(os.path.abspath(__file__)) + "/test.demo"
DISCRETE_PATH = os.path.dirname(os.path.abspath(__file__)) + "/testdcvis.demo"

return yaml.safe_load(
"""
trainer: ppo
batch_size: 32
beta: 5.0e-3
buffer_size: 512
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 3.0e-4
max_steps: 5.0e4
normalize: true
num_epoch: 5
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 1000
use_recurrent: false
memory_size: 8
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
)
return copy.deepcopy(PPO_CONFIG)
return yaml.safe_load(
"""
trainer: sac
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 128
init_entcoef: 1.0
learning_rate: 3.0e-4
max_steps: 5.0e4
memory_size: 256
normalize: false
steps_per_update: 1
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 1000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
)
return copy.deepcopy(SAC_CONFIG)
return {
"gail": {
"strength": 0.1,
"gamma": 0.9,
"encoding_size": 128,
"use_vail": True,
"demo_path": CONTINUOUS_PATH,
}
}
return {RewardSignalType.GAIL: GAILSettings(demo_path=CONTINUOUS_PATH)}
return {"curiosity": {"strength": 0.1, "gamma": 0.9, "encoding_size": 128}}
return {RewardSignalType.CURIOSITY: CuriositySettings()}
@pytest.fixture
def extrinsic_dummy_config():
return {RewardSignalType.EXTRINSIC: RewardSignalSettings()}
VECTOR_ACTION_SPACE = [2]

vector_obs_space=VECTOR_OBS_SPACE,
discrete_action_space=DISCRETE_ACTION_SPACE,
)
trainer_parameters = trainer_config
model_path = "testpath"
trainer_parameters["output_path"] = model_path
trainer_parameters["keep_checkpoints"] = 3
trainer_parameters["reward_signals"].update(reward_signal_config)
trainer_parameters["use_recurrent"] = use_rnn
trainer_settings = trainer_config
trainer_settings.reward_signals = reward_signal_config
trainer_settings.network_settings.memory = (
NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
if use_rnn
else None
)
0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
0, mock_brain, trainer_settings, False, False, create_tf_graph=False
if trainer_parameters["trainer"] == "sac":
optimizer = SACOptimizer(policy, trainer_parameters)
if trainer_settings.trainer_type == TrainerType.SAC:
optimizer = SACOptimizer(policy, trainer_settings)
optimizer = PPOOptimizer(policy, trainer_parameters)
optimizer = PPOOptimizer(policy, trainer_settings)
return optimizer

"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_gail_cc(trainer_config, gail_dummy_config):
trainer_config.update(
{
"behavioral_cloning": {
"demo_path": CONTINUOUS_PATH,
"strength": 1.0,
"steps": 10000000,
}
}
trainer_config.behavioral_cloning = BehavioralCloningSettings(
demo_path=CONTINUOUS_PATH
)
optimizer = create_optimizer_mock(
trainer_config, gail_dummy_config, False, False, False

"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_gail_dc_visual(trainer_config, gail_dummy_config):
gail_dummy_config["gail"]["demo_path"] = DISCRETE_PATH
trainer_config.update(
{
"behavioral_cloning": {
"demo_path": DISCRETE_PATH,
"strength": 1.0,
"steps": 10000000,
}
}
)
gail_dummy_config_discrete = {
RewardSignalType.GAIL: GAILSettings(demo_path=DISCRETE_PATH)
}
trainer_config, gail_dummy_config, False, True, True
trainer_config, gail_dummy_config_discrete, False, True, True
)
reward_signal_eval(optimizer, "gail")
reward_signal_update(optimizer, "gail")

"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_gail_rnn(trainer_config, gail_dummy_config):
trainer_config.update(
{
"behavioral_cloning": {
"demo_path": CONTINUOUS_PATH,
"strength": 1.0,
"steps": 10000000,
}
}
)
policy = create_optimizer_mock(
trainer_config, gail_dummy_config, True, False, False
)

@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_extrinsic(trainer_config, curiosity_dummy_config):
def test_extrinsic(trainer_config, extrinsic_dummy_config):
trainer_config, curiosity_dummy_config, False, False, False
trainer_config, extrinsic_dummy_config, False, False, False
)
reward_signal_eval(policy, "extrinsic")
reward_signal_update(policy, "extrinsic")

18
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


import yaml
from unittest import mock
import pytest
import mlagents.trainers.tests.mock_brain as mb

def dummy_config():
return yaml.safe_load(
"""
output_path: "test/"
summary_freq: 1000
max_steps: 100
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
)
from mlagents.trainers.settings import TrainerSettings
def create_mock_brain():

def create_rl_trainer():
mock_brainparams = create_mock_brain()
trainer = FakeTrainer(mock_brainparams, dummy_config(), True, 0)
trainer = FakeTrainer(mock_brainparams, TrainerSettings(max_steps=100), True, 0)
trainer.set_is_policy_updating(True)
return trainer

83
ml-agents/mlagents/trainers/tests/test_sac.py


import pytest
from unittest import mock
import yaml
import copy
from mlagents.tf_utils import tf

from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.mock_brain import make_brain_parameters
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.tests.test_simple_rl import SAC_CONFIG
from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.tests.test_reward_signals import ( # noqa: F401; pylint: disable=unused-variable
curiosity_dummy_config,
)
return yaml.safe_load(
"""
trainer: sac
batch_size: 8
buffer_size: 10240
buffer_init_steps: 0
hidden_units: 32
init_entcoef: 0.1
learning_rate: 3.0e-4
max_steps: 1024
memory_size: 10
normalize: true
steps_per_update: 1
num_layers: 1
time_horizon: 64
sequence_length: 16
summary_freq: 1000
tau: 0.005
use_recurrent: false
curiosity_enc_size: 128
demo_path: None
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
)
return copy.deepcopy(SAC_CONFIG)
VECTOR_ACTION_SPACE = [2]

vector_obs_space=VECTOR_OBS_SPACE,
discrete_action_space=DISCRETE_ACTION_SPACE,
)
trainer_parameters = dummy_config
model_path = "testmodel"
trainer_parameters["output_path"] = model_path
trainer_parameters["keep_checkpoints"] = 3
trainer_parameters["use_recurrent"] = use_rnn
trainer_settings = dummy_config
trainer_settings.network_settings.memory = (
NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
if use_rnn
else None
)
0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
0, mock_brain, trainer_settings, False, False, create_tf_graph=False
optimizer = SACOptimizer(policy, trainer_parameters)
optimizer = SACOptimizer(policy, trainer_settings)
return optimizer

@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
def test_sac_update_reward_signals(dummy_config, discrete):
def test_sac_update_reward_signals(
dummy_config, curiosity_dummy_config, discrete # noqa: F811
):
dummy_config["reward_signals"]["curiosity"] = {}
dummy_config["reward_signals"]["curiosity"]["strength"] = 1.0
dummy_config["reward_signals"]["curiosity"]["gamma"] = 0.99
dummy_config["reward_signals"]["curiosity"]["encoding_size"] = 128
dummy_config.reward_signals = curiosity_dummy_config
optimizer = create_sac_optimizer_mock(
dummy_config, use_rnn=False, use_discrete=discrete, use_visual=False
)

discrete_action_space=DISCRETE_ACTION_SPACE,
)
trainer_params = dummy_config
trainer_params["output_path"] = str(tmpdir)
trainer_params["save_replay_buffer"] = True
trainer_params.hyperparameters.save_replay_buffer = True
trainer = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, False, 0, 0)
policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
trainer.add_policy(mock_brain.brain_name, policy)

mock_optimizer.reward_signals = {}
sac_optimizer.return_value = mock_optimizer
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = mock.Mock(spec=NNPolicy)
policy.get_current_step.return_value = 2000

brain_params = make_brain_parameters(
discrete_action=False, visual_inputs=0, vec_obs_size=6
)
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
dummy_config["steps_per_update"] = 20
dummy_config.hyperparameters.steps_per_update = 20
dummy_config.hyperparameters.buffer_init_steps = 0
trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = trainer.create_policy(brain_params.brain_name, brain_params)
trainer.add_policy(brain_params.brain_name, policy)

trainer.advance()
with pytest.raises(AgentManagerQueue.Empty):
policy_queue.get_nowait()
def test_bad_config(dummy_config):
brain_params = make_brain_parameters(
discrete_action=False, visual_inputs=0, vec_obs_size=6
)
# Test that we throw an error if we have sequence length greater than batch size
dummy_config["sequence_length"] = 64
dummy_config["batch_size"] = 32
dummy_config["use_recurrent"] = True
dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
with pytest.raises(UnityTrainerException):
_ = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
if __name__ == "__main__":

388
ml-agents/mlagents/trainers/tests/test_simple_rl.py


import math
import tempfile
import pytest
import yaml
from typing import Dict, Any
import attr
from typing import Dict
from mlagents.trainers.tests.simple_test_envs import (
SimpleEnvironment,

from mlagents.trainers.sampler_class import SamplerManager
from mlagents.trainers.demo_loader import write_demo
from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary
from mlagents.trainers.settings import (
TrainerSettings,
PPOSettings,
SACSettings,
NetworkSettings,
SelfPlaySettings,
BehavioralCloningSettings,
GAILSettings,
TrainerType,
RewardSignalType,
)
from mlagents.trainers.models import EncoderType, ScheduleType
from mlagents_envs.side_channel.environment_parameters_channel import (
EnvironmentParametersChannel,
)

BRAIN_NAME = "1D"
PPO_CONFIG = f"""
{BRAIN_NAME}:
trainer: ppo
batch_size: 16
beta: 5.0e-3
buffer_size: 64
epsilon: 0.2
hidden_units: 32
lambd: 0.95
learning_rate: 5.0e-3
learning_rate_schedule: constant
max_steps: 3000
memory_size: 16
normalize: false
num_epoch: 3
num_layers: 1
time_horizon: 64
sequence_length: 64
summary_freq: 500
use_recurrent: false
threaded: false
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
SAC_CONFIG = f"""
{BRAIN_NAME}:
trainer: sac
batch_size: 8
buffer_size: 5000
buffer_init_steps: 100
hidden_units: 16
init_entcoef: 0.01
learning_rate: 5.0e-3
max_steps: 1000
memory_size: 16
normalize: false
steps_per_update: 1
num_layers: 1
time_horizon: 64
sequence_length: 32
summary_freq: 100
tau: 0.01
use_recurrent: false
curiosity_enc_size: 128
demo_path: None
vis_encode_type: simple
threaded: false
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
PPO_CONFIG = TrainerSettings(
trainer_type=TrainerType.PPO,
hyperparameters=PPOSettings(
learning_rate=5.0e-3,
learning_rate_schedule=ScheduleType.CONSTANT,
batch_size=16,
buffer_size=64,
),
network_settings=NetworkSettings(num_layers=1, hidden_units=32),
summary_freq=500,
max_steps=3000,
threaded=False,
)
def generate_config(
config: str, override_vals: Dict[str, Any] = None
) -> Dict[str, Any]:
trainer_config = yaml.safe_load(config)
if override_vals is not None:
trainer_config[BRAIN_NAME].update(override_vals)
return trainer_config
SAC_CONFIG = TrainerSettings(
trainer_type=TrainerType.SAC,
hyperparameters=SACSettings(
learning_rate=5.0e-3,
learning_rate_schedule=ScheduleType.CONSTANT,
batch_size=8,
buffer_init_steps=100,
buffer_size=5000,
tau=0.01,
init_entcoef=0.01,
),
network_settings=NetworkSettings(num_layers=1, hidden_units=16),
summary_freq=100,
max_steps=1000,
threaded=False,
)
# The reward processor is passed as an argument to _check_environment_trains.

StatsReporter.writers.clear() # Clear StatsReporters so we don't write to file
debug_writer = DebugWriter()
StatsReporter.add_writer(debug_writer)
# Make sure threading is turned off for determinism
trainer_config["threading"] = False
if env_manager is None:
env_manager = SimpleEnvManager(env, EnvironmentParametersChannel())
trainer_factory = TrainerFactory(

keep_checkpoints=1,
train_model=True,
load_model=False,
seed=seed,

@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ppo(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
config = generate_config(PPO_CONFIG)
_check_environment_trains(env, config)
config = attr.evolve(PPO_CONFIG)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])

)
config = generate_config(PPO_CONFIG)
_check_environment_trains(env, config)
config = attr.evolve(PPO_CONFIG)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])

num_vector=0,
step_size=0.2,
)
override_vals = {"learning_rate": 3.0e-4}
config = generate_config(PPO_CONFIG, override_vals)
_check_environment_trains(env, config)
new_hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3.0e-4)
config = attr.evolve(PPO_CONFIG, hyperparameters=new_hyperparams)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("num_visual", [1, 2])

step_size=0.5,
vis_obs_size=(36, 36, 3),
)
override_vals = {
"learning_rate": 3.0e-4,
"vis_encode_type": vis_encode_type,
"max_steps": 500,
"summary_freq": 100,
}
config = generate_config(PPO_CONFIG, override_vals)
new_networksettings = attr.evolve(
SAC_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
)
new_hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3.0e-4)
config = attr.evolve(
PPO_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=500,
summary_freq=100,
)
_check_environment_trains(env, config, success_threshold=0.5)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
override_vals = {
"max_steps": 5000,
"batch_size": 64,
"buffer_size": 128,
"learning_rate": 1e-3,
"use_recurrent": True,
}
config = generate_config(PPO_CONFIG, override_vals)
_check_environment_trains(env, config, success_threshold=0.9)
new_network_settings = attr.evolve(
PPO_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16),
)
new_hyperparams = attr.evolve(
PPO_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=128
)
config = attr.evolve(
PPO_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_network_settings,
max_steps=5000,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
config = generate_config(SAC_CONFIG)
_check_environment_trains(env, config)
config = attr.evolve(SAC_CONFIG)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])

)
override_vals = {"buffer_init_steps": 2000, "max_steps": 10000}
config = generate_config(SAC_CONFIG, override_vals)
_check_environment_trains(env, config, success_threshold=0.8)
new_hyperparams = attr.evolve(SAC_CONFIG.hyperparameters, buffer_init_steps=2000)
config = attr.evolve(SAC_CONFIG, hyperparameters=new_hyperparams, max_steps=10000)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
@pytest.mark.parametrize("use_discrete", [True, False])

num_vector=0,
step_size=0.2,
)
override_vals = {"batch_size": 16, "learning_rate": 3e-4}
config = generate_config(SAC_CONFIG, override_vals)
_check_environment_trains(env, config)
new_hyperparams = attr.evolve(
SAC_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4
)
config = attr.evolve(SAC_CONFIG, hyperparameters=new_hyperparams)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("num_visual", [1, 2])

step_size=0.5,
vis_obs_size=(36, 36, 3),
)
override_vals = {
"batch_size": 16,
"learning_rate": 3.0e-4,
"vis_encode_type": vis_encode_type,
"buffer_init_steps": 0,
"max_steps": 100,
}
config = generate_config(SAC_CONFIG, override_vals)
new_networksettings = attr.evolve(
SAC_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
)
new_hyperparams = attr.evolve(
SAC_CONFIG.hyperparameters,
batch_size=16,
learning_rate=3e-4,
buffer_init_steps=0,
)
config = attr.evolve(
SAC_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=100,
)
_check_environment_trains(env, config, success_threshold=0.5)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
override_vals = {
"batch_size": 64,
"use_recurrent": True,
"max_steps": 5000,
"learning_rate": 1e-3,
"buffer_init_steps": 500,
"steps_per_update": 2,
}
config = generate_config(SAC_CONFIG, override_vals)
_check_environment_trains(env, config)
new_networksettings = attr.evolve(
SAC_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=32),
)
new_hyperparams = attr.evolve(
SAC_CONFIG.hyperparameters,
batch_size=64,
learning_rate=1e-3,
buffer_init_steps=500,
steps_per_update=2,
)
config = attr.evolve(
SAC_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=5000,
)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])

)
override_vals = {
"max_steps": 2500,
"self_play": {
"play_against_latest_model_ratio": 1.0,
"save_steps": 2000,
"swap_steps": 2000,
},
}
config = generate_config(PPO_CONFIG, override_vals)
_check_environment_trains(env, config)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000
)
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2500)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])

)
# This config should fail because the ghosted policy is never swapped with a competent policy.
# Swap occurs after max step is reached.
override_vals = {
"max_steps": 2500,
"self_play": {
"play_against_latest_model_ratio": 1.0,
"save_steps": 2000,
"swap_steps": 4000,
},
}
config = generate_config(PPO_CONFIG, override_vals)
_check_environment_trains(env, config, success_threshold=None)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=4000
)
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2500)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None)
processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]

env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
)
override_vals = {
"max_steps": 4000,
"self_play": {
"play_against_latest_model_ratio": 1.0,
"save_steps": 10000,
"swap_steps": 10000,
"team_change": 4000,
},
}
config = generate_config(PPO_CONFIG, override_vals)
config[brain_name_opp] = config[BRAIN_NAME]
_check_environment_trains(env, config)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0,
save_steps=10000,
swap_steps=10000,
team_change=400,
)
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=4000)
_check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
@pytest.mark.parametrize("use_discrete", [True, False])

)
# This config should fail because the team that us not learning when both have reached
# max step should be executing the initial, untrained poliy.
override_vals = {
"max_steps": 2000,
"self_play": {
"play_against_latest_model_ratio": 0.0,
"save_steps": 5000,
"swap_steps": 5000,
"team_change": 2000,
},
}
config = generate_config(PPO_CONFIG, override_vals)
config[brain_name_opp] = config[BRAIN_NAME]
_check_environment_trains(env, config, success_threshold=None)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=0.0,
save_steps=5000,
swap_steps=5000,
team_change=2000,
)
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2000)
_check_environment_trains(
env, {BRAIN_NAME: config, brain_name_opp: config}, success_threshold=None
)
processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]

def test_gail(simple_record, use_discrete, trainer_config):
demo_path = simple_record(use_discrete)
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
override_vals = {
"max_steps": 500,
"behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},
"reward_signals": {
"gail": {
"strength": 1.0,
"gamma": 0.99,
"encoding_size": 32,
"demo_path": demo_path,
}
},
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
config = generate_config(trainer_config, override_vals)
_check_environment_trains(env, config, success_threshold=0.9)
config = attr.evolve(
trainer_config,
reward_signals=reward_signals,
behavioral_cloning=bc_settings,
max_steps=500,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])

use_discrete=use_discrete,
step_size=0.2,
)
override_vals = {
"max_steps": 1000,
"learning_rate": 3.0e-4,
"behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1500},
"reward_signals": {
"gail": {
"strength": 1.0,
"gamma": 0.99,
"encoding_size": 32,
"demo_path": demo_path,
}
},
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
config = generate_config(PPO_CONFIG, override_vals)
_check_environment_trains(env, config, success_threshold=0.9)
hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3e-4)
config = attr.evolve(
PPO_CONFIG,
reward_signals=reward_signals,
hyperparameters=hyperparams,
behavioral_cloning=bc_settings,
max_steps=1000,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])

use_discrete=use_discrete,
step_size=0.2,
)
override_vals = {
"max_steps": 500,
"batch_size": 16,
"learning_rate": 3.0e-4,
"behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},
"reward_signals": {
"gail": {
"strength": 1.0,
"gamma": 0.99,
"encoding_size": 32,
"demo_path": demo_path,
}
},
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
config = generate_config(SAC_CONFIG, override_vals)
_check_environment_trains(env, config, success_threshold=0.9)
hyperparams = attr.evolve(
SAC_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16
)
config = attr.evolve(
SAC_CONFIG,
reward_signals=reward_signals,
hyperparameters=hyperparams,
behavioral_cloning=bc_settings,
max_steps=500,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)

4
ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py


from mlagents.trainers.tests.test_simple_rl import (
_check_environment_trains,
PPO_CONFIG,
generate_config,
DebugWriter,
)

env_manager = SubprocessEnvManager(
simple_env_factory, EngineConfig.default_config(), num_envs
)
trainer_config = generate_config(PPO_CONFIG, override_vals={"max_steps": 5000})
trainer_config,
{"1D": PPO_CONFIG},
env_manager=env_manager,
success_threshold=None,
)

274
ml-agents/mlagents/trainers/tests/test_trainer_util.py


import pytest
import yaml
from mlagents.trainers.trainer_util import (
load_config,
_load_config,
assemble_curriculum_config,
)
from mlagents.trainers.cli_utils import load_config, _load_config
from mlagents.trainers.settings import RunOptions
from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG
return yaml.safe_load(
"""
default:
trainer: ppo
batch_size: 32
beta: 5.0e-3
buffer_size: 512
epsilon: 0.2
gamma: 0.99
hidden_units: 128
lambd: 0.95
learning_rate: 3.0e-4
max_steps: 5.0e4
normalize: true
num_epoch: 5
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 1000
use_recurrent: false
memory_size: 8
use_curiosity: false
curiosity_strength: 0.0
curiosity_enc_size: 1
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
)
@pytest.fixture
def dummy_config_with_override(dummy_config):
base = dummy_config
base["testbrain"] = {}
base["testbrain"]["normalize"] = False
return base
@pytest.fixture
def dummy_bad_config():
return yaml.safe_load(
"""
default:
trainer: incorrect_trainer
brain_to_imitate: ExpertBrain
batches_per_epoch: 16
batch_size: 32
beta: 5.0e-3
buffer_size: 512
epsilon: 0.2
gamma: 0.99
hidden_units: 128
lambd: 0.95
learning_rate: 3.0e-4
max_steps: 5.0e4
normalize: true
num_epoch: 5
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 1000
use_recurrent: false
memory_size: 8
"""
)
@patch("mlagents.trainers.brain.BrainParameters")
def test_initialize_trainer_parameters_override_defaults(
BrainParametersMock, dummy_config_with_override
):
run_id = "testrun"
output_path = "model_dir"
keep_checkpoints = 1
train_model = True
load_model = False
seed = 11
expected_reward_buff_cap = 1
base_config = dummy_config_with_override
expected_config = base_config["default"]
expected_config["output_path"] = output_path + "/testbrain"
expected_config["keep_checkpoints"] = keep_checkpoints
# Override value from specific brain config
expected_config["normalize"] = False
brain_params_mock = BrainParametersMock()
BrainParametersMock.return_value.brain_name = "testbrain"
external_brains = {"testbrain": brain_params_mock}
def mock_constructor(
self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id
):
assert brain == brain_params_mock.brain_name
assert trainer_parameters == expected_config
assert reward_buff_cap == expected_reward_buff_cap
assert training == train_model
assert load == load_model
assert seed == seed
assert run_id == run_id
with patch.object(PPOTrainer, "__init__", mock_constructor):
trainer_factory = trainer_util.TrainerFactory(
trainer_config=base_config,
run_id=run_id,
output_path=output_path,
keep_checkpoints=keep_checkpoints,
train_model=train_model,
load_model=load_model,
seed=seed,
)
trainers = {}
for _, brain_parameters in external_brains.items():
trainers["testbrain"] = trainer_factory.generate(
brain_parameters.brain_name
)
assert "testbrain" in trainers
assert isinstance(trainers["testbrain"], PPOTrainer)
return RunOptions(behaviors={"testbrain": PPO_CONFIG})
@patch("mlagents.trainers.brain.BrainParameters")

external_brains = {"testbrain": BrainParametersMock()}
run_id = "testrun"
output_path = "results_dir"
keep_checkpoints = 1
base_config = dummy_config
expected_config = base_config["default"]
expected_config["output_path"] = output_path + "/testbrain"
expected_config["keep_checkpoints"] = keep_checkpoints
base_config = dummy_config.behaviors
expected_config = PPO_CONFIG
self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id
self, brain, reward_buff_cap, trainer_settings, training, load, seed, run_id
assert trainer_parameters == expected_config
assert trainer_settings == expected_config
assert reward_buff_cap == expected_reward_buff_cap
assert training == train_model
assert load == load_model

trainer_config=base_config,
run_id=run_id,
output_path=output_path,
keep_checkpoints=keep_checkpoints,
train_model=train_model,
load_model=load_model,
seed=seed,

@patch("mlagents.trainers.brain.BrainParameters")
def test_initialize_invalid_trainer_raises_exception(
BrainParametersMock, dummy_bad_config
):
run_id = "testrun"
output_path = "results_dir"
keep_checkpoints = 1
train_model = True
load_model = False
seed = 11
bad_config = dummy_bad_config
BrainParametersMock.return_value.brain_name = "testbrain"
external_brains = {"testbrain": BrainParametersMock()}
with pytest.raises(TrainerConfigError):
trainer_factory = trainer_util.TrainerFactory(
trainer_config=bad_config,
run_id=run_id,
output_path=output_path,
keep_checkpoints=keep_checkpoints,
train_model=train_model,
load_model=load_model,
seed=seed,
)
trainers = {}
for brain_name, brain_parameters in external_brains.items():
trainers[brain_name] = trainer_factory.generate(brain_parameters.brain_name)
# Test no trainer specified
del bad_config["default"]["trainer"]
with pytest.raises(TrainerConfigError):
trainer_factory = trainer_util.TrainerFactory(
trainer_config=bad_config,
run_id=run_id,
output_path=output_path,
keep_checkpoints=keep_checkpoints,
train_model=train_model,
load_model=load_model,
seed=seed,
)
trainers = {}
for brain_name, brain_parameters in external_brains.items():
trainers[brain_name] = trainer_factory.generate(brain_parameters.brain_name)
# Test BC trainer specified
bad_config["default"]["trainer"] = "offline_bc"
with pytest.raises(UnityTrainerException):
trainer_factory = trainer_util.TrainerFactory(
trainer_config=bad_config,
run_id=run_id,
output_path=output_path,
keep_checkpoints=keep_checkpoints,
train_model=train_model,
load_model=load_model,
seed=seed,
)
trainers = {}
for brain_name, brain_parameters in external_brains.items():
trainers[brain_name] = trainer_factory.generate(brain_parameters.brain_name)
def test_handles_no_default_section(dummy_config):
def test_handles_no_config_provided(BrainParametersMock):
Make sure the trainer setup handles a missing "default" in the config.
Make sure the trainer setup handles no configs provided at all.
no_default_config = {brain_name: dummy_config["default"]}
no_default_config = RunOptions().behaviors
brain_parameters = BrainParameters(
brain_name=brain_name,
vector_observation_space_size=1,

trainer_config=no_default_config,
run_id="testrun",
output_path="output_path",
keep_checkpoints=1,
train_model=True,
load_model=False,
seed=42,

def test_raise_if_no_config_for_brain(dummy_config):
"""
Make sure the trainer setup raises a friendlier exception if both "default" and the brain name
are missing from the config.
"""
brain_name = "testbrain"
bad_config = {"some_other_brain": dummy_config["default"]}
brain_parameters = BrainParameters(
brain_name=brain_name,
vector_observation_space_size=1,
camera_resolutions=[],
vector_action_space_size=[2],
vector_action_descriptions=[],
vector_action_space_type=0,
)
trainer_factory = trainer_util.TrainerFactory(
trainer_config=bad_config,
run_id="testrun",
output_path="output_path",
keep_checkpoints=1,
train_model=True,
load_model=False,
seed=42,
)
with pytest.raises(TrainerConfigError):
trainer_factory.generate(brain_parameters)
def test_load_config_missing_file():
with pytest.raises(TrainerConfigError):
load_config("thisFileDefinitelyDoesNotExist.yaml")

with pytest.raises(TrainerConfigError):
fp = io.StringIO(file_contents)
_load_config(fp)
def test_assemble_curriculum_config():
file_contents = """
behavior1:
curriculum:
foo: 5
behavior2:
curriculum:
foo: 6
"""
trainer_config = _load_config(file_contents)
curriculum_config = assemble_curriculum_config(trainer_config)
assert curriculum_config == {"behavior1": {"foo": 5}, "behavior2": {"foo": 6}}
# Check that nothing is returned if no curriculum.
file_contents = """
behavior1:
foo: 3
behavior2:
foo: 4
"""
trainer_config = _load_config(file_contents)
curriculum_config = assemble_curriculum_config(trainer_config)
assert curriculum_config == {}
# Check that method doesn't break if 1st level entity isn't a dict.
# Note: this is a malformed configuration.
file_contents = """
behavior1: 3
behavior2: 4
"""
trainer_config = _load_config(file_contents)
curriculum_config = assemble_curriculum_config(trainer_config)
assert curriculum_config == {}
def test_existing_directories(tmp_path):

10
ml-agents/mlagents/trainers/trainer/rl_trainer.py


from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.components.reward_signals import RewardSignalResult
from mlagents_envs.timers import hierarchical_timer
from mlagents.trainers.agent_processor import AgentManagerQueue

def __init__(self, *args, **kwargs):
super(RLTrainer, self).__init__(*args, **kwargs)
# Make sure we have at least one reward_signal
if not self.trainer_parameters["reward_signals"]:
raise UnityTrainerException(
"No reward signals were defined. At least one must be used with {}.".format(
self.__class__.__name__
)
)
# collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
# used for reporting only. We always want to report the environment reward to Tensorboard, regardless
# of what reward signals are actually present.

}
self.update_buffer: AgentBuffer = AgentBuffer()
self._stats_reporter.add_property(
StatsPropertyType.HYPERPARAMETERS, self.trainer_parameters
StatsPropertyType.HYPERPARAMETERS, self.trainer_settings.as_dict()
)
def end_episode(self) -> None:

28
ml-agents/mlagents/trainers/trainer/trainer.py


# # Unity ML-Agents Toolkit
from typing import Dict, List, Deque, Any
from typing import List, Deque
import abc
from collections import deque

from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.policy import Policy
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.settings import TrainerSettings
logger = get_logger(__name__)

def __init__(
self,
brain_name: str,
trainer_parameters: dict,
trainer_settings: TrainerSettings,
training: bool,
run_id: str,
reward_buff_cap: int = 1,

:BrainParameters brain: Brain to be trained.
:dict trainer_parameters: The parameters for the trainer (dictionary).
:dict trainer_settings: The parameters for the trainer (dictionary).
:bool training: Whether the trainer is set for training.
:str run_id: The identifier of the current run
:int reward_buff_cap:

self.run_id = run_id
self.trainer_parameters = trainer_parameters
self._threaded = trainer_parameters.get("threaded", True)
self.trainer_settings = trainer_settings
self._threaded = trainer_settings.threaded
self._stats_reporter = StatsReporter(brain_name)
self.is_training = training
self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)

self.summary_freq = self.trainer_parameters["summary_freq"]
self.summary_freq = self.trainer_settings.summary_freq
self.next_summary_step = self.summary_freq
@property

"""
return self._stats_reporter
def _check_param_keys(self):
for k in self.param_keys:
if k not in self.trainer_parameters:
raise UnityTrainerException(
"The hyper-parameter {0} could not be found for the {1} trainer of "
"brain {2}.".format(k, self.__class__, self.brain_name)
)
def parameters(self) -> Dict[str, Any]:
def parameters(self) -> TrainerSettings:
return self.trainer_parameters
return self.trainer_settings
@property
def get_max_steps(self) -> int:

"""
return int(float(self.trainer_parameters["max_steps"]))
return self.trainer_settings.max_steps
@property
def get_step(self) -> int:

8
ml-agents/mlagents/trainers/trainer_controller.py


"""Launches trainers for each External Brains in a Unity Environment."""
import os
import sys
import threading
from typing import Dict, Optional, Set, List
from collections import defaultdict

from mlagents.trainers.trainer_util import TrainerFactory
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.agent_processor import AgentManager
from mlagents.trainers.settings import CurriculumSettings
class TrainerController(object):

# Skip brains that are in the metacurriculum but no trainer yet.
if brain_name not in self.trainers:
continue
if curriculum.measure == "progress":
if curriculum.measure == CurriculumSettings.MeasureType.PROGRESS:
elif curriculum.measure == "reward":
elif curriculum.measure == CurriculumSettings.MeasureType.REWARD:
measure_val = np.mean(self.trainers[brain_name].reward_buffer)
brain_names_to_measure_vals[brain_name] = measure_val
else:

policy,
name_behavior_id,
trainer.stats_reporter,
trainer.parameters.get("time_horizon", sys.maxsize),
trainer.parameters.time_horizon,
threaded=trainer.threaded,
)
env_manager.set_agent_manager(name_behavior_id, agent_manager)

106
ml-agents/mlagents/trainers/trainer_util.py


import os
import yaml
from typing import Any, Dict, TextIO
from typing import Dict
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.meta_curriculum import MetaCurriculum

from mlagents.trainers.sac.trainer import SACTrainer
from mlagents.trainers.ghost.trainer import GhostTrainer
from mlagents.trainers.ghost.controller import GhostController
from mlagents.trainers.settings import TrainerSettings, TrainerType
logger = get_logger(__name__)

def __init__(
self,
trainer_config: Any,
trainer_config: Dict[str, TrainerSettings],
keep_checkpoints: int,
train_model: bool,
load_model: bool,
seed: int,

self.run_id = run_id
self.output_path = output_path
self.init_path = init_path
self.keep_checkpoints = keep_checkpoints
self.train_model = train_model
self.load_model = load_model
self.seed = seed

def generate(self, brain_name: str) -> Trainer:
return initialize_trainer(
self.trainer_config,
self.trainer_config[brain_name],
self.keep_checkpoints,
self.train_model,
self.load_model,
self.ghost_controller,

def initialize_trainer(
trainer_config: Any,
trainer_settings: TrainerSettings,
keep_checkpoints: int,
train_model: bool,
load_model: bool,
ghost_controller: GhostController,

Initializes a trainer given a provided trainer configuration and brain parameters, as well as
some general training session options.
:param trainer_config: Original trainer configuration loaded from YAML
:param trainer_settings: Original trainer configuration loaded from YAML
:param brain_name: Name of the brain to be associated with trainer
:param run_id: Run ID to associate with this training run
:param output_path: Path to save the model and summary statistics

:param meta_curriculum: Optional meta_curriculum, used to determine a reward buffer length for PPOTrainer
:return:
"""
if "default" not in trainer_config and brain_name not in trainer_config:
raise TrainerConfigError(
f'Trainer config must have either a "default" section, or a section for the brain name {brain_name}. '
"See the config/ directory for examples."
)
trainer_parameters = trainer_config.get("default", {}).copy()
trainer_parameters["output_path"] = os.path.join(output_path, brain_name)
trainer_settings.output_path = os.path.join(output_path, brain_name)
trainer_parameters["init_path"] = os.path.join(init_path, brain_name)
trainer_parameters["keep_checkpoints"] = keep_checkpoints
if brain_name in trainer_config:
_brain_key: Any = brain_name
while not isinstance(trainer_config[_brain_key], dict):
_brain_key = trainer_config[_brain_key]
trainer_parameters.update(trainer_config[_brain_key])
if init_path is not None:
trainer_parameters["init_path"] = "{basedir}/{name}".format(
basedir=init_path, name=brain_name
)
trainer_settings.init_path = os.path.join(init_path, brain_name)
min_lesson_length = 1
if meta_curriculum:

)
trainer: Trainer = None # type: ignore # will be set to one of these, or raise
if "trainer" not in trainer_parameters:
raise TrainerConfigError(
f'The "trainer" key must be set in your trainer config for brain {brain_name} (or the default brain).'
)
trainer_type = trainer_parameters["trainer"]
trainer_type = trainer_settings.trainer_type
if trainer_type == "offline_bc":
raise UnityTrainerException(
"The offline_bc trainer has been removed. To train with demonstrations, "
"please use a PPO or SAC trainer with the GAIL Reward Signal and/or the "
"Behavioral Cloning feature enabled."
)
elif trainer_type == "ppo":
if trainer_type == TrainerType.PPO:
trainer_parameters,
trainer_settings,
elif trainer_type == "sac":
elif trainer_type == TrainerType.SAC:
trainer_parameters,
trainer_settings,
if "self_play" in trainer_parameters:
if trainer_settings.self_play is not None:
trainer_parameters,
trainer_settings,
def load_config(config_path: str) -> Dict[str, Any]:
try:
with open(config_path) as data_file:
return _load_config(data_file)
except IOError:
abs_path = os.path.abspath(config_path)
raise TrainerConfigError(f"Config file could not be found at {abs_path}.")
except UnicodeDecodeError:
raise TrainerConfigError(
f"There was an error decoding Config file from {config_path}. "
f"Make sure your file is save using UTF-8"
)
def _load_config(fp: TextIO) -> Dict[str, Any]:
"""
Load the yaml config from the file-like object.
"""
try:
return yaml.safe_load(fp)
except yaml.parser.ParserError as e:
raise TrainerConfigError(
"Error parsing yaml file. Please check for formatting errors. "
"A tool such as http://www.yamllint.com/ can be helpful with this."
) from e
def assemble_curriculum_config(trainer_config: Dict[str, Any]) -> Dict[str, Any]:
"""
Assembles a curriculum config Dict from a trainer config. The resulting
dictionary should have a mapping of {brain_name: config}, where config is another
Dict that
:param trainer_config: Dict of trainer configurations (keys are brain_names).
:return: Dict of curriculum configurations. Returns empty dict if none are found.
"""
curriculum_config: Dict[str, Any] = {}
for behavior_name, behavior_config in trainer_config.items():
# Don't try to iterate non-Dicts. This probably means your config is malformed.
if isinstance(behavior_config, dict) and "curriculum" in behavior_config:
curriculum_config[behavior_name] = behavior_config["curriculum"]
return curriculum_config
def handle_existing_directories(

2
ml-agents/setup.py


"protobuf>=3.6",
"pyyaml>=3.1.0",
"tensorflow>=1.7,<3.0",
"cattrs>=1.0.0",
"attrs>=19.3.0",
'pypiwin32==223;platform_system=="Windows"',
# We don't actually need six, but tensorflow does, and pip seems
# to get confused and install the wrong version.

8
ml-agents/tests/yamato/training_int_tests.py


# Copy the default training config but override the max_steps parameter,
# and reduce the batch_size and buffer_size enough to ensure an update step happens.
overrides = {"max_steps": 100, "batch_size": 10, "buffer_size": 10}
overrides = {"max_steps": 100, "batch_size": 10, "buffer_size": 10}
override_config_file("config/ppo/3DBall.yaml", yaml_out, **overrides)
overrides = {
"hyperparameters": {"batch_size": 10, "buffer_size": 10},
"max_steps": 100,
}
override_config_file("config/ppo/3DBall.yaml", yaml_out, overrides)
mla_learn_cmd = (
f"mlagents-learn {yaml_out} --force --env="

12
ml-agents/tests/yamato/yamato_utils.py


subprocess.check_call("rm -rf Project/Library", shell=True)
def override_config_file(src_path, dest_path, **kwargs):
def override_config_file(src_path, dest_path, overrides):
"""
Override settings in a trainer config file. For example,
override_config_file(src_path, dest_path, max_steps=42)

behavior_configs = configs["behaviors"]
for config in behavior_configs.values():
config.update(**kwargs)
_override_config_dict(config, overrides)
def _override_config_dict(config, overrides):
for key, val in overrides.items():
if isinstance(val, dict):
_override_config_dict(config[key], val)
else:
config[key] = val
def override_legacy_config_file(python_version, src_path, dest_path, **kwargs):

110
config/upgrade_config.py


import attr
import cattr
import yaml
from typing import Dict, Any
import argparse
from mlagents.trainers.settings import TrainerSettings, NetworkSettings, TrainerType
from mlagents.trainers.cli_utils import load_config
# Take an existing trainer config (e.g. trainer_config.yaml) and turn it into the new format.
def convert_behaviors(old_trainer_config: Dict[str, Any]) -> Dict[str, Any]:
all_behavior_config_dict = {}
default_config = old_trainer_config.get("default", {})
for behavior_name, config in old_trainer_config.items():
if behavior_name != "default":
config = default_config.copy()
config.update(old_trainer_config[behavior_name])
# Convert to split TrainerSettings, Hyperparameters, NetworkSettings
# Set trainer_type and get appropriate hyperparameter settings
trainer_type = config["trainer"]
new_config = {}
new_config["trainer_type"] = trainer_type
hyperparam_cls = TrainerType(trainer_type).to_settings()
# Try to absorb as much as possible into the hyperparam_cls
new_config["hyperparameters"] = cattr.structure(config, hyperparam_cls)
# Try to absorb as much as possible into the network settings
new_config["network_settings"] = cattr.structure(config, NetworkSettings)
# Deal with recurrent
if config["use_recurrent"]:
new_config["network_settings"].memory = NetworkSettings.MemorySettings(
sequence_length=config["sequence_length"],
memory_size=config["memory_size"],
)
# Absorb the rest into the base TrainerSettings
for key, val in config.items():
if key in attr.fields_dict(TrainerSettings):
new_config[key] = val
# Structure the whole thing
all_behavior_config_dict[behavior_name] = cattr.structure(
new_config, TrainerSettings
)
return all_behavior_config_dict
def write_to_yaml_file(config: Dict[str, Any], output_config: str):
unstructed_config = cattr.unstructure(config)
unstructed_config = remove_nones(unstructed_config)
with open(output_config, "w") as f:
try:
yaml.dump(unstructed_config, f, sort_keys=False)
except TypeError: # Older versions of pyyaml don't support sort_keys
yaml.dump(unstructed_config, f)
def remove_nones(config: Dict[Any, Any]):
new_config = {}
for key, val in config.items():
if isinstance(val, dict):
new_config[key] = remove_nones(val)
elif val is not None:
new_config[key] = val
return new_config
if __name__ == "__main__":
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
argparser.add_argument(
"trainer_config_path",
help="Path to old format (<=0.16.X) trainer configuration YAML.",
)
argparser.add_argument(
"--curriculum",
help="Path to old format (<=0.16.X) curriculum configuration YAML.",
default=None,
)
argparser.add_argument(
"--sampler",
help="Path to old format (<=0.16.X) parameter randomization configuration YAML.",
default=None,
)
argparser.add_argument(
"output_config_path", help="Path to write converted YAML file."
)
args = argparser.parse_args()
print(
f"Converting {args.trainer_config_path} and saving to {args.output_config_path}."
)
old_config = load_config(args.trainer_config_path)
behavior_config_dict = convert_behaviors(old_config)
full_config = {"behaviors": behavior_config_dict}
# Convert curriculum and sampler. note that we don't validate these; if it was correct
# before it should be correct now.
if args.curriculum is not None:
curriculum_config_dict = load_config(args.curriculum)
full_config["curriculum"] = curriculum_config_dict
if args.sampler is not None:
sampler_config_dict = load_config(args.curriculum)
full_config["parameter_randomization"] = sampler_config_dict
write_to_yaml_file(full_config, args.output_config_path)

373
ml-agents/mlagents/trainers/settings.py


import attr
import cattr
from typing import Dict, Optional, List, Any, DefaultDict, Mapping
from enum import Enum
import collections
import argparse
from mlagents.trainers.cli_utils import StoreConfigFile, DetectDefault, parser
from mlagents.trainers.cli_utils import load_config
from mlagents.trainers.exception import TrainerConfigError
from mlagents.trainers.models import ScheduleType, EncoderType
def check_and_structure(key: str, value: Any, class_type: type) -> Any:
attr_fields_dict = attr.fields_dict(class_type)
if key not in attr_fields_dict:
raise TrainerConfigError(
f"The option {key} was specified in your YAML file for {class_type.__name__}, but is invalid."
)
# Apply cattr structure to the values
return cattr.structure(value, attr_fields_dict[key].type)
def strict_to_cls(d: Mapping, t: type) -> Any:
if not isinstance(d, Mapping):
raise TrainerConfigError(f"Unsupported config {d} for {t.__name__}.")
d_copy: Dict[str, Any] = {}
d_copy.update(d)
for key, val in d_copy.items():
d_copy[key] = check_and_structure(key, val, t)
return t(**d_copy)
def defaultdict_to_dict(d: DefaultDict) -> Dict:
return {key: cattr.unstructure(val) for key, val in d.items()}
@attr.s(auto_attribs=True)
class ExportableSettings:
def as_dict(self):
return cattr.unstructure(self)
@attr.s(auto_attribs=True)
class NetworkSettings:
@attr.s(auto_attribs=True)
class MemorySettings:
sequence_length: int = 64
memory_size: int = 128
normalize: bool = False
hidden_units: int = 128
num_layers: int = 2
vis_encode_type: EncoderType = EncoderType.SIMPLE
memory: Optional[MemorySettings] = None
@attr.s(auto_attribs=True)
class BehavioralCloningSettings:
demo_path: str
steps: int = 0
strength: float = 1.0
samples_per_update: int = 0
# Setting either of these to None will allow the Optimizer
# to decide these parameters, based on Trainer hyperparams
num_epoch: Optional[int] = None
batch_size: Optional[int] = None
@attr.s(auto_attribs=True)
class HyperparamSettings:
batch_size: int = 1024
buffer_size: int = 10240
learning_rate: float = 3.0e-4
learning_rate_schedule: ScheduleType = ScheduleType.CONSTANT
@attr.s(auto_attribs=True)
class PPOSettings(HyperparamSettings):
beta: float = 5.0e-3
epsilon: float = 0.2
lambd: float = 0.95
num_epoch: int = 3
learning_rate_schedule: ScheduleType = ScheduleType.LINEAR
@attr.s(auto_attribs=True)
class SACSettings(HyperparamSettings):
batch_size: int = 128
buffer_size: int = 50000
buffer_init_steps: int = 0
tau: float = 0.005
steps_per_update: float = 1
save_replay_buffer: bool = False
init_entcoef: float = 1.0
reward_signal_steps_per_update: float = attr.ib()
@reward_signal_steps_per_update.default
def _reward_signal_steps_per_update_default(self):
return self.steps_per_update
class RewardSignalType(Enum):
EXTRINSIC: str = "extrinsic"
GAIL: str = "gail"
CURIOSITY: str = "curiosity"
def to_settings(self) -> type:
_mapping = {
RewardSignalType.EXTRINSIC: RewardSignalSettings,
RewardSignalType.GAIL: GAILSettings,
RewardSignalType.CURIOSITY: CuriositySettings,
}
return _mapping[self]
@attr.s(auto_attribs=True)
class RewardSignalSettings:
gamma: float = 0.99
strength: float = 1.0
@staticmethod
def structure(d: Mapping, t: type) -> Any:
"""
Helper method to structure a Dict of RewardSignalSettings class. Meant to be registered with
cattr.register_structure_hook() and called with cattr.structure(). This is needed to handle
the special Enum selection of RewardSignalSettings classes.
"""
if not isinstance(d, Mapping):
raise TrainerConfigError(f"Unsupported reward signal configuration {d}.")
d_final: Dict[RewardSignalType, RewardSignalSettings] = {}
for key, val in d.items():
enum_key = RewardSignalType(key)
t = enum_key.to_settings()
d_final[enum_key] = strict_to_cls(val, t)
return d_final
@attr.s(auto_attribs=True)
class GAILSettings(RewardSignalSettings):
encoding_size: int = 64
learning_rate: float = 3e-4
use_actions: bool = False
use_vail: bool = False
demo_path: str = attr.ib(kw_only=True)
@attr.s(auto_attribs=True)
class CuriositySettings(RewardSignalSettings):
encoding_size: int = 64
learning_rate: float = 3e-4
@attr.s(auto_attribs=True)
class SelfPlaySettings:
save_steps: int = 20000
team_change: int = attr.ib()
@team_change.default
def _team_change_default(self):
# Assign team_change to about 4x save_steps
return self.save_steps * 5
swap_steps: int = 10000
window: int = 10
play_against_latest_model_ratio: float = 0.5
initial_elo: float = 1200.0
class TrainerType(Enum):
PPO: str = "ppo"
SAC: str = "sac"
def to_settings(self) -> type:
_mapping = {TrainerType.PPO: PPOSettings, TrainerType.SAC: SACSettings}
return _mapping[self]
@attr.s(auto_attribs=True)
class TrainerSettings(ExportableSettings):
trainer_type: TrainerType = TrainerType.PPO
hyperparameters: HyperparamSettings = attr.ib()
@hyperparameters.default
def _set_default_hyperparameters(self):
return self.trainer_type.to_settings()()
network_settings: NetworkSettings = attr.ib(factory=NetworkSettings)
reward_signals: Dict[RewardSignalType, RewardSignalSettings] = attr.ib(
factory=lambda: {RewardSignalType.EXTRINSIC: RewardSignalSettings()}
)
init_path: Optional[str] = None
output_path: str = "default"
keep_checkpoints: int = 5
max_steps: int = 500000
time_horizon: int = 64
summary_freq: int = 50000
threaded: bool = True
self_play: Optional[SelfPlaySettings] = None
behavioral_cloning: Optional[BehavioralCloningSettings] = None
cattr.register_structure_hook(
Dict[RewardSignalType, RewardSignalSettings], RewardSignalSettings.structure
)
@network_settings.validator
def _check_batch_size_seq_length(self, attribute, value):
if self.network_settings.memory is not None:
if (
self.network_settings.memory.sequence_length
> self.hyperparameters.batch_size
):
raise TrainerConfigError(
"When using memory, sequence length must be less than or equal to batch size. "
)
@staticmethod
def dict_to_defaultdict(d: Dict, t: type) -> DefaultDict:
return collections.defaultdict(
TrainerSettings, cattr.structure(d, Dict[str, TrainerSettings])
)
@staticmethod
def structure(d: Mapping, t: type) -> Any:
"""
Helper method to structure a TrainerSettings class. Meant to be registered with
cattr.register_structure_hook() and called with cattr.structure().
"""
if not isinstance(d, Mapping):
raise TrainerConfigError(f"Unsupported config {d} for {t.__name__}.")
d_copy: Dict[str, Any] = {}
d_copy.update(d)
for key, val in d_copy.items():
if attr.has(type(val)):
# Don't convert already-converted attrs classes.
continue
if key == "hyperparameters":
if "trainer_type" not in d_copy:
raise TrainerConfigError(
"Hyperparameters were specified but no trainer_type was given."
)
else:
d_copy[key] = strict_to_cls(
d_copy[key], TrainerType(d_copy["trainer_type"]).to_settings()
)
elif key == "max_steps":
d_copy[key] = int(float(val))
# In some legacy configs, max steps was specified as a float
else:
d_copy[key] = check_and_structure(key, val, t)
return t(**d_copy)
@attr.s(auto_attribs=True)
class CurriculumSettings:
class MeasureType:
PROGRESS: str = "progress"
REWARD: str = "reward"
measure: str = attr.ib(default=MeasureType.REWARD)
thresholds: List[int] = attr.ib(factory=list)
min_lesson_length: int = 0
signal_smoothing: bool = True
parameters: Dict[str, List[float]] = attr.ib(kw_only=True)
@attr.s(auto_attribs=True)
class CheckpointSettings:
save_freq: int = parser.get_default("save_freq")
run_id: str = parser.get_default("run_id")
initialize_from: str = parser.get_default("initialize_from")
load_model: bool = parser.get_default("load_model")
resume: bool = parser.get_default("resume")
force: bool = parser.get_default("force")
train_model: bool = parser.get_default("train_model")
inference: bool = parser.get_default("inference")
lesson: int = parser.get_default("lesson")
@attr.s(auto_attribs=True)
class EnvironmentSettings:
env_path: Optional[str] = parser.get_default("env_path")
env_args: Optional[List[str]] = parser.get_default("env_args")
base_port: int = parser.get_default("base_port")
num_envs: int = parser.get_default("num_envs")
seed: int = parser.get_default("seed")
@attr.s(auto_attribs=True)
class EngineSettings:
width: int = parser.get_default("width")
height: int = parser.get_default("height")
quality_level: int = parser.get_default("quality_level")
time_scale: float = parser.get_default("time_scale")
target_frame_rate: int = parser.get_default("target_frame_rate")
capture_frame_rate: int = parser.get_default("capture_frame_rate")
no_graphics: bool = parser.get_default("no_graphics")
@attr.s(auto_attribs=True)
class RunOptions(ExportableSettings):
behaviors: DefaultDict[str, TrainerSettings] = attr.ib(
factory=lambda: collections.defaultdict(TrainerSettings)
)
env_settings: EnvironmentSettings = attr.ib(factory=EnvironmentSettings)
engine_settings: EngineSettings = attr.ib(factory=EngineSettings)
parameter_randomization: Optional[Dict] = None
curriculum: Optional[Dict[str, CurriculumSettings]] = None
checkpoint_settings: CheckpointSettings = attr.ib(factory=CheckpointSettings)
# These are options that are relevant to the run itself, and not the engine or environment.
# They will be left here.
debug: bool = parser.get_default("debug")
# Strict conversion
cattr.register_structure_hook(EnvironmentSettings, strict_to_cls)
cattr.register_structure_hook(EngineSettings, strict_to_cls)
cattr.register_structure_hook(CheckpointSettings, strict_to_cls)
cattr.register_structure_hook(CurriculumSettings, strict_to_cls)
cattr.register_structure_hook(TrainerSettings, TrainerSettings.structure)
cattr.register_structure_hook(
DefaultDict[str, TrainerSettings], TrainerSettings.dict_to_defaultdict
)
cattr.register_unstructure_hook(collections.defaultdict, defaultdict_to_dict)
@staticmethod
def from_argparse(args: argparse.Namespace) -> "RunOptions":
"""
Takes an argparse.Namespace as specified in `parse_command_line`, loads input configuration files
from file paths, and converts to a RunOptions instance.
:param args: collection of command-line parameters passed to mlagents-learn
:return: RunOptions representing the passed in arguments, with trainer config, curriculum and sampler
configs loaded from files.
"""
argparse_args = vars(args)
config_path = StoreConfigFile.trainer_config_path
# Load YAML
configured_dict: Dict[str, Any] = {
"checkpoint_settings": {},
"env_settings": {},
"engine_settings": {},
}
if config_path is not None:
configured_dict.update(load_config(config_path))
# Use the YAML file values for all values not specified in the CLI.
for key in configured_dict.keys():
# Detect bad config options
if key not in attr.fields_dict(RunOptions):
raise TrainerConfigError(
"The option {} was specified in your YAML file, but is invalid.".format(
key
)
)
# Override with CLI args
# Keep deprecated --load working, TODO: remove
argparse_args["resume"] = argparse_args["resume"] or argparse_args["load_model"]
for key, val in argparse_args.items():
if key in DetectDefault.non_default_args:
if key in attr.fields_dict(CheckpointSettings):
configured_dict["checkpoint_settings"][key] = val
elif key in attr.fields_dict(EnvironmentSettings):
configured_dict["env_settings"][key] = val
elif key in attr.fields_dict(EngineSettings):
configured_dict["engine_settings"][key] = val
else: # Base options
configured_dict[key] = val
return RunOptions.from_dict(configured_dict)
@staticmethod
def from_dict(options_dict: Dict[str, Any]) -> "RunOptions":
return cattr.structure(options_dict, RunOptions)

151
ml-agents/mlagents/trainers/tests/test_settings.py


import attr
import pytest
from typing import Dict
from mlagents.trainers.settings import (
RunOptions,
TrainerSettings,
PPOSettings,
SACSettings,
RewardSignalType,
RewardSignalSettings,
CuriositySettings,
TrainerType,
strict_to_cls,
)
from mlagents.trainers.exception import TrainerConfigError
def check_if_different(testobj1: object, testobj2: object) -> None:
assert testobj1 is not testobj2
if attr.has(testobj1.__class__) and attr.has(testobj2.__class__):
for key, val in attr.asdict(testobj1, recurse=False).items():
if isinstance(val, dict) or isinstance(val, list) or attr.has(val):
# Note: this check doesn't check the contents of mutables.
check_if_different(val, attr.asdict(testobj2, recurse=False)[key])
def test_is_new_instance():
"""
Verify that every instance of RunOptions() and its subclasses
is a new instance (i.e. all factory methods are used properly.)
"""
check_if_different(RunOptions(), RunOptions())
check_if_different(TrainerSettings(), TrainerSettings())
def test_no_configuration():
"""
Verify that a new config will have a PPO trainer with extrinsic rewards.
"""
blank_runoptions = RunOptions()
assert isinstance(blank_runoptions.behaviors["test"], TrainerSettings)
assert isinstance(blank_runoptions.behaviors["test"].hyperparameters, PPOSettings)
assert (
RewardSignalType.EXTRINSIC in blank_runoptions.behaviors["test"].reward_signals
)
def test_strict_to_cls():
"""
Test strict structuring method.
"""
@attr.s(auto_attribs=True)
class TestAttrsClass:
field1: int = 0
field2: str = "test"
correct_dict = {"field1": 1, "field2": "test2"}
assert strict_to_cls(correct_dict, TestAttrsClass) == TestAttrsClass(**correct_dict)
incorrect_dict = {"field3": 1, "field2": "test2"}
with pytest.raises(TrainerConfigError):
strict_to_cls(incorrect_dict, TestAttrsClass)
with pytest.raises(TrainerConfigError):
strict_to_cls("non_dict_input", TestAttrsClass)
def test_trainersettings_structure():
"""
Test structuring method for TrainerSettings
"""
trainersettings_dict = {
"trainer_type": "sac",
"hyperparameters": {"batch_size": 1024},
"max_steps": 1.0,
"reward_signals": {"curiosity": {"encoding_size": 64}},
}
trainer_settings = TrainerSettings.structure(trainersettings_dict, TrainerSettings)
assert isinstance(trainer_settings.hyperparameters, SACSettings)
assert trainer_settings.trainer_type == TrainerType.SAC
assert isinstance(trainer_settings.max_steps, int)
assert RewardSignalType.CURIOSITY in trainer_settings.reward_signals
# Check invalid trainer type
with pytest.raises(ValueError):
trainersettings_dict = {
"trainer_type": "puppo",
"hyperparameters": {"batch_size": 1024},
"max_steps": 1.0,
}
TrainerSettings.structure(trainersettings_dict, TrainerSettings)
# Check invalid hyperparameter
with pytest.raises(TrainerConfigError):
trainersettings_dict = {
"trainer_type": "ppo",
"hyperparameters": {"notahyperparam": 1024},
"max_steps": 1.0,
}
TrainerSettings.structure(trainersettings_dict, TrainerSettings)
# Check non-dict
with pytest.raises(TrainerConfigError):
TrainerSettings.structure("notadict", TrainerSettings)
# Check hyperparameters specified but trainer type left as default.
# This shouldn't work as you could specify non-PPO hyperparameters.
with pytest.raises(TrainerConfigError):
trainersettings_dict = {"hyperparameters": {"batch_size": 1024}}
TrainerSettings.structure(trainersettings_dict, TrainerSettings)
def test_reward_signal_structure():
"""
Tests the RewardSignalSettings structure method. This one is special b/c
it takes in a Dict[RewardSignalType, RewardSignalSettings].
"""
reward_signals_dict = {
"extrinsic": {"strength": 1.0},
"curiosity": {"strength": 1.0},
}
reward_signals = RewardSignalSettings.structure(
reward_signals_dict, Dict[RewardSignalType, RewardSignalSettings]
)
assert isinstance(reward_signals[RewardSignalType.EXTRINSIC], RewardSignalSettings)
assert isinstance(reward_signals[RewardSignalType.CURIOSITY], CuriositySettings)
# Check invalid reward signal type
reward_signals_dict = {"puppo": {"strength": 1.0}}
with pytest.raises(ValueError):
RewardSignalSettings.structure(
reward_signals_dict, Dict[RewardSignalType, RewardSignalSettings]
)
# Check missing GAIL demo path
reward_signals_dict = {"gail": {"strength": 1.0}}
with pytest.raises(TypeError):
RewardSignalSettings.structure(
reward_signals_dict, Dict[RewardSignalType, RewardSignalSettings]
)
# Check non-Dict input
with pytest.raises(TrainerConfigError):
RewardSignalSettings.structure(
"notadict", Dict[RewardSignalType, RewardSignalSettings]
)
正在加载...
取消
保存