[refactor] Structure configuration files into classes (#3936)

5 年前 · e92b4f88
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
 - Curriculum and Parameter Randomization configurations have been merged
  into the main training configuration file. Note that this means training
  configuration files are now environment-specific. (#3791)
+- The format for trainer configuration has changed, and the "default" behavior has been deprecated.
+  See the [Migration Guide](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Migrating.md) for more details. (#3936)
 - Training artifacts (trained models, summaries) are now found in the `results/`
  directory. (#3829)
 - Unity Player logs are now written out to the results directory. (#3877)
--- a/config/imitation/CrawlerStatic.yaml
+++ b/config/imitation/CrawlerStatic.yaml
 behaviors:
  CrawlerStatic:
-    trainer: ppo
-    batch_size: 2024
-    beta: 0.005
-    buffer_size: 20240
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    max_steps: 1e7
-    memory_size: 256
-    normalize: true
-    num_epoch: 3
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2024
+      buffer_size: 20240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
-        strength: 1.0
+        strength: 1.0
+        learning_rate: 0.0003
+        use_actions: false
+        use_vail: false
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
+      steps: 50000
-      steps: 50000
+      samples_per_update: 0
--- a/config/imitation/FoodCollector.yaml
+++ b/config/imitation/FoodCollector.yaml
 behaviors:
  FoodCollector:
-    trainer: ppo
-    batch_size: 64
-    beta: 0.005
-    buffer_size: 10240
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    max_steps: 2.0e6
-    memory_size: 256
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 32
-    summary_freq: 10000
-    use_recurrent: false
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 64
+      buffer_size: 10240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
-        strength: 0.1
+        strength: 0.1
+        learning_rate: 0.0003
+        use_actions: false
+        use_vail: false
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 2000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
+      steps: 0
-      steps: 0
+      samples_per_update: 0
--- a/config/imitation/Hallway.yaml
+++ b/config/imitation/Hallway.yaml
 behaviors:
  Hallway:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.01
-    buffer_size: 1024
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    max_steps: 1.0e7
-    memory_size: 256
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: true
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 1024
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+      memory:
+        sequence_length: 64
+        memory_size: 256
-        strength: 1.0
+        strength: 1.0
+        gamma: 0.99
-        gamma: 0.99
+        learning_rate: 0.0003
+        use_actions: false
+        use_vail: false
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
--- a/config/imitation/PushBlock.yaml
+++ b/config/imitation/PushBlock.yaml
 behaviors:
  PushBlock:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.01
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    max_steps: 1.5e7
-    memory_size: 256
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 60000
-    use_recurrent: false
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 2048
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+        learning_rate: 0.0003
+        use_actions: false
+        use_vail: false
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 15000000
+    time_horizon: 64
+    summary_freq: 60000
+    threaded: true
--- a/config/imitation/Pyramids.yaml
+++ b/config/imitation/Pyramids.yaml
 behaviors:
  Pyramids:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.01
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    max_steps: 1.0e7
-    memory_size: 256
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
+    trainer_type: ppo
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
+    max_steps: 1.0e7
+    hyperparameters:
+      batch_size: 128
+      beta: 0.01
+      buffer_size: 2048
+      epsilon: 0.2
+      lambd: 0.95
+      learning_rate: 0.0003
+      num_epoch: 3
+    network_settings:
+      num_layers: 2
+      normalize: false
+      hidden_units: 512
    reward_signals:
      extrinsic:
        strength: 1.0
--- a/config/ppo/3DBall.yaml
+++ b/config/ppo/3DBall.yaml
 behaviors:
  3DBall:
-    trainer: ppo
-    batch_size: 64
-    beta: 0.001
-    buffer_size: 12000
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.99
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 5.0e5
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 12000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 64
+      buffer_size: 12000
+      learning_rate: 0.0003
+      beta: 0.001
+      epsilon: 0.2
+      lambd: 0.99
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 1000
+    summary_freq: 12000
+    threaded: true
--- a/config/ppo/3DBallHard.yaml
+++ b/config/ppo/3DBallHard.yaml
 behaviors:
  3DBallHard:
-    trainer: ppo
-    batch_size: 1200
-    beta: 0.001
-    buffer_size: 12000
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 5.0e6
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 12000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 1200
+      buffer_size: 12000
+      learning_rate: 0.0003
+      beta: 0.001
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.995
-        gamma: 0.995
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 1000
+    summary_freq: 12000
+    threaded: true
--- a/config/ppo/3DBall_randomize.yaml
+++ b/config/ppo/3DBall_randomize.yaml
 behaviors:
-    3DBall:
-        trainer: ppo
-        batch_size: 64
-        beta: 0.001
-        buffer_size: 12000
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.99
-        learning_rate: 3.0e-4
-        learning_rate_schedule: linear
-        max_steps: 5.0e5
-        memory_size: 128
-        normalize: true
-        num_epoch: 3
-        num_layers: 2
-        time_horizon: 1000
-        sequence_length: 64
-        summary_freq: 12000
-        use_recurrent: false
-        vis_encode_type: simple
-        reward_signals:
-            extrinsic:
-                strength: 1.0
-                gamma: 0.99
+  3DBall:
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 64
+      buffer_size: 12000
+      learning_rate: 0.0003
+      beta: 0.001
+      epsilon: 0.2
+      lambd: 0.99
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 1000
+    summary_freq: 12000
+    threaded: true
-    resampling-interval: 5000
-    mass:
-        sampler-type: "uniform"
-        min_value: 0.5
-        max_value: 10
-    gravity:
-        sampler-type: "uniform"
-        min_value: 7
-        max_value: 12
-    scale:
-        sampler-type: "uniform"
-        min_value: 0.75
-        max_value: 3
+  resampling-interval: 5000
+  mass:
+    sampler-type: uniform
+    min_value: 0.5
+    max_value: 10
+  gravity:
+    sampler-type: uniform
+    min_value: 7
+    max_value: 12
+  scale:
+    sampler-type: uniform
+    min_value: 0.75
+    max_value: 3
--- a/config/ppo/Basic.yaml
+++ b/config/ppo/Basic.yaml
 behaviors:
  Basic:
-    trainer: ppo
-    batch_size: 32
-    beta: 0.005
-    buffer_size: 256
-    epsilon: 0.2
-    hidden_units: 20
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 5.0e5
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 1
-    time_horizon: 3
-    sequence_length: 64
-    summary_freq: 2000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 32
+      buffer_size: 256
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 20
+      num_layers: 1
+      vis_encode_type: simple
+        gamma: 0.9
-        gamma: 0.9
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 3
+    summary_freq: 2000
+    threaded: true
--- a/config/ppo/Bouncer.yaml
+++ b/config/ppo/Bouncer.yaml
 behaviors:
  Bouncer:
-    trainer: ppo
-    batch_size: 1024
-    beta: 0.005
-    buffer_size: 10240
-    epsilon: 0.2
-    hidden_units: 64
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 4.0e6
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 1024
+      buffer_size: 10240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 64
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 4000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
--- a/config/ppo/CrawlerDynamic.yaml
+++ b/config/ppo/CrawlerDynamic.yaml
 behaviors:
  CrawlerDynamic:
-    trainer: ppo
-    batch_size: 2024
-    beta: 0.005
-    buffer_size: 20240
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 1e7
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2024
+      buffer_size: 20240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
+        gamma: 0.995
-        gamma: 0.995
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
--- a/config/ppo/CrawlerStatic.yaml
+++ b/config/ppo/CrawlerStatic.yaml
 behaviors:
  CrawlerStatic:
-    trainer: ppo
-    batch_size: 2024
-    beta: 0.005
-    buffer_size: 20240
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 1e7
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2024
+      buffer_size: 20240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
+        gamma: 0.995
-        gamma: 0.995
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
--- a/config/ppo/FoodCollector.yaml
+++ b/config/ppo/FoodCollector.yaml
 behaviors:
  FoodCollector:
-    trainer: ppo
-    batch_size: 1024
-    beta: 0.005
-    buffer_size: 10240
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 2.0e6
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 1024
+      buffer_size: 10240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 2000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
--- a/config/ppo/GridWorld.yaml
+++ b/config/ppo/GridWorld.yaml
 behaviors:
  GridWorld:
-    trainer: ppo
-    batch_size: 32
-    beta: 0.005
-    buffer_size: 256
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 500000
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 1
-    time_horizon: 5
-    sequence_length: 64
-    summary_freq: 20000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 32
+      buffer_size: 256
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 1
+      vis_encode_type: simple
+        gamma: 0.9
-        gamma: 0.9
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 5
+    summary_freq: 20000
+    threaded: true
--- a/config/ppo/Hallway.yaml
+++ b/config/ppo/Hallway.yaml
 behaviors:
  Hallway:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.01
-    buffer_size: 1024
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 1.0e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: true
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 1024
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+      memory:
+        sequence_length: 64
+        memory_size: 128
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
--- a/config/ppo/PushBlock.yaml
+++ b/config/ppo/PushBlock.yaml
 behaviors:
  PushBlock:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.01
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 2.0e6
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 60000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 2048
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 2000000
+    time_horizon: 64
+    summary_freq: 60000
+    threaded: true
--- a/config/ppo/Pyramids.yaml
+++ b/config/ppo/Pyramids.yaml
 behaviors:
  Pyramids:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.01
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 1.0e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 128
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 2048
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 512
+      num_layers: 2
+      vis_encode_type: simple
-        strength: 1.0
+        strength: 1.0
-        strength: 0.02
+        strength: 0.02
+        learning_rate: 0.0003
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 128
+    summary_freq: 30000
+    threaded: true
--- a/config/ppo/Reacher.yaml
+++ b/config/ppo/Reacher.yaml
 behaviors:
  Reacher:
-    trainer: ppo
-    batch_size: 2024
-    beta: 0.005
-    buffer_size: 20240
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 2e7
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 60000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2024
+      buffer_size: 20240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.995
-        gamma: 0.995
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
+    time_horizon: 1000
+    summary_freq: 60000
+    threaded: true
--- a/config/ppo/SoccerTwos.yaml
+++ b/config/ppo/SoccerTwos.yaml
 behaviors:
  SoccerTwos:
-    trainer: ppo
-    batch_size: 2048
-    beta: 0.005
-    buffer_size: 20480
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2048
+      buffer_size: 20480
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: constant
+    network_settings:
+      normalize: false
+      hidden_units: 512
+      num_layers: 2
+      vis_encode_type: simple
-        strength: 1.0
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 50000000
+    time_horizon: 1000
+    summary_freq: 10000
+    threaded: true
-      window: 10
-      play_against_latest_model_ratio: 0.5
-      swap_steps: 50000
-    curriculum:
-      measure: progress
-      thresholds: [0.05, 0.1]
-      min_lesson_length: 100
-      signal_smoothing: true
-      parameters:
-        ball_touch: [1.0, 0.5, 0.0]
+      swap_steps: 50000
+      window: 10
+      play_against_latest_model_ratio: 0.5
+      initial_elo: 1200.0
--- a/config/ppo/StrikersVsGoalie.yaml
+++ b/config/ppo/StrikersVsGoalie.yaml
 behaviors:
  Goalie:
-    trainer: ppo
-    batch_size: 2048
-    beta: 0.005
-    buffer_size: 20480
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2048
+      buffer_size: 20480
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: constant
+    network_settings:
+      normalize: false
+      hidden_units: 512
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 50000000
+    time_horizon: 1000
+    summary_freq: 10000
+    threaded: true
-      window: 10
-      play_against_latest_model_ratio: 0.5
-      swap_steps: 25000
-
+      swap_steps: 25000
+      window: 10
+      play_against_latest_model_ratio: 0.5
+      initial_elo: 1200.0
-    trainer: ppo
-    batch_size: 2048
-    beta: 0.005
-    buffer_size: 20480
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2048
+      buffer_size: 20480
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: constant
+    network_settings:
+      normalize: false
+      hidden_units: 512
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 50000000
+    time_horizon: 1000
+    summary_freq: 10000
+    threaded: true
-      window: 10
-      play_against_latest_model_ratio: 0.5
+      team_change: 200000
-      team_change: 200000
+      window: 10
+      play_against_latest_model_ratio: 0.5
+      initial_elo: 1200.0
--- a/config/ppo/Tennis.yaml
+++ b/config/ppo/Tennis.yaml
 behaviors:
  Tennis:
-    trainer: ppo
-    batch_size: 1024
-    beta: 0.005
-    buffer_size: 10240
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e7
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 1024
+      buffer_size: 10240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: constant
+    network_settings:
+      normalize: true
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 50000000
+    time_horizon: 1000
+    summary_freq: 10000
+    threaded: true
-      window: 10
-      play_against_latest_model_ratio: 0.5
+      team_change: 100000
-      team_change: 100000
+      window: 10
+      play_against_latest_model_ratio: 0.5
+      initial_elo: 1200.0
--- a/config/ppo/VisualHallway.yaml
+++ b/config/ppo/VisualHallway.yaml
 behaviors:
  VisualHallway:
-    trainer: ppo
-    batch_size: 64
-    beta: 0.01
-    buffer_size: 1024
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 1.0e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 1
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: true
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 64
+      buffer_size: 1024
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 1
+      vis_encode_type: simple
+      memory:
+        sequence_length: 64
+        memory_size: 128
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
--- a/config/ppo/VisualPushBlock.yaml
+++ b/config/ppo/VisualPushBlock.yaml
 behaviors:
  VisualPushBlock:
-    trainer: ppo
-    batch_size: 64
-    beta: 0.01
-    buffer_size: 1024
-    epsilon: 0.2
-    hidden_units: 128
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 3.0e6
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 1
-    time_horizon: 64
-    sequence_length: 32
-    summary_freq: 60000
-    use_recurrent: true
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 64
+      buffer_size: 1024
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 1
+      vis_encode_type: simple
+      memory:
+        sequence_length: 32
+        memory_size: 128
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 3000000
+    time_horizon: 64
+    summary_freq: 60000
+    threaded: true
--- a/config/ppo/VisualPyramids.yaml
+++ b/config/ppo/VisualPyramids.yaml
 behaviors:
  VisualPyramids:
-    trainer: ppo
-    batch_size: 64
-    beta: 0.01
-    buffer_size: 2024
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 1.0e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 1
-    time_horizon: 128
-    sequence_length: 64
-    summary_freq: 10000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 64
+      buffer_size: 2024
+      learning_rate: 0.0003
+      beta: 0.01
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 1
+      vis_encode_type: simple
-        strength: 1.0
+        strength: 1.0
-        strength: 0.01
+        strength: 0.01
+        learning_rate: 0.0003
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 128
+    summary_freq: 10000
+    threaded: true
--- a/config/ppo/Walker.yaml
+++ b/config/ppo/Walker.yaml
 behaviors:
  Walker:
-    trainer: ppo
-    batch_size: 2048
-    beta: 0.005
-    buffer_size: 20480
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 2e7
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2048
+      buffer_size: 20480
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
+        gamma: 0.995
-        gamma: 0.995
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
--- a/config/ppo/WallJump.yaml
+++ b/config/ppo/WallJump.yaml
 behaviors:
  BigWallJump:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.005
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 2e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 128
-    sequence_length: 64
-    summary_freq: 20000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 2048
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
-        strength: 1.0
-
-  SmallWallJump:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.005
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 5e6
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
-    sequence_length: 64
-    use_recurrent: false
-    vis_encode_type: simple
+    threaded: true
+  SmallWallJump:
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 2048
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 128
+    summary_freq: 20000
+    threaded: true
--- a/config/ppo/WallJump_curriculum.yaml
+++ b/config/ppo/WallJump_curriculum.yaml
 behaviors:
  BigWallJump:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.005
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 2e7
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
-    time_horizon: 128
-    sequence_length: 64
-    summary_freq: 20000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 2048
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
-    curriculum:
-      measure: progress
-      thresholds: [0.1, 0.3, 0.5]
-      min_lesson_length: 100
-      signal_smoothing: true
-      parameters:
-        big_wall_min_height: [0.0, 4.0, 6.0, 8.0]
-        big_wall_max_height: [4.0, 7.0, 8.0, 8.0]
-
-  SmallWallJump:
-    trainer: ppo
-    batch_size: 128
-    beta: 0.005
-    buffer_size: 2048
-    epsilon: 0.2
-    hidden_units: 256
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 5e6
-    memory_size: 128
-    normalize: false
-    num_epoch: 3
-    num_layers: 2
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
-    sequence_length: 64
-    use_recurrent: false
-    vis_encode_type: simple
+    threaded: true
+  SmallWallJump:
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 128
+      buffer_size: 2048
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
-    curriculum:
-      measure: progress
-      thresholds: [0.1, 0.3, 0.5]
-      min_lesson_length: 100
-      signal_smoothing: true
-      parameters:
-        small_wall_height: [1.5, 2.0, 2.5, 4.0]
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 128
+    summary_freq: 20000
+    threaded: true
+
+curriculum:
+  BigWallJump:
+    measure: progress
+    thresholds: [0.1, 0.3, 0.5]
+    min_lesson_length: 100
+    signal_smoothing: true
+    parameters:
+      big_wall_min_height: [0.0, 4.0, 6.0, 8.0]
+      big_wall_max_height: [4.0, 7.0, 8.0, 8.0]
+  SmallWallJump:
+    measure: progress
+    thresholds: [0.1, 0.3, 0.5]
+    min_lesson_length: 100
+    signal_smoothing: true
+    parameters:
+      small_wall_height: [1.5, 2.0, 2.5, 4.0]
--- a/config/ppo/WormDynamic.yaml
+++ b/config/ppo/WormDynamic.yaml
 behaviors:
  WormDynamic:
-    trainer: ppo
-    batch_size: 2024
-    beta: 0.005
-    buffer_size: 20240
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 3.5e6
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2024
+      buffer_size: 20240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
+        gamma: 0.995
-        gamma: 0.995
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 3500000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
--- a/config/ppo/WormStatic.yaml
+++ b/config/ppo/WormStatic.yaml
 behaviors:
  WormStatic:
-    trainer: ppo
-    batch_size: 2024
-    beta: 0.005
-    buffer_size: 20240
-    epsilon: 0.2
-    hidden_units: 512
-    lambd: 0.95
-    learning_rate: 0.0003
-    learning_rate_schedule: linear
-    max_steps: 3.5e6
-    memory_size: 128
-    normalize: true
-    num_epoch: 3
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 2024
+      buffer_size: 20240
+      learning_rate: 0.0003
+      beta: 0.005
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
+        gamma: 0.995
-        gamma: 0.995
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 3500000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
--- a/config/sac/3DBall.yaml
+++ b/config/sac/3DBall.yaml
 behaviors:
  3DBall:
-    trainer: sac
-    batch_size: 64
-    buffer_size: 12000
-    buffer_init_steps: 0
-    hidden_units: 64
-    init_entcoef: 0.5
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e5
-    memory_size: 128
-    normalize: true
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 12000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 64
+      buffer_size: 12000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.5
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: true
+      hidden_units: 64
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 1000
+    summary_freq: 12000
+    threaded: true
--- a/config/sac/3DBallHard.yaml
+++ b/config/sac/3DBallHard.yaml
 behaviors:
  3DBallHard:
-    trainer: sac
-    batch_size: 256
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 128
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e5
-    memory_size: 128
-    normalize: true
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 12000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 1000
+    summary_freq: 12000
+    threaded: true
--- a/config/sac/Basic.yaml
+++ b/config/sac/Basic.yaml
 behaviors:
  Basic:
-    trainer: sac
-    batch_size: 64
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 20
-    init_entcoef: 0.01
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e5
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 10
-    sequence_length: 64
-    summary_freq: 2000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 64
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.01
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 20
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 10
+    summary_freq: 2000
+    threaded: true
--- a/config/sac/Bouncer.yaml
+++ b/config/sac/Bouncer.yaml
 behaviors:
  Bouncer:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 64
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 1.0e6
-    memory_size: 128
-    normalize: true
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 20000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: true
+      hidden_units: 64
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 1000000
+    time_horizon: 64
+    summary_freq: 20000
+    threaded: true
--- a/config/sac/CrawlerDynamic.yaml
+++ b/config/sac/CrawlerDynamic.yaml
 behaviors:
  CrawlerDynamic:
-    trainer: sac
-    batch_size: 256
-    buffer_size: 500000
-    buffer_init_steps: 0
-    hidden_units: 512
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5e6
-    memory_size: 128
-    normalize: true
-    steps_per_update: 20
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 500000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 20.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 20.0
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
+        gamma: 0.995
-        gamma: 0.995
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
--- a/config/sac/CrawlerStatic.yaml
+++ b/config/sac/CrawlerStatic.yaml
 behaviors:
  CrawlerStatic:
-    trainer: sac
-    batch_size: 256
-    buffer_size: 500000
-    buffer_init_steps: 2000
-    hidden_units: 512
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 3e6
-    memory_size: 128
-    normalize: true
-    steps_per_update: 20
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 500000
+      buffer_init_steps: 2000
+      tau: 0.005
+      steps_per_update: 20.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 20.0
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
+        gamma: 0.995
-        gamma: 0.995
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 3000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
--- a/config/sac/FoodCollector.yaml
+++ b/config/sac/FoodCollector.yaml
 behaviors:
  FoodCollector:
-    trainer: sac
-    batch_size: 256
-    buffer_size: 500000
-    buffer_init_steps: 0
-    hidden_units: 128
-    init_entcoef: 0.05
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 2.0e6
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 10000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 500000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.05
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 2000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
--- a/config/sac/GridWorld.yaml
+++ b/config/sac/GridWorld.yaml
 behaviors:
  GridWorld:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 50000
-    buffer_init_steps: 1000
-    hidden_units: 128
-    init_entcoef: 0.5
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 500000
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 1
-    time_horizon: 5
-    sequence_length: 64
-    summary_freq: 20000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 50000
+      buffer_init_steps: 1000
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.5
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 1
+      vis_encode_type: simple
+        gamma: 0.9
-        gamma: 0.9
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 5
+    summary_freq: 20000
+    threaded: true
--- a/config/sac/Hallway.yaml
+++ b/config/sac/Hallway.yaml
 behaviors:
  Hallway:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 128
-    init_entcoef: 0.1
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5.0e6
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 32
-    summary_freq: 10000
-    tau: 0.005
-    use_recurrent: true
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.1
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+      memory:
+        sequence_length: 32
+        memory_size: 128
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
--- a/config/sac/PushBlock.yaml
+++ b/config/sac/PushBlock.yaml
 behaviors:
  PushBlock:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 256
-    init_entcoef: 0.05
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 2e6
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 100000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.05
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 2000000
+    time_horizon: 64
+    summary_freq: 100000
+    threaded: true
--- a/config/sac/Pyramids.yaml
+++ b/config/sac/Pyramids.yaml
 behaviors:
  Pyramids:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 500000
-    buffer_init_steps: 10000
-    hidden_units: 256
-    init_entcoef: 0.01
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 1.0e7
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 128
-    sequence_length: 16
-    summary_freq: 30000
-    tau: 0.01
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 500000
+      buffer_init_steps: 10000
+      tau: 0.01
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.01
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
-        strength: 2.0
+        strength: 2.0
-        strength: 0.02
+        strength: 0.02
+        learning_rate: 0.0003
+        use_vail: false
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 128
+    summary_freq: 30000
+    threaded: true
--- a/config/sac/Reacher.yaml
+++ b/config/sac/Reacher.yaml
 behaviors:
  Reacher:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 500000
-    buffer_init_steps: 0
-    hidden_units: 128
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 2e7
-    memory_size: 128
-    normalize: true
-    steps_per_update: 20
-    num_layers: 2
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 60000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 500000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 20.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 20.0
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
+    time_horizon: 1000
+    summary_freq: 60000
+    threaded: true
--- a/config/sac/Tennis.yaml
+++ b/config/sac/Tennis.yaml
 behaviors:
  Tennis:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 256
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 2e7
-    memory_size: 128
-    normalize: true
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 64
-    sequence_length: 64
-    summary_freq: 10000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: true
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
-      window: 10
-      play_against_current_self_ratio: 0.5
+      team_change: 250000
+      window: 10
+      play_against_latest_model_ratio: 0.5
+      initial_elo: 1200.0
--- a/config/sac/VisualHallway.yaml
+++ b/config/sac/VisualHallway.yaml
 behaviors:
  VisualHallway:
-    trainer: sac
-    batch_size: 64
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 128
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 1.0e7
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 1
-    time_horizon: 64
-    sequence_length: 32
-    summary_freq: 10000
-    tau: 0.005
-    use_recurrent: true
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 64
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 1
+      vis_encode_type: simple
+      memory:
+        sequence_length: 32
+        memory_size: 128
-        strength: 1.0
-    gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 64
+    summary_freq: 10000
+    threaded: true
--- a/config/sac/VisualPushBlock.yaml
+++ b/config/sac/VisualPushBlock.yaml
 behaviors:
  VisualPushBlock:
-    trainer: sac
-    batch_size: 64
-    buffer_size: 1024
-    buffer_init_steps: 0
-    hidden_units: 128
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 3.0e6
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 1
-    time_horizon: 64
-    sequence_length: 32
-    summary_freq: 60000
-    tau: 0.005
-    use_recurrent: true
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 64
+      buffer_size: 1024
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 128
+      num_layers: 1
+      vis_encode_type: simple
+      memory:
+        sequence_length: 32
+        memory_size: 128
-        strength: 1.0
-    gamma: 0.99
+        strength: 1.0
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 3000000
+    time_horizon: 64
+    summary_freq: 60000
+    threaded: true
--- a/config/sac/VisualPyramids.yaml
+++ b/config/sac/VisualPyramids.yaml
 behaviors:
  VisualPyramids:
-    trainer: sac
-    batch_size: 64
-    buffer_size: 500000
-    buffer_init_steps: 1000
-    hidden_units: 256
-    init_entcoef: 0.01
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 1.0e7
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 1
-    time_horizon: 128
-    sequence_length: 64
-    summary_freq: 10000
-    tau: 0.01
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 64
+      buffer_size: 500000
+      buffer_init_steps: 1000
+      tau: 0.01
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.01
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 1
+      vis_encode_type: simple
-        strength: 2.0
+        strength: 2.0
-        strength: 0.02
+        strength: 0.02
+        learning_rate: 0.0003
+        use_vail: false
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 10000000
+    time_horizon: 128
+    summary_freq: 10000
+    threaded: true
--- a/config/sac/Walker.yaml
+++ b/config/sac/Walker.yaml
 behaviors:
  Walker:
-    trainer: sac
-    batch_size: 256
-    buffer_size: 500000
-    buffer_init_steps: 0
-    hidden_units: 512
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 2e7
-    memory_size: 128
-    normalize: true
-    steps_per_update: 30
-    num_layers: 4
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 500000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 30.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 30.0
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 4
+      vis_encode_type: simple
+        gamma: 0.995
-        gamma: 0.995
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
--- a/config/sac/WallJump.yaml
+++ b/config/sac/WallJump.yaml
 behaviors:
  BigWallJump:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 256
-    init_entcoef: 0.1
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 2e7
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 2
-    time_horizon: 128
-    sequence_length: 64
-    summary_freq: 20000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.1
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
-
-  SmallWallJump:
-    trainer: sac
-    batch_size: 128
-    buffer_size: 50000
-    buffer_init_steps: 0
-    hidden_units: 256
-    init_entcoef: 0.1
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5e6
-    memory_size: 128
-    normalize: false
-    steps_per_update: 10
-    num_layers: 2
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 20000000
-    sequence_length: 64
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    threaded: true
+  SmallWallJump:
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 128
+      buffer_size: 50000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.1
+      reward_signal_steps_per_update: 10.0
+    network_settings:
+      normalize: false
+      hidden_units: 256
+      num_layers: 2
+      vis_encode_type: simple
+        gamma: 0.99
-        gamma: 0.99
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 128
+    summary_freq: 20000
+    threaded: true
--- a/config/sac/WormDynamic.yaml
+++ b/config/sac/WormDynamic.yaml
 behaviors:
  WormDynamic:
-    trainer: sac
-    batch_size: 256
-    buffer_size: 500000
-    buffer_init_steps: 0
-    hidden_units: 512
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 5e6
-    memory_size: 128
-    normalize: true
-    steps_per_update: 20
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 500000
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 20.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 20.0
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
+        gamma: 0.995
-        gamma: 0.995
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
--- a/config/sac/WormStatic.yaml
+++ b/config/sac/WormStatic.yaml
 behaviors:
  WormStatic:
-    trainer: sac
-    batch_size: 256
-    buffer_size: 500000
-    buffer_init_steps: 2000
-    hidden_units: 512
-    init_entcoef: 1.0
-    learning_rate: 0.0003
-    learning_rate_schedule: constant
-    max_steps: 3e6
-    memory_size: 128
-    normalize: true
-    steps_per_update: 20
-    num_layers: 3
-    time_horizon: 1000
-    sequence_length: 64
-    summary_freq: 30000
-    tau: 0.005
-    use_recurrent: false
-    vis_encode_type: simple
+    trainer_type: sac
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 500000
+      buffer_init_steps: 2000
+      tau: 0.005
+      steps_per_update: 20.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 20.0
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
+        gamma: 0.995
-        gamma: 0.995
+    output_path: default
+    keep_checkpoints: 5
+    max_steps: 3000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
--- a/docs/Migrating.md
+++ b/docs/Migrating.md
  instead of `summaries/` and `models/`.
 - Trainer configuration, curriculum configuration, and parameter randomization
  configuration have all been moved to a single YAML file. (#3791)
+- Trainer configuration format has changed, and using a "default" behavior name has
+  been deprecated. (#3936)
 - `max_step` in the `TerminalStep` and `TerminalSteps` objects was renamed `interrupted`.
 - On the UnityEnvironment API, `get_behavior_names()` and `get_behavior_specs()` methods were combined into the property `behavior_specs` that contains a mapping from behavior names to behavior spec.
 - `use_visual` and `allow_multiple_visual_obs` in the `UnityToGymWrapper` constructor
 ### Steps to Migrate
- Before upgrading, copy your `Behavior Name` sections from `trainer_config.yaml` into
-  a separate trainer configuration file, under a `behaviors` section. You can move the `default` section too
-  if it's being used. This file should be specific to your environment, and not contain configurations for
-  multiple environments (unless they have the same Behavior Names).
+- To upgrade your configuration files, an upgrade script has been provided. Run `python config/update_config.py
+  -h` to see the script usage.
+
+  To do it manually, copy your `<BehaviorName>` sections from `trainer_config.yaml` into a separate trainer configuration file, under a `behaviors` section.
+  The `default` section is no longer needed. This new file should be specific to your environment, and not contain
+  configurations for multiple environments (unless they have the same Behavior Names).
+  - You will need to reformat your trainer settings as per the [example](Training-ML-Agents.md).
  - If your training uses [curriculum](Training-ML-Agents.md#curriculum-learning), move those configurations under
  the `Behavior Name` section.
  - If your training uses [parameter randomization](Training-ML-Agents.md#environment-parameter-randomization), move
--- a/docs/Training-Configuration-File.md
+++ b/docs/Training-Configuration-File.md

 | **Setting**              | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
 | :----------------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `trainer`                | The type of trainer to use: `ppo` or `sac`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| `summary_freq`           | Number of experiences that needs to be collected before generating and displaying training statistics. This determines the granularity of the graphs in Tensorboard.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| `batch_size`             | Number of experiences in each iteration of gradient descent. **This should always be multiple times smaller than `buffer_size`**. If you are using a continuous action space, this value should be large (in the order of 1000s). If you are using a discrete action space, this value should be smaller (in order of 10s). <br><br> Typical range: (Continuous - PPO): `512` - `5120`; (Continuous - SAC): `128` - `1024`; (Discrete, PPO & SAC): `32` - `512`.                                                                                                                                                                                                                                                               |
-| `buffer_size`            | Number of experiences to collect before updating the policy model. Corresponds to how many experiences should be collected before we do any learning or updating of the model. **This should be multiple times larger than `batch_size`**. Typically a larger `buffer_size` corresponds to more stable training updates. In SAC, the max size of the experience buffer - on the order of thousands of times longer than your episodes, so that SAC can learn from old as well as new experiences. <br><br>Typical range: PPO: `2048` - `409600`; SAC: `50000` - `1000000`                                                                                                                                                      |
-| `hidden_units`           | Number of units in the hidden layers of the neural network. Correspond to how many units are in each fully connected layer of the neural network. For simple problems where the correct action is a straightforward combination of the observation inputs, this should be small. For problems where the action is a very complex interaction between the observation variables, this should be larger. <br><br> Typical range: `32` - `512`                                                                                                                                                                                                                                                                                    |
-| `learning_rate`          | Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| `learning_rate_schedule` | (Optional, default = `linear` for PPO and `constant` for SAC) Determines how learning rate changes over time. For PPO, we recommend decaying learning rate until max_steps so learning converges more stably. However, for some cases (e.g. training for an unknown amount of time) this feature can be disabled. For SAC, we recommend holding learning rate constant so that the agent can continue to learn until its Q function converges naturally. <br><br>`linear` decays the learning_rate linearly, reaching 0 at max_steps, while `constant` keeps the learning rate constant for the entire training run.                                                                                                           |
-| `max_steps`              | Total number of experience points that must be collected from the simulation before ending the training process. <br><br>Typical range: `5e5` - `1e7`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| `normalize`              | Whether normalization is applied to the vector observation inputs. This normalization is based on the running average and variance of the vector observation. Normalization can be helpful in cases with complex continuous control problems, but may be harmful with simpler discrete control problems.                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| `num_layers`             | The number of hidden layers in the neural network. Corresponds to how many hidden layers are present after the observation input, or after the CNN encoding of the visual observation. For simple problems, fewer layers are likely to train faster and more efficiently. More layers may be necessary for more complex control problems. <br><br> Typical range: `1` - `3`                                                                                                                                                                                                                                                                                                                                                    |
-| `time_horizon`           | How many steps of experience to collect per-agent before adding it to the experience buffer. When this limit is reached before the end of an episode, a value estimate is used to predict the overall expected reward from the agent's current state. As such, this parameter trades off between a less biased, but higher variance estimate (long time horizon) and more biased, but less varied estimate (short time horizon). In cases where there are frequent rewards within an episode, or episodes are prohibitively large, a smaller number can be more ideal. This number should be large enough to capture all the important behavior within a sequence of an agent's actions. <br><br> Typical range: `32` - `2048` |
-| `vis_encoder_type`       | (Optional, default = `simple`) Encoder type for encoding visual observations. <br><br> `simple` (default) uses a simple encoder which consists of two convolutional layers, `nature_cnn` uses the CNN implementation proposed by [Mnih et al.](https://www.nature.com/articles/nature14236), consisting of three convolutional layers, and `resnet` uses the [IMPALA Resnet](https://arxiv.org/abs/1802.01561) consisting of three stacked layers, each with two residual blocks, making a much larger network than the other two.                                                                                                                                                                                             |
-| `init_path`              | (Optional, default = None) Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. <br><br>You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run.                                                                                                                                  |
-| `threaded`               | (Optional, default = `true`) By default, model updates can happen while the environment is being stepped. This violates the [on-policy](https://spinningup.openai.com/en/latest/user/algorithms.html#the-on-policy-algorithms) assumption of PPO slightly in exchange for a training speedup. To maintain the strict on-policyness of PPO, you can disable parallel updates by setting `threaded` to `false`. There is usually no reason to turn `threaded` off for SAC.                                                                                                                                                                                                                                                       |
+| `trainer_type`                | (default = `ppo`) The type of trainer to use: `ppo` or `sac`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+| `summary_freq`           | (default = `50000`) Number of experiences that needs to be collected before generating and displaying training statistics. This determines the granularity of the graphs in Tensorboard.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| `time_horizon`           | (default = `64`) How many steps of experience to collect per-agent before adding it to the experience buffer. When this limit is reached before the end of an episode, a value estimate is used to predict the overall expected reward from the agent's current state. As such, this parameter trades off between a less biased, but higher variance estimate (long time horizon) and more biased, but less varied estimate (short time horizon). In cases where there are frequent rewards within an episode, or episodes are prohibitively large, a smaller number can be more ideal. This number should be large enough to capture all the important behavior within a sequence of an agent's actions. <br><br> Typical range: `32` - `2048` |
+| `max_steps`              | (default = `500000`) Total number of steps (i.e., observation collected and action taken) that must be taken in the environment (or across all environments if using multiple in parallel) before ending the training process. If you have multiple agents with the same behavior name within your environment, all steps taken by those agents will contribute to the same `max_steps` count. <br><br>Typical range: `5e5` - `1e7`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| `keep_checkpoints`         | (default = `5`) The maximum number of model checkpoints to keep. Checkpoints are saved after the number of steps specified by the save-freq option. Once the maximum number of checkpoints has been reached, the oldest checkpoint is deleted when saving a new checkpoint. |
+| `init_path`              | (default = None) Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. <br><br>You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run.                                                                                                                                  |
+| `threaded`               | (default = `true`) By default, model updates can happen while the environment is being stepped. This violates the [on-policy](https://spinningup.openai.com/en/latest/user/algorithms.html#the-on-policy-algorithms) assumption of PPO slightly in exchange for a training speedup. To maintain the strict on-policyness of PPO, you can disable parallel updates by setting `threaded` to `false`. There is usually no reason to turn `threaded` off for SAC.                                                                                                                                                                                                                                                       |
+| `hyperparameters -> learning_rate`          | (default = `3e-4`) Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| `hyperparameters -> batch_size`             | Number of experiences in each iteration of gradient descent. **This should always be multiple times smaller than `buffer_size`**. If you are using a continuous action space, this value should be large (in the order of 1000s). If you are using a discrete action space, this value should be smaller (in order of 10s). <br><br> Typical range: (Continuous - PPO): `512` - `5120`; (Continuous - SAC): `128` - `1024`; (Discrete, PPO & SAC): `32` - `512`.                                                                                                                                                                                                                                                               |
+| `hyperparameters -> buffer_size`            | (default = `10240` for PPO and `50000` for SAC) Number of experiences to collect before updating the policy model. Corresponds to how many experiences should be collected before we do any learning or updating of the model. **This should be multiple times larger than `batch_size`**. Typically a larger `buffer_size` corresponds to more stable training updates. In SAC, the max size of the experience buffer - on the order of thousands of times longer than your episodes, so that SAC can learn from old as well as new experiences. <br><br>Typical range: PPO: `2048` - `409600`; SAC: `50000` - `1000000`                                                                                                                                                      |
+| `hyperparameters -> learning_rate_schedule` | (default = `linear` for PPO and `constant` for SAC) Determines how learning rate changes over time. For PPO, we recommend decaying learning rate until max_steps so learning converges more stably. However, for some cases (e.g. training for an unknown amount of time) this feature can be disabled. For SAC, we recommend holding learning rate constant so that the agent can continue to learn until its Q function converges naturally. <br><br>`linear` decays the learning_rate linearly, reaching 0 at max_steps, while `constant` keeps the learning rate constant for the entire training run.                                                                                                           |
+
+| `network_settings -> hidden_units`           | (default = `128`) Number of units in the hidden layers of the neural network. Correspond to how many units are in each fully connected layer of the neural network. For simple problems where the correct action is a straightforward combination of the observation inputs, this should be small. For problems where the action is a very complex interaction between the observation variables, this should be larger. <br><br> Typical range: `32` - `512`                                                                                                                                                                                                                                                                                    |
+| `network_settings -> num_layers`             | (default = `false`) The number of hidden layers in the neural network. Corresponds to how many hidden layers are present after the observation input, or after the CNN encoding of the visual observation. For simple problems, fewer layers are likely to train faster and more efficiently. More layers may be necessary for more complex control problems. <br><br> Typical range: `1` - `3`                                                                                                                                                                                                                                                                                                                                                    |
+| `network_settings -> normalize`              | (default = `false`) Whether normalization is applied to the vector observation inputs. This normalization is based on the running average and variance of the vector observation. Normalization can be helpful in cases with complex continuous control problems, but may be harmful with simpler discrete control problems.                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| `network_settings -> vis_encoder_type`       | (default = `simple`) Encoder type for encoding visual observations. <br><br> `simple` (default) uses a simple encoder which consists of two convolutional layers, `nature_cnn` uses the CNN implementation proposed by [Mnih et al.](https://www.nature.com/articles/nature14236), consisting of three convolutional layers, and `resnet` uses the [IMPALA Resnet](https://arxiv.org/abs/1802.01561) consisting of three stacked layers, each with two residual blocks, making a much larger network than the other two.                                                                                                                                                                                             |
+

 ## Trainer-specific Configurations


 | **Setting** | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
 | :---------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `beta`      | Strength of the entropy regularization, which makes the policy "more random." This ensures that agents properly explore the action space during training. Increasing this will ensure more random actions are taken. This should be adjusted such that the entropy (measurable from TensorBoard) slowly decreases alongside increases in reward. If entropy drops too quickly, increase beta. If entropy drops too slowly, decrease `beta`. <br><br>Typical range: `1e-4` - `1e-2`                                                                                                                                                                     |
-| `epsilon`   | Influences how rapidly the policy can evolve during training. Corresponds to the acceptable threshold of divergence between the old and new policies during gradient descent updating. Setting this value small will result in more stable updates, but will also slow the training process. <br><br>Typical range: `0.1` - `0.3`                                                                                                                                                                                                                                                                                                                      |
-| `lambd`     | Regularization parameter (lambda) used when calculating the Generalized Advantage Estimate ([GAE](https://arxiv.org/abs/1506.02438)). This can be thought of as how much the agent relies on its current value estimate when calculating an updated value estimate. Low values correspond to relying more on the current value estimate (which can be high bias), and high values correspond to relying more on the actual rewards received in the environment (which can be high variance). The parameter provides a trade-off between the two, and the right value can lead to a more stable training process. <br><br>Typical range: `0.9` - `0.95` |
-| `num_epoch` | Number of passes to make through the experience buffer when performing gradient descent optimization.The larger the batch_size, the larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning. <br><br>Typical range: `3` - `10`                                                                                                                                                                                                                                                                                                                                                           |
+| `hyperparameters -> beta`      | (default = `5.0e-3`) Strength of the entropy regularization, which makes the policy "more random." This ensures that agents properly explore the action space during training. Increasing this will ensure more random actions are taken. This should be adjusted such that the entropy (measurable from TensorBoard) slowly decreases alongside increases in reward. If entropy drops too quickly, increase beta. If entropy drops too slowly, decrease `beta`. <br><br>Typical range: `1e-4` - `1e-2`                                                                                                                                                                     |
+| `hyperparameters -> epsilon`   | (default = `0.2`) Influences how rapidly the policy can evolve during training. Corresponds to the acceptable threshold of divergence between the old and new policies during gradient descent updating. Setting this value small will result in more stable updates, but will also slow the training process. <br><br>Typical range: `0.1` - `0.3`                                                                                                                                                                                                                                                                                                                      |
+| `hyperparameters -> lambd`     | (default = `0.95`) Regularization parameter (lambda) used when calculating the Generalized Advantage Estimate ([GAE](https://arxiv.org/abs/1506.02438)). This can be thought of as how much the agent relies on its current value estimate when calculating an updated value estimate. Low values correspond to relying more on the current value estimate (which can be high bias), and high values correspond to relying more on the actual rewards received in the environment (which can be high variance). The parameter provides a trade-off between the two, and the right value can lead to a more stable training process. <br><br>Typical range: `0.9` - `0.95` |
+| `hyperparameters -> num_epoch` | Number of passes to make through the experience buffer when performing gradient descent optimization.The larger the batch_size, the larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning. <br><br>Typical range: `3` - `10`                                                                                                                                                                                                                                                                                                                                                           |
-| `buffer_init_steps`  | Number of experiences to collect into the buffer before updating the policy model. As the untrained policy is fairly random, pre-filling the buffer with random actions is useful for exploration. Typically, at least several episodes of experiences should be pre-filled. <br><br>Typical range: `1000` - `10000`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| `init_entcoef`       | How much the agent should explore in the beginning of training. Corresponds to the initial entropy coefficient set at the beginning of training. In SAC, the agent is incentivized to make its actions entropic to facilitate better exploration. The entropy coefficient weighs the true reward with a bonus entropy reward. The entropy coefficient is [automatically adjusted](https://arxiv.org/abs/1812.05905) to a preset target entropy, so the `init_entcoef` only corresponds to the starting value of the entropy bonus. Increase init_entcoef to explore more in the beginning, decrease to converge to a solution faster. <br><br>Typical range: (Continuous): `0.5` - `1.0`; (Discrete): `0.05` - `0.5`                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| `save_replay_buffer` | (Optional, default = `false`) Whether to save and load the experience replay buffer as well as the model when quitting and re-starting training. This may help resumes go more smoothly, as the experiences collected won't be wiped. Note that replay buffers can be very large, and will take up a considerable amount of disk space. For that reason, we disable this feature by default.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| `tau`                | How aggressively to update the target network used for bootstrapping value estimation in SAC. Corresponds to the magnitude of the target Q update during the SAC model update. In SAC, there are two neural networks: the target and the policy. The target network is used to bootstrap the policy's estimate of the future rewards at a given state, and is fixed while the policy is being updated. This target is then slowly updated according to tau. Typically, this value should be left at 0.005. For simple problems, increasing tau to 0.01 might reduce the time it takes to learn, at the cost of stability. <br><br>Typical range: `0.005` - `0.01`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| `steps_per_update`   | Average ratio of agent steps (actions) taken to updates made of the agent's policy. In SAC, a single "update" corresponds to grabbing a batch of size `batch_size` from the experience replay buffer, and using this mini batch to update the models. Note that it is not guaranteed that after exactly `steps_per_update` steps an update will be made, only that the ratio will hold true over many steps. Typically, `steps_per_update` should be greater than or equal to 1. Note that setting `steps_per_update` lower will improve sample efficiency (reduce the number of steps required to train) but increase the CPU time spent performing updates. For most environments where steps are fairly fast (e.g. our example environments) `steps_per_update` equal to the number of agents in the scene is a good balance. For slow environments (steps take 0.1 seconds or more) reducing `steps_per_update` may improve training speed. We can also change `steps_per_update` to lower than 1 to update more often than once per step, though this will usually result in a slowdown unless the environment is very slow. <br><br>Typical range: `1` - `20` |
+| `hyperparameters -> buffer_init_steps`  | (default = `0`) Number of experiences to collect into the buffer before updating the policy model. As the untrained policy is fairly random, pre-filling the buffer with random actions is useful for exploration. Typically, at least several episodes of experiences should be pre-filled. <br><br>Typical range: `1000` - `10000`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| `hyperparameters -> init_entcoef` | (default = `1.0`) How much the agent should explore in the beginning of training. Corresponds to the initial entropy coefficient set at the beginning of training. In SAC, the agent is incentivized to make its actions entropic to facilitate better exploration. The entropy coefficient weighs the true reward with a bonus entropy reward. The entropy coefficient is [automatically adjusted](https://arxiv.org/abs/1812.05905) to a preset target entropy, so the `init_entcoef` only corresponds to the starting value of the entropy bonus. Increase init_entcoef to explore more in the beginning, decrease to converge to a solution faster. <br><br>Typical range: (Continuous): `0.5` - `1.0`; (Discrete): `0.05` - `0.5`                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| `hyperparameters -> save_replay_buffer` | (default = `false`) Whether to save and load the experience replay buffer as well as the model when quitting and re-starting training. This may help resumes go more smoothly, as the experiences collected won't be wiped. Note that replay buffers can be very large, and will take up a considerable amount of disk space. For that reason, we disable this feature by default.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| `hyperparameters -> tau` | (default = `0.005`) How aggressively to update the target network used for bootstrapping value estimation in SAC. Corresponds to the magnitude of the target Q update during the SAC model update. In SAC, there are two neural networks: the target and the policy. The target network is used to bootstrap the policy's estimate of the future rewards at a given state, and is fixed while the policy is being updated. This target is then slowly updated according to tau. Typically, this value should be left at 0.005. For simple problems, increasing tau to 0.01 might reduce the time it takes to learn, at the cost of stability. <br><br>Typical range: `0.005` - `0.01`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| `hyperparameters -> steps_per_update` | (default = `1`) Average ratio of agent steps (actions) taken to updates made of the agent's policy. In SAC, a single "update" corresponds to grabbing a batch of size `batch_size` from the experience replay buffer, and using this mini batch to update the models. Note that it is not guaranteed that after exactly `steps_per_update` steps an update will be made, only that the ratio will hold true over many steps. Typically, `steps_per_update` should be greater than or equal to 1. Note that setting `steps_per_update` lower will improve sample efficiency (reduce the number of steps required to train) but increase the CPU time spent performing updates. For most environments where steps are fairly fast (e.g. our example environments) `steps_per_update` equal to the number of agents in the scene is a good balance. For slow environments (steps take 0.1 seconds or more) reducing `steps_per_update` may improve training speed. We can also change `steps_per_update` to lower than 1 to update more often than once per step, though this will usually result in a slowdown unless the environment is very slow. <br><br>Typical range: `1` - `20` |
+
+| `hyperparameters -> reward_signal_num_update` | (default = `steps_per_update`) Number of steps per mini batch sampled and used for updating the reward signals. By default, we update the reward signals once every time the main policy is updated. However, to imitate the training procedure in certain imitation learning papers (e.g. [Kostrikov et. al](http://arxiv.org/abs/1809.02925), [Blondé et. al](http://arxiv.org/abs/1809.02064)), we may want to update the reward signal (GAIL) M times for every update of the policy. We can change `steps_per_update` of SAC to N, as well as `reward_signal_steps_per_update` under `reward_signals` to N / M to accomplish this. By default, `reward_signal_steps_per_update` is set to `steps_per_update`. |

 ## Reward Signals


 | **Setting**             | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                      |
 | :---------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `extrinsic -> strength` | Factor by which to multiply the reward given by the environment. Typical ranges will vary depending on the reward signal. <br><br>Typical range: `1.00`                                                                                                                                                                                                                                                                                              |
-| `extrinsic -> gamma`    | Discount factor for future rewards coming from the environment. This can be thought of as how far into the future the agent should care about possible rewards. In situations when the agent should be acting in the present in order to prepare for rewards in the distant future, this value should be large. In cases when rewards are more immediate, it can be smaller. Must be strictly smaller than 1. <br><br>Typical range: `0.8` - `0.995` |
+| `extrinsic -> strength` | (default = `1.0`) Factor by which to multiply the reward given by the environment. Typical ranges will vary depending on the reward signal. <br><br>Typical range: `1.00`                                                                                                                                                                                                                                                                                              |
+| `extrinsic -> gamma`    | (default = `0.99`) Discount factor for future rewards coming from the environment. This can be thought of as how far into the future the agent should care about possible rewards. In situations when the agent should be acting in the present in order to prepare for rewards in the distant future, this value should be large. In cases when rewards are more immediate, it can be smaller. Must be strictly smaller than 1. <br><br>Typical range: `0.8` - `0.995` |

 ### Curiosity Intrinsic Reward

 | :--------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `curiosity -> strength`      | Magnitude of the curiosity reward generated by the intrinsic curiosity module. This should be scaled in order to ensure it is large enough to not be overwhelmed by extrinsic reward signals in the environment. Likewise it should not be too large to overwhelm the extrinsic reward signal. <br><br>Typical range: `0.001` - `0.1` |
-| `curiosity -> gamma`         | Discount factor for future rewards. <br><br>Typical range: `0.8` - `0.995`                                                                                                                                                                                                                                                            |
-| `curiosity -> encoding_size` | (Optional, default = `64`) Size of the encoding used by the intrinsic curiosity model. This value should be small enough to encourage the ICM to compress the original observation, but also not too small to prevent it from learning to differentiate between expected and actual observations. <br><br>Typical range: `64` - `256` |
-| `curiosity -> learning_rate` | (Optional, default = `3e-4`) Learning rate used to update the intrinsic curiosity module. This should typically be decreased if training is unstable, and the curiosity loss is unstable. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                      |
+| `curiosity -> strength`      | (default = `1.0`) Magnitude of the curiosity reward generated by the intrinsic curiosity module. This should be scaled in order to ensure it is large enough to not be overwhelmed by extrinsic reward signals in the environment. Likewise it should not be too large to overwhelm the extrinsic reward signal. <br><br>Typical range: `0.001` - `0.1` |
+| `curiosity -> gamma`         | (default = `0.99`) Discount factor for future rewards. <br><br>Typical range: `0.8` - `0.995`                                                                                                                                                                                                                                                            |
+| `curiosity -> encoding_size` | (default = `64`) Size of the encoding used by the intrinsic curiosity model. This value should be small enough to encourage the ICM to compress the original observation, but also not too small to prevent it from learning to differentiate between expected and actual observations. <br><br>Typical range: `64` - `256` |
+| `curiosity -> learning_rate` | (default = `3e-4`) Learning rate used to update the intrinsic curiosity module. This should typically be decreased if training is unstable, and the curiosity loss is unstable. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                      |

 ### GAIL Intrinsic Reward

 | **Setting**             | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
 | :---------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `gail -> strength`      | Factor by which to multiply the raw reward. Note that when using GAIL with an Extrinsic Signal, this value should be set lower if your demonstrations are suboptimal (e.g. from a human), so that a trained agent will focus on receiving extrinsic rewards instead of exactly copying the demonstrations. Keep the strength below about 0.1 in those cases. <br><br>Typical range: `0.01` - `1.0`                                                                              |
-| `gail -> gamma`         | Discount factor for future rewards. <br><br>Typical range: `0.8` - `0.9`                                                                                                                                                                                                                                                                                                                                                                                                        |
-| `gail -> demo_path`     | The path to your .demo file or directory of .demo files.                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| `gail -> encoding_size` | (Optional, default = `64`) Size of the hidden layer used by the discriminator. This value should be small enough to encourage the discriminator to compress the original observation, but also not too small to prevent it from learning to differentiate between demonstrated and actual behavior. Dramatically increasing this size will also negatively affect training times. <br><br>Typical range: `64` - `256`                                                           |
+| `gail -> strength`      | (default = `1.0`) Factor by which to multiply the raw reward. Note that when using GAIL with an Extrinsic Signal, this value should be set lower if your demonstrations are suboptimal (e.g. from a human), so that a trained agent will focus on receiving extrinsic rewards instead of exactly copying the demonstrations. Keep the strength below about 0.1 in those cases. <br><br>Typical range: `0.01` - `1.0`                                                                              |
+| `gail -> gamma`         | (default = `0.99`) Discount factor for future rewards. <br><br>Typical range: `0.8` - `0.9`                                                                                                                                                                                                                                                                                                                                                                                                        |
+| `gail -> demo_path`     | (Required, no default) The path to your .demo file or directory of .demo files.                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| `gail -> encoding_size` | (default = `64`) Size of the hidden layer used by the discriminator. This value should be small enough to encourage the discriminator to compress the original observation, but also not too small to prevent it from learning to differentiate between demonstrated and actual behavior. Dramatically increasing this size will also negatively affect training times. <br><br>Typical range: `64` - `256`                                                           |
-| `gail -> use_actions`   | (Optional, default = `false`) Determines whether the discriminator should discriminate based on both observations and actions, or just observations. Set to True if you want the agent to mimic the actions from the demonstrations, and False if you'd rather have the agent visit the same states as in the demonstrations but with possibly different actions. Setting to False is more likely to be stable, especially with imperfect demonstrations, but may learn slower. |
-| `gail -> use_vail`      | (Optional, default = `false`) Enables a variational bottleneck within the GAIL discriminator. This forces the discriminator to learn a more general representation and reduces its tendency to be "too good" at discriminating, making learning more stable. However, it does increase training time. Enable this if you notice your imitation learning is unstable, or unable to learn the task at hand.                                                                       |
-
-### Reward Signal Settings for SAC
-
-All of the reward signals configurations described above apply to both PPO and
-SAC. There is one configuration for all reward signals that only applies to SAC.
+| `gail -> use_actions`   | (default = `false`) Determines whether the discriminator should discriminate based on both observations and actions, or just observations. Set to True if you want the agent to mimic the actions from the demonstrations, and False if you'd rather have the agent visit the same states as in the demonstrations but with possibly different actions. Setting to False is more likely to be stable, especially with imperfect demonstrations, but may learn slower. |
+| `gail -> use_vail`      | (default = `false`) Enables a variational bottleneck within the GAIL discriminator. This forces the discriminator to learn a more general representation and reduces its tendency to be "too good" at discriminating, making learning more stable. However, it does increase training time. Enable this if you notice your imitation learning is unstable, or unable to learn the task at hand.                                                                       |
-| **Setting**                                  | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| :------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `reward_signals -> reward_signal_num_update` | (Optional, default = `steps_per_update`) Number of steps per mini batch sampled and used for updating the reward signals. By default, we update the reward signals once every time the main policy is updated. However, to imitate the training procedure in certain imitation learning papers (e.g. [Kostrikov et. al](http://arxiv.org/abs/1809.02925), [Blondé et. al](http://arxiv.org/abs/1809.02064)), we may want to update the reward signal (GAIL) M times for every update of the policy. We can change `steps_per_update` of SAC to N, as well as `reward_signal_steps_per_update` under `reward_signals` to N / M to accomplish this. By default, `reward_signal_steps_per_update` is set to `steps_per_update`. |

 ## Behavioral Cloning


 | **Setting**          | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
 | :------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `demo_path`          | The path to your .demo file or directory of .demo files.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| `strength`           | Learning rate of the imitation relative to the learning rate of PPO, and roughly corresponds to how strongly we allow BC to influence the policy. <br><br>Typical range: `0.1` - `0.5`                                                                                                                                                                                                                                                                                                                                                                     |
-| `steps`              | During BC, it is often desirable to stop using demonstrations after the agent has "seen" rewards, and allow it to optimize past the available demonstrations and/or generalize outside of the provided demonstrations. steps corresponds to the training steps over which BC is active. The learning rate of BC will anneal over the steps. Set the steps to 0 for constant imitation over the entire training run.                                                                                                                                        |
-| `batch_size`         | Number of demonstration experiences used for one iteration of a gradient descent update. If not specified, it will default to the `batch_size`. <br><br>Typical range: (Continuous): `512` - `5120`; (Discrete): `32` - `512`                                                                                                                                                                                                                                                                                                                              |
-| `num_epoch`          | Number of passes through the experience buffer during gradient descent. If not specified, it will default to the number of epochs set for PPO. <br><br>Typical range: `3` - `10`                                                                                                                                                                                                                                                                                                                                                                           |
-| `samples_per_update` | (Optional, default = `0`) Maximum number of samples to use during each imitation update. You may want to lower this if your demonstration dataset is very large to avoid overfitting the policy on demonstrations. Set to 0 to train over all of the demonstrations at each update step. <br><br>Typical range: `buffer_size`                                                                                                                                                                                                                              |
-| `init_path`          | Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run. |
+| `demo_path`          | (Required, no default) The path to your .demo file or directory of .demo files.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| `strength`           | (default = `1.0`) Learning rate of the imitation relative to the learning rate of PPO, and roughly corresponds to how strongly we allow BC to influence the policy. <br><br>Typical range: `0.1` - `0.5`                                                                                                                                                                                                                                                                                                                                                                     |
+| `steps`              | (default = `0`) During BC, it is often desirable to stop using demonstrations after the agent has "seen" rewards, and allow it to optimize past the available demonstrations and/or generalize outside of the provided demonstrations. steps corresponds to the training steps over which BC is active. The learning rate of BC will anneal over the steps. Set the steps to 0 for constant imitation over the entire training run.                                                                                                                                        |
+| `batch_size`         | (default = `batch_size` of trainer) Number of demonstration experiences used for one iteration of a gradient descent update. If not specified, it will default to the `batch_size` of the trainer. <br><br>Typical range: (Continuous): `512` - `5120`; (Discrete): `32` - `512`                                                                                                                                                                                                                                                                                                                              |
+| `num_epoch`          | (default = `num_epoch` of trainer) Number of passes through the experience buffer during gradient descent. If not specified, it will default to the number of epochs set for PPO. <br><br>Typical range: `3` - `10`                                                                                                                                                                                                                                                                                                                                                                           |
+| `samples_per_update` | (default = `0`) Maximum number of samples to use during each imitation update. You may want to lower this if your demonstration dataset is very large to avoid overfitting the policy on demonstrations. Set to 0 to train over all of the demonstrations at each update step. <br><br>Typical range: `buffer_size`
-You can enable your agents to use memory, by setting `use_recurrent` to `true`
+You can enable your agents to use memory by adding a `memory` section under `network_settings`,
-| `use_recurrent`   | Whether to enable this option or not.                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| `memory_size`     | Size of the memory an agent must keep. In order to use a LSTM, training requires a sequence of experiences instead of single experiences. Corresponds to the size of the array of floating point numbers used to store the hidden state of the recurrent neural network of the policy. This value must be a multiple of 2, and should scale with the amount of information you expect the agent will need to remember in order to successfully complete the task. <br><br>Typical range: `32` - `256` |
-| `sequence_length` | Defines how long the sequences of experiences must be while training. Note that if this number is too small, the agent will not be able to remember things over longer periods of time. If this number is too large, the neural network will take longer to train. <br><br>Typical range: `4` - `128`                                                                                                                                                                                                 |
+| `network_settings -> memory -> memory_size` | (default = `128`) Size of the memory an agent must keep. In order to use a LSTM, training requires a sequence of experiences instead of single experiences. Corresponds to the size of the array of floating point numbers used to store the hidden state of the recurrent neural network of the policy. This value must be a multiple of 2, and should scale with the amount of information you expect the agent will need to remember in order to successfully complete the task. <br><br>Typical range: `32` - `256` |
+| `network_settings -> memory -> sequence_length` | (default = `64`) Defines how long the sequences of experiences must be while training. Note that if this number is too small, the agent will not be able to remember things over longer periods of time. If this number is too large, the neural network will take longer to train. <br><br>Typical range: `4` - `128`                                                                                                                                                                                                 |

 A few considerations when deciding to use memory:


 | **Setting**                       | **Description**                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | :-------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `save_steps`                      | Number of _trainer steps_ between snapshots. For example, if `save_steps=10000` then a snapshot of the current policy will be saved every `10000` trainer steps. Note, trainer steps are counted per agent. For more information, please see the [migration doc](Migrating.md) after v0.13. <br><br>A larger value of `save_steps` will yield a set of opponents that cover a wider range of skill levels and possibly play styles since the policy receives more training. As a result, the agent trains against a wider variety of opponents. Learning a policy to defeat more diverse opponents is a harder problem and so may require more overall training steps but also may lead to more general and robust policy at the end of training. This value is also dependent on how intrinsically difficult the environment is for the agent. <br><br> Typical range: `10000` - `100000`                                                                                                                                                                                                                                                                                            |
-| `team_change`                     | Number of _trainer_steps_ between switching the learning team. This is the number of trainer steps the teams associated with a specific ghost trainer will train before a different team becomes the new learning team. It is possible that, in asymmetric games, opposing teams require fewer trainer steps to make similar performance gains. This enables users to train a more complicated team of agents for more trainer steps than a simpler team of agents per team switch. <br><br>A larger value of `team-change` will allow the agent to train longer against it's opponents. The longer an agent trains against the same set of opponents the more able it will be to defeat them. However, training against them for too long may result in overfitting to the particular opponent strategies and so the agent may fail against the next batch of opponents. <br><br> The value of `team-change` will determine how many snapshots of the agent's policy are saved to be used as opponents for the other team. So, we recommend setting this value as a function of the `save_steps` parameter discussed previously. <br><br> Typical range: 4x-10x where x=`save_steps` |
-| `swap_steps`                      | Number of _ghost steps_ (not trainer steps) between swapping the opponents policy with a different snapshot. A 'ghost step' refers to a step taken by an agent _that is following a fixed policy and not learning_. The reason for this distinction is that in asymmetric games, we may have teams with an unequal number of agents e.g. a 2v1 scenario like our Strikers Vs Goalie example environment. The team with two agents collects twice as many agent steps per environment step as the team with one agent. Thus, these two values will need to be distinct to ensure that the same number of trainer steps corresponds to the same number of opponent swaps for each team. The formula for `swap_steps` if a user desires `x` swaps of a team with `num_agents` agents against an opponent team with `num_opponent_agents` agents during `team-change` total steps is: `(num_agents / num_opponent_agents) * (team_change / x)` <br><br> Typical range: `10000` - `100000`                                                                                                                                                                                                 |
-| `play_against_latest_model_ratio` | Probability an agent will play against the latest opponent policy. With probability 1 - `play_against_latest_model_ratio`, the agent will play against a snapshot of its opponent from a past iteration. <br><br> A larger value of `play_against_latest_model_ratio` indicates that an agent will be playing against the current opponent more often. Since the agent is updating it's policy, the opponent will be different from iteration to iteration. This can lead to an unstable learning environment, but poses the agent with an [auto-curricula](https://openai.com/blog/emergent-tool-use/) of more increasingly challenging situations which may lead to a stronger final policy. <br><br> Typical range: `0.0` - `1.0`                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| `window`                          | Size of the sliding window of past snapshots from which the agent's opponents are sampled. For example, a `window` size of 5 will save the last 5 snapshots taken. Each time a new snapshot is taken, the oldest is discarded. A larger value of `window` means that an agent's pool of opponents will contain a larger diversity of behaviors since it will contain policies from earlier in the training run. Like in the `save_steps` hyperparameter, the agent trains against a wider variety of opponents. Learning a policy to defeat more diverse opponents is a harder problem and so may require more overall training steps but also may lead to more general and robust policy at the end of training. <br><br> Typical range: `5` - `30`                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| `save_steps`                      | (default = `20000`) Number of _trainer steps_ between snapshots. For example, if `save_steps=10000` then a snapshot of the current policy will be saved every `10000` trainer steps. Note, trainer steps are counted per agent. For more information, please see the [migration doc](Migrating.md) after v0.13. <br><br>A larger value of `save_steps` will yield a set of opponents that cover a wider range of skill levels and possibly play styles since the policy receives more training. As a result, the agent trains against a wider variety of opponents. Learning a policy to defeat more diverse opponents is a harder problem and so may require more overall training steps but also may lead to more general and robust policy at the end of training. This value is also dependent on how intrinsically difficult the environment is for the agent. <br><br> Typical range: `10000` - `100000`                                                                                                                                                                                                                                                                                            |
+| `team_change`                     | (default = `5 * save_steps`) Number of _trainer_steps_ between switching the learning team. This is the number of trainer steps the teams associated with a specific ghost trainer will train before a different team becomes the new learning team. It is possible that, in asymmetric games, opposing teams require fewer trainer steps to make similar performance gains. This enables users to train a more complicated team of agents for more trainer steps than a simpler team of agents per team switch. <br><br>A larger value of `team-change` will allow the agent to train longer against it's opponents. The longer an agent trains against the same set of opponents the more able it will be to defeat them. However, training against them for too long may result in overfitting to the particular opponent strategies and so the agent may fail against the next batch of opponents. <br><br> The value of `team-change` will determine how many snapshots of the agent's policy are saved to be used as opponents for the other team. So, we recommend setting this value as a function of the `save_steps` parameter discussed previously. <br><br> Typical range: 4x-10x where x=`save_steps` |
+| `swap_steps`                      | (default = `10000`) Number of _ghost steps_ (not trainer steps) between swapping the opponents policy with a different snapshot. A 'ghost step' refers to a step taken by an agent _that is following a fixed policy and not learning_. The reason for this distinction is that in asymmetric games, we may have teams with an unequal number of agents e.g. a 2v1 scenario like our Strikers Vs Goalie example environment. The team with two agents collects twice as many agent steps per environment step as the team with one agent. Thus, these two values will need to be distinct to ensure that the same number of trainer steps corresponds to the same number of opponent swaps for each team. The formula for `swap_steps` if a user desires `x` swaps of a team with `num_agents` agents against an opponent team with `num_opponent_agents` agents during `team-change` total steps is: `(num_agents / num_opponent_agents) * (team_change / x)` <br><br> Typical range: `10000` - `100000`                                                                                                                                                                                                 |
+| `play_against_latest_model_ratio` | (default = `0.5`) Probability an agent will play against the latest opponent policy. With probability 1 - `play_against_latest_model_ratio`, the agent will play against a snapshot of its opponent from a past iteration. <br><br> A larger value of `play_against_latest_model_ratio` indicates that an agent will be playing against the current opponent more often. Since the agent is updating it's policy, the opponent will be different from iteration to iteration. This can lead to an unstable learning environment, but poses the agent with an [auto-curricula](https://openai.com/blog/emergent-tool-use/) of more increasingly challenging situations which may lead to a stronger final policy. <br><br> Typical range: `0.0` - `1.0`                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| `window`                          | (default = `10`) Size of the sliding window of past snapshots from which the agent's opponents are sampled. For example, a `window` size of 5 will save the last 5 snapshots taken. Each time a new snapshot is taken, the oldest is discarded. A larger value of `window` means that an agent's pool of opponents will contain a larger diversity of behaviors since it will contain policies from earlier in the training run. Like in the `save_steps` hyperparameter, the agent trains against a wider variety of opponents. Learning a policy to defeat more diverse opponents is a harder problem and so may require more overall training steps but also may lead to more general and robust policy at the end of training. <br><br> Typical range: `5` - `30`                                                                                                                                                                                                                                                                                                                                                                                                                                  |

 ### Note on Reward Signals

--- a/docs/Training-ML-Agents.md
+++ b/docs/Training-ML-Agents.md
 The rest of this guide breaks down the different sub-sections of the trainer config file
 and explains the possible settings for each.

+**NOTE:** The configuration file format has been changed from 0.17.0 and onwards. To convert
+an old set of configuration files (trainer config, curriculum, and sampler files) to the new
+format, a script has been provided. Run `python config/upgrade_config.py -h` in your  console
+to see the script's usage.
+
 ### Behavior Configurations

 The primary section of the trainer config file is a
 ```yaml
 behaviors:
  BehaviorPPO:
-    trainer: ppo
+    trainer_type: ppo
+
+    hyperparameters:
+      # Hyperparameters common to PPO and SAC
+      batch_size: 1024
+      buffer_size: 10240
+      learning_rate: 3.0e-4
+      learning_rate_schedule: linear
-    # Trainer configs common to PPO/SAC (excluding reward signals)
-    batch_size: 1024
-    buffer_size: 10240
-    hidden_units: 128
-    learning_rate: 3.0e-4
-    learning_rate_schedule: linear
+      # PPO-specific hyperparameters
+      # Replaces the "PPO-specific hyperparameters" section above
+      beta: 5.0e-3
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+
+    # Configuration of the neural network (common to PPO/SAC)
+    network_settings:
+      vis_encoder_type: simple
+      normalize: false
+      hidden_units: 128
+      num_layers: 2
+      # memory
+      memory:
+        sequence_length: 64
+        memory_size: 256
+
+    # Trainer configurations common to all trainers
-    normalize: false
-    num_layers: 2
-    vis_encoder_type: simple
-    init_path: null
-
-    # PPO-specific configs
-    beta: 5.0e-3
-    epsilon: 0.2
-    lambd: 0.95
-    num_epoch: 3
+    keep_checkpoints: 5
-
-    # memory
-    use_recurrent: true
-    sequence_length: 64
-    memory_size: 256
+    init_path: null

    # behavior cloning
    behavioral_cloning:
      samples_per_update: 0

    reward_signals:
-      # environment reward
+      # environment reward (default)
      extrinsic:
        strength: 1.0
        gamma: 0.99
 ```yaml
 behaviors:
  BehaviorSAC:
-    trainer: sac
+    trainer_type: sac
-    # SAC-specific configs (replaces the "PPO-specific configs" section above)
-    buffer_init_steps: 0
-    tau: 0.005
-    steps_per_update: 1
-    train_interval: 1
-    init_entcoef: 1.0
-    save_replay_buffer: false
+    # SAC-specific configs (replaces the hyperparameters section above)
+    hyperparameters:
+      # Hyperparameters common to PPO and SAC
+      # Same as PPO config
+
+      # SAC-specific hyperparameters
+      # Replaces the "PPO-specific hyperparameters" section above
+      buffer_init_steps: 0
+      tau: 0.005
+      steps_per_update: 10.0
+      save_replay_buffer: false
+      init_entcoef: 0.5
+      reward_signal_steps_per_update: 10.0
-    # memory
-    # same as PPO config
+    # Configuration of the neural network (common to PPO/SAC)
+    network_settings:
+      # Same as PPO config
+
+    # Trainer configurations common to all trainers
+      # <Same as PPO config>

    # pre-training using behavior cloning
    behavioral_cloning:
-      reward_signal_num_update: 1 # only applies to SAC
-
      # environment reward
      extrinsic:
        # same as PPO config
 We now break apart the components of the configuration file and describe what
 each of these parameters mean and provide guidelines on how to set them. See
 [Training Configuration File](Training-Configuration-File.md) for a detailed
-description of all the configurations listed above.
+description of all the configurations listed above, along with their defaults.
+Unless otherwise specified, omitting a configuration will revert it to its default.
-To enable curriculum learning, you need to add a sub-section to the corresponding
-`behaivors` entry in the trainer config YAML file that defines the curriculum for that
-behavior. Here is one example:
+To enable curriculum learning, you need to add a `curriculum ` sub-section to the trainer
+configuration YAML file. Within this sub-section, add an entry for each behavior that defines
+the curriculum for thatbehavior. Here is one example:

 ```yml
 behaviors:
-    # Add this section
-    curriculum:
-      measure: progress
-      thresholds: [0.1, 0.3, 0.5]
-      min_lesson_length: 100
-      signal_smoothing: true
-      parameters:
-        wall_height: [1.5, 2.0, 2.5, 4.0]
+# Add this section
+curriculum:
+  BehaviorY:
+    measure: progress
+    thresholds: [0.1, 0.3, 0.5]
+    min_lesson_length: 100
+    signal_smoothing: true
+    parameters:
+      wall_height: [1.5, 2.0, 2.5, 4.0]
 ```

 Each group of Agents under the same `Behavior Name` in an environment can have a
 behaviors:
  BigWallJump:
    # < Trainer parameters for BigWallJump >
-    # Curriculum configuration
-    curriculum:
+  SmallWallJump:
+    # < Trainer parameters for SmallWallJump >
+
+curriculum:
+  BigWallJump:
      measure: progress
      thresholds: [0.1, 0.3, 0.5]
      min_lesson_length: 100
        big_wall_max_height: [4.0, 7.0, 8.0, 8.0]
-
-    # < Trainer parameters for BigWallJump >
-    # Curriculum configuration
-    curriculum:
-      measure: progress
-      thresholds: [0.1, 0.3, 0.5]
-      min_lesson_length: 100
-      signal_smoothing: true
-      parameters:
-        small_wall_height: [1.5, 2.0, 2.5, 4.0]
+    measure: progress
+    thresholds: [0.1, 0.3, 0.5]
+    min_lesson_length: 100
+    signal_smoothing: true
+    parameters:
+      small_wall_height: [1.5, 2.0, 2.5, 4.0]
 ```

 The curriculum for each Behavior has the following parameters:
 #### Training with a Curriculum

 Once we have specified our metacurriculum and curricula, we can launch
-`mlagents-learn` using the config file for
+`mlagents-learn` to point to the config file containing
 our curricula and PPO will train using Curriculum Learning. For example, to
 train agents in the Wall Jump environment with curriculum learning, we can run:

--- a/ml-agents/mlagents/trainers/cli_utils.py
+++ b/ml-agents/mlagents/trainers/cli_utils.py
-from typing import Set
+from typing import Set, Dict, Any, TextIO
+import os
+import yaml
+from mlagents.trainers.exception import TrainerConfigError
+from mlagents_envs.environment import UnityEnvironment
 import argparse


    def __call__(self, arg_parser, namespace, values, option_string=None):
        delattr(namespace, self.dest)
        StoreConfigFile.trainer_config_path = values
+
+
+def _create_parser() -> argparse.ArgumentParser:
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    argparser.add_argument(
+        "trainer_config_path", action=StoreConfigFile, nargs="?", default=None
+    )
+    argparser.add_argument(
+        "--env",
+        default=None,
+        dest="env_path",
+        help="Path to the Unity executable to train",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--lesson",
+        default=0,
+        type=int,
+        help="The lesson to start with when performing curriculum training",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--load",
+        default=False,
+        dest="load_model",
+        action=DetectDefaultStoreTrue,
+        help=argparse.SUPPRESS,  # Deprecated but still usable for now.
+    )
+    argparser.add_argument(
+        "--resume",
+        default=False,
+        dest="resume",
+        action=DetectDefaultStoreTrue,
+        help="Whether to resume training from a checkpoint. Specify a --run-id to use this option. "
+        "If set, the training code loads an already trained model to initialize the neural network "
+        "before resuming training. This option is only valid when the models exist, and have the same "
+        "behavior names as the current agents in your scene.",
+    )
+    argparser.add_argument(
+        "--force",
+        default=False,
+        dest="force",
+        action=DetectDefaultStoreTrue,
+        help="Whether to force-overwrite this run-id's existing summary and model data. (Without "
+        "this flag, attempting to train a model with a run-id that has been used before will throw "
+        "an error.",
+    )
+    argparser.add_argument(
+        "--run-id",
+        default="ppo",
+        help="The identifier for the training run. This identifier is used to name the "
+        "subdirectories in which the trained model and summary statistics are saved as well "
+        "as the saved model itself. If you use TensorBoard to view the training statistics, "
+        "always set a unique run-id for each training run. (The statistics for all runs with the "
+        "same id are combined as if they were produced by a the same session.)",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--initialize-from",
+        metavar="RUN_ID",
+        default=None,
+        help="Specify a previously saved run ID from which to initialize the model from. "
+        "This can be used, for instance, to fine-tune an existing model on a new environment. "
+        "Note that the previously saved models must have the same behavior parameters as your "
+        "current environment.",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--save-freq",
+        default=50000,
+        type=int,
+        help="How often (in steps) to save the model during training",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--seed",
+        default=-1,
+        type=int,
+        help="A number to use as a seed for the random number generator used by the training code",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--train",
+        default=False,
+        dest="train_model",
+        action=DetectDefaultStoreTrue,
+        help=argparse.SUPPRESS,
+    )
+    argparser.add_argument(
+        "--inference",
+        default=False,
+        dest="inference",
+        action=DetectDefaultStoreTrue,
+        help="Whether to run in Python inference mode (i.e. no training). Use with --resume to load "
+        "a model trained with an existing run ID.",
+    )
+    argparser.add_argument(
+        "--base-port",
+        default=UnityEnvironment.BASE_ENVIRONMENT_PORT,
+        type=int,
+        help="The starting port for environment communication. Each concurrent Unity environment "
+        "instance will get assigned a port sequentially, starting from the base-port. Each instance "
+        "will use the port (base_port + worker_id), where the worker_id is sequential IDs given to "
+        "each instance from 0 to (num_envs - 1). Note that when training using the Editor rather "
+        "than an executable, the base port will be ignored.",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--num-envs",
+        default=1,
+        type=int,
+        help="The number of concurrent Unity environment instances to collect experiences "
+        "from when training",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--debug",
+        default=False,
+        action=DetectDefaultStoreTrue,
+        help="Whether to enable debug-level logging for some parts of the code",
+    )
+    argparser.add_argument(
+        "--env-args",
+        default=None,
+        nargs=argparse.REMAINDER,
+        help="Arguments passed to the Unity executable. Be aware that the standalone build will also "
+        "process these as Unity Command Line Arguments. You should choose different argument names if "
+        "you want to create environment-specific arguments. All arguments after this flag will be "
+        "passed to the executable.",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--cpu",
+        default=False,
+        action=DetectDefaultStoreTrue,
+        help="Forces training using CPU only",
+    )
+
+    eng_conf = argparser.add_argument_group(title="Engine Configuration")
+    eng_conf.add_argument(
+        "--width",
+        default=84,
+        type=int,
+        help="The width of the executable window of the environment(s) in pixels "
+        "(ignored for editor training).",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--height",
+        default=84,
+        type=int,
+        help="The height of the executable window of the environment(s) in pixels "
+        "(ignored for editor training)",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--quality-level",
+        default=5,
+        type=int,
+        help="The quality level of the environment(s). Equivalent to calling "
+        "QualitySettings.SetQualityLevel in Unity.",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--time-scale",
+        default=20,
+        type=float,
+        help="The time scale of the Unity environment(s). Equivalent to setting "
+        "Time.timeScale in Unity.",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--target-frame-rate",
+        default=-1,
+        type=int,
+        help="The target frame rate of the Unity environment(s). Equivalent to setting "
+        "Application.targetFrameRate in Unity.",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--capture-frame-rate",
+        default=60,
+        type=int,
+        help="The capture frame rate of the Unity environment(s). Equivalent to setting "
+        "Time.captureFramerate in Unity.",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--no-graphics",
+        default=False,
+        action=DetectDefaultStoreTrue,
+        help="Whether to run the Unity executable in no-graphics mode (i.e. without initializing "
+        "the graphics driver. Use this only if your agents don't use visual observations.",
+    )
+    return argparser
+
+
+def load_config(config_path: str) -> Dict[str, Any]:
+    try:
+        with open(config_path) as data_file:
+            return _load_config(data_file)
+    except IOError:
+        abs_path = os.path.abspath(config_path)
+        raise TrainerConfigError(f"Config file could not be found at {abs_path}.")
+    except UnicodeDecodeError:
+        raise TrainerConfigError(
+            f"There was an error decoding Config file from {config_path}. "
+            f"Make sure your file is save using UTF-8"
+        )
+
+
+def _load_config(fp: TextIO) -> Dict[str, Any]:
+    """
+    Load the yaml config from the file-like object.
+    """
+    try:
+        return yaml.safe_load(fp)
+    except yaml.parser.ParserError as e:
+        raise TrainerConfigError(
+            "Error parsing yaml file. Please check for formatting errors. "
+            "A tool such as http://www.yamllint.com/ can be helpful with this."
+        ) from e
+
+
+parser = _create_parser()
--- a/ml-agents/mlagents/trainers/components/bc/module.py
+++ b/ml-agents/mlagents/trainers/components/bc/module.py
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from .model import BCModel
 from mlagents.trainers.demo_loader import demo_to_buffer
-from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.settings import BehavioralCloningSettings


 class BCModule:
+        settings: BehavioralCloningSettings,
-        strength: float,
-        demo_path: str,
-        steps: int,
-        batch_size: int = None,
-        num_epoch: int = None,
-        samples_per_update: int = 0,
    ):
        """
        A BC trainer that can be used inline with RL.
        :param samples_per_update: Maximum number of samples to train on during each BC update.
        """
        self.policy = policy
-        self.current_lr = policy_learning_rate * strength
-        self.model = BCModel(policy, self.current_lr, steps)
+        self.current_lr = policy_learning_rate * settings.strength
+        self.model = BCModel(policy, self.current_lr, settings.steps)
-            demo_path, policy.sequence_length, policy.brain
+            settings.demo_path, policy.sequence_length, policy.brain
-        self.batch_size = batch_size if batch_size else default_batch_size
-        self.num_epoch = num_epoch if num_epoch else default_num_epoch
+        self.batch_size = (
+            settings.batch_size if settings.batch_size else default_batch_size
+        )
+        self.num_epoch = settings.num_epoch if settings.num_epoch else default_num_epoch
        self.n_sequences = max(
            min(self.batch_size, self.demonstration_buffer.num_experiences)
            // policy.sequence_length,
        self.has_updated = False
        self.use_recurrent = self.policy.use_recurrent
-        self.samples_per_update = samples_per_update
+        self.samples_per_update = settings.samples_per_update
-
-    @staticmethod
-    def check_config(config_dict: Dict[str, Any]) -> None:
-        """
-        Check the behavioral_cloning config for the required keys.
-        :param config_dict: Pretraining section of trainer_config
-        """
-        param_keys = ["strength", "demo_path", "steps"]
-        for k in param_keys:
-            if k not in config_dict:
-                raise UnityTrainerException(
-                    "The required pre-training hyper-parameter {0} was not defined. Please check your \
-                    trainer YAML file.".format(
-                        k
-                    )
-                )

    def update(self) -> Dict[str, Any]:
        """
--- a/ml-agents/mlagents/trainers/components/reward_signals/init.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/init.py
-from typing import Any, Dict, List
+from typing import Any, Dict
 from collections import namedtuple
 import numpy as np
 import abc
 from mlagents_envs.logging_util import get_logger
-from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.settings import RewardSignalSettings


 logger = get_logger(__name__)


 class RewardSignal(abc.ABC):
-    def __init__(self, policy: TFPolicy, strength: float, gamma: float):
+    def __init__(self, policy: TFPolicy, settings: RewardSignalSettings):
-        :param strength: The strength of the reward. The reward's raw value will be multiplied by this value.
-        :param gamma: The time discounting factor used for this reward.
+        :param settings: Settings parameters for this Reward Signal, including gamma and strength.
        :return: A RewardSignal object.
        """
        class_name = self.__class__.__name__
        # no natural end, e.g. GAIL or Curiosity
        self.use_terminal_states = True
        self.update_dict: Dict[str, tf.Tensor] = {}
-        self.gamma = gamma
+        self.gamma = settings.gamma
-        self.strength = strength
+        self.strength = settings.strength
        self.stats_name_to_update_name: Dict[str, str] = {}

    def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
        :return: A dict that corresponds to the feed_dict needed for the update.
        """
        return {}
-
-    @classmethod
-    def check_config(
-        cls, config_dict: Dict[str, Any], param_keys: List[str] = None
-    ) -> None:
-        """
-        Check the config dict, and throw an error if there are missing hyperparameters.
-        """
-        param_keys = param_keys or []
-        for k in param_keys:
-            if k not in config_dict:
-                raise UnityTrainerException(
-                    "The hyper-parameter {0} could not be found for {1}.".format(
-                        k, cls.__name__
-                    )
-                )
--- a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
-from typing import Any, Dict, List
+from typing import Any, Dict
 import numpy as np
 from mlagents.tf_utils import tf

 from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.settings import CuriositySettings
-    def __init__(
-        self,
-        policy: TFPolicy,
-        strength: float,
-        gamma: float,
-        encoding_size: int = 128,
-        learning_rate: float = 3e-4,
-    ):
+    def __init__(self, policy: TFPolicy, settings: CuriositySettings):
-        :param strength: The scaling parameter for the reward. The scaled reward will be the unscaled
-        reward multiplied by the strength parameter
-        :param gamma: The time discounting factor used for this reward.
-        :param encoding_size: The size of the hidden encoding layer for the ICM
-        :param learning_rate: The learning rate for the ICM.
+        :param settings: CuriositySettings object that contains the parameters
+            (including encoding size and learning rate) for this CuriosityRewardSignal.
-        super().__init__(policy, strength, gamma)
+        super().__init__(policy, settings)
-            policy, encoding_size=encoding_size, learning_rate=learning_rate
+            policy,
+            encoding_size=settings.encoding_size,
+            learning_rate=settings.learning_rate,
        )
        self.use_terminal_states = False
        self.update_dict = {
            unscaled_reward * float(self.has_updated) * self.strength, 0, 1
        )
        return RewardSignalResult(scaled_reward, unscaled_reward)
-
-    @classmethod
-    def check_config(
-        cls, config_dict: Dict[str, Any], param_keys: List[str] = None
-    ) -> None:
-        """
-        Checks the config and throw an exception if a hyperparameter is missing. Curiosity requires strength,
-        gamma, and encoding size at minimum.
-        """
-        param_keys = ["strength", "gamma", "encoding_size"]
-        super().check_config(config_dict, param_keys)

    def prepare_update(
        self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
--- a/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
-from typing import Any, Dict, List
 import numpy as np

 from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
 class ExtrinsicRewardSignal(RewardSignal):
-    @classmethod
-    def check_config(
-        cls, config_dict: Dict[str, Any], param_keys: List[str] = None
-    ) -> None:
-        """
-        Checks the config and throw an exception if a hyperparameter is missing. Extrinsic requires strength and gamma
-        at minimum.
-        """
-        param_keys = ["strength", "gamma"]
-        super().check_config(config_dict, param_keys)
-
    def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
        env_rews = np.array(mini_batch["environment_rewards"], dtype=np.float32)
        return RewardSignalResult(self.strength * env_rews, env_rews)
--- a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
-from typing import Any, Dict, List
+from typing import Any, Dict
 import numpy as np
 from mlagents.tf_utils import tf

 from mlagents.trainers.demo_loader import demo_to_buffer
 from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.settings import GAILSettings
-    def __init__(
-        self,
-        policy: TFPolicy,
-        strength: float,
-        gamma: float,
-        demo_path: str,
-        encoding_size: int = 64,
-        learning_rate: float = 3e-4,
-        use_actions: bool = False,
-        use_vail: bool = False,
-    ):
+    def __init__(self, policy: TFPolicy, settings: GAILSettings):
-        :param strength: The scaling parameter for the reward. The scaled reward will be the unscaled
-        reward multiplied by the strength parameter
-        :param gamma: The time discounting factor used for this reward.
-        :param demo_path: The path to the demonstration file
-        :param num_epoch: The number of epochs to train over the training buffer for the discriminator.
-        :param encoding_size: The size of the the hidden layers of the discriminator
-        :param learning_rate: The Learning Rate used during GAIL updates.
-        :param use_actions: Whether or not to use the actions for the discriminator.
-        :param use_vail: Whether or not to use a variational bottleneck for the discriminator.
+        :param settings: The settings for this GAILRewardSignal.
-        super().__init__(policy, strength, gamma)
+        super().__init__(policy, settings)
-            policy, 128, learning_rate, encoding_size, use_actions, use_vail
+            policy,
+            128,
+            settings.learning_rate,
+            settings.encoding_size,
+            settings.use_actions,
+            settings.use_vail,
-            demo_path, policy.sequence_length, policy.brain
+            settings.demo_path, policy.sequence_length, policy.brain
        )
        self.has_updated = False
        self.update_dict: Dict[str, tf.Tensor] = {
        )
        scaled_reward = unscaled_reward * float(self.has_updated) * self.strength
        return RewardSignalResult(scaled_reward, unscaled_reward)
-
-    @classmethod
-    def check_config(
-        cls, config_dict: Dict[str, Any], param_keys: List[str] = None
-    ) -> None:
-        """
-        Checks the config and throw an exception if a hyperparameter is missing. GAIL requires strength and gamma
-        at minimum.
-        """
-        param_keys = ["strength", "gamma", "demo_path"]
-        super().check_config(config_dict, param_keys)

    def prepare_update(
        self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
--- a/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
-from typing import Any, Dict, Type
+from typing import Dict, Type
 from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.components.reward_signals import RewardSignal
 from mlagents.trainers.components.reward_signals.extrinsic.signal import (
    CuriosityRewardSignal,
 )
 from mlagents.trainers.policy.tf_policy import TFPolicy
+from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
-NAME_TO_CLASS: Dict[str, Type[RewardSignal]] = {
-    "extrinsic": ExtrinsicRewardSignal,
-    "curiosity": CuriosityRewardSignal,
-    "gail": GAILRewardSignal,
+NAME_TO_CLASS: Dict[RewardSignalType, Type[RewardSignal]] = {
+    RewardSignalType.EXTRINSIC: ExtrinsicRewardSignal,
+    RewardSignalType.CURIOSITY: CuriosityRewardSignal,
+    RewardSignalType.GAIL: GAILRewardSignal,
-    policy: TFPolicy, name: str, config_entry: Dict[str, Any]
+    policy: TFPolicy, name: RewardSignalType, settings: RewardSignalSettings
 ) -> RewardSignal:
    """
    Creates a reward signal class based on the name and config entry provided as a dict.
    rcls = NAME_TO_CLASS.get(name)
    if not rcls:
        raise UnityTrainerException("Unknown reward signal type {0}".format(name))
-    rcls.check_config(config_entry)
-    try:
-        class_inst = rcls(policy, **config_entry)
-    except TypeError:
-        raise UnityTrainerException(
-            "Unknown parameters given for reward signal {0}".format(name)
-        )
+
+    class_inst = rcls(policy, settings)
    return class_inst
--- a/ml-agents/mlagents/trainers/curriculum.py
+++ b/ml-agents/mlagents/trainers/curriculum.py
-import json
-from typing import Dict, Any, TextIO
+from typing import Dict, Any
-from .exception import CurriculumConfigError, CurriculumLoadingError
+from mlagents.trainers.exception import CurriculumConfigError
+from mlagents.trainers.settings import CurriculumSettings
-    def __init__(self, brain_name: str, config: Dict):
+    def __init__(self, brain_name: str, settings: CurriculumSettings):
        """
        Initializes a Curriculum object.
        :param brain_name: Name of the brain this Curriculum is associated with
        self.measure = None
        self._lesson_num = 0
        self.brain_name = brain_name
-        self.config = config
+        self.settings = settings
-        for key in [
-            "parameters",
-            "measure",
-            "thresholds",
-            "min_lesson_length",
-            "signal_smoothing",
-        ]:
-            if key not in self.config:
-                raise CurriculumConfigError(
-                    f"{brain_name} curriculum config does not contain a {key} field."
-                )
-        self.smoothing_value = 0
-        self.measure = self.config["measure"]
-        self.min_lesson_length = self.config["min_lesson_length"]
-        self.max_lesson_num = len(self.config["thresholds"])
+        self.measure = self.settings.measure
+        self.min_lesson_length = self.settings.min_lesson_length
+        self.max_lesson_num = len(self.settings.thresholds)
-        parameters = self.config["parameters"]
+        parameters = self.settings.parameters
        for key in parameters:
            if len(parameters[key]) != self.max_lesson_num + 1:
                raise CurriculumConfigError(
               steps completed).
        :return Whether the lesson was incremented.
        """
-        if not self.config or not measure_val or math.isnan(measure_val):
+        if not self.settings or not measure_val or math.isnan(measure_val):
-        if self.config["signal_smoothing"]:
+        if self.settings.signal_smoothing:
-            if measure_val > self.config["thresholds"][self.lesson_num]:
+            if measure_val > self.settings.thresholds[self.lesson_num]:
-                parameters = self.config["parameters"]
+                parameters = self.settings.parameters
                for key in parameters:
                    config[key] = parameters[key][self.lesson_num]
                logger.info(
               current lesson is returned.
        :return: The configuration of the reset parameters.
        """
-        if not self.config:
+        if not self.settings:
-        parameters = self.config["parameters"]
+        parameters = self.settings.parameters
-
-    @staticmethod
-    def load_curriculum_file(config_path: str) -> Dict:
-        try:
-            with open(config_path) as data_file:
-                return Curriculum._load_curriculum(data_file)
-        except IOError:
-            raise CurriculumLoadingError(
-                "The file {0} could not be found.".format(config_path)
-            )
-        except UnicodeDecodeError:
-            raise CurriculumLoadingError(
-                "There was an error decoding {}".format(config_path)
-            )
-
-    @staticmethod
-    def _load_curriculum(fp: TextIO) -> Dict:
-        try:
-            return json.load(fp)
-        except json.decoder.JSONDecodeError as e:
-            raise CurriculumLoadingError(
-                "Error parsing JSON file. Please check for formatting errors. "
-                "A tool such as https://jsonlint.com/ can be helpful with this."
-            ) from e
--- a/ml-agents/mlagents/trainers/ghost/trainer.py
+++ b/ml-agents/mlagents/trainers/ghost/trainer.py
        brain_name,
        controller,
        reward_buff_cap,
-        trainer_parameters,
+        trainer_settings,
        training,
        run_id,
    ):
        :param brain_name: The name of the brain associated with trainer config
        :param controller: GhostController that coordinates all ghost trainers and calculates ELO
        :param reward_buff_cap: Max reward history to track in the reward buffer
-        :param trainer_parameters: The parameters for the trainer (dictionary).
+        :param trainer_settings: The parameters for the trainer.
-            brain_name, trainer_parameters, training, run_id, reward_buff_cap
+            brain_name, trainer_settings, training, run_id, reward_buff_cap
        )

        self.trainer = trainer
        # Set the logging to print ELO in the console
        self._stats_reporter.add_property(StatsPropertyType.SELF_PLAY, True)

-        self_play_parameters = trainer_parameters["self_play"]
-        self.window = self_play_parameters.get("window", 10)
-        self.play_against_latest_model_ratio = self_play_parameters.get(
-            "play_against_latest_model_ratio", 0.5
+        self_play_parameters = trainer_settings.self_play
+        self.window = self_play_parameters.window
+        self.play_against_latest_model_ratio = (
+            self_play_parameters.play_against_latest_model_ratio
        )
        if (
            self.play_against_latest_model_ratio > 1.0
                "The play_against_latest_model_ratio is not between 0 and 1."
            )

-        self.steps_between_save = self_play_parameters.get("save_steps", 20000)
-        self.steps_between_swap = self_play_parameters.get("swap_steps", 20000)
-        self.steps_to_train_team = self_play_parameters.get("team_change", 100000)
+        self.steps_between_save = self_play_parameters.save_steps
+        self.steps_between_swap = self_play_parameters.swap_steps
+        self.steps_to_train_team = self_play_parameters.team_change
        if self.steps_to_train_team > self.get_max_steps:
            logger.warning(
                "The max steps of the GhostTrainer for behavior name {} is less than team change. This team will not face \
        self.last_team_change: int = 0

        # Chosen because it is the initial ELO in Chess
-        self.initial_elo: float = self_play_parameters.get("initial_elo", 1200.0)
+        self.initial_elo: float = self_play_parameters.initial_elo
        self.policy_elos: List[float] = [self.initial_elo] * (
            self.window + 1
        )  # for learning policy
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
 # # Unity ML-Agents Toolkit
-import argparse
 import yaml

 import os
-from typing import Callable, Optional, List, NamedTuple, Dict
+from typing import Callable, Optional, List, Dict

 import mlagents.trainers
 import mlagents_envs
-from mlagents.trainers.trainer_util import (
-    load_config,
-    TrainerFactory,
-    handle_existing_directories,
-    assemble_curriculum_config,
-)
+from mlagents.trainers.trainer_util import TrainerFactory, handle_existing_directories
 from mlagents.trainers.stats import (
    TensorboardWriter,
    CSVWriter,
 )
-from mlagents.trainers.cli_utils import (
-    StoreConfigFile,
-    DetectDefault,
-    DetectDefaultStoreTrue,
-)
+from mlagents.trainers.cli_utils import parser
-from mlagents.trainers.exception import SamplerException, TrainerConfigError
+from mlagents.trainers.exception import SamplerException
+from mlagents.trainers.settings import RunOptions
 from mlagents_envs.base_env import BaseEnv
 from mlagents.trainers.subprocess_env_manager import SubprocessEnvManager
 from mlagents_envs.side_channel.side_channel import SideChannel
 logger = logging_util.get_logger(__name__)


-def _create_parser():
-    argparser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    argparser.add_argument("trainer_config_path", action=StoreConfigFile)
-    argparser.add_argument(
-        "--env",
-        default=None,
-        dest="env_path",
-        help="Path to the Unity executable to train",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--lesson",
-        default=0,
-        type=int,
-        help="The lesson to start with when performing curriculum training",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--keep-checkpoints",
-        default=5,
-        type=int,
-        help="The maximum number of model checkpoints to keep. Checkpoints are saved after the"
-        "number of steps specified by the save-freq option. Once the maximum number of checkpoints"
-        "has been reached, the oldest checkpoint is deleted when saving a new checkpoint.",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--load",
-        default=False,
-        dest="load_model",
-        action=DetectDefaultStoreTrue,
-        help=argparse.SUPPRESS,  # Deprecated but still usable for now.
-    )
-    argparser.add_argument(
-        "--resume",
-        default=False,
-        dest="resume",
-        action=DetectDefaultStoreTrue,
-        help="Whether to resume training from a checkpoint. Specify a --run-id to use this option. "
-        "If set, the training code loads an already trained model to initialize the neural network "
-        "before resuming training. This option is only valid when the models exist, and have the same "
-        "behavior names as the current agents in your scene.",
-    )
-    argparser.add_argument(
-        "--force",
-        default=False,
-        dest="force",
-        action=DetectDefaultStoreTrue,
-        help="Whether to force-overwrite this run-id's existing summary and model data. (Without "
-        "this flag, attempting to train a model with a run-id that has been used before will throw "
-        "an error.",
-    )
-    argparser.add_argument(
-        "--run-id",
-        default="ppo",
-        help="The identifier for the training run. This identifier is used to name the "
-        "subdirectories in which the trained model and summary statistics are saved as well "
-        "as the saved model itself. If you use TensorBoard to view the training statistics, "
-        "always set a unique run-id for each training run. (The statistics for all runs with the "
-        "same id are combined as if they were produced by a the same session.)",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--initialize-from",
-        metavar="RUN_ID",
-        default=None,
-        help="Specify a previously saved run ID from which to initialize the model from. "
-        "This can be used, for instance, to fine-tune an existing model on a new environment. "
-        "Note that the previously saved models must have the same behavior parameters as your "
-        "current environment.",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--save-freq",
-        default=50000,
-        type=int,
-        help="How often (in steps) to save the model during training",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--seed",
-        default=-1,
-        type=int,
-        help="A number to use as a seed for the random number generator used by the training code",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--train",
-        default=False,
-        dest="train_model",
-        action=DetectDefaultStoreTrue,
-        help=argparse.SUPPRESS,
-    )
-    argparser.add_argument(
-        "--inference",
-        default=False,
-        dest="inference",
-        action=DetectDefaultStoreTrue,
-        help="Whether to run in Python inference mode (i.e. no training). Use with --resume to load "
-        "a model trained with an existing run ID.",
-    )
-    argparser.add_argument(
-        "--base-port",
-        default=UnityEnvironment.BASE_ENVIRONMENT_PORT,
-        type=int,
-        help="The starting port for environment communication. Each concurrent Unity environment "
-        "instance will get assigned a port sequentially, starting from the base-port. Each instance "
-        "will use the port (base_port + worker_id), where the worker_id is sequential IDs given to "
-        "each instance from 0 to (num_envs - 1). Note that when training using the Editor rather "
-        "than an executable, the base port will be ignored.",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--num-envs",
-        default=1,
-        type=int,
-        help="The number of concurrent Unity environment instances to collect experiences "
-        "from when training",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--no-graphics",
-        default=False,
-        action=DetectDefaultStoreTrue,
-        help="Whether to run the Unity executable in no-graphics mode (i.e. without initializing "
-        "the graphics driver. Use this only if your agents don't use visual observations.",
-    )
-    argparser.add_argument(
-        "--debug",
-        default=False,
-        action=DetectDefaultStoreTrue,
-        help="Whether to enable debug-level logging for some parts of the code",
-    )
-    argparser.add_argument(
-        "--env-args",
-        default=None,
-        nargs=argparse.REMAINDER,
-        help="Arguments passed to the Unity executable. Be aware that the standalone build will also "
-        "process these as Unity Command Line Arguments. You should choose different argument names if "
-        "you want to create environment-specific arguments. All arguments after this flag will be "
-        "passed to the executable.",
-        action=DetectDefault,
-    )
-    argparser.add_argument(
-        "--cpu",
-        default=False,
-        action=DetectDefaultStoreTrue,
-        help="Forces training using CPU only",
-    )
-
-    argparser.add_argument("--version", action="version", version="")
-
-    eng_conf = argparser.add_argument_group(title="Engine Configuration")
-    eng_conf.add_argument(
-        "--width",
-        default=None,
-        type=int,
-        help="The width of the executable window of the environment(s) in pixels "
-        "(ignored for editor training).",
-        action=DetectDefault,
-    )
-    eng_conf.add_argument(
-        "--height",
-        default=None,
-        type=int,
-        help="The height of the executable window of the environment(s) in pixels "
-        "(ignored for editor training)",
-        action=DetectDefault,
-    )
-    eng_conf.add_argument(
-        "--quality-level",
-        default=5,
-        type=int,
-        help="The quality level of the environment(s). Equivalent to calling "
-        "QualitySettings.SetQualityLevel in Unity.",
-        action=DetectDefault,
-    )
-    eng_conf.add_argument(
-        "--time-scale",
-        default=20,
-        type=float,
-        help="The time scale of the Unity environment(s). Equivalent to setting "
-        "Time.timeScale in Unity.",
-        action=DetectDefault,
-    )
-    eng_conf.add_argument(
-        "--target-frame-rate",
-        default=-1,
-        type=int,
-        help="The target frame rate of the Unity environment(s). Equivalent to setting "
-        "Application.targetFrameRate in Unity.",
-        action=DetectDefault,
-    )
-    eng_conf.add_argument(
-        "--capture-frame-rate",
-        default=60,
-        type=int,
-        help="The capture frame rate of the Unity environment(s). Equivalent to setting "
-        "Time.captureFramerate in Unity.",
-        action=DetectDefault,
-    )
-    return argparser
-
-
-parser = _create_parser()
-
-
-class RunOptions(NamedTuple):
-    behaviors: Dict
-    debug: bool = parser.get_default("debug")
-    seed: int = parser.get_default("seed")
-    env_path: Optional[str] = parser.get_default("env_path")
-    run_id: str = parser.get_default("run_id")
-    initialize_from: str = parser.get_default("initialize_from")
-    load_model: bool = parser.get_default("load_model")
-    resume: bool = parser.get_default("resume")
-    force: bool = parser.get_default("force")
-    train_model: bool = parser.get_default("train_model")
-    inference: bool = parser.get_default("inference")
-    save_freq: int = parser.get_default("save_freq")
-    keep_checkpoints: int = parser.get_default("keep_checkpoints")
-    base_port: int = parser.get_default("base_port")
-    num_envs: int = parser.get_default("num_envs")
-    curriculum_config: Optional[Dict] = None
-    lesson: int = parser.get_default("lesson")
-    no_graphics: bool = parser.get_default("no_graphics")
-    multi_gpu: bool = parser.get_default("multi_gpu")
-    parameter_randomization: Optional[Dict] = None
-    env_args: Optional[List[str]] = parser.get_default("env_args")
-    cpu: bool = parser.get_default("cpu")
-    width: int = parser.get_default("width")
-    height: int = parser.get_default("height")
-    quality_level: int = parser.get_default("quality_level")
-    time_scale: float = parser.get_default("time_scale")
-    target_frame_rate: int = parser.get_default("target_frame_rate")
-    capture_frame_rate: int = parser.get_default("capture_frame_rate")
-
-    @staticmethod
-    def from_argparse(args: argparse.Namespace) -> "RunOptions":
-        """
-        Takes an argparse.Namespace as specified in `parse_command_line`, loads input configuration files
-        from file paths, and converts to a CommandLineOptions instance.
-        :param args: collection of command-line parameters passed to mlagents-learn
-        :return: CommandLineOptions representing the passed in arguments, with trainer config, curriculum and sampler
-          configs loaded from files.
-        """
-        argparse_args = vars(args)
-        run_options_dict = {}
-        run_options_dict.update(argparse_args)
-        config_path = StoreConfigFile.trainer_config_path
-
-        # Load YAML
-        yaml_config = load_config(config_path)
-        # This is the only option that is not optional and has no defaults.
-        if "behaviors" not in yaml_config:
-            raise TrainerConfigError(
-                "Trainer configurations not found. Make sure your YAML file has a section for behaviors."
-            )
-        # Use the YAML file values for all values not specified in the CLI.
-        for key, val in yaml_config.items():
-            # Detect bad config options
-            if not hasattr(RunOptions, key):
-                raise TrainerConfigError(
-                    "The option {} was specified in your YAML file, but is invalid.".format(
-                        key
-                    )
-                )
-            if key not in DetectDefault.non_default_args:
-                run_options_dict[key] = val
-
-        # Keep deprecated --load working, TODO: remove
-        run_options_dict["resume"] = (
-            run_options_dict["resume"] or run_options_dict["load_model"]
-        )
-
-        return RunOptions(**run_options_dict)
-
-
 def get_version_string() -> str:
    # pylint: disable=no-member
    return f""" Version information:
    :param run_options: Command line arguments for training.
    """
    with hierarchical_timer("run_training.setup"):
+        checkpoint_settings = options.checkpoint_settings
+        env_settings = options.env_settings
+        engine_settings = options.engine_settings
-        write_path = os.path.join(base_path, options.run_id)
+        write_path = os.path.join(base_path, checkpoint_settings.run_id)
-            os.path.join(base_path, options.run_id) if options.initialize_from else None
+            os.path.join(base_path, checkpoint_settings.run_id)
+            if checkpoint_settings.initialize_from
+            else None
-        port: Optional[int] = options.base_port
+        port: Optional[int] = env_settings.base_port
-            write_path, options.resume, options.force, maybe_init_path
+            write_path,
+            checkpoint_settings.resume,
+            checkpoint_settings.force,
+            maybe_init_path,
        )
        # Make run logs directory
        os.makedirs(run_logs_dir, exist_ok=True)
                "Environment/Episode Length",
            ],
        )
-        tb_writer = TensorboardWriter(write_path, clear_past_data=not options.resume)
+        tb_writer = TensorboardWriter(
+            write_path, clear_past_data=not checkpoint_settings.resume
+        )
        gauge_write = GaugeWriter()
        console_writer = ConsoleWriter()
        StatsReporter.add_writer(tb_writer)

-        if options.env_path is None:
+        if env_settings.env_path is None:
-            options.env_path,
-            options.no_graphics,
+            env_settings.env_path,
+            engine_settings.no_graphics,
-            options.env_args,
+            env_settings.env_args,
-            width=options.width,
-            height=options.height,
-            quality_level=options.quality_level,
-            time_scale=options.time_scale,
-            target_frame_rate=options.target_frame_rate,
-            capture_frame_rate=options.capture_frame_rate,
+            width=engine_settings.width,
+            height=engine_settings.height,
+            quality_level=engine_settings.quality_level,
+            time_scale=engine_settings.time_scale,
+            target_frame_rate=engine_settings.target_frame_rate,
+            capture_frame_rate=engine_settings.capture_frame_rate,
-        env_manager = SubprocessEnvManager(env_factory, engine_config, options.num_envs)
-        curriculum_config = assemble_curriculum_config(options.behaviors)
+        env_manager = SubprocessEnvManager(
+            env_factory, engine_config, env_settings.num_envs
+        )
-            curriculum_config, env_manager, options.lesson
+            options.curriculum, env_manager, checkpoint_settings.lesson
        )
        sampler_manager, resampling_interval = create_sampler_manager(
            options.parameter_randomization, run_seed
-            options.run_id,
+            checkpoint_settings.run_id,
-            options.keep_checkpoints,
-            not options.inference,
-            options.resume,
+            not checkpoint_settings.inference,
+            checkpoint_settings.resume,
-            options.multi_gpu,
+            False,
-            options.run_id,
-            options.save_freq,
+            checkpoint_settings.run_id,
+            checkpoint_settings.save_freq,
-            not options.inference,
+            not checkpoint_settings.inference,
            run_seed,
            sampler_manager,
            resampling_interval,
    try:
        with open(run_options_path, "w") as f:
            try:
-                yaml.dump(dict(run_options._asdict()), f, sort_keys=False)
+                yaml.dump(run_options.as_dict(), f, sort_keys=False)
-                yaml.dump(dict(run_options._asdict()), f)
+                yaml.dump(run_options.as_dict(), f)
    except FileNotFoundError:
        logger.warning(
            f"Unable to save configuration to {run_options_path}. Make sure the directory exists"
    logging_util.set_log_level(log_level)

    logger.debug("Configuration for this run:")
-    logger.debug(json.dumps(options._asdict(), indent=4))
+    logger.debug(json.dumps(options.as_dict(), indent=4))
-    if options.load_model:
+    if options.checkpoint_settings.load_model:
-    if options.train_model:
+    if options.checkpoint_settings.train_model:
-    run_seed = options.seed
-    if options.cpu:
-        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+    run_seed = options.env_settings.seed

    # Add some timer metadata
    add_timer_metadata("mlagents_version", mlagents.trainers.__version__)

-    if options.seed == -1:
+    if options.env_settings.seed == -1:
        run_seed = np.random.randint(0, 10000)
    run_training(run_seed, options)

--- a/ml-agents/mlagents/trainers/meta_curriculum.py
+++ b/ml-agents/mlagents/trainers/meta_curriculum.py

 from typing import Dict, Set
 from mlagents.trainers.curriculum import Curriculum
+from mlagents.trainers.settings import CurriculumSettings

 from mlagents_envs.logging_util import get_logger

    particular brain in the environment.
    """

-    def __init__(self, curriculum_configs: Dict[str, Dict]):
+    def __init__(self, curriculum_configs: Dict[str, CurriculumSettings]):
        """Initializes a MetaCurriculum object.

        :param curriculum_folder: Dictionary of brain_name to the
        used_reset_parameters: Set[str] = set()
-        for brain_name, curriculum_config in curriculum_configs.items():
+        for brain_name, curriculum_settings in curriculum_configs.items():
-                brain_name, curriculum_config
+                brain_name, curriculum_settings
            )
            config_keys: Set[str] = set(
                self._brains_to_curricula[brain_name].get_config().keys()
--- a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
 from mlagents.trainers.components.reward_signals.reward_signal_factory import (
    create_reward_signal,
 )
+from mlagents.trainers.settings import TrainerSettings, RewardSignalType
-    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
+    def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
-        self.create_reward_signals(trainer_params["reward_signals"])
+        self.create_reward_signals(trainer_params.reward_signals)
-        if "behavioral_cloning" in trainer_params:
-            BCModule.check_config(trainer_params["behavioral_cloning"])
+        if trainer_params.behavioral_cloning is not None:
-                policy_learning_rate=trainer_params["learning_rate"],
-                default_batch_size=trainer_params["batch_size"],
+                trainer_params.behavioral_cloning,
+                policy_learning_rate=trainer_params.hyperparameters.learning_rate,
+                default_batch_size=trainer_params.hyperparameters.batch_size,
-                **trainer_params["behavioral_cloning"],
            )

    def get_trajectory_value_estimates(

        return value_estimates

-    def create_reward_signals(self, reward_signal_configs: Dict[str, Any]) -> None:
+    def create_reward_signals(
+        self, reward_signal_configs: Dict[RewardSignalType, Any]
+    ) -> None:
        """
        Create reward signals
        :param reward_signal_configs: Reward signal config.
-        for reward_signal, config in reward_signal_configs.items():
-            self.reward_signals[reward_signal] = create_reward_signal(
-                self.policy, reward_signal, config
+        for reward_signal, settings in reward_signal_configs.items():
+            # Name reward signals by string in case we have duplicates later
+            self.reward_signals[reward_signal.value] = create_reward_signal(
+                self.policy, reward_signal, settings
+            )
+            self.update_dict.update(
+                self.reward_signals[reward_signal.value].update_dict
-            self.update_dict.update(self.reward_signals[reward_signal].update_dict)

    def create_optimizer_op(
        self, learning_rate: tf.Tensor, name: str = "Adam"
--- a/ml-agents/mlagents/trainers/policy/nn_policy.py
+++ b/ml-agents/mlagents/trainers/policy/nn_policy.py
 from mlagents.trainers.models import EncoderType
 from mlagents.trainers.models import ModelUtils
 from mlagents.trainers.policy.tf_policy import TFPolicy
+from mlagents.trainers.settings import TrainerSettings
 from mlagents.trainers.distributions import (
    GaussianDistribution,
    MultiCategoricalDistribution,
        self,
        seed: int,
        brain: BrainParameters,
-        trainer_params: Dict[str, Any],
+        trainer_params: TrainerSettings,
        is_training: bool,
        load: bool,
        tanh_squash: bool = False,
        super().__init__(seed, brain, trainer_params, load)
        self.grads = None
        self.update_batch: Optional[tf.Operation] = None
-        num_layers = trainer_params["num_layers"]
-        self.h_size = trainer_params["hidden_units"]
+        num_layers = self.network_settings.num_layers
+        self.h_size = self.network_settings.hidden_units
-        self.vis_encode_type = EncoderType(
-            trainer_params.get("vis_encode_type", "simple")
-        )
+        self.vis_encode_type = self.network_settings.vis_encode_type
        self.tanh_squash = tanh_squash
        self.reparameterize = reparameterize
        self.condition_sigma_on_obs = condition_sigma_on_obs
--- a/ml-agents/mlagents/trainers/policy/tf_policy.py
+++ b/ml-agents/mlagents/trainers/policy/tf_policy.py
 from mlagents.trainers.brain_conversion_utils import get_global_agent_id
 from mlagents_envs.base_env import DecisionSteps
 from mlagents.trainers.models import ModelUtils
+from mlagents.trainers.settings import TrainerSettings, NetworkSettings
+from mlagents.trainers.brain import BrainParameters


 logger = get_logger(__name__)
    functions to save/load models and create the input placeholders.
    """

-    def __init__(self, seed, brain, trainer_parameters, load=False):
+    def __init__(
+        self,
+        seed: int,
+        brain: BrainParameters,
+        trainer_settings: TrainerSettings,
+        load: bool = False,
+    ):
-        :param trainer_parameters: The trainer parameters.
+        :param trainer_settings: The trainer parameters.
-
+        self.trainer_settings = trainer_settings
+        self.network_settings: NetworkSettings = trainer_settings.network_settings
-        self.assign_phs = []
-        self.assign_ops = []
+        self.assign_phs: List[tf.Tensor] = []
+        self.assign_ops: List[tf.Operation] = []
-        self.inference_dict = {}
-        self.update_dict = {}
+        self.inference_dict: Dict[str, tf.Tensor] = {}
+        self.update_dict: Dict[str, tf.Tensor] = {}
        self.sequence_length = 1
        self.seed = seed
        self.brain = brain
        self.vis_obs_size = brain.number_visual_observations

-        self.use_recurrent = trainer_parameters["use_recurrent"]
+        self.use_recurrent = self.network_settings.memory is not None
-        self.normalize = trainer_parameters.get("normalize", False)
+        self.normalize = self.network_settings.normalize
-        self.model_path = trainer_parameters["output_path"]
-        self.initialize_path = trainer_parameters.get("init_path", None)
-        self.keep_checkpoints = trainer_parameters.get("keep_checkpoints", 5)
+        self.model_path = self.trainer_settings.output_path
+        self.initialize_path = self.trainer_settings.init_path
+        self.keep_checkpoints = self.trainer_settings.keep_checkpoints
-        self.saver = None
+        self.saver: Optional[tf.Operation] = None
-        if self.use_recurrent:
-            self.m_size = trainer_parameters["memory_size"]
-            self.sequence_length = trainer_parameters["sequence_length"]
+        if self.network_settings.memory is not None:
+            self.m_size = self.network_settings.memory.memory_size
+            self.sequence_length = self.network_settings.memory.sequence_length
            if self.m_size == 0:
                raise UnityPolicyException(
                    "The memory size for brain {0} is 0 even "
--- a/ml-agents/mlagents/trainers/ppo/optimizer.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer.py
-from typing import Optional, Any, Dict
+from typing import Optional, Any, Dict, cast
-from mlagents.trainers.models import ModelUtils, EncoderType, ScheduleType
+from mlagents.trainers.models import ModelUtils, EncoderType
+from mlagents.trainers.settings import TrainerSettings, PPOSettings
-    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
+    def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
        """
        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
        The PPO optimizer has a value estimator and a loss function.
        with policy.graph.as_default():
            with tf.variable_scope("optimizer/"):
                super().__init__(policy, trainer_params)
+                hyperparameters: PPOSettings = cast(
+                    PPOSettings, trainer_params.hyperparameters
+                )
+                lr = float(hyperparameters.learning_rate)
+                self._schedule = hyperparameters.learning_rate_schedule
+                epsilon = float(hyperparameters.epsilon)
+                beta = float(hyperparameters.beta)
+                max_step = float(trainer_params.max_steps)
-                lr = float(trainer_params["learning_rate"])
-                self._schedule = ScheduleType(
-                    trainer_params.get("learning_rate_schedule", "linear")
-                )
-                h_size = int(trainer_params["hidden_units"])
-                epsilon = float(trainer_params["epsilon"])
-                beta = float(trainer_params["beta"])
-                max_step = float(trainer_params["max_steps"])
-                num_layers = int(trainer_params["num_layers"])
-                vis_encode_type = EncoderType(
-                    trainer_params.get("vis_encode_type", "simple")
-                )
-                self.burn_in_ratio = float(trainer_params.get("burn_in_ratio", 0.0))
+                policy_network_settings = policy.network_settings
+                h_size = int(policy_network_settings.hidden_units)
+                num_layers = policy_network_settings.num_layers
+                vis_encode_type = policy_network_settings.vis_encode_type
+                self.burn_in_ratio = 0.0

                self.stream_names = list(self.reward_signals.keys())

--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
 # Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347

 from collections import defaultdict
+from typing import cast

 import numpy as np

 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.ppo.optimizer import PPOOptimizer
 from mlagents.trainers.trajectory import Trajectory
-from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.settings import TrainerSettings, PPOSettings


 logger = get_logger(__name__)
        self,
        brain_name: str,
        reward_buff_cap: int,
-        trainer_parameters: dict,
+        trainer_settings: TrainerSettings,
        training: bool,
        load: bool,
        seed: int,
        Responsible for collecting experiences and training PPO model.
        :param brain_name: The name of the brain associated with trainer config
        :param reward_buff_cap: Max reward history to track in the reward buffer
-        :param trainer_parameters: The parameters for the trainer (dictionary).
+        :param trainer_settings: The parameters for the trainer.
        :param training: Whether the trainer is set for training.
        :param load: Whether the model should be loaded.
        :param seed: The seed the model will be initialized with
-            brain_name, trainer_parameters, training, run_id, reward_buff_cap
+            brain_name, trainer_settings, training, run_id, reward_buff_cap
        )
        self.param_keys = [
            "batch_size",
            "output_path",
            "reward_signals",
        ]
-        self._check_param_keys()
+        self.hyperparameters: PPOSettings = cast(
+            PPOSettings, self.trainer_settings.hyperparameters
+        )
-    def _check_param_keys(self):
-        super()._check_param_keys()
-        # Check that batch size is greater than sequence length. Else, throw
-        # an exception.
-        if (
-            self.trainer_parameters["sequence_length"]
-            > self.trainer_parameters["batch_size"]
-            and self.trainer_parameters["use_recurrent"]
-        ):
-            raise UnityTrainerException(
-                "batch_size must be greater than or equal to sequence_length when use_recurrent is True."
-            )
-
    def _process_trajectory(self, trajectory: Trajectory) -> None:
        """
        Takes a trajectory and processes it, putting it into the update buffer.
                value_estimates=local_value_estimates,
                value_next=bootstrap_value,
                gamma=self.optimizer.reward_signals[name].gamma,
-                lambd=self.trainer_parameters["lambd"],
+                lambd=self.hyperparameters.lambd,
            )
            local_return = local_advantage + local_value_estimates
            # This is later use as target for the different value estimates
        :return: A boolean corresponding to whether or not update_model() can be run
        """
        size_of_buffer = self.update_buffer.num_experiences
-        return size_of_buffer > self.trainer_parameters["buffer_size"]
+        return size_of_buffer > self.hyperparameters.buffer_size

    def _update_policy(self):
        """
        # Make sure batch_size is a multiple of sequence length. During training, we
        # will need to reshape the data into a batch_size x sequence_length tensor.
        batch_size = (
-            self.trainer_parameters["batch_size"]
-            - self.trainer_parameters["batch_size"] % self.policy.sequence_length
+            self.hyperparameters.batch_size
+            - self.hyperparameters.batch_size % self.policy.sequence_length
-            int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
+            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
        )

        advantages = self.update_buffer["advantages"].get_batch()
-        num_epoch = self.trainer_parameters["num_epoch"]
+        num_epoch = self.hyperparameters.num_epoch
        batch_update_stats = defaultdict(list)
        for _ in range(num_epoch):
            self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
        policy = NNPolicy(
            self.seed,
            brain_parameters,
-            self.trainer_parameters,
+            self.trainer_settings,
            self.is_training,
            self.load,
            condition_sigma_on_obs=False,  # Faster training for PPO
        if not isinstance(policy, NNPolicy):
            raise RuntimeError("Non-NNPolicy passed to PPOTrainer.add_policy()")
        self.policy = policy
-        self.optimizer = PPOOptimizer(self.policy, self.trainer_parameters)
+        self.optimizer = PPOOptimizer(self.policy, self.trainer_settings)
        for _reward_signal in self.optimizer.reward_signals.keys():
            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
        # Needed to resume loads properly
--- a/ml-agents/mlagents/trainers/run_experiment.py
+++ b/ml-agents/mlagents/trainers/run_experiment.py
 import argparse
 from typing import Optional, List
-from mlagents.trainers.learn import RunOptions, run_cli, load_config
+from mlagents.trainers.learn import run_cli
+from mlagents.trainers.settings import RunOptions
+from mlagents.trainers.cli_utils import load_config


 def parse_command_line(argv: Optional[List[str]] = None) -> argparse.Namespace:
    """
    args = parse_command_line()
    expt_config = load_config(args.experiment_config_path)
-    run_cli(RunOptions(**expt_config))
+    run_cli(RunOptions.from_dict(expt_config))


 if __name__ == "__main__":
--- a/ml-agents/mlagents/trainers/sac/optimizer.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer.py
 import numpy as np
-from typing import Dict, List, Optional, Any, Mapping
+from typing import Dict, List, Optional, Any, Mapping, cast
-from mlagents.trainers.models import ScheduleType, EncoderType, ModelUtils
+from mlagents.trainers.models import ModelUtils
+from mlagents.trainers.settings import TrainerSettings, SACSettings

 EPSILON = 1e-6  # Small value to avoid divide by zero



 class SACOptimizer(TFOptimizer):
-    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
+    def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
        """
        Takes a Unity environment and model-specific hyper-parameters and returns the
        appropriate PPO agent model for the environment.
        with policy.graph.as_default():
            with tf.variable_scope(""):
                super().__init__(policy, trainer_params)
-                lr = float(trainer_params["learning_rate"])
-                lr_schedule = ScheduleType(
-                    trainer_params.get("learning_rate_schedule", "constant")
+                hyperparameters: SACSettings = cast(
+                    SACSettings, trainer_params.hyperparameters
+                lr = hyperparameters.learning_rate
+                lr_schedule = hyperparameters.learning_rate_schedule
+                max_step = trainer_params.max_steps
+                self.tau = hyperparameters.tau
+                self.init_entcoef = hyperparameters.init_entcoef
+
-                self.act_size = self.policy.act_size
-                h_size = int(trainer_params["hidden_units"])
-                max_step = float(trainer_params["max_steps"])
-                num_layers = int(trainer_params["num_layers"])
-                vis_encode_type = EncoderType(
-                    trainer_params.get("vis_encode_type", "simple")
-                )
-                self.tau = trainer_params.get("tau", 0.005)
-                self.burn_in_ratio = float(trainer_params.get("burn_in_ratio", 0.0))
+                self.act_size = policy.act_size
+                policy_network_settings = policy.network_settings
+                h_size = policy_network_settings.hidden_units
+                num_layers = policy_network_settings.num_layers
+                vis_encode_type = policy_network_settings.vis_encode_type
+
+                self.tau = hyperparameters.tau
+                self.burn_in_ratio = 0.0

                # Non-exposed SAC parameters
                self.discrete_target_entropy_scale = (

-                self.init_entcoef = trainer_params.get("init_entcoef", 1.0)
-                    _val["gamma"] for _val in trainer_params["reward_signals"].values()
+                    _val.gamma for _val in trainer_params.reward_signals.values()
                ]
                self.use_dones_in_backup = {
                    name: tf.Variable(1.0) for name in stream_names
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
 # and implemented in https://github.com/hill-a/stable-baselines

 from collections import defaultdict
-from typing import Dict
+from typing import Dict, cast
 import os

 import numpy as np
 from mlagents.trainers.trainer.rl_trainer import RLTrainer
 from mlagents.trainers.trajectory import Trajectory, SplitObservations
 from mlagents.trainers.brain import BrainParameters
-from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.settings import TrainerSettings, SACSettings
-DEFAULT_STEPS_PER_UPDATE = 1


 class SACTrainer(RLTrainer):
        self,
        brain_name: str,
        reward_buff_cap: int,
-        trainer_parameters: dict,
+        trainer_settings: TrainerSettings,
        training: bool,
        load: bool,
        seed: int,
        Responsible for collecting experiences and training SAC model.
        :param brain_name: The name of the brain associated with trainer config
        :param reward_buff_cap: Max reward history to track in the reward buffer
-        :param trainer_parameters: The parameters for the trainer (dictionary).
+        :param trainer_settings: The parameters for the trainer.
        :param training: Whether the trainer is set for training.
        :param load: Whether the model should be loaded.
        :param seed: The seed the model will be initialized with
-            brain_name, trainer_parameters, training, run_id, reward_buff_cap
+            brain_name, trainer_settings, training, run_id, reward_buff_cap
-        self.param_keys = [
-            "batch_size",
-            "buffer_size",
-            "buffer_init_steps",
-            "hidden_units",
-            "learning_rate",
-            "init_entcoef",
-            "max_steps",
-            "normalize",
-            "num_layers",
-            "time_horizon",
-            "steps_per_update",
-            "sequence_length",
-            "summary_freq",
-            "tau",
-            "use_recurrent",
-            "memory_size",
-            "output_path",
-            "reward_signals",
-        ]
-        self._check_param_keys()
-
+        self.hyperparameters: SACSettings = cast(
+            SACSettings, trainer_settings.hyperparameters
+        )
-        self.update_steps = max(1, self.trainer_parameters["buffer_init_steps"])
-        self.reward_signal_update_steps = max(
-            1, self.trainer_parameters["buffer_init_steps"]
-        )
+        self.update_steps = max(1, self.hyperparameters.buffer_init_steps)
+        self.reward_signal_update_steps = max(1, self.hyperparameters.buffer_init_steps)
-        self.steps_per_update = (
-            trainer_parameters["steps_per_update"]
-            if "steps_per_update" in trainer_parameters
-            else DEFAULT_STEPS_PER_UPDATE
-        )
+        self.steps_per_update = self.hyperparameters.steps_per_update
-            trainer_parameters["reward_signals"]["reward_signal_steps_per_update"]
-            if "reward_signal_steps_per_update" in trainer_parameters["reward_signals"]
-            else self.steps_per_update
+            self.hyperparameters.reward_signal_steps_per_update
-        self.checkpoint_replay_buffer = (
-            trainer_parameters["save_replay_buffer"]
-            if "save_replay_buffer" in trainer_parameters
-            else False
-        )
-
-    def _check_param_keys(self):
-        super()._check_param_keys()
-        # Check that batch size is greater than sequence length. Else, throw
-        # an exception.
-        if (
-            self.trainer_parameters["sequence_length"]
-            > self.trainer_parameters["batch_size"]
-            and self.trainer_parameters["use_recurrent"]
-        ):
-            raise UnityTrainerException(
-                "batch_size must be greater than or equal to sequence_length when use_recurrent is True."
-            )
+        self.checkpoint_replay_buffer = self.hyperparameters.save_replay_buffer

    def save_model(self, name_behavior_id: str) -> None:
        """
        Save the training buffer's update buffer to a pickle file.
        """
        filename = os.path.join(
-            self.trainer_parameters["output_path"], "last_replay_buffer.hdf5"
+            self.trainer_settings.output_path, "last_replay_buffer.hdf5"
        )
        logger.info("Saving Experience Replay Buffer to {}".format(filename))
        with open(filename, "wb") as file_object:
        Loads the last saved replay buffer from a file.
        """
        filename = os.path.join(
-            self.trainer_parameters["output_path"], "last_replay_buffer.hdf5"
+            self.trainer_settings.output_path, "last_replay_buffer.hdf5"
        )
        logger.info("Loading Experience Replay Buffer from {}".format(filename))
        with open(filename, "rb+") as file_object:
        :return: A boolean corresponding to whether or not _update_policy() can be run
        """
        return (
-            self.update_buffer.num_experiences >= self.trainer_parameters["batch_size"]
-            and self.step >= self.trainer_parameters["buffer_init_steps"]
+            self.update_buffer.num_experiences >= self.hyperparameters.batch_size
+            and self.step >= self.hyperparameters.buffer_init_steps
        )

    @timed
        policy = NNPolicy(
            self.seed,
            brain_parameters,
-            self.trainer_parameters,
+            self.trainer_settings,
            self.is_training,
            self.load,
            tanh_squash=True,
        has_updated = False
        self.cumulative_returns_since_policy_update.clear()
        n_sequences = max(
-            int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
+            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
        )

        batch_update_stats: Dict[str, list] = defaultdict(list)
-            if (
-                self.update_buffer.num_experiences
-                >= self.trainer_parameters["batch_size"]
-            ):
+            if self.update_buffer.num_experiences >= self.hyperparameters.batch_size:
-                    self.trainer_parameters["batch_size"],
+                    self.hyperparameters.batch_size,
                    sequence_length=self.policy.sequence_length,
                )
                # Get rewards for each reward

        # Truncate update buffer if neccessary. Truncate more than we need to to avoid truncating
        # a large buffer at each update.
-        if self.update_buffer.num_experiences > self.trainer_parameters["buffer_size"]:
+        if self.update_buffer.num_experiences > self.hyperparameters.buffer_size:
-                int(self.trainer_parameters["buffer_size"] * BUFFER_TRUNCATE_PERCENT)
+                int(self.hyperparameters.buffer_size * BUFFER_TRUNCATE_PERCENT)
            )
        return has_updated

        """
        buffer = self.update_buffer
        n_sequences = max(
-            int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
+            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
        )
        batch_update_stats: Dict[str, list] = defaultdict(list)
        while (
                # Some signals don't need a minibatch to be sampled - so we don't!
                if signal.update_dict:
                    reward_signal_minibatches[name] = buffer.sample_mini_batch(
-                        self.trainer_parameters["batch_size"],
+                        self.hyperparameters.batch_size,
                        sequence_length=self.policy.sequence_length,
                    )
            update_stats = self.optimizer.update_reward_signals(
        if not isinstance(policy, NNPolicy):
            raise RuntimeError("Non-SACPolicy passed to SACTrainer.add_policy()")
        self.policy = policy
-        self.optimizer = SACOptimizer(self.policy, self.trainer_parameters)
+        self.optimizer = SACOptimizer(self.policy, self.trainer_settings)
        for _reward_signal in self.optimizer.reward_signals.keys():
            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
        # Needed to resume loads properly
--- a/ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
+++ b/ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
 import os
 import tempfile
 import pytest
-import yaml
+from mlagents.trainers.settings import TrainerSettings
 from mlagents.tf_utils import tf
 from mlagents.model_serialization import SerializationSettings, export_policy_model

    os.remove(tmpfile)


-@pytest.fixture
-def dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        use_recurrent: false
-        normalize: true
-        memory_size: 8
-        curiosity_strength: 0.0
-        curiosity_enc_size: 1
-        output_path: test
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        """
-    )
-
-
-def test_policy_conversion(dummy_config, tmpdir, rnn, visual, discrete):
+def test_policy_conversion(tmpdir, rnn, visual, discrete):
-    dummy_config["output_path"] = os.path.join(tmpdir, "test")
+    dummy_config = TrainerSettings(output_path=os.path.join(tmpdir, "test"))
    policy = create_policy_mock(
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
--- a/ml-agents/mlagents/trainers/tests/test_bcmodule.py
+++ b/ml-agents/mlagents/trainers/tests/test_bcmodule.py
 import mlagents.trainers.tests.mock_brain as mb

 import numpy as np
-import yaml
-
-
-def ppo_dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        use_recurrent: false
-        memory_size: 8
-        behavioral_cloning:
-          demo_path: ./Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
-          strength: 1.0
-          steps: 10000000
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        """
-    )
+from mlagents.trainers.settings import (
+    TrainerSettings,
+    BehavioralCloningSettings,
+    NetworkSettings,
+)
-def create_bc_module(mock_brain, trainer_config, use_rnn, demo_file, tanhresample):
+def create_bc_module(mock_brain, bc_settings, use_rnn, tanhresample):
-    trainer_config["output_path"] = "testpath"
-    trainer_config["keep_checkpoints"] = 3
-    trainer_config["use_recurrent"] = use_rnn
-    trainer_config["behavioral_cloning"]["demo_path"] = (
-        os.path.dirname(os.path.abspath(__file__)) + "/" + demo_file
+    trainer_config = TrainerSettings()
+    trainer_config.network_settings.memory = (
+        NetworkSettings.MemorySettings() if use_rnn else None
-
    policy = NNPolicy(
        0, mock_brain, trainer_config, False, False, tanhresample, tanhresample
    )
-            policy_learning_rate=trainer_config["learning_rate"],
-            default_batch_size=trainer_config["batch_size"],
+            policy_learning_rate=trainer_config.hyperparameters.learning_rate,
+            default_batch_size=trainer_config.hyperparameters.batch_size,
-            **trainer_config["behavioral_cloning"],
+            settings=bc_settings,
        )
    policy.initialize_or_load()  # Normally the optimizer calls this after the BCModule is created
    return bc_module
 def test_bcmodule_defaults():
    # See if default values match
    mock_brain = mb.create_mock_3dball_brain()
-    trainer_config = ppo_dummy_config()
-    bc_module = create_bc_module(mock_brain, trainer_config, False, "test.demo", False)
+    bc_settings = BehavioralCloningSettings(
+        demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo"
+    )
+    bc_module = create_bc_module(mock_brain, bc_settings, False, False)
-    assert bc_module.batch_size == trainer_config["batch_size"]
+    assert bc_module.batch_size == TrainerSettings().hyperparameters.batch_size
-    trainer_config["behavioral_cloning"]["num_epoch"] = 100
-    trainer_config["behavioral_cloning"]["batch_size"] = 10000
-    bc_module = create_bc_module(mock_brain, trainer_config, False, "test.demo", False)
+    bc_settings = BehavioralCloningSettings(
+        demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo",
+        num_epoch=100,
+        batch_size=10000,
+    )
+    bc_module = create_bc_module(mock_brain, bc_settings, False, False)
    assert bc_module.num_epoch == 100
    assert bc_module.batch_size == 10000

 def test_bcmodule_update(is_sac):
    mock_brain = mb.create_mock_3dball_brain()
-    bc_module = create_bc_module(
-        mock_brain, ppo_dummy_config(), False, "test.demo", is_sac
+    bc_settings = BehavioralCloningSettings(
+        demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo"
+    bc_module = create_bc_module(mock_brain, bc_settings, False, is_sac)
    stats = bc_module.update()
    for _, item in stats.items():
        assert isinstance(item, np.float32)
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
 def test_bcmodule_constant_lr_update(is_sac):
-    trainer_config = ppo_dummy_config()
-    trainer_config["behavioral_cloning"]["steps"] = 0
-    bc_module = create_bc_module(mock_brain, trainer_config, False, "test.demo", is_sac)
+    bc_settings = BehavioralCloningSettings(
+        demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo",
+        steps=0,
+    )
+    bc_module = create_bc_module(mock_brain, bc_settings, False, is_sac)
    stats = bc_module.update()
    for _, item in stats.items():
        assert isinstance(item, np.float32)
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
 def test_bcmodule_rnn_update(is_sac):
    mock_brain = mb.create_mock_3dball_brain()
-    bc_module = create_bc_module(
-        mock_brain, ppo_dummy_config(), True, "test.demo", is_sac
+    bc_settings = BehavioralCloningSettings(
+        demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo"
+    bc_module = create_bc_module(mock_brain, bc_settings, True, is_sac)
    stats = bc_module.update()
    for _, item in stats.items():
        assert isinstance(item, np.float32)
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
 def test_bcmodule_dc_visual_update(is_sac):
    mock_brain = mb.create_mock_banana_brain()
-    bc_module = create_bc_module(
-        mock_brain, ppo_dummy_config(), False, "testdcvis.demo", is_sac
+    bc_settings = BehavioralCloningSettings(
+        demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "testdcvis.demo"
+    bc_module = create_bc_module(mock_brain, bc_settings, False, is_sac)
    stats = bc_module.update()
    for _, item in stats.items():
        assert isinstance(item, np.float32)
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
 def test_bcmodule_rnn_dc_update(is_sac):
    mock_brain = mb.create_mock_banana_brain()
-    bc_module = create_bc_module(
-        mock_brain, ppo_dummy_config(), True, "testdcvis.demo", is_sac
+    bc_settings = BehavioralCloningSettings(
+        demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "testdcvis.demo"
+    bc_module = create_bc_module(mock_brain, bc_settings, True, is_sac)
    stats = bc_module.update()
    for _, item in stats.items():
        assert isinstance(item, np.float32)
--- a/ml-agents/mlagents/trainers/tests/test_curriculum.py
+++ b/ml-agents/mlagents/trainers/tests/test_curriculum.py
-import io
-import json
-from unittest.mock import patch, mock_open
-from mlagents.trainers.exception import CurriculumConfigError, CurriculumLoadingError
+from mlagents.trainers.exception import CurriculumConfigError
+from mlagents.trainers.settings import CurriculumSettings
-dummy_curriculum_json_str = """
-    {
-        "measure" : "reward",
-        "thresholds" : [10, 20, 50],
-        "min_lesson_length" : 3,
-        "signal_smoothing" : true,
-        "parameters" :
-        {
-            "param1" : [0.7, 0.5, 0.3, 0.1],
-            "param2" : [100, 50, 20, 15],
-            "param3" : [0.2, 0.3, 0.7, 0.9]
-        }
-    }
-    """
-dummy_curriculum_config = json.loads(dummy_curriculum_json_str)
-
-bad_curriculum_json_str = """
-    {
-        "measure" : "reward",
-        "thresholds" : [10, 20, 50],
-        "min_lesson_length" : 3,
-        "signal_smoothing" : false,
-        "parameters" :
-        {
-            "param1" : [0.7, 0.5, 0.3, 0.1],
-            "param2" : [100, 50, 20],
-            "param3" : [0.2, 0.3, 0.7, 0.9]
-        }
-    }
-    """
-
+dummy_curriculum_config = CurriculumSettings(
+    measure="reward",
+    thresholds=[10, 20, 50],
+    min_lesson_length=3,
+    signal_smoothing=True,
+    parameters={
+        "param1": [0.7, 0.5, 0.3, 0.1],
+        "param2": [100, 50, 20, 15],
+        "param3": [0.2, 0.3, 0.7, 0.9],
+    },
+)
-dummy_curriculum_config_path = "TestBrain.json"
+bad_curriculum_config = CurriculumSettings(
+    measure="reward",
+    thresholds=[10, 20, 50],
+    min_lesson_length=3,
+    signal_smoothing=False,
+    parameters={
+        "param1": [0.7, 0.5, 0.3, 0.1],
+        "param2": [100, 50, 20],
+        "param3": [0.2, 0.3, 0.7, 0.9],
+    },
+)


@pytest.fixture
    assert curriculum.brain_name == "TestBrain"
    assert curriculum.lesson_num == 0
    assert curriculum.measure == "reward"
-
-
-@patch("builtins.open", new_callable=mock_open, read_data=bad_curriculum_json_str)
-def test_load_bad_curriculum_file_raises_error(mock_file):
-    with pytest.raises(CurriculumConfigError):
-        Curriculum(
-            "TestBrain", Curriculum.load_curriculum_file(dummy_curriculum_config_path)
-        )


 def test_increment_lesson():
    assert curriculum.get_config(0) == {"param1": 0.7, "param2": 100, "param3": 0.2}


-# Test json loading and error handling. These examples don't need to valid config files.
-def test_curriculum_load_good():
-    expected = {"x": 1}
-    value = json.dumps(expected)
-    fp = io.StringIO(value)
-    assert expected == Curriculum._load_curriculum(fp)
-
-
-def test_curriculum_load_missing_file():
-    with pytest.raises(CurriculumLoadingError):
-        Curriculum.load_curriculum_file("notAValidFile.json")
-
-
-def test_curriculum_load_invalid_json():
-    # This isn't valid json because of the trailing comma
-    contents = """
-{
-  "x": [1, 2, 3,]
-}
-"""
-    fp = io.StringIO(contents)
-    with pytest.raises(CurriculumLoadingError):
-        Curriculum._load_curriculum(fp)
+def test_load_bad_curriculum_file_raises_error():
+    with pytest.raises(CurriculumConfigError):
+        Curriculum("TestBrain", bad_curriculum_config)
--- a/ml-agents/mlagents/trainers/tests/test_distributions.py
+++ b/ml-agents/mlagents/trainers/tests/test_distributions.py

 from mlagents.tf_utils import tf

-import yaml
-
-
-
-@pytest.fixture
-def dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        use_recurrent: false
-        normalize: true
-        memory_size: 8
-        curiosity_strength: 0.0
-        curiosity_enc_size: 1
-        summary_path: test
-        model_path: test
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        """
-    )


 VECTOR_ACTION_SPACE = [2]
--- a/ml-agents/mlagents/trainers/tests/test_ghost.py
+++ b/ml-agents/mlagents/trainers/tests/test_ghost.py

 import numpy as np

-import yaml
-
 from mlagents.trainers.ghost.trainer import GhostTrainer
 from mlagents.trainers.ghost.controller import GhostController
 from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
 from mlagents.trainers.tests import mock_brain as mb
 from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
+from mlagents.trainers.settings import TrainerSettings, SelfPlaySettings
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        use_recurrent: false
-        normalize: true
-        memory_size: 8
-        curiosity_strength: 0.0
-        curiosity_enc_size: 1
-        output_path: test
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        self_play:
-            window: 5
-            play_against_current_self_ratio: 0.5
-            save_steps: 1000
-            swap_steps: 1000
-        """
-    )
+    return TrainerSettings(self_play=SelfPlaySettings())


 VECTOR_ACTION_SPACE = [1]
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    controller = GhostController(100)
    trainer = GhostTrainer(
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    controller = GhostController(100)
    trainer = GhostTrainer(
--- a/ml-agents/mlagents/trainers/tests/test_learn.py
+++ b/ml-agents/mlagents/trainers/tests/test_learn.py
 from unittest.mock import MagicMock, patch, mock_open
 from mlagents.trainers import learn
 from mlagents.trainers.trainer_controller import TrainerController
-from mlagents.trainers.learn import parse_command_line, DetectDefault
+from mlagents.trainers.learn import parse_command_line
+from mlagents.trainers.cli_utils import DetectDefault
 from mlagents_envs.exception import UnityEnvironmentException
 from mlagents.trainers.stats import StatsReporter

 MOCK_PARAMETER_YAML = """
    behaviors:
        {}
-    env_path: "./oldenvfile"
-    keep_checkpoints: 34
-    lesson: 2
-    run_id: uselessrun
-    save_freq: 654321
-    seed: 9870
-    base_port: 4001
-    num_envs: 4
+    env_settings:
+        env_path: "./oldenvfile"
+        num_envs: 4
+        base_port: 4001
+        seed: 9870
+    checkpoint_settings:
+        lesson: 2
+        run_id: uselessrun
+        save_freq: 654321
-    behaviors:
+    parameter_randomization:
+        sampler1: foo
+
+    curriculum:
-            curriculum:
-                curriculum1
+            parameters:
+                foo: [0.2, 0.5]
-            curriculum:
-                curriculum2
-
-    parameter_randomization:
-        sampler1
+            parameters:
+                foo: [0.2, 0.5]
    """


@patch("mlagents.trainers.learn.SamplerManager")
@patch("mlagents.trainers.learn.SubprocessEnvManager")
@patch("mlagents.trainers.learn.create_environment_factory")
-@patch("mlagents.trainers.learn.load_config")
+@patch("mlagents.trainers.settings.load_config")
 def test_run_training(
    load_config,
    create_environment_factory,
@patch("builtins.open", new_callable=mock_open, read_data=MOCK_YAML)
 def test_commandline_args(mock_file):
    # No args raises
-    with pytest.raises(SystemExit):
-        parse_command_line([])
+    # with pytest.raises(SystemExit):
+    #     parse_command_line([])
-    assert opt.env_path is None
+    assert opt.env_settings.env_path is None
-    assert opt.keep_checkpoints == 5
-    assert opt.lesson == 0
-    assert opt.resume is False
-    assert opt.inference is False
-    assert opt.run_id == "ppo"
-    assert opt.save_freq == 50000
-    assert opt.seed == -1
-    assert opt.base_port == 5005
-    assert opt.num_envs == 1
-    assert opt.no_graphics is False
+    assert opt.checkpoint_settings.lesson == 0
+    assert opt.checkpoint_settings.resume is False
+    assert opt.checkpoint_settings.inference is False
+    assert opt.checkpoint_settings.run_id == "ppo"
+    assert opt.checkpoint_settings.save_freq == 50000
+    assert opt.env_settings.seed == -1
+    assert opt.env_settings.base_port == 5005
+    assert opt.env_settings.num_envs == 1
+    assert opt.engine_settings.no_graphics is False
-    assert opt.env_args is None
+    assert opt.env_settings.env_args is None
-        "--keep-checkpoints=42",
        "--lesson=3",
        "--resume",
        "--inference",

    opt = parse_command_line(full_args)
    assert opt.behaviors == {}
-    assert opt.env_path == "./myenvfile"
+    assert opt.env_settings.env_path == "./myenvfile"
-    assert opt.keep_checkpoints == 42
-    assert opt.lesson == 3
-    assert opt.run_id == "myawesomerun"
-    assert opt.save_freq == 123456
-    assert opt.seed == 7890
-    assert opt.base_port == 4004
-    assert opt.num_envs == 2
-    assert opt.no_graphics is True
+    assert opt.checkpoint_settings.lesson == 3
+    assert opt.checkpoint_settings.run_id == "myawesomerun"
+    assert opt.checkpoint_settings.save_freq == 123456
+    assert opt.env_settings.seed == 7890
+    assert opt.env_settings.base_port == 4004
+    assert opt.env_settings.num_envs == 2
+    assert opt.engine_settings.no_graphics is True
-    assert opt.inference is True
-    assert opt.resume is True
+    assert opt.checkpoint_settings.inference is True
+    assert opt.checkpoint_settings.resume is True


@patch("builtins.open", new_callable=mock_open, read_data=MOCK_PARAMETER_YAML)
    opt = parse_command_line(["mytrainerpath"])
    assert opt.behaviors == {}
-    assert opt.env_path == "./oldenvfile"
+    assert opt.env_settings.env_path == "./oldenvfile"
-    assert opt.keep_checkpoints == 34
-    assert opt.lesson == 2
-    assert opt.run_id == "uselessrun"
-    assert opt.save_freq == 654321
-    assert opt.seed == 9870
-    assert opt.base_port == 4001
-    assert opt.num_envs == 4
-    assert opt.no_graphics is False
+    assert opt.checkpoint_settings.lesson == 2
+    assert opt.checkpoint_settings.run_id == "uselessrun"
+    assert opt.checkpoint_settings.save_freq == 654321
+    assert opt.env_settings.seed == 9870
+    assert opt.env_settings.base_port == 4001
+    assert opt.env_settings.num_envs == 4
+    assert opt.engine_settings.no_graphics is False
-    assert opt.env_args is None
+    assert opt.env_settings.env_args is None
-        "--keep-checkpoints=42",
        "--lesson=3",
        "--resume",
        "--inference",

    opt = parse_command_line(full_args)
    assert opt.behaviors == {}
-    assert opt.env_path == "./myenvfile"
+    assert opt.env_settings.env_path == "./myenvfile"
-    assert opt.keep_checkpoints == 42
-    assert opt.lesson == 3
-    assert opt.run_id == "myawesomerun"
-    assert opt.save_freq == 123456
-    assert opt.seed == 7890
-    assert opt.base_port == 4004
-    assert opt.num_envs == 2
-    assert opt.no_graphics is True
+    assert opt.checkpoint_settings.lesson == 3
+    assert opt.checkpoint_settings.run_id == "myawesomerun"
+    assert opt.checkpoint_settings.save_freq == 123456
+    assert opt.env_settings.seed == 7890
+    assert opt.env_settings.base_port == 4004
+    assert opt.env_settings.num_envs == 2
+    assert opt.engine_settings.no_graphics is True
-    assert opt.inference is True
-    assert opt.resume is True
+    assert opt.checkpoint_settings.inference is True
+    assert opt.checkpoint_settings.resume is True
-    assert opt.parameter_randomization == "sampler1"
+    assert opt.parameter_randomization == {"sampler1": "foo"}
+    assert len(opt.curriculum.keys()) == 2


@patch("builtins.open", new_callable=mock_open, read_data=MOCK_YAML)
    ]

    opt = parse_command_line(full_args)
-    assert opt.env_args == ["--foo=bar", "--blah", "baz", "100"]
+    assert opt.env_settings.env_args == ["--foo=bar", "--blah", "baz", "100"]
--- a/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
+++ b/ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
 from unittest.mock import patch, Mock

 from mlagents.trainers.meta_curriculum import MetaCurriculum
-import json
-import yaml
-from mlagents.trainers.tests.test_simple_rl import _check_environment_trains, BRAIN_NAME
-from mlagents.trainers.tests.test_curriculum import dummy_curriculum_json_str
+from mlagents.trainers.tests.test_simple_rl import (
+    _check_environment_trains,
+    BRAIN_NAME,
+    PPO_CONFIG,
+)
+from mlagents.trainers.tests.test_curriculum import dummy_curriculum_config
+from mlagents.trainers.settings import CurriculumSettings


@pytest.fixture


 def test_curriculum_config(param_name="test_param1", min_lesson_length=100):
-    return {
-        "measure": "progress",
-        "thresholds": [0.1, 0.3, 0.5],
-        "min_lesson_length": min_lesson_length,
-        "signal_smoothing": True,
-        "parameters": {f"{param_name}": [0.0, 4.0, 6.0, 8.0]},
-    }
+    return CurriculumSettings(
+        thresholds=[0.1, 0.3, 0.5],
+        min_lesson_length=min_lesson_length,
+        parameters={f"{param_name}": [0.0, 4.0, 6.0, 8.0]},
+    )


 test_meta_curriculum_config = {
    assert meta_curriculum.get_config() == {"test_param1": 0.0, "test_param2": 0.0}


-TRAINER_CONFIG = """
-    default:
-        trainer: ppo
-        batch_size: 16
-        beta: 5.0e-3
-        buffer_size: 64
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 5.0e-3
-        max_steps: 100
-        memory_size: 256
-        normalize: false
-        num_epoch: 3
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 50
-        use_recurrent: false
-        reward_signals:
-            extrinsic:
-                strength: 1.0
-                gamma: 0.99
-    """
-
-
-    curriculum_config = json.loads(dummy_curriculum_json_str)
-    mc = MetaCurriculum({curriculum_brain_name: curriculum_config})
-    trainer_config = yaml.safe_load(TRAINER_CONFIG)
+    mc = MetaCurriculum({curriculum_brain_name: dummy_curriculum_config})
-        env, trainer_config, meta_curriculum=mc, success_threshold=None
+        env, {BRAIN_NAME: PPO_CONFIG}, meta_curriculum=mc, success_threshold=None
    )
--- a/ml-agents/mlagents/trainers/tests/test_nn_policy.py
+++ b/ml-agents/mlagents/trainers/tests/test_nn_policy.py
 import pytest
 import os
-from typing import Dict, Any
-import yaml

 from mlagents.trainers.policy.nn_policy import NNPolicy
 from mlagents.trainers.models import EncoderType, ModelUtils
+from mlagents.trainers.settings import TrainerSettings, NetworkSettings
-
-
-@pytest.fixture
-def dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        use_recurrent: false
-        normalize: true
-        memory_size: 8
-        curiosity_strength: 0.0
-        curiosity_enc_size: 1
-        output_path: test
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        """
-    )


 VECTOR_ACTION_SPACE = [2]


 def create_policy_mock(
-    dummy_config: Dict[str, Any],
+    dummy_config: TrainerSettings,
    use_rnn: bool = False,
    use_discrete: bool = True,
    use_visual: bool = False,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

-    trainer_parameters = dummy_config
-    trainer_parameters["keep_checkpoints"] = 3
-    trainer_parameters["use_recurrent"] = use_rnn
-    policy = NNPolicy(seed, mock_brain, trainer_parameters, False, load)
+    trainer_settings = dummy_config
+    trainer_settings.keep_checkpoints = 3
+    trainer_settings.network_settings.memory = (
+        NetworkSettings.MemorySettings() if use_rnn else None
+    )
+    policy = NNPolicy(seed, mock_brain, trainer_settings, False, load)
-def test_load_save(dummy_config, tmp_path):
+def test_load_save(tmp_path):
-    trainer_params = dummy_config
-    trainer_params["output_path"] = path1
+    trainer_params = TrainerSettings(output_path=path1)
    policy = create_policy_mock(trainer_params)
    policy.initialize_or_load()
    policy._set_step(2000)
    assert policy2.get_current_step() == 2000

    # Try initialize from path 1
-    trainer_params["model_path"] = path2
-    trainer_params["init_path"] = path1
+    trainer_params.output_path = path2
+    trainer_params.init_path = path1
    policy3 = create_policy_mock(trainer_params, load=False, seed=2)
    policy3.initialize_or_load()

@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
-def test_policy_evaluate(dummy_config, rnn, visual, discrete):
+def test_policy_evaluate(rnn, visual, discrete):
-        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
+        TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    decision_step, terminal_step = mb.create_steps_from_brainparams(
        policy.brain, num_agents=NUM_AGENTS
        assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE[0])


-def test_normalization(dummy_config):
+def test_normalization():
    brain_params = BrainParameters(
        brain_name="test_brain",
        vector_observation_space_size=1,
        vector_action_space_type=0,
    )
-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"

    time_horizon = 6
    trajectory = make_fake_trajectory(
    # Change half of the obs to 0
    for i in range(3):
        trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
-    policy = policy = NNPolicy(0, brain_params, dummy_config, False, False)
+    policy = NNPolicy(
+        0,
+        brain_params,
+        TrainerSettings(network_settings=NetworkSettings(normalize=True)),
+        False,
+        False,
+    )

    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])
--- a/ml-agents/mlagents/trainers/tests/test_policy.py
+++ b/ml-agents/mlagents/trainers/tests/test_policy.py
 from mlagents_envs.base_env import DecisionSteps, BehaviorSpec
 from mlagents.trainers.action_info import ActionInfo
 from unittest.mock import MagicMock
+from mlagents.trainers.settings import TrainerSettings
 import numpy as np


    return mock_brain


-def basic_params():
-    return {"use_recurrent": False, "output_path": "my/path"}
-
-
 class FakePolicy(TFPolicy):
    def create_tf_graph(self):
        pass

 def test_take_action_returns_empty_with_no_agents():
    test_seed = 3
-    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
+    policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings())
    # Doesn't really matter what this is
    dummy_groupspec = BehaviorSpec([(1,)], "continuous", 1)
    no_agent_step = DecisionSteps.empty(dummy_groupspec)

 def test_take_action_returns_nones_on_missing_values():
    test_seed = 3
-    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
+    policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings())
    policy.evaluate = MagicMock(return_value={})
    policy.save_memories = MagicMock()
    step_with_agents = DecisionSteps(

 def test_take_action_returns_action_info_when_available():
    test_seed = 3
-    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
+    policy = FakePolicy(test_seed, basic_mock_brain(), TrainerSettings())
    policy_eval_out = {
        "action": np.array([1.0], dtype=np.float32),
        "memory_out": np.array([[2.5]], dtype=np.float32),
--- a/ml-agents/mlagents/trainers/tests/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/test_ppo.py

 import numpy as np
 from mlagents.tf_utils import tf
-
-import yaml
+import copy
+import attr

 from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
 from mlagents.trainers.ppo.optimizer import PPOOptimizer
 from mlagents.trainers.tests import mock_brain as mb
 from mlagents.trainers.tests.mock_brain import make_brain_parameters
 from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
-from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.settings import NetworkSettings, TrainerSettings, PPOSettings
+from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG
+from mlagents.trainers.exception import TrainerConfigError
 from mlagents.trainers.tests.test_reward_signals import (  # noqa: F401; pylint: disable=unused-variable
    curiosity_dummy_config,
    gail_dummy_config,
@pytest.fixture
 def dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 16
-        summary_freq: 1000
-        use_recurrent: false
-        normalize: true
-        memory_size: 10
-        curiosity_strength: 0.0
-        curiosity_enc_size: 1
-        output_path: test
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        """
-    )
+    return copy.deepcopy(PPO_CONFIG)


 VECTOR_ACTION_SPACE = [2]
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

-    trainer_parameters = dummy_config
-    model_path = "testmodel"
-    trainer_parameters["model_path"] = model_path
-    trainer_parameters["keep_checkpoints"] = 3
-    trainer_parameters["use_recurrent"] = use_rnn
+    trainer_settings = attr.evolve(dummy_config)
+    trainer_settings.network_settings.memory = (
+        NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
+        if use_rnn
+        else None
+    )
-        0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
+        0, mock_brain, trainer_settings, False, False, create_tf_graph=False
-    optimizer = PPOOptimizer(policy, trainer_parameters)
+    optimizer = PPOOptimizer(policy, trainer_settings)
    return optimizer


@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
 # We need to test this separately from test_reward_signals.py to ensure no interactions
 def test_ppo_optimizer_update_curiosity(
-    curiosity_dummy_config, dummy_config, rnn, visual, discrete  # noqa: F811
+    dummy_config, curiosity_dummy_config, rnn, visual, discrete  # noqa: F811
-    dummy_config["reward_signals"].update(curiosity_dummy_config)
+    dummy_config.reward_signals = curiosity_dummy_config
    optimizer = _create_ppo_optimizer_ops_mock(
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
 def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config):  # noqa: F811
    # Test evaluate
    tf.reset_default_graph()
-    dummy_config["reward_signals"].update(gail_dummy_config)
+    dummy_config.reward_signals = gail_dummy_config
-        dummy_config, use_rnn=False, use_discrete=False, use_visual=False
+        PPO_CONFIG, use_rnn=False, use_discrete=False, use_visual=False
    )
    # Test update
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)


@mock.patch("mlagents.trainers.ppo.trainer.PPOOptimizer")
-def test_trainer_increment_step(ppo_optimizer, dummy_config):
-    trainer_params = dummy_config
+def test_trainer_increment_step(ppo_optimizer):
+    trainer_params = PPO_CONFIG
    mock_optimizer = mock.Mock()
    mock_optimizer.reward_signals = {}
    ppo_optimizer.return_value = mock_optimizer
    policy_mock = mock.Mock(spec=NNPolicy)
    policy_mock.get_current_step.return_value = 0
    step_count = (
-        5
-    )  # 10 hacked because this function is no longer called through trainer
+        5  # 10 hacked because this function is no longer called through trainer
+    )
    policy_mock.increment_step = mock.Mock(return_value=step_count)
    trainer.add_policy("testbehavior", policy_mock)



@pytest.mark.parametrize("use_discrete", [True, False])
-def test_trainer_update_policy(dummy_config, use_discrete):
+def test_trainer_update_policy(
+    dummy_config, curiosity_dummy_config, use_discrete  # noqa: F811
+):
    mock_brain = mb.setup_mock_brain(
        use_discrete,
        False,
    )

    trainer_params = dummy_config
-    trainer_params["use_recurrent"] = True
+    trainer_params.network_settings.memory = NetworkSettings.MemorySettings(
+        memory_size=10, sequence_length=16
+    )
-    trainer_params["reward_signals"]["curiosity"] = {}
-    trainer_params["reward_signals"]["curiosity"]["strength"] = 1.0
-    trainer_params["reward_signals"]["curiosity"]["gamma"] = 0.99
-    trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128
-
+    trainer_params.reward_signals = curiosity_dummy_config
    trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0")
    policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
    trainer.add_policy(mock_brain.brain_name, policy)
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
    policy = trainer.create_policy(brain_params.brain_name, brain_params)
    trainer.add_policy(brain_params.brain_name, policy)
    mock_optimizer.reward_signals = {}
    ppo_optimizer.return_value = mock_optimizer

-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
    policy = mock.Mock(spec=NNPolicy)
    policy.get_current_step.return_value = 2000
        trainer.add_policy(brain_params, policy)


-def test_bad_config(dummy_config):
+# TODO: Move this to test_settings.py
+def test_bad_config():
-    dummy_config["sequence_length"] = 64
-    dummy_config["batch_size"] = 32
-    dummy_config["use_recurrent"] = True
-    with pytest.raises(UnityTrainerException):
+    with pytest.raises(TrainerConfigError):
+        TrainerSettings(
+            network_settings=NetworkSettings(
+                memory=NetworkSettings.MemorySettings(sequence_length=64)
+            ),
+            hyperparameters=PPOSettings(batch_size=32),
+        )
        _ = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")


--- a/ml-agents/mlagents/trainers/tests/test_reward_signals.py
+++ b/ml-agents/mlagents/trainers/tests/test_reward_signals.py
 import pytest
-import yaml
+import copy
+from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG, SAC_CONFIG
+from mlagents.trainers.settings import (
+    GAILSettings,
+    CuriositySettings,
+    RewardSignalSettings,
+    BehavioralCloningSettings,
+    NetworkSettings,
+    TrainerType,
+    RewardSignalType,
+)

 CONTINUOUS_PATH = os.path.dirname(os.path.abspath(__file__)) + "/test.demo"
 DISCRETE_PATH = os.path.dirname(os.path.abspath(__file__)) + "/testdcvis.demo"
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        use_recurrent: false
-        memory_size: 8
-        reward_signals:
-            extrinsic:
-                strength: 1.0
-                gamma: 0.99
-        """
-    )
+    return copy.deepcopy(PPO_CONFIG)
-    return yaml.safe_load(
-        """
-        trainer: sac
-        batch_size: 128
-        buffer_size: 50000
-        buffer_init_steps: 0
-        hidden_units: 128
-        init_entcoef: 1.0
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        memory_size: 256
-        normalize: false
-        steps_per_update: 1
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        tau: 0.005
-        use_recurrent: false
-        vis_encode_type: simple
-        reward_signals:
-            extrinsic:
-                strength: 1.0
-                gamma: 0.99
-        """
-    )
+    return copy.deepcopy(SAC_CONFIG)
-    return {
-        "gail": {
-            "strength": 0.1,
-            "gamma": 0.9,
-            "encoding_size": 128,
-            "use_vail": True,
-            "demo_path": CONTINUOUS_PATH,
-        }
-    }
+    return {RewardSignalType.GAIL: GAILSettings(demo_path=CONTINUOUS_PATH)}
-    return {"curiosity": {"strength": 0.1, "gamma": 0.9, "encoding_size": 128}}
+    return {RewardSignalType.CURIOSITY: CuriositySettings()}
+
+
+@pytest.fixture
+def extrinsic_dummy_config():
+    return {RewardSignalType.EXTRINSIC: RewardSignalSettings()}


 VECTOR_ACTION_SPACE = [2]
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )
-    trainer_parameters = trainer_config
-    model_path = "testpath"
-    trainer_parameters["output_path"] = model_path
-    trainer_parameters["keep_checkpoints"] = 3
-    trainer_parameters["reward_signals"].update(reward_signal_config)
-    trainer_parameters["use_recurrent"] = use_rnn
+    trainer_settings = trainer_config
+    trainer_settings.reward_signals = reward_signal_config
+    trainer_settings.network_settings.memory = (
+        NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
+        if use_rnn
+        else None
+    )
-        0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
+        0, mock_brain, trainer_settings, False, False, create_tf_graph=False
-    if trainer_parameters["trainer"] == "sac":
-        optimizer = SACOptimizer(policy, trainer_parameters)
+    if trainer_settings.trainer_type == TrainerType.SAC:
+        optimizer = SACOptimizer(policy, trainer_settings)
-        optimizer = PPOOptimizer(policy, trainer_parameters)
+        optimizer = PPOOptimizer(policy, trainer_settings)
    return optimizer


    "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
 )
 def test_gail_cc(trainer_config, gail_dummy_config):
-    trainer_config.update(
-        {
-            "behavioral_cloning": {
-                "demo_path": CONTINUOUS_PATH,
-                "strength": 1.0,
-                "steps": 10000000,
-            }
-        }
+    trainer_config.behavioral_cloning = BehavioralCloningSettings(
+        demo_path=CONTINUOUS_PATH
    )
    optimizer = create_optimizer_mock(
        trainer_config, gail_dummy_config, False, False, False
    "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
 )
 def test_gail_dc_visual(trainer_config, gail_dummy_config):
-    gail_dummy_config["gail"]["demo_path"] = DISCRETE_PATH
-    trainer_config.update(
-        {
-            "behavioral_cloning": {
-                "demo_path": DISCRETE_PATH,
-                "strength": 1.0,
-                "steps": 10000000,
-            }
-        }
-    )
+    gail_dummy_config_discrete = {
+        RewardSignalType.GAIL: GAILSettings(demo_path=DISCRETE_PATH)
+    }
-        trainer_config, gail_dummy_config, False, True, True
+        trainer_config, gail_dummy_config_discrete, False, True, True
    )
    reward_signal_eval(optimizer, "gail")
    reward_signal_update(optimizer, "gail")
    "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
 )
 def test_gail_rnn(trainer_config, gail_dummy_config):
-    trainer_config.update(
-        {
-            "behavioral_cloning": {
-                "demo_path": CONTINUOUS_PATH,
-                "strength": 1.0,
-                "steps": 10000000,
-            }
-        }
-    )
    policy = create_optimizer_mock(
        trainer_config, gail_dummy_config, True, False, False
    )
@pytest.mark.parametrize(
    "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
 )
-def test_extrinsic(trainer_config, curiosity_dummy_config):
+def test_extrinsic(trainer_config, extrinsic_dummy_config):
-        trainer_config, curiosity_dummy_config, False, False, False
+        trainer_config, extrinsic_dummy_config, False, False, False
    )
    reward_signal_eval(policy, "extrinsic")
    reward_signal_update(policy, "extrinsic")
--- a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
+++ b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
-import yaml
 from unittest import mock
 import pytest
 import mlagents.trainers.tests.mock_brain as mb
-
-
-def dummy_config():
-    return yaml.safe_load(
-        """
-        output_path: "test/"
-        summary_freq: 1000
-        max_steps: 100
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        """
-    )
+from mlagents.trainers.settings import TrainerSettings


 def create_mock_brain():

 def create_rl_trainer():
    mock_brainparams = create_mock_brain()
-    trainer = FakeTrainer(mock_brainparams, dummy_config(), True, 0)
+    trainer = FakeTrainer(mock_brainparams, TrainerSettings(max_steps=100), True, 0)
    trainer.set_is_policy_updating(True)
    return trainer

--- a/ml-agents/mlagents/trainers/tests/test_sac.py
+++ b/ml-agents/mlagents/trainers/tests/test_sac.py
 import pytest
 from unittest import mock
-import yaml
+import copy

 from mlagents.tf_utils import tf

 from mlagents.trainers.tests import mock_brain as mb
 from mlagents.trainers.tests.mock_brain import make_brain_parameters
 from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
-from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.tests.test_simple_rl import SAC_CONFIG
+from mlagents.trainers.settings import NetworkSettings
+from mlagents.trainers.tests.test_reward_signals import (  # noqa: F401; pylint: disable=unused-variable
+    curiosity_dummy_config,
+)
-    return yaml.safe_load(
-        """
-        trainer: sac
-        batch_size: 8
-        buffer_size: 10240
-        buffer_init_steps: 0
-        hidden_units: 32
-        init_entcoef: 0.1
-        learning_rate: 3.0e-4
-        max_steps: 1024
-        memory_size: 10
-        normalize: true
-        steps_per_update: 1
-        num_layers: 1
-        time_horizon: 64
-        sequence_length: 16
-        summary_freq: 1000
-        tau: 0.005
-        use_recurrent: false
-        curiosity_enc_size: 128
-        demo_path: None
-        vis_encode_type: simple
-        reward_signals:
-            extrinsic:
-                strength: 1.0
-                gamma: 0.99
-        """
-    )
+    return copy.deepcopy(SAC_CONFIG)


 VECTOR_ACTION_SPACE = [2]
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )
-
-    trainer_parameters = dummy_config
-    model_path = "testmodel"
-    trainer_parameters["output_path"] = model_path
-    trainer_parameters["keep_checkpoints"] = 3
-    trainer_parameters["use_recurrent"] = use_rnn
+    trainer_settings = dummy_config
+    trainer_settings.network_settings.memory = (
+        NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
+        if use_rnn
+        else None
+    )
-        0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
+        0, mock_brain, trainer_settings, False, False, create_tf_graph=False
-    optimizer = SACOptimizer(policy, trainer_parameters)
+    optimizer = SACOptimizer(policy, trainer_settings)
    return optimizer




@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
-def test_sac_update_reward_signals(dummy_config, discrete):
+def test_sac_update_reward_signals(
+    dummy_config, curiosity_dummy_config, discrete  # noqa: F811
+):
-    dummy_config["reward_signals"]["curiosity"] = {}
-    dummy_config["reward_signals"]["curiosity"]["strength"] = 1.0
-    dummy_config["reward_signals"]["curiosity"]["gamma"] = 0.99
-    dummy_config["reward_signals"]["curiosity"]["encoding_size"] = 128
+    dummy_config.reward_signals = curiosity_dummy_config
    optimizer = create_sac_optimizer_mock(
        dummy_config, use_rnn=False, use_discrete=discrete, use_visual=False
    )
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )
    trainer_params = dummy_config
-    trainer_params["output_path"] = str(tmpdir)
-    trainer_params["save_replay_buffer"] = True
+    trainer_params.hyperparameters.save_replay_buffer = True
    trainer = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, False, 0, 0)
    policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
    trainer.add_policy(mock_brain.brain_name, policy)
    mock_optimizer.reward_signals = {}
    sac_optimizer.return_value = mock_optimizer

-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
    trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
    policy = mock.Mock(spec=NNPolicy)
    policy.get_current_step.return_value = 2000
    brain_params = make_brain_parameters(
        discrete_action=False, visual_inputs=0, vec_obs_size=6
    )
-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
-    dummy_config["steps_per_update"] = 20
+    dummy_config.hyperparameters.steps_per_update = 20
+    dummy_config.hyperparameters.buffer_init_steps = 0
    trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
    policy = trainer.create_policy(brain_params.brain_name, brain_params)
    trainer.add_policy(brain_params.brain_name, policy)
    trainer.advance()
    with pytest.raises(AgentManagerQueue.Empty):
        policy_queue.get_nowait()
-
-
-def test_bad_config(dummy_config):
-    brain_params = make_brain_parameters(
-        discrete_action=False, visual_inputs=0, vec_obs_size=6
-    )
-    # Test that we throw an error if we have sequence length greater than batch size
-    dummy_config["sequence_length"] = 64
-    dummy_config["batch_size"] = 32
-    dummy_config["use_recurrent"] = True
-    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
-    with pytest.raises(UnityTrainerException):
-        _ = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")


 if __name__ == "__main__":
--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
 import math
 import tempfile
 import pytest
-import yaml
-from typing import Dict, Any
+import attr
+from typing import Dict

 from mlagents.trainers.tests.simple_test_envs import (
    SimpleEnvironment,
 from mlagents.trainers.sampler_class import SamplerManager
 from mlagents.trainers.demo_loader import write_demo
 from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary
+from mlagents.trainers.settings import (
+    TrainerSettings,
+    PPOSettings,
+    SACSettings,
+    NetworkSettings,
+    SelfPlaySettings,
+    BehavioralCloningSettings,
+    GAILSettings,
+    TrainerType,
+    RewardSignalType,
+)
+from mlagents.trainers.models import EncoderType, ScheduleType
 from mlagents_envs.side_channel.environment_parameters_channel import (
    EnvironmentParametersChannel,
 )

 BRAIN_NAME = "1D"

-PPO_CONFIG = f"""
-    {BRAIN_NAME}:
-        trainer: ppo
-        batch_size: 16
-        beta: 5.0e-3
-        buffer_size: 64
-        epsilon: 0.2
-        hidden_units: 32
-        lambd: 0.95
-        learning_rate: 5.0e-3
-        learning_rate_schedule: constant
-        max_steps: 3000
-        memory_size: 16
-        normalize: false
-        num_epoch: 3
-        num_layers: 1
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 500
-        use_recurrent: false
-        threaded: false
-        reward_signals:
-            extrinsic:
-                strength: 1.0
-                gamma: 0.99
-    """
-SAC_CONFIG = f"""
-    {BRAIN_NAME}:
-        trainer: sac
-        batch_size: 8
-        buffer_size: 5000
-        buffer_init_steps: 100
-        hidden_units: 16
-        init_entcoef: 0.01
-        learning_rate: 5.0e-3
-        max_steps: 1000
-        memory_size: 16
-        normalize: false
-        steps_per_update: 1
-        num_layers: 1
-        time_horizon: 64
-        sequence_length: 32
-        summary_freq: 100
-        tau: 0.01
-        use_recurrent: false
-        curiosity_enc_size: 128
-        demo_path: None
-        vis_encode_type: simple
-        threaded: false
-        reward_signals:
-            extrinsic:
-                strength: 1.0
-                gamma: 0.99
-    """
-
+PPO_CONFIG = TrainerSettings(
+    trainer_type=TrainerType.PPO,
+    hyperparameters=PPOSettings(
+        learning_rate=5.0e-3,
+        learning_rate_schedule=ScheduleType.CONSTANT,
+        batch_size=16,
+        buffer_size=64,
+    ),
+    network_settings=NetworkSettings(num_layers=1, hidden_units=32),
+    summary_freq=500,
+    max_steps=3000,
+    threaded=False,
+)
-def generate_config(
-    config: str, override_vals: Dict[str, Any] = None
-) -> Dict[str, Any]:
-    trainer_config = yaml.safe_load(config)
-    if override_vals is not None:
-        trainer_config[BRAIN_NAME].update(override_vals)
-    return trainer_config
+SAC_CONFIG = TrainerSettings(
+    trainer_type=TrainerType.SAC,
+    hyperparameters=SACSettings(
+        learning_rate=5.0e-3,
+        learning_rate_schedule=ScheduleType.CONSTANT,
+        batch_size=8,
+        buffer_init_steps=100,
+        buffer_size=5000,
+        tau=0.01,
+        init_entcoef=0.01,
+    ),
+    network_settings=NetworkSettings(num_layers=1, hidden_units=16),
+    summary_freq=100,
+    max_steps=1000,
+    threaded=False,
+)


 # The reward processor is passed as an argument to _check_environment_trains.
        StatsReporter.writers.clear()  # Clear StatsReporters so we don't write to file
        debug_writer = DebugWriter()
        StatsReporter.add_writer(debug_writer)
-        # Make sure threading is turned off for determinism
-        trainer_config["threading"] = False
        if env_manager is None:
            env_manager = SimpleEnvManager(env, EnvironmentParametersChannel())
        trainer_factory = TrainerFactory(
-            keep_checkpoints=1,
            train_model=True,
            load_model=False,
            seed=seed,
@pytest.mark.parametrize("use_discrete", [True, False])
 def test_simple_ppo(use_discrete):
    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
-    config = generate_config(PPO_CONFIG)
-    _check_environment_trains(env, config)
+    config = attr.evolve(PPO_CONFIG)
+    _check_environment_trains(env, {BRAIN_NAME: config})


@pytest.mark.parametrize("use_discrete", [True, False])
    )
-    config = generate_config(PPO_CONFIG)
-    _check_environment_trains(env, config)
+    config = attr.evolve(PPO_CONFIG)
+    _check_environment_trains(env, {BRAIN_NAME: config})


@pytest.mark.parametrize("use_discrete", [True, False])
        num_vector=0,
        step_size=0.2,
    )
-    override_vals = {"learning_rate": 3.0e-4}
-    config = generate_config(PPO_CONFIG, override_vals)
-    _check_environment_trains(env, config)
+    new_hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3.0e-4)
+    config = attr.evolve(PPO_CONFIG, hyperparameters=new_hyperparams)
+    _check_environment_trains(env, {BRAIN_NAME: config})


@pytest.mark.parametrize("num_visual", [1, 2])
        step_size=0.5,
        vis_obs_size=(36, 36, 3),
    )
-    override_vals = {
-        "learning_rate": 3.0e-4,
-        "vis_encode_type": vis_encode_type,
-        "max_steps": 500,
-        "summary_freq": 100,
-    }
-    config = generate_config(PPO_CONFIG, override_vals)
+    new_networksettings = attr.evolve(
+        SAC_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
+    )
+    new_hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3.0e-4)
+    config = attr.evolve(
+        PPO_CONFIG,
+        hyperparameters=new_hyperparams,
+        network_settings=new_networksettings,
+        max_steps=500,
+        summary_freq=100,
+    )
-    _check_environment_trains(env, config, success_threshold=0.5)
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
-    override_vals = {
-        "max_steps": 5000,
-        "batch_size": 64,
-        "buffer_size": 128,
-        "learning_rate": 1e-3,
-        "use_recurrent": True,
-    }
-    config = generate_config(PPO_CONFIG, override_vals)
-    _check_environment_trains(env, config, success_threshold=0.9)
+    new_network_settings = attr.evolve(
+        PPO_CONFIG.network_settings,
+        memory=NetworkSettings.MemorySettings(memory_size=16),
+    )
+    new_hyperparams = attr.evolve(
+        PPO_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=128
+    )
+    config = attr.evolve(
+        PPO_CONFIG,
+        hyperparameters=new_hyperparams,
+        network_settings=new_network_settings,
+        max_steps=5000,
+    )
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
-    config = generate_config(SAC_CONFIG)
-    _check_environment_trains(env, config)
+    config = attr.evolve(SAC_CONFIG)
+    _check_environment_trains(env, {BRAIN_NAME: config})


@pytest.mark.parametrize("use_discrete", [True, False])
    )
-    override_vals = {"buffer_init_steps": 2000, "max_steps": 10000}
-    config = generate_config(SAC_CONFIG, override_vals)
-    _check_environment_trains(env, config, success_threshold=0.8)
+    new_hyperparams = attr.evolve(SAC_CONFIG.hyperparameters, buffer_init_steps=2000)
+    config = attr.evolve(SAC_CONFIG, hyperparameters=new_hyperparams, max_steps=10000)
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)


@pytest.mark.parametrize("use_discrete", [True, False])
        num_vector=0,
        step_size=0.2,
    )
-    override_vals = {"batch_size": 16, "learning_rate": 3e-4}
-    config = generate_config(SAC_CONFIG, override_vals)
-    _check_environment_trains(env, config)
+    new_hyperparams = attr.evolve(
+        SAC_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4
+    )
+    config = attr.evolve(SAC_CONFIG, hyperparameters=new_hyperparams)
+    _check_environment_trains(env, {BRAIN_NAME: config})


@pytest.mark.parametrize("num_visual", [1, 2])
        step_size=0.5,
        vis_obs_size=(36, 36, 3),
    )
-    override_vals = {
-        "batch_size": 16,
-        "learning_rate": 3.0e-4,
-        "vis_encode_type": vis_encode_type,
-        "buffer_init_steps": 0,
-        "max_steps": 100,
-    }
-    config = generate_config(SAC_CONFIG, override_vals)
+    new_networksettings = attr.evolve(
+        SAC_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
+    )
+    new_hyperparams = attr.evolve(
+        SAC_CONFIG.hyperparameters,
+        batch_size=16,
+        learning_rate=3e-4,
+        buffer_init_steps=0,
+    )
+    config = attr.evolve(
+        SAC_CONFIG,
+        hyperparameters=new_hyperparams,
+        network_settings=new_networksettings,
+        max_steps=100,
+    )
-    _check_environment_trains(env, config, success_threshold=0.5)
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
-    override_vals = {
-        "batch_size": 64,
-        "use_recurrent": True,
-        "max_steps": 5000,
-        "learning_rate": 1e-3,
-        "buffer_init_steps": 500,
-        "steps_per_update": 2,
-    }
-    config = generate_config(SAC_CONFIG, override_vals)
-    _check_environment_trains(env, config)
+    new_networksettings = attr.evolve(
+        SAC_CONFIG.network_settings,
+        memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=32),
+    )
+    new_hyperparams = attr.evolve(
+        SAC_CONFIG.hyperparameters,
+        batch_size=64,
+        learning_rate=1e-3,
+        buffer_init_steps=500,
+        steps_per_update=2,
+    )
+    config = attr.evolve(
+        SAC_CONFIG,
+        hyperparameters=new_hyperparams,
+        network_settings=new_networksettings,
+        max_steps=5000,
+    )
+    _check_environment_trains(env, {BRAIN_NAME: config})


@pytest.mark.parametrize("use_discrete", [True, False])
    )
-    override_vals = {
-        "max_steps": 2500,
-        "self_play": {
-            "play_against_latest_model_ratio": 1.0,
-            "save_steps": 2000,
-            "swap_steps": 2000,
-        },
-    }
-    config = generate_config(PPO_CONFIG, override_vals)
-    _check_environment_trains(env, config)
+    self_play_settings = SelfPlaySettings(
+        play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000
+    )
+    config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2500)
+    _check_environment_trains(env, {BRAIN_NAME: config})


@pytest.mark.parametrize("use_discrete", [True, False])
    )
    # This config should fail because the ghosted policy is never swapped with a competent policy.
    # Swap occurs after max step is reached.
-    override_vals = {
-        "max_steps": 2500,
-        "self_play": {
-            "play_against_latest_model_ratio": 1.0,
-            "save_steps": 2000,
-            "swap_steps": 4000,
-        },
-    }
-    config = generate_config(PPO_CONFIG, override_vals)
-    _check_environment_trains(env, config, success_threshold=None)
+    self_play_settings = SelfPlaySettings(
+        play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=4000
+    )
+    config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2500)
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None)
    processed_rewards = [
        default_reward_processor(rewards) for rewards in env.final_rewards.values()
    ]
    env = SimpleEnvironment(
        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
    )
-    override_vals = {
-        "max_steps": 4000,
-        "self_play": {
-            "play_against_latest_model_ratio": 1.0,
-            "save_steps": 10000,
-            "swap_steps": 10000,
-            "team_change": 4000,
-        },
-    }
-    config = generate_config(PPO_CONFIG, override_vals)
-    config[brain_name_opp] = config[BRAIN_NAME]
-    _check_environment_trains(env, config)
+    self_play_settings = SelfPlaySettings(
+        play_against_latest_model_ratio=1.0,
+        save_steps=10000,
+        swap_steps=10000,
+        team_change=400,
+    )
+    config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=4000)
+    _check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})


@pytest.mark.parametrize("use_discrete", [True, False])
    )
    # This config should fail because the team that us not learning when both have reached
    # max step should be executing the initial, untrained poliy.
-    override_vals = {
-        "max_steps": 2000,
-        "self_play": {
-            "play_against_latest_model_ratio": 0.0,
-            "save_steps": 5000,
-            "swap_steps": 5000,
-            "team_change": 2000,
-        },
-    }
-    config = generate_config(PPO_CONFIG, override_vals)
-    config[brain_name_opp] = config[BRAIN_NAME]
-    _check_environment_trains(env, config, success_threshold=None)
+    self_play_settings = SelfPlaySettings(
+        play_against_latest_model_ratio=0.0,
+        save_steps=5000,
+        swap_steps=5000,
+        team_change=2000,
+    )
+    config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2000)
+    _check_environment_trains(
+        env, {BRAIN_NAME: config, brain_name_opp: config}, success_threshold=None
+    )
    processed_rewards = [
        default_reward_processor(rewards) for rewards in env.final_rewards.values()
    ]
 def test_gail(simple_record, use_discrete, trainer_config):
    demo_path = simple_record(use_discrete)
    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
-    override_vals = {
-        "max_steps": 500,
-        "behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},
-        "reward_signals": {
-            "gail": {
-                "strength": 1.0,
-                "gamma": 0.99,
-                "encoding_size": 32,
-                "demo_path": demo_path,
-            }
-        },
+    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
+    reward_signals = {
+        RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
-    config = generate_config(trainer_config, override_vals)
-    _check_environment_trains(env, config, success_threshold=0.9)
+    config = attr.evolve(
+        trainer_config,
+        reward_signals=reward_signals,
+        behavioral_cloning=bc_settings,
+        max_steps=500,
+    )
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)


@pytest.mark.parametrize("use_discrete", [True, False])
        use_discrete=use_discrete,
        step_size=0.2,
    )
-    override_vals = {
-        "max_steps": 1000,
-        "learning_rate": 3.0e-4,
-        "behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1500},
-        "reward_signals": {
-            "gail": {
-                "strength": 1.0,
-                "gamma": 0.99,
-                "encoding_size": 32,
-                "demo_path": demo_path,
-            }
-        },
+    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)
+    reward_signals = {
+        RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
-    config = generate_config(PPO_CONFIG, override_vals)
-    _check_environment_trains(env, config, success_threshold=0.9)
+    hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3e-4)
+    config = attr.evolve(
+        PPO_CONFIG,
+        reward_signals=reward_signals,
+        hyperparameters=hyperparams,
+        behavioral_cloning=bc_settings,
+        max_steps=1000,
+    )
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)


@pytest.mark.parametrize("use_discrete", [True, False])
        use_discrete=use_discrete,
        step_size=0.2,
    )
-    override_vals = {
-        "max_steps": 500,
-        "batch_size": 16,
-        "learning_rate": 3.0e-4,
-        "behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},
-        "reward_signals": {
-            "gail": {
-                "strength": 1.0,
-                "gamma": 0.99,
-                "encoding_size": 32,
-                "demo_path": demo_path,
-            }
-        },
+    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
+    reward_signals = {
+        RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
-    config = generate_config(SAC_CONFIG, override_vals)
-    _check_environment_trains(env, config, success_threshold=0.9)
+    hyperparams = attr.evolve(
+        SAC_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16
+    )
+    config = attr.evolve(
+        SAC_CONFIG,
+        reward_signals=reward_signals,
+        hyperparameters=hyperparams,
+        behavioral_cloning=bc_settings,
+        max_steps=500,
+    )
+    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
--- a/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
+++ b/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
 from mlagents.trainers.tests.test_simple_rl import (
    _check_environment_trains,
    PPO_CONFIG,
-    generate_config,
    DebugWriter,
 )

    env_manager = SubprocessEnvManager(
        simple_env_factory, EngineConfig.default_config(), num_envs
    )
-    trainer_config = generate_config(PPO_CONFIG, override_vals={"max_steps": 5000})
-        trainer_config,
+        {"1D": PPO_CONFIG},
        env_manager=env_manager,
        success_threshold=None,
    )
--- a/ml-agents/mlagents/trainers/tests/test_trainer_util.py
+++ b/ml-agents/mlagents/trainers/tests/test_trainer_util.py
 import pytest
-import yaml
-from mlagents.trainers.trainer_util import (
-    load_config,
-    _load_config,
-    assemble_curriculum_config,
-)
+from mlagents.trainers.cli_utils import load_config, _load_config
+from mlagents.trainers.settings import RunOptions
+from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG
-    return yaml.safe_load(
-        """
-        default:
-            trainer: ppo
-            batch_size: 32
-            beta: 5.0e-3
-            buffer_size: 512
-            epsilon: 0.2
-            gamma: 0.99
-            hidden_units: 128
-            lambd: 0.95
-            learning_rate: 3.0e-4
-            max_steps: 5.0e4
-            normalize: true
-            num_epoch: 5
-            num_layers: 2
-            time_horizon: 64
-            sequence_length: 64
-            summary_freq: 1000
-            use_recurrent: false
-            memory_size: 8
-            use_curiosity: false
-            curiosity_strength: 0.0
-            curiosity_enc_size: 1
-            reward_signals:
-                extrinsic:
-                    strength: 1.0
-                    gamma: 0.99
-        """
-    )
-
-
-@pytest.fixture
-def dummy_config_with_override(dummy_config):
-    base = dummy_config
-    base["testbrain"] = {}
-    base["testbrain"]["normalize"] = False
-    return base
-
-
-@pytest.fixture
-def dummy_bad_config():
-    return yaml.safe_load(
-        """
-        default:
-            trainer: incorrect_trainer
-            brain_to_imitate: ExpertBrain
-            batches_per_epoch: 16
-            batch_size: 32
-            beta: 5.0e-3
-            buffer_size: 512
-            epsilon: 0.2
-            gamma: 0.99
-            hidden_units: 128
-            lambd: 0.95
-            learning_rate: 3.0e-4
-            max_steps: 5.0e4
-            normalize: true
-            num_epoch: 5
-            num_layers: 2
-            time_horizon: 64
-            sequence_length: 64
-            summary_freq: 1000
-            use_recurrent: false
-            memory_size: 8
-        """
-    )
-
-
-@patch("mlagents.trainers.brain.BrainParameters")
-def test_initialize_trainer_parameters_override_defaults(
-    BrainParametersMock, dummy_config_with_override
-):
-    run_id = "testrun"
-    output_path = "model_dir"
-    keep_checkpoints = 1
-    train_model = True
-    load_model = False
-    seed = 11
-    expected_reward_buff_cap = 1
-
-    base_config = dummy_config_with_override
-    expected_config = base_config["default"]
-    expected_config["output_path"] = output_path + "/testbrain"
-    expected_config["keep_checkpoints"] = keep_checkpoints
-
-    # Override value from specific brain config
-    expected_config["normalize"] = False
-
-    brain_params_mock = BrainParametersMock()
-    BrainParametersMock.return_value.brain_name = "testbrain"
-    external_brains = {"testbrain": brain_params_mock}
-
-    def mock_constructor(
-        self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id
-    ):
-        assert brain == brain_params_mock.brain_name
-        assert trainer_parameters == expected_config
-        assert reward_buff_cap == expected_reward_buff_cap
-        assert training == train_model
-        assert load == load_model
-        assert seed == seed
-        assert run_id == run_id
-
-    with patch.object(PPOTrainer, "__init__", mock_constructor):
-        trainer_factory = trainer_util.TrainerFactory(
-            trainer_config=base_config,
-            run_id=run_id,
-            output_path=output_path,
-            keep_checkpoints=keep_checkpoints,
-            train_model=train_model,
-            load_model=load_model,
-            seed=seed,
-        )
-        trainers = {}
-        for _, brain_parameters in external_brains.items():
-            trainers["testbrain"] = trainer_factory.generate(
-                brain_parameters.brain_name
-            )
-        assert "testbrain" in trainers
-        assert isinstance(trainers["testbrain"], PPOTrainer)
+    return RunOptions(behaviors={"testbrain": PPO_CONFIG})


@patch("mlagents.trainers.brain.BrainParameters")
    external_brains = {"testbrain": BrainParametersMock()}
    run_id = "testrun"
    output_path = "results_dir"
-    keep_checkpoints = 1
-    base_config = dummy_config
-    expected_config = base_config["default"]
-    expected_config["output_path"] = output_path + "/testbrain"
-    expected_config["keep_checkpoints"] = keep_checkpoints
+    base_config = dummy_config.behaviors
+    expected_config = PPO_CONFIG
-        self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id
+        self, brain, reward_buff_cap, trainer_settings, training, load, seed, run_id
-        assert trainer_parameters == expected_config
+        assert trainer_settings == expected_config
        assert reward_buff_cap == expected_reward_buff_cap
        assert training == train_model
        assert load == load_model
            trainer_config=base_config,
            run_id=run_id,
            output_path=output_path,
-            keep_checkpoints=keep_checkpoints,
            train_model=train_model,
            load_model=load_model,
            seed=seed,


@patch("mlagents.trainers.brain.BrainParameters")
-def test_initialize_invalid_trainer_raises_exception(
-    BrainParametersMock, dummy_bad_config
-):
-    run_id = "testrun"
-    output_path = "results_dir"
-    keep_checkpoints = 1
-    train_model = True
-    load_model = False
-    seed = 11
-    bad_config = dummy_bad_config
-    BrainParametersMock.return_value.brain_name = "testbrain"
-    external_brains = {"testbrain": BrainParametersMock()}
-
-    with pytest.raises(TrainerConfigError):
-        trainer_factory = trainer_util.TrainerFactory(
-            trainer_config=bad_config,
-            run_id=run_id,
-            output_path=output_path,
-            keep_checkpoints=keep_checkpoints,
-            train_model=train_model,
-            load_model=load_model,
-            seed=seed,
-        )
-        trainers = {}
-        for brain_name, brain_parameters in external_brains.items():
-            trainers[brain_name] = trainer_factory.generate(brain_parameters.brain_name)
-
-    # Test no trainer specified
-    del bad_config["default"]["trainer"]
-    with pytest.raises(TrainerConfigError):
-        trainer_factory = trainer_util.TrainerFactory(
-            trainer_config=bad_config,
-            run_id=run_id,
-            output_path=output_path,
-            keep_checkpoints=keep_checkpoints,
-            train_model=train_model,
-            load_model=load_model,
-            seed=seed,
-        )
-        trainers = {}
-        for brain_name, brain_parameters in external_brains.items():
-            trainers[brain_name] = trainer_factory.generate(brain_parameters.brain_name)
-
-    # Test BC trainer specified
-    bad_config["default"]["trainer"] = "offline_bc"
-    with pytest.raises(UnityTrainerException):
-        trainer_factory = trainer_util.TrainerFactory(
-            trainer_config=bad_config,
-            run_id=run_id,
-            output_path=output_path,
-            keep_checkpoints=keep_checkpoints,
-            train_model=train_model,
-            load_model=load_model,
-            seed=seed,
-        )
-        trainers = {}
-        for brain_name, brain_parameters in external_brains.items():
-            trainers[brain_name] = trainer_factory.generate(brain_parameters.brain_name)
-
-
-def test_handles_no_default_section(dummy_config):
+def test_handles_no_config_provided(BrainParametersMock):
-    Make sure the trainer setup handles a missing "default" in the config.
+    Make sure the trainer setup handles no configs provided at all.
-    no_default_config = {brain_name: dummy_config["default"]}
+    no_default_config = RunOptions().behaviors
    brain_parameters = BrainParameters(
        brain_name=brain_name,
        vector_observation_space_size=1,
        trainer_config=no_default_config,
        run_id="testrun",
        output_path="output_path",
-        keep_checkpoints=1,
        train_model=True,
        load_model=False,
        seed=42,

-def test_raise_if_no_config_for_brain(dummy_config):
-    """
-    Make sure the trainer setup raises a friendlier exception if both "default" and the brain name
-    are missing from the config.
-    """
-    brain_name = "testbrain"
-    bad_config = {"some_other_brain": dummy_config["default"]}
-    brain_parameters = BrainParameters(
-        brain_name=brain_name,
-        vector_observation_space_size=1,
-        camera_resolutions=[],
-        vector_action_space_size=[2],
-        vector_action_descriptions=[],
-        vector_action_space_type=0,
-    )
-
-    trainer_factory = trainer_util.TrainerFactory(
-        trainer_config=bad_config,
-        run_id="testrun",
-        output_path="output_path",
-        keep_checkpoints=1,
-        train_model=True,
-        load_model=False,
-        seed=42,
-    )
-    with pytest.raises(TrainerConfigError):
-        trainer_factory.generate(brain_parameters)
-
-
 def test_load_config_missing_file():
    with pytest.raises(TrainerConfigError):
        load_config("thisFileDefinitelyDoesNotExist.yaml")
    with pytest.raises(TrainerConfigError):
        fp = io.StringIO(file_contents)
        _load_config(fp)
-
-
-def test_assemble_curriculum_config():
-    file_contents = """
-behavior1:
-    curriculum:
-        foo: 5
-behavior2:
-    curriculum:
-        foo: 6
-    """
-    trainer_config = _load_config(file_contents)
-    curriculum_config = assemble_curriculum_config(trainer_config)
-    assert curriculum_config == {"behavior1": {"foo": 5}, "behavior2": {"foo": 6}}
-
-    # Check that nothing is returned if no curriculum.
-    file_contents = """
-behavior1:
-    foo: 3
-behavior2:
-    foo: 4
-    """
-    trainer_config = _load_config(file_contents)
-    curriculum_config = assemble_curriculum_config(trainer_config)
-    assert curriculum_config == {}
-
-    # Check that method doesn't break if 1st level entity isn't a dict.
-    # Note: this is a malformed configuration.
-    file_contents = """
-behavior1: 3
-behavior2: 4
-    """
-    trainer_config = _load_config(file_contents)
-    curriculum_config = assemble_curriculum_config(trainer_config)
-    assert curriculum_config == {}


 def test_existing_directories(tmp_path):
--- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
 from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
 from mlagents.trainers.buffer import AgentBuffer
 from mlagents.trainers.trainer import Trainer
-from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.components.reward_signals import RewardSignalResult
 from mlagents_envs.timers import hierarchical_timer
 from mlagents.trainers.agent_processor import AgentManagerQueue

    def __init__(self, *args, **kwargs):
        super(RLTrainer, self).__init__(*args, **kwargs)
-        # Make sure we have at least one reward_signal
-        if not self.trainer_parameters["reward_signals"]:
-            raise UnityTrainerException(
-                "No reward signals were defined. At least one must be used with {}.".format(
-                    self.__class__.__name__
-                )
-            )
        # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
        # used for reporting only. We always want to report the environment reward to Tensorboard, regardless
        # of what reward signals are actually present.
        }
        self.update_buffer: AgentBuffer = AgentBuffer()
        self._stats_reporter.add_property(
-            StatsPropertyType.HYPERPARAMETERS, self.trainer_parameters
+            StatsPropertyType.HYPERPARAMETERS, self.trainer_settings.as_dict()
        )

    def end_episode(self) -> None:
--- a/ml-agents/mlagents/trainers/trainer/trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/trainer.py
 # # Unity ML-Agents Toolkit
-from typing import Dict, List, Deque, Any
+from typing import List, Deque
 import abc

 from collections import deque
 from mlagents.trainers.agent_processor import AgentManagerQueue
 from mlagents.trainers.brain import BrainParameters
 from mlagents.trainers.policy import Policy
-from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.settings import TrainerSettings


 logger = get_logger(__name__)
    def __init__(
        self,
        brain_name: str,
-        trainer_parameters: dict,
+        trainer_settings: TrainerSettings,
        training: bool,
        run_id: str,
        reward_buff_cap: int = 1,
        :BrainParameters brain: Brain to be trained.
-        :dict trainer_parameters: The parameters for the trainer (dictionary).
+        :dict trainer_settings: The parameters for the trainer (dictionary).
        :bool training: Whether the trainer is set for training.
        :str run_id: The identifier of the current run
        :int reward_buff_cap:
        self.run_id = run_id
-        self.trainer_parameters = trainer_parameters
-        self._threaded = trainer_parameters.get("threaded", True)
+        self.trainer_settings = trainer_settings
+        self._threaded = trainer_settings.threaded
        self._stats_reporter = StatsReporter(brain_name)
        self.is_training = training
        self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)
-        self.summary_freq = self.trainer_parameters["summary_freq"]
+        self.summary_freq = self.trainer_settings.summary_freq
        self.next_summary_step = self.summary_freq

    @property
        """
        return self._stats_reporter

-    def _check_param_keys(self):
-        for k in self.param_keys:
-            if k not in self.trainer_parameters:
-                raise UnityTrainerException(
-                    "The hyper-parameter {0} could not be found for the {1} trainer of "
-                    "brain {2}.".format(k, self.__class__, self.brain_name)
-                )
-
-    def parameters(self) -> Dict[str, Any]:
+    def parameters(self) -> TrainerSettings:
-        return self.trainer_parameters
+        return self.trainer_settings

    @property
    def get_max_steps(self) -> int:
        """
-        return int(float(self.trainer_parameters["max_steps"]))
+        return self.trainer_settings.max_steps

    @property
    def get_step(self) -> int:
--- a/ml-agents/mlagents/trainers/trainer_controller.py
+++ b/ml-agents/mlagents/trainers/trainer_controller.py
 """Launches trainers for each External Brains in a Unity Environment."""

 import os
-import sys
 import threading
 from typing import Dict, Optional, Set, List
 from collections import defaultdict
 from mlagents.trainers.trainer_util import TrainerFactory
 from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
 from mlagents.trainers.agent_processor import AgentManager
+from mlagents.trainers.settings import CurriculumSettings


 class TrainerController(object):
                # Skip brains that are in the metacurriculum but no trainer yet.
                if brain_name not in self.trainers:
                    continue
-                if curriculum.measure == "progress":
+                if curriculum.measure == CurriculumSettings.MeasureType.PROGRESS:
-                elif curriculum.measure == "reward":
+                elif curriculum.measure == CurriculumSettings.MeasureType.REWARD:
                    measure_val = np.mean(self.trainers[brain_name].reward_buffer)
                    brain_names_to_measure_vals[brain_name] = measure_val
        else:
            policy,
            name_behavior_id,
            trainer.stats_reporter,
-            trainer.parameters.get("time_horizon", sys.maxsize),
+            trainer.parameters.time_horizon,
            threaded=trainer.threaded,
        )
        env_manager.set_agent_manager(name_behavior_id, agent_manager)
--- a/ml-agents/mlagents/trainers/trainer_util.py
+++ b/ml-agents/mlagents/trainers/trainer_util.py
 import os
-import yaml
-from typing import Any, Dict, TextIO
+from typing import Dict

 from mlagents_envs.logging_util import get_logger
 from mlagents.trainers.meta_curriculum import MetaCurriculum
 from mlagents.trainers.sac.trainer import SACTrainer
 from mlagents.trainers.ghost.trainer import GhostTrainer
 from mlagents.trainers.ghost.controller import GhostController
+from mlagents.trainers.settings import TrainerSettings, TrainerType


 logger = get_logger(__name__)
    def __init__(
        self,
-        trainer_config: Any,
+        trainer_config: Dict[str, TrainerSettings],
-        keep_checkpoints: int,
        train_model: bool,
        load_model: bool,
        seed: int,
        self.run_id = run_id
        self.output_path = output_path
        self.init_path = init_path
-        self.keep_checkpoints = keep_checkpoints
        self.train_model = train_model
        self.load_model = load_model
        self.seed = seed

    def generate(self, brain_name: str) -> Trainer:
        return initialize_trainer(
-            self.trainer_config,
+            self.trainer_config[brain_name],
-            self.keep_checkpoints,
            self.train_model,
            self.load_model,
            self.ghost_controller,


 def initialize_trainer(
-    trainer_config: Any,
+    trainer_settings: TrainerSettings,
-    keep_checkpoints: int,
    train_model: bool,
    load_model: bool,
    ghost_controller: GhostController,
    Initializes a trainer given a provided trainer configuration and brain parameters, as well as
    some general training session options.

-    :param trainer_config: Original trainer configuration loaded from YAML
+    :param trainer_settings: Original trainer configuration loaded from YAML
    :param brain_name: Name of the brain to be associated with trainer
    :param run_id: Run ID to associate with this training run
    :param output_path: Path to save the model and summary statistics
    :param meta_curriculum: Optional meta_curriculum, used to determine a reward buffer length for PPOTrainer
    :return:
    """
-    if "default" not in trainer_config and brain_name not in trainer_config:
-        raise TrainerConfigError(
-            f'Trainer config must have either a "default" section, or a section for the brain name {brain_name}. '
-            "See the config/ directory for examples."
-        )
-
-    trainer_parameters = trainer_config.get("default", {}).copy()
-    trainer_parameters["output_path"] = os.path.join(output_path, brain_name)
+    trainer_settings.output_path = os.path.join(output_path, brain_name)
-        trainer_parameters["init_path"] = os.path.join(init_path, brain_name)
-    trainer_parameters["keep_checkpoints"] = keep_checkpoints
-    if brain_name in trainer_config:
-        _brain_key: Any = brain_name
-        while not isinstance(trainer_config[_brain_key], dict):
-            _brain_key = trainer_config[_brain_key]
-        trainer_parameters.update(trainer_config[_brain_key])
-
-    if init_path is not None:
-        trainer_parameters["init_path"] = "{basedir}/{name}".format(
-            basedir=init_path, name=brain_name
-        )
+        trainer_settings.init_path = os.path.join(init_path, brain_name)

    min_lesson_length = 1
    if meta_curriculum:
            )

    trainer: Trainer = None  # type: ignore  # will be set to one of these, or raise
-    if "trainer" not in trainer_parameters:
-        raise TrainerConfigError(
-            f'The "trainer" key must be set in your trainer config for brain {brain_name} (or the default brain).'
-        )
-    trainer_type = trainer_parameters["trainer"]
+    trainer_type = trainer_settings.trainer_type
-    if trainer_type == "offline_bc":
-        raise UnityTrainerException(
-            "The offline_bc trainer has been removed. To train with demonstrations, "
-            "please use a PPO or SAC trainer with the GAIL Reward Signal and/or the "
-            "Behavioral Cloning feature enabled."
-        )
-    elif trainer_type == "ppo":
+    if trainer_type == TrainerType.PPO:
-            trainer_parameters,
+            trainer_settings,
-    elif trainer_type == "sac":
+    elif trainer_type == TrainerType.SAC:
-            trainer_parameters,
+            trainer_settings,
-
-    if "self_play" in trainer_parameters:
+    if trainer_settings.self_play is not None:
-            trainer_parameters,
+            trainer_settings,
-
-
-def load_config(config_path: str) -> Dict[str, Any]:
-    try:
-        with open(config_path) as data_file:
-            return _load_config(data_file)
-    except IOError:
-        abs_path = os.path.abspath(config_path)
-        raise TrainerConfigError(f"Config file could not be found at {abs_path}.")
-    except UnicodeDecodeError:
-        raise TrainerConfigError(
-            f"There was an error decoding Config file from {config_path}. "
-            f"Make sure your file is save using UTF-8"
-        )
-
-
-def _load_config(fp: TextIO) -> Dict[str, Any]:
-    """
-    Load the yaml config from the file-like object.
-    """
-    try:
-        return yaml.safe_load(fp)
-    except yaml.parser.ParserError as e:
-        raise TrainerConfigError(
-            "Error parsing yaml file. Please check for formatting errors. "
-            "A tool such as http://www.yamllint.com/ can be helpful with this."
-        ) from e
-
-
-def assemble_curriculum_config(trainer_config: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Assembles a curriculum config Dict from a trainer config. The resulting
-    dictionary should have a mapping of {brain_name: config}, where config is another
-    Dict that
-    :param trainer_config: Dict of trainer configurations (keys are brain_names).
-    :return: Dict of curriculum configurations. Returns empty dict if none are found.
-    """
-    curriculum_config: Dict[str, Any] = {}
-    for behavior_name, behavior_config in trainer_config.items():
-        # Don't try to iterate non-Dicts. This probably means your config is malformed.
-        if isinstance(behavior_config, dict) and "curriculum" in behavior_config:
-            curriculum_config[behavior_name] = behavior_config["curriculum"]
-    return curriculum_config


 def handle_existing_directories(
--- a/ml-agents/setup.py
+++ b/ml-agents/setup.py
        "protobuf>=3.6",
        "pyyaml>=3.1.0",
        "tensorflow>=1.7,<3.0",
+        "cattrs>=1.0.0",
+        "attrs>=19.3.0",
        'pypiwin32==223;platform_system=="Windows"',
        # We don't actually need six, but tensorflow does, and pip seems
        # to get confused and install the wrong version.
--- a/ml-agents/tests/yamato/training_int_tests.py
+++ b/ml-agents/tests/yamato/training_int_tests.py

    # Copy the default training config but override the max_steps parameter,
    # and reduce the batch_size and buffer_size enough to ensure an update step happens.
-    overrides = {"max_steps": 100, "batch_size": 10, "buffer_size": 10}
+        overrides = {"max_steps": 100, "batch_size": 10, "buffer_size": 10}
-        override_config_file("config/ppo/3DBall.yaml", yaml_out, **overrides)
+        overrides = {
+            "hyperparameters": {"batch_size": 10, "buffer_size": 10},
+            "max_steps": 100,
+        }
+        override_config_file("config/ppo/3DBall.yaml", yaml_out, overrides)

    mla_learn_cmd = (
        f"mlagents-learn {yaml_out} --force --env="
--- a/ml-agents/tests/yamato/yamato_utils.py
+++ b/ml-agents/tests/yamato/yamato_utils.py
    subprocess.check_call("rm -rf Project/Library", shell=True)


-def override_config_file(src_path, dest_path, **kwargs):
+def override_config_file(src_path, dest_path, overrides):
    """
    Override settings in a trainer config file. For example,
        override_config_file(src_path, dest_path, max_steps=42)
        behavior_configs = configs["behaviors"]

    for config in behavior_configs.values():
-        config.update(**kwargs)
+        _override_config_dict(config, overrides)
+
+
+def _override_config_dict(config, overrides):
+    for key, val in overrides.items():
+        if isinstance(val, dict):
+            _override_config_dict(config[key], val)
+        else:
+            config[key] = val


 def override_legacy_config_file(python_version, src_path, dest_path, **kwargs):
--- a/config/upgrade_config.py
+++ b/config/upgrade_config.py
+import attr
+import cattr
+import yaml
+from typing import Dict, Any
+import argparse
+from mlagents.trainers.settings import TrainerSettings, NetworkSettings, TrainerType
+from mlagents.trainers.cli_utils import load_config
+
+
+# Take an existing trainer config (e.g. trainer_config.yaml) and turn it into the new format.
+def convert_behaviors(old_trainer_config: Dict[str, Any]) -> Dict[str, Any]:
+    all_behavior_config_dict = {}
+    default_config = old_trainer_config.get("default", {})
+    for behavior_name, config in old_trainer_config.items():
+        if behavior_name != "default":
+            config = default_config.copy()
+            config.update(old_trainer_config[behavior_name])
+
+            # Convert to split TrainerSettings, Hyperparameters, NetworkSettings
+            # Set trainer_type and get appropriate hyperparameter settings
+            trainer_type = config["trainer"]
+            new_config = {}
+            new_config["trainer_type"] = trainer_type
+            hyperparam_cls = TrainerType(trainer_type).to_settings()
+            # Try to absorb as much as possible into the hyperparam_cls
+            new_config["hyperparameters"] = cattr.structure(config, hyperparam_cls)
+
+            # Try to absorb as much as possible into the network settings
+            new_config["network_settings"] = cattr.structure(config, NetworkSettings)
+            # Deal with recurrent
+            if config["use_recurrent"]:
+                new_config["network_settings"].memory = NetworkSettings.MemorySettings(
+                    sequence_length=config["sequence_length"],
+                    memory_size=config["memory_size"],
+                )
+
+            # Absorb the rest into the base TrainerSettings
+            for key, val in config.items():
+                if key in attr.fields_dict(TrainerSettings):
+                    new_config[key] = val
+
+            # Structure the whole thing
+            all_behavior_config_dict[behavior_name] = cattr.structure(
+                new_config, TrainerSettings
+            )
+    return all_behavior_config_dict
+
+
+def write_to_yaml_file(config: Dict[str, Any], output_config: str):
+    unstructed_config = cattr.unstructure(config)
+    unstructed_config = remove_nones(unstructed_config)
+    with open(output_config, "w") as f:
+        try:
+            yaml.dump(unstructed_config, f, sort_keys=False)
+        except TypeError:  # Older versions of pyyaml don't support sort_keys
+            yaml.dump(unstructed_config, f)
+
+
+def remove_nones(config: Dict[Any, Any]):
+    new_config = {}
+    for key, val in config.items():
+        if isinstance(val, dict):
+            new_config[key] = remove_nones(val)
+        elif val is not None:
+            new_config[key] = val
+    return new_config
+
+
+if __name__ == "__main__":
+
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    argparser.add_argument(
+        "trainer_config_path",
+        help="Path to old format (<=0.16.X) trainer configuration YAML.",
+    )
+    argparser.add_argument(
+        "--curriculum",
+        help="Path to old format (<=0.16.X) curriculum configuration YAML.",
+        default=None,
+    )
+    argparser.add_argument(
+        "--sampler",
+        help="Path to old format (<=0.16.X) parameter randomization configuration YAML.",
+        default=None,
+    )
+    argparser.add_argument(
+        "output_config_path", help="Path to write converted YAML file."
+    )
+    args = argparser.parse_args()
+    print(
+        f"Converting {args.trainer_config_path} and saving to {args.output_config_path}."
+    )
+
+    old_config = load_config(args.trainer_config_path)
+    behavior_config_dict = convert_behaviors(old_config)
+    full_config = {"behaviors": behavior_config_dict}
+
+    # Convert curriculum and sampler. note that we don't validate these; if it was correct
+    # before it should be correct now.
+    if args.curriculum is not None:
+        curriculum_config_dict = load_config(args.curriculum)
+        full_config["curriculum"] = curriculum_config_dict
+
+    if args.sampler is not None:
+        sampler_config_dict = load_config(args.curriculum)
+        full_config["parameter_randomization"] = sampler_config_dict
+
+    write_to_yaml_file(full_config, args.output_config_path)
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
+import attr
+import cattr
+from typing import Dict, Optional, List, Any, DefaultDict, Mapping
+from enum import Enum
+import collections
+import argparse
+
+from mlagents.trainers.cli_utils import StoreConfigFile, DetectDefault, parser
+from mlagents.trainers.cli_utils import load_config
+from mlagents.trainers.exception import TrainerConfigError
+from mlagents.trainers.models import ScheduleType, EncoderType
+
+
+def check_and_structure(key: str, value: Any, class_type: type) -> Any:
+    attr_fields_dict = attr.fields_dict(class_type)
+    if key not in attr_fields_dict:
+        raise TrainerConfigError(
+            f"The option {key} was specified in your YAML file for {class_type.__name__}, but is invalid."
+        )
+    # Apply cattr structure to the values
+    return cattr.structure(value, attr_fields_dict[key].type)
+
+
+def strict_to_cls(d: Mapping, t: type) -> Any:
+    if not isinstance(d, Mapping):
+        raise TrainerConfigError(f"Unsupported config {d} for {t.__name__}.")
+    d_copy: Dict[str, Any] = {}
+    d_copy.update(d)
+    for key, val in d_copy.items():
+        d_copy[key] = check_and_structure(key, val, t)
+    return t(**d_copy)
+
+
+def defaultdict_to_dict(d: DefaultDict) -> Dict:
+    return {key: cattr.unstructure(val) for key, val in d.items()}
+
+
+@attr.s(auto_attribs=True)
+class ExportableSettings:
+    def as_dict(self):
+        return cattr.unstructure(self)
+
+
+@attr.s(auto_attribs=True)
+class NetworkSettings:
+    @attr.s(auto_attribs=True)
+    class MemorySettings:
+        sequence_length: int = 64
+        memory_size: int = 128
+
+    normalize: bool = False
+    hidden_units: int = 128
+    num_layers: int = 2
+    vis_encode_type: EncoderType = EncoderType.SIMPLE
+    memory: Optional[MemorySettings] = None
+
+
+@attr.s(auto_attribs=True)
+class BehavioralCloningSettings:
+    demo_path: str
+    steps: int = 0
+    strength: float = 1.0
+    samples_per_update: int = 0
+    # Setting either of these to None will allow the Optimizer
+    # to decide these parameters, based on Trainer hyperparams
+    num_epoch: Optional[int] = None
+    batch_size: Optional[int] = None
+
+
+@attr.s(auto_attribs=True)
+class HyperparamSettings:
+    batch_size: int = 1024
+    buffer_size: int = 10240
+    learning_rate: float = 3.0e-4
+    learning_rate_schedule: ScheduleType = ScheduleType.CONSTANT
+
+
+@attr.s(auto_attribs=True)
+class PPOSettings(HyperparamSettings):
+    beta: float = 5.0e-3
+    epsilon: float = 0.2
+    lambd: float = 0.95
+    num_epoch: int = 3
+    learning_rate_schedule: ScheduleType = ScheduleType.LINEAR
+
+
+@attr.s(auto_attribs=True)
+class SACSettings(HyperparamSettings):
+    batch_size: int = 128
+    buffer_size: int = 50000
+    buffer_init_steps: int = 0
+    tau: float = 0.005
+    steps_per_update: float = 1
+    save_replay_buffer: bool = False
+    init_entcoef: float = 1.0
+    reward_signal_steps_per_update: float = attr.ib()
+
+    @reward_signal_steps_per_update.default
+    def _reward_signal_steps_per_update_default(self):
+        return self.steps_per_update
+
+
+class RewardSignalType(Enum):
+    EXTRINSIC: str = "extrinsic"
+    GAIL: str = "gail"
+    CURIOSITY: str = "curiosity"
+
+    def to_settings(self) -> type:
+        _mapping = {
+            RewardSignalType.EXTRINSIC: RewardSignalSettings,
+            RewardSignalType.GAIL: GAILSettings,
+            RewardSignalType.CURIOSITY: CuriositySettings,
+        }
+        return _mapping[self]
+
+
+@attr.s(auto_attribs=True)
+class RewardSignalSettings:
+    gamma: float = 0.99
+    strength: float = 1.0
+
+    @staticmethod
+    def structure(d: Mapping, t: type) -> Any:
+        """
+        Helper method to structure a Dict of RewardSignalSettings class. Meant to be registered with
+        cattr.register_structure_hook() and called with cattr.structure(). This is needed to handle
+        the special Enum selection of RewardSignalSettings classes.
+        """
+        if not isinstance(d, Mapping):
+            raise TrainerConfigError(f"Unsupported reward signal configuration {d}.")
+        d_final: Dict[RewardSignalType, RewardSignalSettings] = {}
+        for key, val in d.items():
+            enum_key = RewardSignalType(key)
+            t = enum_key.to_settings()
+            d_final[enum_key] = strict_to_cls(val, t)
+        return d_final
+
+
+@attr.s(auto_attribs=True)
+class GAILSettings(RewardSignalSettings):
+    encoding_size: int = 64
+    learning_rate: float = 3e-4
+    use_actions: bool = False
+    use_vail: bool = False
+    demo_path: str = attr.ib(kw_only=True)
+
+
+@attr.s(auto_attribs=True)
+class CuriositySettings(RewardSignalSettings):
+    encoding_size: int = 64
+    learning_rate: float = 3e-4
+
+
+@attr.s(auto_attribs=True)
+class SelfPlaySettings:
+    save_steps: int = 20000
+    team_change: int = attr.ib()
+
+    @team_change.default
+    def _team_change_default(self):
+        # Assign team_change to about 4x save_steps
+        return self.save_steps * 5
+
+    swap_steps: int = 10000
+    window: int = 10
+    play_against_latest_model_ratio: float = 0.5
+    initial_elo: float = 1200.0
+
+
+class TrainerType(Enum):
+    PPO: str = "ppo"
+    SAC: str = "sac"
+
+    def to_settings(self) -> type:
+        _mapping = {TrainerType.PPO: PPOSettings, TrainerType.SAC: SACSettings}
+        return _mapping[self]
+
+
+@attr.s(auto_attribs=True)
+class TrainerSettings(ExportableSettings):
+    trainer_type: TrainerType = TrainerType.PPO
+    hyperparameters: HyperparamSettings = attr.ib()
+
+    @hyperparameters.default
+    def _set_default_hyperparameters(self):
+        return self.trainer_type.to_settings()()
+
+    network_settings: NetworkSettings = attr.ib(factory=NetworkSettings)
+    reward_signals: Dict[RewardSignalType, RewardSignalSettings] = attr.ib(
+        factory=lambda: {RewardSignalType.EXTRINSIC: RewardSignalSettings()}
+    )
+    init_path: Optional[str] = None
+    output_path: str = "default"
+    keep_checkpoints: int = 5
+    max_steps: int = 500000
+    time_horizon: int = 64
+    summary_freq: int = 50000
+    threaded: bool = True
+    self_play: Optional[SelfPlaySettings] = None
+    behavioral_cloning: Optional[BehavioralCloningSettings] = None
+
+    cattr.register_structure_hook(
+        Dict[RewardSignalType, RewardSignalSettings], RewardSignalSettings.structure
+    )
+
+    @network_settings.validator
+    def _check_batch_size_seq_length(self, attribute, value):
+        if self.network_settings.memory is not None:
+            if (
+                self.network_settings.memory.sequence_length
+                > self.hyperparameters.batch_size
+            ):
+                raise TrainerConfigError(
+                    "When using memory, sequence length must be less than or equal to batch size. "
+                )
+
+    @staticmethod
+    def dict_to_defaultdict(d: Dict, t: type) -> DefaultDict:
+        return collections.defaultdict(
+            TrainerSettings, cattr.structure(d, Dict[str, TrainerSettings])
+        )
+
+    @staticmethod
+    def structure(d: Mapping, t: type) -> Any:
+        """
+        Helper method to structure a TrainerSettings class. Meant to be registered with
+        cattr.register_structure_hook() and called with cattr.structure().
+        """
+        if not isinstance(d, Mapping):
+            raise TrainerConfigError(f"Unsupported config {d} for {t.__name__}.")
+        d_copy: Dict[str, Any] = {}
+        d_copy.update(d)
+
+        for key, val in d_copy.items():
+            if attr.has(type(val)):
+                # Don't convert already-converted attrs classes.
+                continue
+            if key == "hyperparameters":
+                if "trainer_type" not in d_copy:
+                    raise TrainerConfigError(
+                        "Hyperparameters were specified but no trainer_type was given."
+                    )
+                else:
+                    d_copy[key] = strict_to_cls(
+                        d_copy[key], TrainerType(d_copy["trainer_type"]).to_settings()
+                    )
+            elif key == "max_steps":
+                d_copy[key] = int(float(val))
+                # In some legacy configs, max steps was specified as a float
+            else:
+                d_copy[key] = check_and_structure(key, val, t)
+        return t(**d_copy)
+
+
+@attr.s(auto_attribs=True)
+class CurriculumSettings:
+    class MeasureType:
+        PROGRESS: str = "progress"
+        REWARD: str = "reward"
+
+    measure: str = attr.ib(default=MeasureType.REWARD)
+    thresholds: List[int] = attr.ib(factory=list)
+    min_lesson_length: int = 0
+    signal_smoothing: bool = True
+    parameters: Dict[str, List[float]] = attr.ib(kw_only=True)
+
+
+@attr.s(auto_attribs=True)
+class CheckpointSettings:
+    save_freq: int = parser.get_default("save_freq")
+    run_id: str = parser.get_default("run_id")
+    initialize_from: str = parser.get_default("initialize_from")
+    load_model: bool = parser.get_default("load_model")
+    resume: bool = parser.get_default("resume")
+    force: bool = parser.get_default("force")
+    train_model: bool = parser.get_default("train_model")
+    inference: bool = parser.get_default("inference")
+    lesson: int = parser.get_default("lesson")
+
+
+@attr.s(auto_attribs=True)
+class EnvironmentSettings:
+    env_path: Optional[str] = parser.get_default("env_path")
+    env_args: Optional[List[str]] = parser.get_default("env_args")
+    base_port: int = parser.get_default("base_port")
+    num_envs: int = parser.get_default("num_envs")
+    seed: int = parser.get_default("seed")
+
+
+@attr.s(auto_attribs=True)
+class EngineSettings:
+    width: int = parser.get_default("width")
+    height: int = parser.get_default("height")
+    quality_level: int = parser.get_default("quality_level")
+    time_scale: float = parser.get_default("time_scale")
+    target_frame_rate: int = parser.get_default("target_frame_rate")
+    capture_frame_rate: int = parser.get_default("capture_frame_rate")
+    no_graphics: bool = parser.get_default("no_graphics")
+
+
+@attr.s(auto_attribs=True)
+class RunOptions(ExportableSettings):
+    behaviors: DefaultDict[str, TrainerSettings] = attr.ib(
+        factory=lambda: collections.defaultdict(TrainerSettings)
+    )
+    env_settings: EnvironmentSettings = attr.ib(factory=EnvironmentSettings)
+    engine_settings: EngineSettings = attr.ib(factory=EngineSettings)
+    parameter_randomization: Optional[Dict] = None
+    curriculum: Optional[Dict[str, CurriculumSettings]] = None
+    checkpoint_settings: CheckpointSettings = attr.ib(factory=CheckpointSettings)
+
+    # These are options that are relevant to the run itself, and not the engine or environment.
+    # They will be left here.
+    debug: bool = parser.get_default("debug")
+    # Strict conversion
+    cattr.register_structure_hook(EnvironmentSettings, strict_to_cls)
+    cattr.register_structure_hook(EngineSettings, strict_to_cls)
+    cattr.register_structure_hook(CheckpointSettings, strict_to_cls)
+    cattr.register_structure_hook(CurriculumSettings, strict_to_cls)
+    cattr.register_structure_hook(TrainerSettings, TrainerSettings.structure)
+    cattr.register_structure_hook(
+        DefaultDict[str, TrainerSettings], TrainerSettings.dict_to_defaultdict
+    )
+    cattr.register_unstructure_hook(collections.defaultdict, defaultdict_to_dict)
+
+    @staticmethod
+    def from_argparse(args: argparse.Namespace) -> "RunOptions":
+        """
+        Takes an argparse.Namespace as specified in `parse_command_line`, loads input configuration files
+        from file paths, and converts to a RunOptions instance.
+        :param args: collection of command-line parameters passed to mlagents-learn
+        :return: RunOptions representing the passed in arguments, with trainer config, curriculum and sampler
+          configs loaded from files.
+        """
+        argparse_args = vars(args)
+        config_path = StoreConfigFile.trainer_config_path
+
+        # Load YAML
+        configured_dict: Dict[str, Any] = {
+            "checkpoint_settings": {},
+            "env_settings": {},
+            "engine_settings": {},
+        }
+        if config_path is not None:
+            configured_dict.update(load_config(config_path))
+
+        # Use the YAML file values for all values not specified in the CLI.
+        for key in configured_dict.keys():
+            # Detect bad config options
+            if key not in attr.fields_dict(RunOptions):
+                raise TrainerConfigError(
+                    "The option {} was specified in your YAML file, but is invalid.".format(
+                        key
+                    )
+                )
+        # Override with CLI args
+        # Keep deprecated --load working, TODO: remove
+        argparse_args["resume"] = argparse_args["resume"] or argparse_args["load_model"]
+        for key, val in argparse_args.items():
+            if key in DetectDefault.non_default_args:
+                if key in attr.fields_dict(CheckpointSettings):
+                    configured_dict["checkpoint_settings"][key] = val
+                elif key in attr.fields_dict(EnvironmentSettings):
+                    configured_dict["env_settings"][key] = val
+                elif key in attr.fields_dict(EngineSettings):
+                    configured_dict["engine_settings"][key] = val
+                else:  # Base options
+                    configured_dict[key] = val
+        return RunOptions.from_dict(configured_dict)
+
+    @staticmethod
+    def from_dict(options_dict: Dict[str, Any]) -> "RunOptions":
+        return cattr.structure(options_dict, RunOptions)
--- a/ml-agents/mlagents/trainers/tests/test_settings.py
+++ b/ml-agents/mlagents/trainers/tests/test_settings.py
+import attr
+import pytest
+
+from typing import Dict
+
+from mlagents.trainers.settings import (
+    RunOptions,
+    TrainerSettings,
+    PPOSettings,
+    SACSettings,
+    RewardSignalType,
+    RewardSignalSettings,
+    CuriositySettings,
+    TrainerType,
+    strict_to_cls,
+)
+from mlagents.trainers.exception import TrainerConfigError
+
+
+def check_if_different(testobj1: object, testobj2: object) -> None:
+    assert testobj1 is not testobj2
+    if attr.has(testobj1.__class__) and attr.has(testobj2.__class__):
+        for key, val in attr.asdict(testobj1, recurse=False).items():
+            if isinstance(val, dict) or isinstance(val, list) or attr.has(val):
+                # Note: this check doesn't check the contents of mutables.
+                check_if_different(val, attr.asdict(testobj2, recurse=False)[key])
+
+
+def test_is_new_instance():
+    """
+    Verify that every instance of RunOptions() and its subclasses
+    is a new instance (i.e. all factory methods are used properly.)
+    """
+    check_if_different(RunOptions(), RunOptions())
+    check_if_different(TrainerSettings(), TrainerSettings())
+
+
+def test_no_configuration():
+    """
+    Verify that a new config will have a PPO trainer with extrinsic rewards.
+    """
+    blank_runoptions = RunOptions()
+    assert isinstance(blank_runoptions.behaviors["test"], TrainerSettings)
+    assert isinstance(blank_runoptions.behaviors["test"].hyperparameters, PPOSettings)
+
+    assert (
+        RewardSignalType.EXTRINSIC in blank_runoptions.behaviors["test"].reward_signals
+    )
+
+
+def test_strict_to_cls():
+    """
+    Test strict structuring method.
+    """
+
+    @attr.s(auto_attribs=True)
+    class TestAttrsClass:
+        field1: int = 0
+        field2: str = "test"
+
+    correct_dict = {"field1": 1, "field2": "test2"}
+    assert strict_to_cls(correct_dict, TestAttrsClass) == TestAttrsClass(**correct_dict)
+
+    incorrect_dict = {"field3": 1, "field2": "test2"}
+
+    with pytest.raises(TrainerConfigError):
+        strict_to_cls(incorrect_dict, TestAttrsClass)
+
+    with pytest.raises(TrainerConfigError):
+        strict_to_cls("non_dict_input", TestAttrsClass)
+
+
+def test_trainersettings_structure():
+    """
+    Test structuring method for TrainerSettings
+    """
+    trainersettings_dict = {
+        "trainer_type": "sac",
+        "hyperparameters": {"batch_size": 1024},
+        "max_steps": 1.0,
+        "reward_signals": {"curiosity": {"encoding_size": 64}},
+    }
+    trainer_settings = TrainerSettings.structure(trainersettings_dict, TrainerSettings)
+    assert isinstance(trainer_settings.hyperparameters, SACSettings)
+    assert trainer_settings.trainer_type == TrainerType.SAC
+    assert isinstance(trainer_settings.max_steps, int)
+    assert RewardSignalType.CURIOSITY in trainer_settings.reward_signals
+
+    # Check invalid trainer type
+    with pytest.raises(ValueError):
+        trainersettings_dict = {
+            "trainer_type": "puppo",
+            "hyperparameters": {"batch_size": 1024},
+            "max_steps": 1.0,
+        }
+        TrainerSettings.structure(trainersettings_dict, TrainerSettings)
+
+    # Check invalid hyperparameter
+    with pytest.raises(TrainerConfigError):
+        trainersettings_dict = {
+            "trainer_type": "ppo",
+            "hyperparameters": {"notahyperparam": 1024},
+            "max_steps": 1.0,
+        }
+        TrainerSettings.structure(trainersettings_dict, TrainerSettings)
+
+    # Check non-dict
+    with pytest.raises(TrainerConfigError):
+        TrainerSettings.structure("notadict", TrainerSettings)
+
+    # Check hyperparameters specified but trainer type left as default.
+    # This shouldn't work as you could specify non-PPO hyperparameters.
+    with pytest.raises(TrainerConfigError):
+        trainersettings_dict = {"hyperparameters": {"batch_size": 1024}}
+        TrainerSettings.structure(trainersettings_dict, TrainerSettings)
+
+
+def test_reward_signal_structure():
+    """
+    Tests the RewardSignalSettings structure method. This one is special b/c
+    it takes in a Dict[RewardSignalType, RewardSignalSettings].
+    """
+    reward_signals_dict = {
+        "extrinsic": {"strength": 1.0},
+        "curiosity": {"strength": 1.0},
+    }
+    reward_signals = RewardSignalSettings.structure(
+        reward_signals_dict, Dict[RewardSignalType, RewardSignalSettings]
+    )
+    assert isinstance(reward_signals[RewardSignalType.EXTRINSIC], RewardSignalSettings)
+    assert isinstance(reward_signals[RewardSignalType.CURIOSITY], CuriositySettings)
+
+    # Check invalid reward signal type
+    reward_signals_dict = {"puppo": {"strength": 1.0}}
+    with pytest.raises(ValueError):
+        RewardSignalSettings.structure(
+            reward_signals_dict, Dict[RewardSignalType, RewardSignalSettings]
+        )
+
+    # Check missing GAIL demo path
+    reward_signals_dict = {"gail": {"strength": 1.0}}
+    with pytest.raises(TypeError):
+        RewardSignalSettings.structure(
+            reward_signals_dict, Dict[RewardSignalType, RewardSignalSettings]
+        )
+
+    # Check non-Dict input
+    with pytest.raises(TrainerConfigError):
+        RewardSignalSettings.structure(
+            "notadict", Dict[RewardSignalType, RewardSignalSettings]
+        )