[config] Disable `threading` by default (#5221)

* Remove threading as default * New description * Remove threaded option from YAML configs * Remove from Match3
4 年前 · 45e75e01
--- a/config/imitation/Crawler.yaml
+++ b/config/imitation/Crawler.yaml
    max_steps: 10000000
    time_horizon: 1000
    summary_freq: 30000
-    threaded: true
    behavioral_cloning:
      demo_path: Project/Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawler.demo
      steps: 50000
--- a/config/imitation/Hallway.yaml
+++ b/config/imitation/Hallway.yaml
    max_steps: 10000000
    time_horizon: 64
    summary_freq: 10000
-    threaded: true
--- a/config/imitation/PushBlock.yaml
+++ b/config/imitation/PushBlock.yaml
    max_steps: 100000
    time_horizon: 64
    summary_freq: 60000
-    threaded: true
    behavioral_cloning:
      demo_path: Project/Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPushBlock.demo
      steps: 50000
--- a/config/poca/DungeonEscape.yaml
+++ b/config/poca/DungeonEscape.yaml
    max_steps: 20000000
    time_horizon: 64
    summary_freq: 60000
-    threaded: true
--- a/config/poca/PushBlockCollab.yaml
+++ b/config/poca/PushBlockCollab.yaml
    max_steps: 15000000
    time_horizon: 64
    summary_freq: 60000
-    threaded: true
--- a/config/poca/SoccerTwos.yaml
+++ b/config/poca/SoccerTwos.yaml
    max_steps: 50000000
    time_horizon: 1000
    summary_freq: 10000
-    threaded: false
    self_play:
      save_steps: 50000
      team_change: 200000
--- a/config/poca/StrikersVsGoalie.yaml
+++ b/config/poca/StrikersVsGoalie.yaml
    max_steps: 30000000
    time_horizon: 1000
    summary_freq: 10000
-    threaded: false
    self_play:
      save_steps: 50000
      team_change: 200000
    max_steps: 30000000
    time_horizon: 1000
    summary_freq: 10000
-    threaded: false
    self_play:
      save_steps: 50000
      team_change: 200000
--- a/config/ppo/3DBall.yaml
+++ b/config/ppo/3DBall.yaml
    max_steps: 500000
    time_horizon: 1000
    summary_freq: 12000
-    threaded: true
--- a/config/ppo/3DBallHard.yaml
+++ b/config/ppo/3DBallHard.yaml
    max_steps: 500000
    time_horizon: 1000
    summary_freq: 12000
-    threaded: true
--- a/config/ppo/3DBall_randomize.yaml
+++ b/config/ppo/3DBall_randomize.yaml
    max_steps: 500000
    time_horizon: 1000
    summary_freq: 12000
-    threaded: true
 environment_parameters:
  mass:
    sampler_type: uniform
--- a/config/ppo/Basic.yaml
+++ b/config/ppo/Basic.yaml
    max_steps: 500000
    time_horizon: 3
    summary_freq: 2000
-    threaded: true
--- a/config/ppo/Crawler.yaml
+++ b/config/ppo/Crawler.yaml
    max_steps: 10000000
    time_horizon: 1000
    summary_freq: 30000
-    threaded: true
--- a/config/ppo/FoodCollector.yaml
+++ b/config/ppo/FoodCollector.yaml
    max_steps: 2000000
    time_horizon: 64
    summary_freq: 10000
-    threaded: true
--- a/config/ppo/GridWorld.yaml
+++ b/config/ppo/GridWorld.yaml
    max_steps: 500000
    time_horizon: 5
    summary_freq: 20000
-    threaded: true
--- a/config/ppo/Hallway.yaml
+++ b/config/ppo/Hallway.yaml
    max_steps: 10000000
    time_horizon: 64
    summary_freq: 10000
-    threaded: true
--- a/config/ppo/Match3.yaml
+++ b/config/ppo/Match3.yaml
  max_steps: 5000000
  time_horizon: 128
  summary_freq: 10000
-  threaded: true

 behaviors:
  Match3SimpleHeuristic:
      num_layers: 1
    max_steps: 5000000
    summary_freq: 10000
-    threaded: true
  Match3SmartHeuristic:
    # Settings can be very simple since we don't care about actually training the model
    trainer_type: ppo
      num_layers: 1
    max_steps: 5000000
    summary_freq: 10000
-    threaded: true
--- a/config/ppo/PushBlock.yaml
+++ b/config/ppo/PushBlock.yaml
    max_steps: 2000000
    time_horizon: 64
    summary_freq: 60000
-    threaded: true
--- a/config/ppo/Pyramids.yaml
+++ b/config/ppo/Pyramids.yaml
    max_steps: 10000000
    time_horizon: 128
    summary_freq: 30000
-    threaded: true
--- a/config/ppo/PyramidsRND.yaml
+++ b/config/ppo/PyramidsRND.yaml
    max_steps: 3000000
    time_horizon: 128
    summary_freq: 30000
-    threaded: true
--- a/config/ppo/Sorter_curriculum.yaml
+++ b/config/ppo/Sorter_curriculum.yaml
      epsilon: 0.2
      lambd: 0.95
      num_epoch: 3
-      learning_rate_schedule: constant 
+      learning_rate_schedule: constant
    network_settings:
      normalize: False
      hidden_units: 128
    max_steps: 5000000
    time_horizon: 256
    summary_freq: 10000
-    threaded: true
 environment_parameters:
  num_tiles:
    curriculum:
--- a/config/ppo/Visual3DBall.yaml
+++ b/config/ppo/Visual3DBall.yaml
    max_steps: 400000
    time_horizon: 64
    summary_freq: 20000
-    threaded: true
--- a/config/ppo/VisualFoodCollector.yaml
+++ b/config/ppo/VisualFoodCollector.yaml
    max_steps: 3000000
    time_horizon: 100
    summary_freq: 40000
-    threaded: true
--- a/config/ppo/Walker.yaml
+++ b/config/ppo/Walker.yaml
    max_steps: 30000000
    time_horizon: 1000
    summary_freq: 30000
-    threaded: true
--- a/config/ppo/WallJump.yaml
+++ b/config/ppo/WallJump.yaml
    max_steps: 20000000
    time_horizon: 128
    summary_freq: 20000
-    threaded: true
  SmallWallJump:
    trainer_type: ppo
    hyperparameters:
    max_steps: 5000000
    time_horizon: 128
    summary_freq: 20000
-    threaded: true
--- a/config/ppo/WallJump_curriculum.yaml
+++ b/config/ppo/WallJump_curriculum.yaml
    max_steps: 20000000
    time_horizon: 128
    summary_freq: 20000
-    threaded: true
  SmallWallJump:
    trainer_type: ppo
    hyperparameters:
    max_steps: 5000000
    time_horizon: 128
    summary_freq: 20000
-    threaded: true
 environment_parameters:
  big_wall_height:
    curriculum:
--- a/config/ppo/Worm.yaml
+++ b/config/ppo/Worm.yaml
    max_steps: 7000000
    time_horizon: 1000
    summary_freq: 30000
-    threaded: true
--- a/config/sac/3DBall.yaml
+++ b/config/sac/3DBall.yaml
    max_steps: 200000
    time_horizon: 1000
    summary_freq: 12000
-    threaded: true
--- a/config/sac/3DBallHard.yaml
+++ b/config/sac/3DBallHard.yaml
    max_steps: 500000
    time_horizon: 1000
    summary_freq: 12000
-    threaded: true
--- a/config/sac/Basic.yaml
+++ b/config/sac/Basic.yaml
    max_steps: 500000
    time_horizon: 10
    summary_freq: 2000
-    threaded: true
--- a/config/sac/Crawler.yaml
+++ b/config/sac/Crawler.yaml
    max_steps: 5000000
    time_horizon: 1000
    summary_freq: 30000
-    threaded: true
--- a/config/sac/FoodCollector.yaml
+++ b/config/sac/FoodCollector.yaml
    max_steps: 2000000
    time_horizon: 64
    summary_freq: 60000
-    threaded: true
+    threaded: false
--- a/config/sac/GridWorld.yaml
+++ b/config/sac/GridWorld.yaml
    max_steps: 500000
    time_horizon: 5
    summary_freq: 20000
-    threaded: true
--- a/config/sac/Hallway.yaml
+++ b/config/sac/Hallway.yaml
    max_steps: 5000000
    time_horizon: 64
    summary_freq: 10000
-    threaded: true
--- a/config/sac/PushBlock.yaml
+++ b/config/sac/PushBlock.yaml
    max_steps: 2000000
    time_horizon: 64
    summary_freq: 100000
-    threaded: true
--- a/config/sac/Pyramids.yaml
+++ b/config/sac/Pyramids.yaml
    max_steps: 3000000
    time_horizon: 128
    summary_freq: 30000
-    threaded: true
--- a/config/sac/Walker.yaml
+++ b/config/sac/Walker.yaml
    max_steps: 15000000
    time_horizon: 1000
    summary_freq: 30000
-    threaded: true
--- a/config/sac/WallJump.yaml
+++ b/config/sac/WallJump.yaml
    max_steps: 15000000
    time_horizon: 128
    summary_freq: 20000
-    threaded: true
  SmallWallJump:
    trainer_type: sac
    hyperparameters:
    max_steps: 5000000
    time_horizon: 128
    summary_freq: 20000
-    threaded: true
--- a/config/sac/Worm.yaml
+++ b/config/sac/Worm.yaml
    max_steps: 5000000
    time_horizon: 1000
    summary_freq: 30000
-    threaded: true
--- a/docs/Training-Configuration-File.md
+++ b/docs/Training-Configuration-File.md
 | `keep_checkpoints`         | (default = `5`) The maximum number of model checkpoints to keep. Checkpoints are saved after the number of steps specified by the checkpoint_interval option. Once the maximum number of checkpoints has been reached, the oldest checkpoint is deleted when saving a new checkpoint. |
 | `checkpoint_interval`         | (default = `500000`) The number of experiences collected between each checkpoint by the trainer. A maximum of `keep_checkpoints` checkpoints are saved before old ones are deleted. Each checkpoint saves the `.onnx` files in `results/` folder.|
 | `init_path`              | (default = None) Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. <br><br>You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run.                                                                                                                                  |
-| `threaded`               | (default = `true`) By default, model updates can happen while the environment is being stepped. This violates the [on-policy](https://spinningup.openai.com/en/latest/user/algorithms.html#the-on-policy-algorithms) assumption of PPO slightly in exchange for a training speedup. To maintain the strict on-policyness of PPO, you can disable parallel updates by setting `threaded` to `false`. There is usually no reason to turn `threaded` off for SAC.                                                                                                                                                                                                                                                       |
+| `threaded`               | (default = `false`) Allow environments to step while updating the model. This might result in a training speedup, especially when using SAC. For best performance, leave setting to `false` when using self-play.                                                                                                                                                                                                                      |
 | `hyperparameters -> learning_rate`          | (default = `3e-4`) Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3`                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
 | `hyperparameters -> batch_size`             | Number of experiences in each iteration of gradient descent. **This should always be multiple times smaller than `buffer_size`**. If you are using continuous actions, this value should be large (on the order of 1000s). If you are using only discrete actions, this value should be smaller (on the order of 10s). <br><br> Typical range: (Continuous - PPO): `512` - `5120`; (Continuous - SAC): `128` - `1024`; (Discrete, PPO & SAC): `32` - `512`.                                                                                                                                                                                                                                                               |
 | `hyperparameters -> buffer_size`            | (default = `10240` for PPO and `50000` for SAC)<br> **PPO:** Number of experiences to collect before updating the policy model. Corresponds to how many experiences should be collected before we do any learning or updating of the model. **This should be multiple times larger than `batch_size`**. Typically a larger `buffer_size` corresponds to more stable training updates. <br> **SAC:** The max size of the experience buffer - on the order of thousands of times longer than your episodes, so that SAC can learn from old as well as new experiences. <br><br>Typical range: PPO: `2048` - `409600`; SAC: `50000` - `1000000`                                                                                                                                                      |
--- a/docs/Training-ML-Agents.md
+++ b/docs/Training-ML-Agents.md
    summary_freq: 10000
    keep_checkpoints: 5
    checkpoint_interval: 50000
-    threaded: true
+    threaded: false
    init_path: null

    # behavior cloning
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
    max_steps: int = 500000
    time_horizon: int = 64
    summary_freq: int = 50000
-    threaded: bool = True
+    threaded: bool = False
    self_play: Optional[SelfPlaySettings] = None
    behavioral_cloning: Optional[BehavioralCloningSettings] = None