Merge remote-tracking branch 'origin/master' into release_1_to_master

5 年前 · 68b68396
--- a/.gitignore
+++ b/.gitignore
-# Tensorflow Model Info
+# Output Artifacts (Legacy)
+# Output Artifacts
+/results

 # Training environments
 /envs
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
 and this project adheres to
 [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

-## [1.0.0-preview] - 2020-05-06
+## [Unreleased]
+### Major Changes
+#### com.unity.ml-agents (C#)
+#### ml-agents / ml-agents-envs / gym-unity (Python)
+### Minor Changes
+#### com.unity.ml-agents (C#)
+#### ml-agents / ml-agents-envs / gym-unity (Python)
+- Curriculum and Parameter Randomization configurations have been merged
+  into the main training configuration file. Note that this means training
+  configuration files are now environment-specific. (#3791)
+- Training artifacts (trained models, summaries) are now found in the `results/`
+  directory. (#3829)
+- Unity Player logs are now written out to the results directory. (#3877)
+- Run configuration YAML files are written out to the results directory at the end of the run. (#3815)
+### Bug Fixes
+#### com.unity.ml-agents (C#)
+#### ml-agents / ml-agents-envs / gym-unity (Python)
+
+## [1.0.0-preview] - 2020-04-30

 ### Major Changes

--- a/com.unity.ml-agents/Editor/BehaviorParametersEditor.cs
+++ b/com.unity.ml-agents/Editor/BehaviorParametersEditor.cs
-using Unity.MLAgents.Sensors;
-using Unity.MLAgents.Policies;
+using Unity.MLAgents.Policies;
+using Unity.MLAgents.Sensors;
 using UnityEngine;

 namespace Unity.MLAgents.Editor
--- a/com.unity.ml-agents/Runtime/Agent.cs
+++ b/com.unity.ml-agents/Runtime/Agent.cs
 using System.Collections.Generic;
 using System.Collections.ObjectModel;
 using UnityEngine;
+using Unity.Barracuda;
-using Unity.Barracuda;
 using UnityEngine.Serialization;

 namespace Unity.MLAgents
--- a/com.unity.ml-agents/Runtime/Inference/BarracudaModelParamLoader.cs
+++ b/com.unity.ml-agents/Runtime/Inference/BarracudaModelParamLoader.cs
 using System;
 using System.Collections.Generic;
 using System.Linq;
+using Unity.Barracuda;
-using Unity.Barracuda;

 namespace Unity.MLAgents.Inference
 {
--- a/com.unity.ml-agents/Runtime/Inference/GeneratorImpl.cs
+++ b/com.unity.ml-agents/Runtime/Inference/GeneratorImpl.cs
 using System.Collections.Generic;
 using System;
-using Unity.MLAgents.Inference.Utils;
-using Unity.Barracuda;
+using Unity.Barracuda;
+using Unity.MLAgents.Inference.Utils;
 using Unity.MLAgents.Sensors;

 namespace Unity.MLAgents.Inference
--- a/com.unity.ml-agents/Runtime/Inference/TensorApplier.cs
+++ b/com.unity.ml-agents/Runtime/Inference/TensorApplier.cs
 using System.Collections.Generic;
-using Unity.MLAgents.Policies;
+using Unity.MLAgents.Policies;

 namespace Unity.MLAgents.Inference
 {
--- a/com.unity.ml-agents/Runtime/Inference/TensorGenerator.cs
+++ b/com.unity.ml-agents/Runtime/Inference/TensorGenerator.cs
 using System.Collections.Generic;
-using Unity.MLAgents.Sensors;
+using Unity.MLAgents.Sensors;

 namespace Unity.MLAgents.Inference
 {
--- a/com.unity.ml-agents/Runtime/Inference/TensorProxy.cs
+++ b/com.unity.ml-agents/Runtime/Inference/TensorProxy.cs
 using System;
 using System.Collections.Generic;
-using Unity.MLAgents.Inference.Utils;
+using Unity.MLAgents.Inference.Utils;

 namespace Unity.MLAgents.Inference
 {
--- a/com.unity.ml-agents/Runtime/Sensors/ObservationWriter.cs
+++ b/com.unity.ml-agents/Runtime/Sensors/ObservationWriter.cs
 using System;
 using System.Collections.Generic;
-using Unity.MLAgents.Inference;
+using Unity.MLAgents.Inference;

 namespace Unity.MLAgents.Sensors
 {
--- a/com.unity.ml-agents/Tests/Editor/EditModeTestInternalBrainTensorApplier.cs
+++ b/com.unity.ml-agents/Tests/Editor/EditModeTestInternalBrainTensorApplier.cs
 using System.Collections.Generic;
 using NUnit.Framework;
+using Unity.Barracuda;
-using Unity.Barracuda;

 namespace Unity.MLAgents.Tests
 {
--- a/com.unity.ml-agents/Tests/Editor/ModelRunnerTest.cs
+++ b/com.unity.ml-agents/Tests/Editor/ModelRunnerTest.cs
+using System.Linq;
+using Unity.Barracuda;
-using Unity.Barracuda;
-using System.Linq;
 using Unity.MLAgents.Policies;

 namespace Unity.MLAgents.Tests
--- a/com.unity.ml-agents/Tests/Editor/ParameterLoaderTest.cs
+++ b/com.unity.ml-agents/Tests/Editor/ParameterLoaderTest.cs
+using System.Linq;
+using Unity.Barracuda;
-using Unity.Barracuda;
-using System.Linq;
 using Unity.MLAgents.Policies;

 namespace Unity.MLAgents.Tests
--- a/com.unity.ml-agents/Tests/Editor/Sensor/ObservationWriterTests.cs
+++ b/com.unity.ml-agents/Tests/Editor/Sensor/ObservationWriterTests.cs
 using NUnit.Framework;
+using Unity.Barracuda;
-using Unity.Barracuda;


 namespace Unity.MLAgents.Tests
--- a/com.unity.ml-agents/Tests/Editor/TensorUtilsTest.cs
+++ b/com.unity.ml-agents/Tests/Editor/TensorUtilsTest.cs
 using System;
+using NUnit.Framework;
+using Unity.Barracuda;
-using Unity.Barracuda;
-using NUnit.Framework;

 namespace Unity.MLAgents.Tests
 {
--- a/docs/Getting-Started.md
+++ b/docs/Getting-Started.md
 1. Navigate to the folder where you cloned the `ml-agents` repository. **Note**:
   If you followed the default [installation](Installation.md), then you should
   be able to run `mlagents-learn` from any directory.
-1. Run `mlagents-learn config/trainer_config.yaml --run-id=first3DBallRun`.
-   - `config/trainer_config.yaml` is the path to a default training
-     configuration file that we provide. In includes training configurations for
-     all our example environments, including 3DBall.
+1. Run `mlagents-learn config/ppo/3DBall.yaml --run-id=first3DBallRun`.
+   - `config/ppo/3DBall.yaml` is the path to a default training
+     configuration file that we provide. The `config/ppo` folder includes training configuration
+     files for all our example environments, including 3DBall.
   - `run-id` is a unique name for this training session.
 1. When the message _"Start training by pressing the Play button in the Unity
   Editor"_ is displayed on the screen, you can press the **Play** button in
        sequence_length:     64
        summary_freq:        1000
        use_recurrent:       False
-        summary_path:        ./summaries/first3DBallRun
-        model_path: ./models/first3DBallRun/3DBallLearning
+        output_path: ./results/first3DBallRun/3DBallLearning
 INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 1000. Mean Reward: 1.242. Std of Reward: 0.746. Training.
 INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 2000. Mean Reward: 1.319. Std of Reward: 0.693. Training.
 INFO:mlagents.trainers: first3DBallRun: 3DBallLearning: Step: 3000. Mean Reward: 1.804. Std of Reward: 1.056. Training.
 run the same command again, appending the `--resume` flag:

 ```sh
-mlagents-learn config/trainer_config.yaml --run-id=first3DBallRun --resume
+mlagents-learn config/ppo/3DBall.yaml --run-id=firstRun --resume
-Your trained model will be at `models/<run-identifier>/<behavior_name>.nn` where
+Your trained model will be at `results/<run-identifier>/<behavior_name>.nn` where
 `<behavior_name>` is the name of the `Behavior Name` of the agents corresponding
 to the model. This file corresponds to your model's latest checkpoint. You can
 now embed this trained model into your Agents by following the steps below,
--- a/docs/Learning-Environment-Create-New.md
+++ b/docs/Learning-Environment-Create-New.md
 and include the following hyperparameter values:

 ```yml
-RollerBall:
-  trainer: ppo
-  batch_size: 10
-  beta: 5.0e-3
-  buffer_size: 100
-  epsilon: 0.2
-  hidden_units: 128
-  lambd: 0.95
-  learning_rate: 3.0e-4
-  learning_rate_schedule: linear
-  max_steps: 5.0e4
-  normalize: false
-  num_epoch: 3
-  num_layers: 2
-  time_horizon: 64
-  summary_freq: 10000
-  use_recurrent: false
-  reward_signals:
-    extrinsic:
-      strength: 1.0
-      gamma: 0.99
+behaviors:
+  RollerBall:
+    trainer: ppo
+    batch_size: 10
+    beta: 5.0e-3
+    buffer_size: 100
+    epsilon: 0.2
+    hidden_units: 128
+    lambd: 0.95
+    learning_rate: 3.0e-4
+    learning_rate_schedule: linear
+    max_steps: 5.0e4
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 64
+    summary_freq: 10000
+    use_recurrent: false
+    reward_signals:
+        extrinsic:
+        strength: 1.0
+        gamma: 0.99
 ```

 Since this example creates a very simple training environment with only a few
--- a/docs/Learning-Environment-Examples.md
+++ b/docs/Learning-Environment-Examples.md
    does not train with the provided default training parameters.**
 - Float Properties: None
 - Benchmark Mean Reward: 0.7
-  - To speed up training, you can enable curiosity by adding the `curiosity`
-    reward signal in `config/trainer_config.yaml`
+  - To train this environment, you can enable curiosity by adding the `curiosity` reward signal
+    in `config/ppo/Hallway.yaml`

 ## Bouncer

--- a/docs/Learning-Environment-Executable.md
+++ b/docs/Learning-Environment-Executable.md
 the directory where you installed the ML-Agents Toolkit, run:

 ```sh
-mlagents-learn ../config/trainer_config.yaml --env=3DBall --run-id=firstRun
+mlagents-learn ../config/ppo/3DBall.yaml --env=3DBall --run-id=firstRun
-ml-agents$ mlagents-learn config/trainer_config.yaml --env=3DBall --run-id=first-run
+ml-agents$ mlagents-learn config/ppo/3DBall.yaml --env=3DBall --run-id=first-run


                        ▄▄▄▓▓▓▓
        sequence_length:     64
        summary_freq:        1000
        use_recurrent:       False
-        summary_path:        ./summaries/first-run-0
-        model_path: ./models/first-run-0/Ball3DLearning
+        output_path: ./results/first-run-0/Ball3DLearning
 INFO:mlagents.trainers: first-run-0: Ball3DLearning: Step: 1000. Mean Reward: 1.242. Std of Reward: 0.746. Training.
 INFO:mlagents.trainers: first-run-0: Ball3DLearning: Step: 2000. Mean Reward: 1.319. Std of Reward: 0.693. Training.
 INFO:mlagents.trainers: first-run-0: Ball3DLearning: Step: 3000. Mean Reward: 1.804. Std of Reward: 1.056. Training.
 INFO:mlagents.trainers: first-run-0: Ball3DLearning: Step: 10000. Mean Reward: 27.284. Std of Reward: 28.667. Training.
 ```

-You can press `Ctrl+C` to stop the training, and your trained model will be at
-`models/<run-identifier>/<behavior_name>.nn`, which corresponds to your model's
+You can press Ctrl+C to stop the training, and your trained model will be at
+`results/<run-identifier>/<behavior_name>.nn`, which corresponds to your model's
-trainer_config.yaml.) You can now embed this trained model into your Agent by
+your config YAML.) You can now embed this trained model into your Agent by
 following the steps below:

 1. Move your model file into
--- a/docs/Migrating.md
+++ b/docs/Migrating.md
 ## Migrating from Release 1 to latest

 ### Important changes
+- Training artifacts (trained models, summaries) are now found under `results/`
+  instead of `summaries/` and `models/`.
+- Trainer configuration, curriculum configuration, and parameter randomization
+  configuration have all been moved to a single YAML file. (#3791)
+- Before upgrading, copy your `Behavior Name` sections from `trainer_config.yaml` into
+  a separate trainer configuration file, under a `behaviors` section. You can move the `default` section too
+  if it's being used. This file should be specific to your environment, and not contain configurations for
+  multiple environments (unless they have the same Behavior Names).
+  - If your training uses [curriculum](Training-Curriculum-Learning.md), move those configurations under
+  the `Behavior Name` section.
+  - If your training uses [parameter randomization](Training-Environment-Parameter-Randomization.md), move
+  the contents of the sampler config to `parameter_randomization` in the main trainer configuration.

 ## Migrating from 0.15 to Release 1

  longer takes a file name as input but a fully constructed `UnityEnvironment`
  instead.
 - Update uses of "camelCase" fields and properties to "PascalCase".
- If you have a custom `ISensor` implementation, you will need to change the
-  signature of its `Write()` method to use `ObservationWriter` instead of
-  `WriteAdapter`.

 ## Migrating from 0.14 to 0.15

 - Multiply `max_steps` and `summary_freq` in your `trainer_config.yaml` by the
  number of Agents in the scene.
 - Combine curriculum configs into a single file. See
-  [the WallJump curricula](../config/curricula/wall_jump.yaml) for an example of
+  [the WallJump curricula](https://github.com/Unity-Technologies/ml-agents/blob/0.14.1/config/curricula/wall_jump.yaml) for an example of
  the new curriculum config format. A tool like https://www.json2yaml.com may be
  useful to help with the conversion.
 - If you have a model trained which uses RayPerceptionSensor and has non-1.0

 - It is now required to specify the path to the yaml trainer configuration file
  when running `mlagents-learn`. For an example trainer configuration file, see
-  [trainer_config.yaml](../config/trainer_config.yaml). An example of passing a
+  [trainer_config.yaml](https://github.com/Unity-Technologies/ml-agents/blob/0.5.0a/config/trainer_config.yaml). An example of passing a
  trainer configuration to `mlagents-learn` is shown above.
 - The environment name is now passed through the `--env` option.
 - Curriculum learning has been changed. In summary:
--- a/docs/Using-Tensorboard.md
+++ b/docs/Using-Tensorboard.md

 1. Open a terminal or console window:
 1. Navigate to the directory where the ML-Agents Toolkit is installed.
-1. From the command line run: `tensorboard --logdir=summaries --port=6006`
+1. From the command line run: `tensorboard --logdir=results --port=6006`
 1. Open a browser window and navigate to
   [localhost:6006](http://localhost:6006).

--- a/gym-unity/README.md
+++ b/gym-unity/README.md
 We provide results from our PPO implementation and the DQN from Baselines as
 reference. Note that all runs used the same greyscale GridWorld as Dopamine. For
 PPO, `num_layers` was set to 2, and all other hyperparameters are the default
-for GridWorld in `trainer_config.yaml`. For Baselines DQN, the provided
+for GridWorld in `config/ppo/GridWorld.yaml`. For Baselines DQN, the provided
+

 ![Dopamine on GridWorld](images/dopamine_gridworld_plot.png)

--- a/gym-unity/gym_unity/init.py
+++ b/gym-unity/gym_unity/init.py
 # Version of the library that will be used to upload to pypi
-__version__ = "0.16.0"
+__version__ = "0.17.0.dev0"
-__release_tag__ = "release_1"
+__release_tag__ = None
--- a/ml-agents-envs/mlagents_envs/init.py
+++ b/ml-agents-envs/mlagents_envs/init.py
 # Version of the library that will be used to upload to pypi
-__version__ = "0.16.0"
+__version__ = "0.17.0.dev0"
-__release_tag__ = "release_1"
+__release_tag__ = None
--- a/ml-agents-envs/mlagents_envs/environment.py
+++ b/ml-agents-envs/mlagents_envs/environment.py
        seed: int = 0,
        no_graphics: bool = False,
        timeout_wait: int = 60,
-        args: Optional[List[str]] = None,
+        additional_args: Optional[List[str]] = None,
+        log_folder: Optional[str] = None,
    ):
        """
        Starts a new unity environment and establishes a connection with the environment.
        :int timeout_wait: Time (in seconds) to wait for connection from environment.
        :list args: Addition Unity command line arguments
        :list side_channels: Additional side channel for no-rl communication with Unity
+        :str log_folder: Optional folder to write the Unity Player log file into.  Requires absolute path.
-        args = args or []
+        self.additional_args = additional_args or []
+        self.no_graphics = no_graphics
        # If base port is not specified, use BASE_ENVIRONMENT_PORT if we have
        # an environment, otherwise DEFAULT_EDITOR_PORT
        if base_port is None:
                        )
                    )
                self.side_channels[_sc.channel_id] = _sc
+        self.log_folder = log_folder

        # If the environment name is None, a new environment will not be launched
        # and the communicator will directly try to connect to an existing unity environment.
                "the worker-id must be 0 in order to connect with the Editor."
            )
        if file_name is not None:
-            self.executable_launcher(file_name, no_graphics, args)
+            self.executable_launcher(file_name, no_graphics, additional_args)
        else:
            logger.info(
                f"Listening on port {self.port}. "
                launch_string = candidates[0]
        return launch_string

+    def executable_args(self) -> List[str]:
+        args: List[str] = []
+        if self.no_graphics:
+            args += ["-nographics", "-batchmode"]
+        args += [UnityEnvironment.PORT_COMMAND_LINE_ARG, str(self.port)]
+        if self.log_folder:
+            log_file_path = os.path.join(
+                self.log_folder, f"Player-{self.worker_id}.log"
+            )
+            args += ["-logFile", log_file_path]
+        # Add in arguments passed explicitly by the user.
+        args += self.additional_args
+        return args
+
    def executable_launcher(self, file_name, no_graphics, args):
        launch_string = self.validate_environment_path(file_name)
        if launch_string is None:
        else:
            logger.debug("This is the launch string {}".format(launch_string))
            # Launch Unity environment
-            subprocess_args = [launch_string]
-            if no_graphics:
-                subprocess_args += ["-nographics", "-batchmode"]
-            subprocess_args += [UnityEnvironment.PORT_COMMAND_LINE_ARG, str(self.port)]
-            subprocess_args += args
+            subprocess_args = [launch_string] + self.executable_args()
            try:
                self.proc1 = subprocess.Popen(
                    subprocess_args,
--- a/ml-agents-envs/mlagents_envs/tests/test_envs.py
+++ b/ml-agents-envs/mlagents_envs/tests/test_envs.py

@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
+def test_log_file_path_is_set(mock_communicator, mock_launcher):
+    mock_communicator.return_value = MockCommunicator()
+    env = UnityEnvironment(
+        file_name="myfile", worker_id=0, log_folder="./some-log-folder-path"
+    )
+    args = env.executable_args()
+    log_file_index = args.index("-logFile")
+    assert args[log_file_index + 1] == "./some-log-folder-path/Player-0.log"
+
+
+@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
+@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
 def test_reset(mock_communicator, mock_launcher):
    mock_communicator.return_value = MockCommunicator(
        discrete_action=False, visual_inputs=0
--- a/ml-agents/mlagents/trainers/init.py
+++ b/ml-agents/mlagents/trainers/init.py
 # Version of the library that will be used to upload to pypi
-__version__ = "0.16.0"
+__version__ = "0.17.0.dev0"
-__release_tag__ = "release_1"
+__release_tag__ = None
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
 # # Unity ML-Agents Toolkit
 import argparse
+import yaml

 import os
 import numpy as np
    load_config,
    TrainerFactory,
    handle_existing_directories,
+    assemble_curriculum_config,
 )
 from mlagents.trainers.stats import (
    TensorboardWriter,
    ConsoleWriter,
 )
+from mlagents.trainers.cli_utils import (
+    StoreConfigFile,
+    DetectDefault,
+    DetectDefaultStoreTrue,
+)
-from mlagents.trainers.exception import SamplerException
+from mlagents.trainers.exception import SamplerException, TrainerConfigError
 from mlagents_envs.base_env import BaseEnv
 from mlagents.trainers.subprocess_env_manager import SubprocessEnvManager
 from mlagents_envs.side_channel.side_channel import SideChannel
    argparser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
-    argparser.add_argument("trainer_config_path")
+    argparser.add_argument("trainer_config_path", action=StoreConfigFile)
-    )
-    argparser.add_argument(
-        "--curriculum",
-        default=None,
-        dest="curriculum_config_path",
-        help="YAML file for defining the lessons for curriculum training",
+        action=DetectDefault,
    )
    argparser.add_argument(
        "--lesson",
-    )
-    argparser.add_argument(
-        "--sampler",
-        default=None,
-        dest="sampler_file_path",
-        help="YAML file for defining the sampler for environment parameter randomization",
+        action=DetectDefault,
    )
    argparser.add_argument(
        "--keep-checkpoints",
        "number of steps specified by the save-freq option. Once the maximum number of checkpoints"
        "has been reached, the oldest checkpoint is deleted when saving a new checkpoint.",
+        action=DetectDefault,
-        action="store_true",
+        action=DetectDefaultStoreTrue,
        help=argparse.SUPPRESS,  # Deprecated but still usable for now.
    )
    argparser.add_argument(
-        action="store_true",
+        action=DetectDefaultStoreTrue,
        help="Whether to resume training from a checkpoint. Specify a --run-id to use this option. "
        "If set, the training code loads an already trained model to initialize the neural network "
        "before resuming training. This option is only valid when the models exist, and have the same "
        "--force",
        default=False,
        dest="force",
-        action="store_true",
+        action=DetectDefaultStoreTrue,
        help="Whether to force-overwrite this run-id's existing summary and model data. (Without "
        "this flag, attempting to train a model with a run-id that has been used before will throw "
        "an error.",
        "as the saved model itself. If you use TensorBoard to view the training statistics, "
        "always set a unique run-id for each training run. (The statistics for all runs with the "
        "same id are combined as if they were produced by a the same session.)",
+        action=DetectDefault,
    )
    argparser.add_argument(
        "--initialize-from",
        "This can be used, for instance, to fine-tune an existing model on a new environment. "
        "Note that the previously saved models must have the same behavior parameters as your "
        "current environment.",
+        action=DetectDefault,
    )
    argparser.add_argument(
        "--save-freq",
+        action=DetectDefault,
    )
    argparser.add_argument(
        "--seed",
+        action=DetectDefault,
-        action="store_true",
+        action=DetectDefaultStoreTrue,
        help=argparse.SUPPRESS,
    )
    argparser.add_argument(
-        action="store_true",
+        action=DetectDefaultStoreTrue,
        help="Whether to run in Python inference mode (i.e. no training). Use with --resume to load "
        "a model trained with an existing run ID.",
    )
        "will use the port (base_port + worker_id), where the worker_id is sequential IDs given to "
        "each instance from 0 to (num_envs - 1). Note that when training using the Editor rather "
        "than an executable, the base port will be ignored.",
+        action=DetectDefault,
    )
    argparser.add_argument(
        "--num-envs",
        "from when training",
+        action=DetectDefault,
-        action="store_true",
+        action=DetectDefaultStoreTrue,
        help="Whether to run the Unity executable in no-graphics mode (i.e. without initializing "
        "the graphics driver. Use this only if your agents don't use visual observations.",
    )
-        action="store_true",
+        action=DetectDefaultStoreTrue,
        help="Whether to enable debug-level logging for some parts of the code",
    )
    argparser.add_argument(
        "process these as Unity Command Line Arguments. You should choose different argument names if "
        "you want to create environment-specific arguments. All arguments after this flag will be "
        "passed to the executable.",
+        action=DetectDefault,
-        action="store_true",
+        action=DetectDefaultStoreTrue,
        help="Forces training using CPU only",
    )

        type=int,
        help="The width of the executable window of the environment(s) in pixels "
        "(ignored for editor training).",
+        action=DetectDefault,
    )
    eng_conf.add_argument(
        "--height",
        "(ignored for editor training)",
+        action=DetectDefault,
    )
    eng_conf.add_argument(
        "--quality-level",
        "QualitySettings.SetQualityLevel in Unity.",
+        action=DetectDefault,
    )
    eng_conf.add_argument(
        "--time-scale",
        "Time.timeScale in Unity.",
+        action=DetectDefault,
    )
    eng_conf.add_argument(
        "--target-frame-rate",
        "Application.targetFrameRate in Unity.",
+        action=DetectDefault,
    )
    eng_conf.add_argument(
        "--capture-frame-rate",
        "Time.captureFramerate in Unity.",
+        action=DetectDefault,
    )
    return argparser


 class RunOptions(NamedTuple):
-    trainer_config: Dict
+    behaviors: Dict
    debug: bool = parser.get_default("debug")
    seed: int = parser.get_default("seed")
    env_path: Optional[str] = parser.get_default("env_path")
    lesson: int = parser.get_default("lesson")
    no_graphics: bool = parser.get_default("no_graphics")
    multi_gpu: bool = parser.get_default("multi_gpu")
-    sampler_config: Optional[Dict] = None
+    parameter_randomization: Optional[Dict] = None
    env_args: Optional[List[str]] = parser.get_default("env_args")
    cpu: bool = parser.get_default("cpu")
    width: int = parser.get_default("width")
          configs loaded from files.
        """
        argparse_args = vars(args)
-        trainer_config_path = argparse_args["trainer_config_path"]
-        curriculum_config_path = argparse_args["curriculum_config_path"]
-        argparse_args["trainer_config"] = load_config(trainer_config_path)
-        if curriculum_config_path is not None:
-            argparse_args["curriculum_config"] = load_config(curriculum_config_path)
-        if argparse_args["sampler_file_path"] is not None:
-            argparse_args["sampler_config"] = load_config(
-                argparse_args["sampler_file_path"]
+        run_options_dict = {}
+        run_options_dict.update(argparse_args)
+        config_path = StoreConfigFile.trainer_config_path
+
+        # Load YAML
+        yaml_config = load_config(config_path)
+        # This is the only option that is not optional and has no defaults.
+        if "behaviors" not in yaml_config:
+            raise TrainerConfigError(
+                "Trainer configurations not found. Make sure your YAML file has a section for behaviors."
+        # Use the YAML file values for all values not specified in the CLI.
+        for key, val in yaml_config.items():
+            # Detect bad config options
+            if not hasattr(RunOptions, key):
+                raise TrainerConfigError(
+                    "The option {} was specified in your YAML file, but is invalid.".format(
+                        key
+                    )
+                )
+            if key not in DetectDefault.non_default_args:
+                run_options_dict[key] = val
+
-        argparse_args["resume"] = argparse_args["resume"] or argparse_args["load_model"]
-        # Since argparse accepts file paths in the config options which don't exist in CommandLineOptions,
-        # these keys will need to be deleted to use the **/splat operator below.
-        argparse_args.pop("sampler_file_path")
-        argparse_args.pop("curriculum_config_path")
-        argparse_args.pop("trainer_config_path")
-        return RunOptions(**vars(args))
+        run_options_dict["resume"] = (
+            run_options_dict["resume"] or run_options_dict["load_model"]
+        )
+
+        return RunOptions(**run_options_dict)


 def get_version_string() -> str:
    :param run_options: Command line arguments for training.
    """
    with hierarchical_timer("run_training.setup"):
-        model_path = f"./models/{options.run_id}"
+        base_path = "results"
+        write_path = os.path.join(base_path, options.run_id)
-            f"./models/{options.initialize_from}" if options.initialize_from else None
+            os.path.join(base_path, options.run_id) if options.initialize_from else None
-        summaries_dir = "./summaries"
+        run_logs_dir = os.path.join(write_path, "run_logs")
-
+        # Check if directory exists
+        handle_existing_directories(
+            write_path, options.resume, options.force, maybe_init_path
+        )
+        # Make run logs directory
+        os.makedirs(run_logs_dir, exist_ok=True)
-            summaries_dir,
+            write_path,
-        handle_existing_directories(
-            model_path, summaries_dir, options.resume, options.force, maybe_init_path
-        )
-        tb_writer = TensorboardWriter(summaries_dir, clear_past_data=not options.resume)
+        tb_writer = TensorboardWriter(write_path, clear_past_data=not options.resume)
        gauge_write = GaugeWriter()
        console_writer = ConsoleWriter()
        StatsReporter.add_writer(tb_writer)
        if options.env_path is None:
            port = UnityEnvironment.DEFAULT_EDITOR_PORT
        env_factory = create_environment_factory(
-            options.env_path, options.no_graphics, run_seed, port, options.env_args
+            options.env_path,
+            options.no_graphics,
+            run_seed,
+            port,
+            options.env_args,
+            os.path.abspath(run_logs_dir),  # Unity environment requires absolute path
        )
        engine_config = EngineConfig(
            width=options.width,
            capture_frame_rate=options.capture_frame_rate,
        )
        env_manager = SubprocessEnvManager(env_factory, engine_config, options.num_envs)
+        curriculum_config = assemble_curriculum_config(options.behaviors)
-            options.curriculum_config, env_manager, options.lesson
+            curriculum_config, env_manager, options.lesson
-            options.sampler_config, run_seed
+            options.parameter_randomization, run_seed
-            options.trainer_config,
-            summaries_dir,
+            options.behaviors,
-            model_path,
+            write_path,
            options.keep_checkpoints,
            not options.inference,
            options.resume,
        # Create controller and begin training.
        tc = TrainerController(
            trainer_factory,
-            model_path,
-            summaries_dir,
+            write_path,
            options.run_id,
            options.save_freq,
            maybe_meta_curriculum,
        tc.start_learning(env_manager)
    finally:
        env_manager.close()
-        write_timing_tree(summaries_dir, options.run_id)
+        write_run_options(write_path, options)
+        write_timing_tree(run_logs_dir)
-def write_timing_tree(summaries_dir: str, run_id: str) -> None:
-    timing_path = f"{summaries_dir}/{run_id}_timers.json"
+def write_run_options(output_dir: str, run_options: RunOptions) -> None:
+    run_options_path = os.path.join(output_dir, "configuration.yaml")
+    try:
+        with open(run_options_path, "w") as f:
+            try:
+                yaml.dump(dict(run_options._asdict()), f, sort_keys=False)
+            except TypeError:  # Older versions of pyyaml don't support sort_keys
+                yaml.dump(dict(run_options._asdict()), f)
+    except FileNotFoundError:
+        logger.warning(
+            f"Unable to save configuration to {run_options_path}. Make sure the directory exists"
+        )
+
+
+def write_timing_tree(output_dir: str) -> None:
+    timing_path = os.path.join(output_dir, "timers.json")
    try:
        with open(timing_path, "w") as f:
            json.dump(get_timer_tree(), f, indent=4)
 def try_create_meta_curriculum(
    curriculum_config: Optional[Dict], env: SubprocessEnvManager, lesson: int
 ) -> Optional[MetaCurriculum]:
-    if curriculum_config is None:
+    if curriculum_config is None or len(curriculum_config) <= 0:
        return None
    else:
        meta_curriculum = MetaCurriculum(curriculum_config)
    seed: int,
    start_port: int,
    env_args: Optional[List[str]],
+    log_folder: str,
 ) -> Callable[[int, List[SideChannel]], BaseEnv]:
    if env_path is not None:
        launch_string = UnityEnvironment.validate_environment_path(env_path)
            seed=env_seed,
            no_graphics=no_graphics,
            base_port=start_port,
-            args=env_args,
+            additional_args=env_args,
+            log_folder=log_folder,
        )

    return create_unity_environment
--- a/ml-agents/mlagents/trainers/policy/tf_policy.py
+++ b/ml-agents/mlagents/trainers/policy/tf_policy.py
 from typing import Any, Dict, List, Optional
 import abc
+import os
 import numpy as np
 from mlagents.tf_utils import tf
 from mlagents import tf_utils
        self.use_continuous_act = brain.vector_action_space_type == "continuous"
        if self.use_continuous_act:
            self.num_branches = self.brain.vector_action_space_size[0]
-        self.model_path = trainer_parameters["model_path"]
+        self.model_path = trainer_parameters["output_path"]
        self.initialize_path = trainer_parameters.get("init_path", None)
        self.keep_checkpoints = trainer_parameters.get("keep_checkpoints", 5)
        self.graph = tf.Graph()
        :return:
        """
        with self.graph.as_default():
-            last_checkpoint = self.model_path + "/model-" + str(steps) + ".ckpt"
+            last_checkpoint = os.path.join(self.model_path, f"model-{steps}.ckpt")
            self.saver.save(self.sess, last_checkpoint)
            tf.train.write_graph(
                self.graph, self.model_path, "raw_graph_def.pb", as_text=False
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
            "sequence_length",
            "summary_freq",
            "use_recurrent",
-            "summary_path",
-            "model_path",
+            "output_path",
            "reward_signals",
        ]
        self._check_param_keys()
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
            "summary_freq",
            "tau",
            "use_recurrent",
-            "summary_path",
-            "model_path",
+            "output_path",
            "reward_signals",
        ]

        Save the training buffer's update buffer to a pickle file.
        """
        filename = os.path.join(
-            self.trainer_parameters["model_path"], "last_replay_buffer.hdf5"
+            self.trainer_parameters["output_path"], "last_replay_buffer.hdf5"
        )
        logger.info("Saving Experience Replay Buffer to {}".format(filename))
        with open(filename, "wb") as file_object:
        Loads the last saved replay buffer from a file.
        """
        filename = os.path.join(
-            self.trainer_parameters["model_path"], "last_replay_buffer.hdf5"
+            self.trainer_parameters["output_path"], "last_replay_buffer.hdf5"
        )
        logger.info("Loading Experience Replay Buffer from {}".format(filename))
        with open(filename, "rb+") as file_object:
--- a/ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
+++ b/ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
        memory_size: 8
        curiosity_strength: 0.0
        curiosity_enc_size: 1
-        summary_path: test
-        model_path: test
+        output_path: test
        reward_signals:
          extrinsic:
            strength: 1.0
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
 def test_policy_conversion(dummy_config, tmpdir, rnn, visual, discrete):
    tf.reset_default_graph()
-    dummy_config["summary_path"] = str(tmpdir)
-    dummy_config["model_path"] = os.path.join(tmpdir, "test")
+    dummy_config["output_path"] = os.path.join(tmpdir, "test")
    policy = create_policy_mock(
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
--- a/ml-agents/mlagents/trainers/tests/test_bcmodule.py
+++ b/ml-agents/mlagents/trainers/tests/test_bcmodule.py

 def create_bc_module(mock_brain, trainer_config, use_rnn, demo_file, tanhresample):
    # model_path = env.external_brain_names[0]
-    trainer_config["model_path"] = "testpath"
+    trainer_config["output_path"] = "testpath"
    trainer_config["keep_checkpoints"] = 3
    trainer_config["use_recurrent"] = use_rnn
    trainer_config["behavioral_cloning"]["demo_path"] = (
--- a/ml-agents/mlagents/trainers/tests/test_ghost.py
+++ b/ml-agents/mlagents/trainers/tests/test_ghost.py
        memory_size: 8
        curiosity_strength: 0.0
        curiosity_enc_size: 1
-        summary_path: test
-        model_path: test
+        output_path: test
        reward_signals:
          extrinsic:
            strength: 1.0
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
-    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
-    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
+    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    controller = GhostController(100)
    trainer = GhostTrainer(
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
-    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
-    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
+    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    controller = GhostController(100)
    trainer = GhostTrainer(
--- a/ml-agents/mlagents/trainers/tests/test_learn.py
+++ b/ml-agents/mlagents/trainers/tests/test_learn.py
 import pytest
+import yaml
-from mlagents.trainers.learn import parse_command_line
+from mlagents.trainers.learn import parse_command_line, DetectDefault
 from mlagents_envs.exception import UnityEnvironmentException
 from mlagents.trainers.stats import StatsReporter

    return parse_command_line(args)


+MOCK_YAML = """
+    behaviors:
+        {}
+    """
+
+MOCK_PARAMETER_YAML = """
+    behaviors:
+        {}
+    env_path: "./oldenvfile"
+    keep_checkpoints: 34
+    lesson: 2
+    run_id: uselessrun
+    save_freq: 654321
+    seed: 9870
+    base_port: 4001
+    num_envs: 4
+    debug: false
+    """
+
+MOCK_SAMPLER_CURRICULUM_YAML = """
+    behaviors:
+        behavior1:
+            curriculum:
+                curriculum1
+        behavior2:
+            curriculum:
+                curriculum2
+
+    parameter_randomization:
+        sampler1
+    """
+
+
+@patch("mlagents.trainers.learn.write_timing_tree")
+@patch("mlagents.trainers.learn.write_run_options")
@patch("mlagents.trainers.learn.handle_existing_directories")
@patch("mlagents.trainers.learn.TrainerFactory")
@patch("mlagents.trainers.learn.SamplerManager")
    sampler_manager_mock,
    trainer_factory_mock,
    handle_dir_mock,
+    write_run_options_mock,
+    write_timing_tree_mock,
-    trainer_config_mock = MagicMock()
-    load_config.return_value = trainer_config_mock
+    load_config.return_value = yaml.safe_load(MOCK_YAML)
-            learn.run_training(0, basic_options())
+            options = basic_options()
+            learn.run_training(0, options)
-                "./models/ppo",
-                "./summaries",
+                "results/ppo",
                "ppo",
                50000,
                None,
                None,
            )
-            handle_dir_mock.assert_called_once_with(
-                "./models/ppo", "./summaries", False, False, None
-            )
+            handle_dir_mock.assert_called_once_with("results/ppo", False, False, None)
+            write_timing_tree_mock.assert_called_once_with("results/ppo/run_logs")
+            write_run_options_mock.assert_called_once_with("results/ppo", options)
    StatsReporter.writers.clear()  # make sure there aren't any writers as added by learn.py


            seed=None,
            start_port=8000,
            env_args=None,
+            log_folder="results/log_folder",
-@patch("builtins.open", new_callable=mock_open, read_data="{}")
+@patch("builtins.open", new_callable=mock_open, read_data=MOCK_YAML)
-
-
-    assert opt.trainer_config == {}
+    assert opt.behaviors == {}
-    assert opt.curriculum_config is None
-    assert opt.sampler_config is None
+    assert opt.parameter_randomization is None
    assert opt.keep_checkpoints == 5
    assert opt.lesson == 0
    assert opt.resume is False
    full_args = [
        "mytrainerpath",
        "--env=./myenvfile",
-        "--curriculum=./mycurriculum",
-        "--sampler=./mysample",
+        "--keep-checkpoints=42",
+        "--lesson=3",
+        "--resume",
+        "--inference",
+        "--run-id=myawesomerun",
+        "--save-freq=123456",
+        "--seed=7890",
+        "--train",
+        "--base-port=4004",
+        "--num-envs=2",
+        "--no-graphics",
+        "--debug",
+    ]
+
+    opt = parse_command_line(full_args)
+    assert opt.behaviors == {}
+    assert opt.env_path == "./myenvfile"
+    assert opt.parameter_randomization is None
+    assert opt.keep_checkpoints == 42
+    assert opt.lesson == 3
+    assert opt.run_id == "myawesomerun"
+    assert opt.save_freq == 123456
+    assert opt.seed == 7890
+    assert opt.base_port == 4004
+    assert opt.num_envs == 2
+    assert opt.no_graphics is True
+    assert opt.debug is True
+    assert opt.inference is True
+    assert opt.resume is True
+
+
+@patch("builtins.open", new_callable=mock_open, read_data=MOCK_PARAMETER_YAML)
+def test_yaml_args(mock_file):
+    # Test with opts loaded from YAML
+    DetectDefault.non_default_args.clear()
+    opt = parse_command_line(["mytrainerpath"])
+    assert opt.behaviors == {}
+    assert opt.env_path == "./oldenvfile"
+    assert opt.parameter_randomization is None
+    assert opt.keep_checkpoints == 34
+    assert opt.lesson == 2
+    assert opt.run_id == "uselessrun"
+    assert opt.save_freq == 654321
+    assert opt.seed == 9870
+    assert opt.base_port == 4001
+    assert opt.num_envs == 4
+    assert opt.no_graphics is False
+    assert opt.debug is False
+    assert opt.env_args is None
+    # Test that CLI overrides YAML
+    full_args = [
+        "mytrainerpath",
+        "--env=./myenvfile",
        "--keep-checkpoints=42",
        "--lesson=3",
        "--resume",
    ]

    opt = parse_command_line(full_args)
-    assert opt.trainer_config == {}
+    assert opt.behaviors == {}
-    assert opt.curriculum_config == {}
-    assert opt.sampler_config == {}
+    assert opt.parameter_randomization is None
    assert opt.keep_checkpoints == 42
    assert opt.lesson == 3
    assert opt.run_id == "myawesomerun"
    assert opt.resume is True


-@patch("builtins.open", new_callable=mock_open, read_data="{}")
+@patch("builtins.open", new_callable=mock_open, read_data=MOCK_SAMPLER_CURRICULUM_YAML)
+def test_sampler_configs(mock_file):
+    opt = parse_command_line(["mytrainerpath"])
+    assert opt.parameter_randomization == "sampler1"
+
+
+@patch("builtins.open", new_callable=mock_open, read_data=MOCK_YAML)
 def test_env_args(mock_file):
    full_args = [
        "mytrainerpath",
--- a/ml-agents/mlagents/trainers/tests/test_nn_policy.py
+++ b/ml-agents/mlagents/trainers/tests/test_nn_policy.py
        memory_size: 8
        curiosity_strength: 0.0
        curiosity_enc_size: 1
-        summary_path: test
-        model_path: test
+        output_path: test
        reward_signals:
          extrinsic:
            strength: 1.0
    path1 = os.path.join(tmp_path, "runid1")
    path2 = os.path.join(tmp_path, "runid2")
    trainer_params = dummy_config
-    trainer_params["model_path"] = path1
+    trainer_params["output_path"] = path1
    policy = create_policy_mock(trainer_params)
    policy.initialize_or_load()
    policy.save_model(2000)
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
-    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
-    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
+    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"

    time_horizon = 6
    trajectory = make_fake_trajectory(
--- a/ml-agents/mlagents/trainers/tests/test_policy.py
+++ b/ml-agents/mlagents/trainers/tests/test_policy.py


 def basic_params():
-    return {"use_recurrent": False, "model_path": "my/path"}
+    return {"use_recurrent": False, "output_path": "my/path"}


 class FakePolicy(TFPolicy):
--- a/ml-agents/mlagents/trainers/tests/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/test_ppo.py
        memory_size: 10
        curiosity_strength: 0.0
        curiosity_enc_size: 1
-        summary_path: test
-        model_path: test
+        output_path: test
        reward_signals:
          extrinsic:
            strength: 1.0
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
-    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
-    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
+    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
    policy = trainer.create_policy(brain_params.brain_name, brain_params)
    trainer.add_policy(brain_params.brain_name, policy)
    mock_optimizer.reward_signals = {}
    ppo_optimizer.return_value = mock_optimizer

-    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
-    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
+    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
    policy = mock.Mock(spec=NNPolicy)
    policy.get_current_step.return_value = 2000
--- a/ml-agents/mlagents/trainers/tests/test_reward_signals.py
+++ b/ml-agents/mlagents/trainers/tests/test_reward_signals.py
    )
    trainer_parameters = trainer_config
    model_path = "testpath"
-    trainer_parameters["model_path"] = model_path
+    trainer_parameters["output_path"] = model_path
    trainer_parameters["keep_checkpoints"] = 3
    trainer_parameters["reward_signals"].update(reward_signal_config)
    trainer_parameters["use_recurrent"] = use_rnn
--- a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
+++ b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
 def dummy_config():
    return yaml.safe_load(
        """
-        summary_path: "test/"
+        output_path: "test/"
        summary_freq: 1000
        max_steps: 100
        reward_signals:
--- a/ml-agents/mlagents/trainers/tests/test_sac.py
+++ b/ml-agents/mlagents/trainers/tests/test_sac.py

    trainer_parameters = dummy_config
    model_path = "testmodel"
-    trainer_parameters["model_path"] = model_path
+    trainer_parameters["output_path"] = model_path
    trainer_parameters["keep_checkpoints"] = 3
    trainer_parameters["use_recurrent"] = use_rnn
    policy = NNPolicy(
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )
    trainer_params = dummy_config
-    trainer_params["summary_path"] = str(tmpdir)
-    trainer_params["model_path"] = str(tmpdir)
+    trainer_params["output_path"] = str(tmpdir)
    trainer_params["save_replay_buffer"] = True
    trainer = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, False, 0, 0)
    policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
    mock_optimizer.reward_signals = {}
    sac_optimizer.return_value = mock_optimizer

-    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
-    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
+    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
    trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
    policy = mock.Mock(spec=NNPolicy)
    policy.get_current_step.return_value = 2000
    brain_params = make_brain_parameters(
        discrete_action=False, visual_inputs=0, vec_obs_size=6
    )
-    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
-    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
+    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
    dummy_config["steps_per_update"] = 20
    trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
    policy = trainer.create_policy(brain_params.brain_name, brain_params)
    dummy_config["sequence_length"] = 64
    dummy_config["batch_size"] = 32
    dummy_config["use_recurrent"] = True
-    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
-    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
+    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
    with pytest.raises(UnityTrainerException):
        _ = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")

--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
            env_manager = SimpleEnvManager(env, EnvironmentParametersChannel())
        trainer_factory = TrainerFactory(
            trainer_config=trainer_config,
-            summaries_dir=dir,
-            model_path=dir,
+            output_path=dir,
            keep_checkpoints=1,
            train_model=True,
            load_model=False,

        tc = TrainerController(
            trainer_factory=trainer_factory,
-            summaries_dir=dir,
-            model_path=dir,
+            output_path=dir,
            run_id=run_id,
            meta_curriculum=meta_curriculum,
            train=True,
--- a/ml-agents/mlagents/trainers/tests/test_trainer_controller.py
+++ b/ml-agents/mlagents/trainers/tests/test_trainer_controller.py
    trainer_factory_mock.ghost_controller = GhostController()
    return TrainerController(
        trainer_factory=trainer_factory_mock,
-        model_path="test_model_path",
-        summaries_dir="test_summaries_dir",
+        output_path="test_model_path",
        run_id="test_run_id",
        save_freq=100,
        meta_curriculum=None,
    trainer_factory_mock.ghost_controller = GhostController()
    TrainerController(
        trainer_factory=trainer_factory_mock,
-        model_path="",
-        summaries_dir="",
+        output_path="",
        run_id="1",
        save_freq=1,
        meta_curriculum=None,
--- a/ml-agents/mlagents/trainers/tests/test_trainer_util.py
+++ b/ml-agents/mlagents/trainers/tests/test_trainer_util.py
 from unittest.mock import patch

 from mlagents.trainers import trainer_util
-from mlagents.trainers.trainer_util import load_config, _load_config
+from mlagents.trainers.trainer_util import (
+    load_config,
+    _load_config,
+    assemble_curriculum_config,
+)
 from mlagents.trainers.ppo.trainer import PPOTrainer
 from mlagents.trainers.exception import TrainerConfigError, UnityTrainerException
 from mlagents.trainers.brain import BrainParameters
 def test_initialize_trainer_parameters_override_defaults(
    BrainParametersMock, dummy_config_with_override
 ):
-    summaries_dir = "test_dir"
-    model_path = "model_dir"
+    output_path = "model_dir"
    keep_checkpoints = 1
    train_model = True
    load_model = False
    base_config = dummy_config_with_override
    expected_config = base_config["default"]
-    expected_config["summary_path"] = f"{run_id}_testbrain"
-    expected_config["model_path"] = model_path + "/testbrain"
+    expected_config["output_path"] = output_path + "/testbrain"
    expected_config["keep_checkpoints"] = keep_checkpoints

    # Override value from specific brain config
    with patch.object(PPOTrainer, "__init__", mock_constructor):
        trainer_factory = trainer_util.TrainerFactory(
            trainer_config=base_config,
-            summaries_dir=summaries_dir,
-            model_path=model_path,
+            output_path=output_path,
            keep_checkpoints=keep_checkpoints,
            train_model=train_model,
            load_model=load_model,
    brain_params_mock = BrainParametersMock()
    BrainParametersMock.return_value.brain_name = "testbrain"
    external_brains = {"testbrain": BrainParametersMock()}
-    summaries_dir = "test_dir"
-    model_path = "model_dir"
+    output_path = "results_dir"
    keep_checkpoints = 1
    train_model = True
    load_model = False
    base_config = dummy_config
    expected_config = base_config["default"]
-    expected_config["summary_path"] = f"{run_id}_testbrain"
-    expected_config["model_path"] = model_path + "/testbrain"
+    expected_config["output_path"] = output_path + "/testbrain"
    expected_config["keep_checkpoints"] = keep_checkpoints

    def mock_constructor(
    with patch.object(PPOTrainer, "__init__", mock_constructor):
        trainer_factory = trainer_util.TrainerFactory(
            trainer_config=base_config,
-            summaries_dir=summaries_dir,
-            model_path=model_path,
+            output_path=output_path,
            keep_checkpoints=keep_checkpoints,
            train_model=train_model,
            load_model=load_model,
 def test_initialize_invalid_trainer_raises_exception(
    BrainParametersMock, dummy_bad_config
 ):
-    summaries_dir = "test_dir"
-    model_path = "model_dir"
+    output_path = "results_dir"
    keep_checkpoints = 1
    train_model = True
    load_model = False
    with pytest.raises(TrainerConfigError):
        trainer_factory = trainer_util.TrainerFactory(
            trainer_config=bad_config,
-            summaries_dir=summaries_dir,
-            model_path=model_path,
+            output_path=output_path,
            keep_checkpoints=keep_checkpoints,
            train_model=train_model,
            load_model=load_model,
    with pytest.raises(TrainerConfigError):
        trainer_factory = trainer_util.TrainerFactory(
            trainer_config=bad_config,
-            summaries_dir=summaries_dir,
-            model_path=model_path,
+            output_path=output_path,
            keep_checkpoints=keep_checkpoints,
            train_model=train_model,
            load_model=load_model,
    with pytest.raises(UnityTrainerException):
        trainer_factory = trainer_util.TrainerFactory(
            trainer_config=bad_config,
-            summaries_dir=summaries_dir,
-            model_path=model_path,
+            output_path=output_path,
            keep_checkpoints=keep_checkpoints,
            train_model=train_model,
            load_model=load_model,

    trainer_factory = trainer_util.TrainerFactory(
        trainer_config=no_default_config,
-        summaries_dir="test_dir",
-        model_path="model_dir",
+        output_path="output_path",
        keep_checkpoints=1,
        train_model=True,
        load_model=False,

    trainer_factory = trainer_util.TrainerFactory(
        trainer_config=bad_config,
-        summaries_dir="test_dir",
-        model_path="model_dir",
+        output_path="output_path",
        keep_checkpoints=1,
        train_model=True,
        load_model=False,
        _load_config(fp)


+def test_assemble_curriculum_config():
+    file_contents = """
+behavior1:
+    curriculum:
+        foo: 5
+behavior2:
+    curriculum:
+        foo: 6
+    """
+    trainer_config = _load_config(file_contents)
+    curriculum_config = assemble_curriculum_config(trainer_config)
+    assert curriculum_config == {"behavior1": {"foo": 5}, "behavior2": {"foo": 6}}
+
+    # Check that nothing is returned if no curriculum.
+    file_contents = """
+behavior1:
+    foo: 3
+behavior2:
+    foo: 4
+    """
+    trainer_config = _load_config(file_contents)
+    curriculum_config = assemble_curriculum_config(trainer_config)
+    assert curriculum_config == {}
+
+    # Check that method doesn't break if 1st level entity isn't a dict.
+    # Note: this is a malformed configuration.
+    file_contents = """
+behavior1: 3
+behavior2: 4
+    """
+    trainer_config = _load_config(file_contents)
+    curriculum_config = assemble_curriculum_config(trainer_config)
+    assert curriculum_config == {}
+
+
-    model_path = os.path.join(tmp_path, "runid")
-    # Unused summary path
-    summary_path = os.path.join(tmp_path, "runid")
+    output_path = os.path.join(tmp_path, "runid")
-    trainer_util.handle_existing_directories(model_path, summary_path, False, False)
+    trainer_util.handle_existing_directories(output_path, False, False)
-        trainer_util.handle_existing_directories(model_path, summary_path, True, False)
+        trainer_util.handle_existing_directories(output_path, True, False)
-    os.mkdir(model_path)
+    os.mkdir(output_path)
-        trainer_util.handle_existing_directories(model_path, summary_path, False, False)
+        trainer_util.handle_existing_directories(output_path, False, False)
-    trainer_util.handle_existing_directories(model_path, summary_path, True, False)
+    trainer_util.handle_existing_directories(output_path, True, False)
-    trainer_util.handle_existing_directories(model_path, summary_path, False, True)
+    trainer_util.handle_existing_directories(output_path, False, True)
-        trainer_util.handle_existing_directories(
-            model_path, summary_path, False, True, init_path
-        )
+        trainer_util.handle_existing_directories(output_path, False, True, init_path)
-    trainer_util.handle_existing_directories(
-        model_path, summary_path, False, True, init_path
-    )
+    trainer_util.handle_existing_directories(output_path, False, True, init_path)
--- a/ml-agents/mlagents/trainers/trainer/trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/trainer.py
        self.brain_name = brain_name
        self.run_id = run_id
        self.trainer_parameters = trainer_parameters
-        self.summary_path = trainer_parameters["summary_path"]
-        self._stats_reporter = StatsReporter(self.summary_path)
+        self._stats_reporter = StatsReporter(brain_name)
        self.is_training = training
        self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)
        self.policy_queues: List[AgentManagerQueue[Policy]] = []
--- a/ml-agents/mlagents/trainers/trainer_controller.py
+++ b/ml-agents/mlagents/trainers/trainer_controller.py
    def __init__(
        self,
        trainer_factory: TrainerFactory,
-        model_path: str,
-        summaries_dir: str,
+        output_path: str,
        run_id: str,
        save_freq: int,
        meta_curriculum: Optional[MetaCurriculum],
        resampling_interval: Optional[int],
    ):
        """
-        :param model_path: Path to save the model.
+        :param output_path: Path to save the model.
        :param summaries_dir: Folder to save training summaries.
        :param run_id: The sub-directory name for model and summary statistics
        :param save_freq: Frequency at which to save model
        self.trainers: Dict[str, Trainer] = {}
        self.brain_name_to_identifier: Dict[str, Set] = defaultdict(set)
        self.trainer_factory = trainer_factory
-        self.model_path = model_path
-        self.summaries_dir = summaries_dir
+        self.output_path = output_path
        self.logger = get_logger(__name__)
        self.run_id = run_id
        self.save_freq = save_freq
                self.trainers[brain_name].export_model(name_behavior_id)

    @staticmethod
-    def _create_model_path(model_path):
+    def _create_output_path(output_path):
-            if not os.path.exists(model_path):
-                os.makedirs(model_path)
+            if not os.path.exists(output_path):
+                os.makedirs(output_path)
-                "The folder {} containing the "
+                f"The folder {output_path} containing the "
-                "permissions are set correctly.".format(model_path)
+                "permissions are set correctly."
            )

    @timed

    @timed
    def start_learning(self, env_manager: EnvManager) -> None:
-        self._create_model_path(self.model_path)
+        self._create_output_path(self.output_path)
        tf.reset_default_graph()
        global_step = 0
        last_brain_behavior_ids: Set[str] = set()
--- a/ml-agents/mlagents/trainers/trainer_util.py
+++ b/ml-agents/mlagents/trainers/trainer_util.py
    def __init__(
        self,
        trainer_config: Any,
-        summaries_dir: str,
-        model_path: str,
+        output_path: str,
        keep_checkpoints: int,
        train_model: bool,
        load_model: bool,
        multi_gpu: bool = False,
    ):
        self.trainer_config = trainer_config
-        self.summaries_dir = summaries_dir
-        self.model_path = model_path
+        self.output_path = output_path
        self.init_path = init_path
        self.keep_checkpoints = keep_checkpoints
        self.train_model = train_model
        return initialize_trainer(
            self.trainer_config,
            brain_name,
-            self.summaries_dir,
-            self.model_path,
+            self.output_path,
            self.keep_checkpoints,
            self.train_model,
            self.load_model,
 def initialize_trainer(
    trainer_config: Any,
    brain_name: str,
-    summaries_dir: str,
-    model_path: str,
+    output_path: str,
    keep_checkpoints: int,
    train_model: bool,
    load_model: bool,

    :param trainer_config: Original trainer configuration loaded from YAML
    :param brain_name: Name of the brain to be associated with trainer
-    :param summaries_dir: Directory to store trainer summary statistics
-    :param model_path: Path to save the model
+    :param output_path: Path to save the model and summary statistics
    :param keep_checkpoints: How many model checkpoints to keep
    :param train_model: Whether to train the model (vs. run inference)
    :param load_model: Whether to load the model or randomly initialize
    """
    if "default" not in trainer_config and brain_name not in trainer_config:
        raise TrainerConfigError(
-            f'Trainer config must have either a "default" section, or a section for the brain name ({brain_name}). '
-            "See config/trainer_config.yaml for an example."
+            f'Trainer config must have either a "default" section, or a section for the brain name {brain_name}. '
+            "See the config/ directory for examples."
-    trainer_parameters["summary_path"] = str(run_id) + "_" + brain_name
-    trainer_parameters["model_path"] = "{basedir}/{name}".format(
-        basedir=model_path, name=brain_name
-    )
+    trainer_parameters["output_path"] = os.path.join(output_path, brain_name)
-        trainer_parameters["init_path"] = "{basedir}/{name}".format(
-            basedir=init_path, name=brain_name
-        )
+        trainer_parameters["init_path"] = os.path.join(init_path, brain_name)
    trainer_parameters["keep_checkpoints"] = keep_checkpoints
    if brain_name in trainer_config:
        _brain_key: Any = brain_name

+    if init_path is not None:
+        trainer_parameters["init_path"] = "{basedir}/{name}".format(
+            basedir=init_path, name=brain_name
+        )
+
    min_lesson_length = 1
    if meta_curriculum:
        if brain_name in meta_curriculum.brains_to_curricula:
        ) from e


+def assemble_curriculum_config(trainer_config: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Assembles a curriculum config Dict from a trainer config. The resulting
+    dictionary should have a mapping of {brain_name: config}, where config is another
+    Dict that
+    :param trainer_config: Dict of trainer configurations (keys are brain_names).
+    :return: Dict of curriculum configurations. Returns empty dict if none are found.
+    """
+    curriculum_config: Dict[str, Any] = {}
+    for behavior_name, behavior_config in trainer_config.items():
+        # Don't try to iterate non-Dicts. This probably means your config is malformed.
+        if isinstance(behavior_config, dict) and "curriculum" in behavior_config:
+            curriculum_config[behavior_name] = behavior_config["curriculum"]
+    return curriculum_config
+
+
-    model_path: str, summary_path: str, resume: bool, force: bool, init_path: str = None
+    output_path: str, resume: bool, force: bool, init_path: str = None
 ) -> None:
    """
    Validates that if the run_id model exists, we do not overwrite it unless --force is specified.
    :param force: Whether or not the --force flag was passed.
    """

-    model_path_exists = os.path.isdir(model_path)
+    output_path_exists = os.path.isdir(output_path)
-    if model_path_exists:
+    if output_path_exists:
        if not resume and not force:
            raise UnityTrainerException(
                "Previous data from this run ID was found. "
--- a/ml-agents/tests/yamato/scripts/run_llapi.py
+++ b/ml-agents/tests/yamato/scripts/run_llapi.py
        file_name=env_name,
        side_channels=[engine_configuration_channel],
        no_graphics=True,
-        args=["-logFile", "-"],
+        additional_args=["-logFile", "-"],
    )

    try:
    """
    try:
        env1 = UnityEnvironment(
-            file_name=env_name, base_port=5006, no_graphics=True, args=["-logFile", "-"]
+            file_name=env_name,
+            base_port=5006,
+            no_graphics=True,
+            additional_args=["-logFile", "-"],
-            file_name=env_name, base_port=5006, no_graphics=True, args=["-logFile", "-"]
+            file_name=env_name,
+            base_port=5006,
+            no_graphics=True,
+            additional_args=["-logFile", "-"],
-            file_name=env_name, base_port=5007, no_graphics=True, args=["-logFile", "-"]
+            file_name=env_name,
+            base_port=5007,
+            no_graphics=True,
+            additional_args=["-logFile", "-"],
        )
        env2.reset()
    finally:
--- a/ml-agents/tests/yamato/training_int_tests.py
+++ b/ml-agents/tests/yamato/training_int_tests.py
    print(
        f"Running training with python={python_version or latest} and c#={csharp_version or latest}"
    )
-    nn_file_expected = f"./models/{run_id}/3DBall.nn"
+    nn_file_expected = f"./results/{run_id}/3DBall.nn"
    if os.path.exists(nn_file_expected):
        # Should never happen - make sure nothing leftover from an old test.
        print("Artifacts from previous build found!")
    # Copy the default training config but override the max_steps parameter,
    # and reduce the batch_size and buffer_size enough to ensure an update step happens.
    override_config_file(
-        "config/trainer_config.yaml",
+        "config/ppo/3DBall.yaml",
        "override.yaml",
        max_steps=100,
        batch_size=10,
--- a/ml-agents/tests/yamato/yamato_utils.py
+++ b/ml-agents/tests/yamato/yamato_utils.py
    """
    with open(src_path) as f:
        configs = yaml.safe_load(f)
+        behavior_configs = configs["behaviors"]
-    for config in configs.values():
+    for config in behavior_configs.values():
        config.update(**kwargs)

    with open(dest_path, "w") as f:
--- a/ml-agents/mlagents/trainers/cli_utils.py
+++ b/ml-agents/mlagents/trainers/cli_utils.py
+from typing import Set
+import argparse
+
+
+class DetectDefault(argparse.Action):
+    """
+    Internal custom Action to help detect arguments that aren't default.
+    """
+
+    non_default_args: Set[str] = set()
+
+    def __call__(self, arg_parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, values)
+        DetectDefault.non_default_args.add(self.dest)
+
+
+class DetectDefaultStoreTrue(DetectDefault):
+    """
+    Internal class to help detect arguments that aren't default.
+    Used for store_true arguments.
+    """
+
+    def __init__(self, nargs=0, **kwargs):
+        super().__init__(nargs=nargs, **kwargs)
+
+    def __call__(self, arg_parser, namespace, values, option_string=None):
+        super().__call__(arg_parser, namespace, True, option_string)
+
+
+class StoreConfigFile(argparse.Action):
+    """
+    Custom Action to store the config file location not as part of the CLI args.
+    This is because we want to maintain an equivalence between the config file's
+    contents and the args themselves.
+    """
+
+    trainer_config_path: str
+
+    def __call__(self, arg_parser, namespace, values, option_string=None):
+        delattr(namespace, self.dest)
+        StoreConfigFile.trainer_config_path = values
--- a/config/imitation/CrawlerStatic.yaml
+++ b/config/imitation/CrawlerStatic.yaml
+behaviors:
+  CrawlerStatic:
+    trainer: ppo
+    batch_size: 2024
+    beta: 0.005
+    buffer_size: 20240
+    epsilon: 0.2
+    hidden_units: 512
+    lambd: 0.95
+    learning_rate: 0.0003
+    max_steps: 1e7
+    memory_size: 256
+    normalize: true
+    num_epoch: 3
+    num_layers: 3
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 30000
+    use_recurrent: false
+    reward_signals:
+      gail:
+        strength: 1.0
+        gamma: 0.99
+        encoding_size: 128
+        demo_path: Project/Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerSta.demo
+    behavioral_cloning:
+      demo_path: Project/Assets/ML-Agents/Examples/Crawler/Demos/ExpertCrawlerSta.demo
+      strength: 0.5
+      steps: 50000
--- a/config/imitation/FoodCollector.yaml
+++ b/config/imitation/FoodCollector.yaml
+behaviors:
+  FoodCollector:
+    trainer: ppo
+    batch_size: 64
+    beta: 0.005
+    buffer_size: 10240
+    epsilon: 0.2
+    hidden_units: 128
+    lambd: 0.95
+    learning_rate: 0.0003
+    max_steps: 2.0e6
+    memory_size: 256
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 64
+    sequence_length: 32
+    summary_freq: 10000
+    use_recurrent: false
+    reward_signals:
+      gail:
+        strength: 0.1
+        gamma: 0.99
+        encoding_size: 128
+        demo_path: Project/Assets/ML-Agents/Examples/FoodCollector/Demos/ExpertFood.demo
+    behavioral_cloning:
+      demo_path: Project/Assets/ML-Agents/Examples/FoodCollector/Demos/ExpertFood.demo
+      strength: 1.0
+      steps: 0
--- a/config/imitation/Hallway.yaml
+++ b/config/imitation/Hallway.yaml
+behaviors:
+  Hallway:
+    trainer: ppo
+    batch_size: 128
+    beta: 0.01
+    buffer_size: 1024
+    epsilon: 0.2
+    hidden_units: 128
+    lambd: 0.95
+    learning_rate: 0.0003
+    max_steps: 1.0e7
+    memory_size: 256
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 64
+    sequence_length: 64
+    summary_freq: 10000
+    use_recurrent: true
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
+      gail:
+        strength: 0.1
+        gamma: 0.99
+        encoding_size: 128
+        demo_path: Project/Assets/ML-Agents/Examples/Hallway/Demos/ExpertHallway.demo
--- a/config/imitation/PushBlock.yaml
+++ b/config/imitation/PushBlock.yaml
+behaviors:
+  PushBlock:
+    trainer: ppo
+    batch_size: 128
+    beta: 0.01
+    buffer_size: 2048
+    epsilon: 0.2
+    hidden_units: 256
+    lambd: 0.95
+    learning_rate: 0.0003
+    max_steps: 1.5e7
+    memory_size: 256
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 64
+    sequence_length: 64
+    summary_freq: 60000
+    use_recurrent: false
+    reward_signals:
+      gail:
+        strength: 1.0
+        gamma: 0.99
+        encoding_size: 128
+        demo_path: Project/Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo
--- a/config/imitation/Pyramids.yaml
+++ b/config/imitation/Pyramids.yaml
+behaviors:
+  Pyramids:
+    trainer: ppo
+    batch_size: 128
+    beta: 0.01
+    buffer_size: 2048
+    epsilon: 0.2
+    hidden_units: 512
+    lambd: 0.95
+    learning_rate: 0.0003
+    max_steps: 1.0e7
+    memory_size: 256
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 128
+    sequence_length: 64
+    summary_freq: 30000
+    use_recurrent: false
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
+      curiosity:
+        strength: 0.02
+        gamma: 0.99
+        encoding_size: 256
+      gail:
+        strength: 0.01
+        gamma: 0.99
+        encoding_size: 128
+        demo_path: Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
+    behavioral_cloning:
+      demo_path: Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
+      strength: 0.5
+      steps: 150000
--- a/config/ppo/3DBall.yaml
+++ b/config/ppo/3DBall.yaml
+behaviors:
+  3DBall:
+    trainer: ppo
+    batch_size: 64
+    beta: 0.001
+    buffer_size: 12000
+    epsilon: 0.2
+    hidden_units: 128
+    lambd: 0.99
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 5.0e5
+    memory_size: 128
+    normalize: true
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 12000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/ppo/3DBallHard.yaml
+++ b/config/ppo/3DBallHard.yaml
+behaviors:
+  3DBallHard:
+    trainer: ppo
+    batch_size: 1200
+    beta: 0.001
+    buffer_size: 12000
+    epsilon: 0.2
+    hidden_units: 128
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 5.0e6
+    memory_size: 128
+    normalize: true
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 12000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.995
--- a/config/ppo/3DBall_randomize.yaml
+++ b/config/ppo/3DBall_randomize.yaml
+behaviors:
+    3DBall:
+        trainer: ppo
+        batch_size: 64
+        beta: 0.001
+        buffer_size: 12000
+        epsilon: 0.2
+        hidden_units: 128
+        lambd: 0.99
+        learning_rate: 3.0e-4
+        learning_rate_schedule: linear
+        max_steps: 5.0e5
+        memory_size: 128
+        normalize: true
+        num_epoch: 3
+        num_layers: 2
+        time_horizon: 1000
+        sequence_length: 64
+        summary_freq: 12000
+        use_recurrent: false
+        vis_encode_type: simple
+        reward_signals:
+            extrinsic:
+                strength: 1.0
+                gamma: 0.99
+
+parameter_randomization:
+    resampling-interval: 500
+    mass:
+        sampler-type: "uniform"
+        min_value: 0.5
+        max_value: 10
+    gravity:
+        sampler-type: "uniform"
+        min_value: 7
+        max_value: 12
+    scale:
+        sampler-type: "uniform"
+        min_value: 0.75
+        max_value: 3
--- a/config/ppo/Basic.yaml
+++ b/config/ppo/Basic.yaml
+behaviors:
+  Basic:
+    trainer: ppo
+    batch_size: 32
+    beta: 0.005
+    buffer_size: 256
+    epsilon: 0.2
+    hidden_units: 20
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 5.0e5
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 1
+    time_horizon: 3
+    sequence_length: 64
+    summary_freq: 2000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.9
--- a/config/ppo/Bouncer.yaml
+++ b/config/ppo/Bouncer.yaml
+behaviors:
+  Bouncer:
+    trainer: ppo
+    batch_size: 1024
+    beta: 0.005
+    buffer_size: 10240
+    epsilon: 0.2
+    hidden_units: 64
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 4.0e6
+    memory_size: 128
+    normalize: true
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 64
+    sequence_length: 64
+    summary_freq: 10000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/ppo/CrawlerDynamic.yaml
+++ b/config/ppo/CrawlerDynamic.yaml
+behaviors:
+  CrawlerDynamic:
+    trainer: ppo
+    batch_size: 2024
+    beta: 0.005
+    buffer_size: 20240
+    epsilon: 0.2
+    hidden_units: 512
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 1e7
+    memory_size: 128
+    normalize: true
+    num_epoch: 3
+    num_layers: 3
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 30000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.995
--- a/config/ppo/CrawlerStatic.yaml
+++ b/config/ppo/CrawlerStatic.yaml
+behaviors:
+  CrawlerStatic:
+    trainer: ppo
+    batch_size: 2024
+    beta: 0.005
+    buffer_size: 20240
+    epsilon: 0.2
+    hidden_units: 512
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 1e7
+    memory_size: 128
+    normalize: true
+    num_epoch: 3
+    num_layers: 3
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 30000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.995
--- a/config/ppo/FoodCollector.yaml
+++ b/config/ppo/FoodCollector.yaml
+behaviors:
+  FoodCollector:
+    trainer: ppo
+    batch_size: 1024
+    beta: 0.005
+    buffer_size: 10240
+    epsilon: 0.2
+    hidden_units: 128
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 2.0e6
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 64
+    sequence_length: 64
+    summary_freq: 10000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/ppo/GridWorld.yaml
+++ b/config/ppo/GridWorld.yaml
+behaviors:
+  GridWorld:
+    trainer: ppo
+    batch_size: 32
+    beta: 0.005
+    buffer_size: 256
+    epsilon: 0.2
+    hidden_units: 256
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 500000
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 1
+    time_horizon: 5
+    sequence_length: 64
+    summary_freq: 20000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.9
--- a/config/ppo/Hallway.yaml
+++ b/config/ppo/Hallway.yaml
+behaviors:
+  Hallway:
+    trainer: ppo
+    batch_size: 128
+    beta: 0.01
+    buffer_size: 1024
+    epsilon: 0.2
+    hidden_units: 128
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 1.0e7
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 64
+    sequence_length: 64
+    summary_freq: 10000
+    use_recurrent: true
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/ppo/PushBlock.yaml
+++ b/config/ppo/PushBlock.yaml
+behaviors:
+  PushBlock:
+    trainer: ppo
+    batch_size: 128
+    beta: 0.01
+    buffer_size: 2048
+    epsilon: 0.2
+    hidden_units: 256
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 2.0e6
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 64
+    sequence_length: 64
+    summary_freq: 60000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/ppo/Pyramids.yaml
+++ b/config/ppo/Pyramids.yaml
+behaviors:
+  Pyramids:
+    trainer: ppo
+    batch_size: 128
+    beta: 0.01
+    buffer_size: 2048
+    epsilon: 0.2
+    hidden_units: 512
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 1.0e7
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 128
+    sequence_length: 64
+    summary_freq: 30000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
+      curiosity:
+        strength: 0.02
+        gamma: 0.99
+        encoding_size: 256
--- a/config/ppo/Reacher.yaml
+++ b/config/ppo/Reacher.yaml
+behaviors:
+  Reacher:
+    trainer: ppo
+    batch_size: 2024
+    beta: 0.005
+    buffer_size: 20240
+    epsilon: 0.2
+    hidden_units: 128
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 2e7
+    memory_size: 128
+    normalize: true
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 60000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.995
--- a/config/ppo/SoccerTwos.yaml
+++ b/config/ppo/SoccerTwos.yaml
+behaviors:
+  SoccerTwos:
+    trainer: ppo
+    batch_size: 2048
+    beta: 0.005
+    buffer_size: 20480
+    epsilon: 0.2
+    hidden_units: 512
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 5.0e7
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 10000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
+    self_play:
+      window: 10
+      play_against_latest_model_ratio: 0.5
+      save_steps: 50000
+      swap_steps: 50000
+      team_change: 200000
+    curriculum:
+      measure: progress
+      thresholds: [0.05, 0.1]
+      min_lesson_length: 100
+      signal_smoothing: true
+      parameters:
+        ball_touch: [1.0, 0.5, 0.0]
--- a/config/ppo/StrikersVsGoalie.yaml
+++ b/config/ppo/StrikersVsGoalie.yaml
+behaviors:
+  Goalie:
+    trainer: ppo
+    batch_size: 2048
+    beta: 0.005
+    buffer_size: 20480
+    epsilon: 0.2
+    hidden_units: 512
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 5.0e7
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 10000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
+    self_play:
+      window: 10
+      play_against_latest_model_ratio: 0.5
+      save_steps: 50000
+      swap_steps: 25000
+      team_change: 200000
+
+  Striker:
+    trainer: ppo
+    batch_size: 2048
+    beta: 0.005
+    buffer_size: 20480
+    epsilon: 0.2
+    hidden_units: 512
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 5.0e7
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 10000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
+    self_play:
+      window: 10
+      play_against_latest_model_ratio: 0.5
+      save_steps: 50000
+      swap_steps: 100000
+      team_change: 200000
--- a/config/ppo/Tennis.yaml
+++ b/config/ppo/Tennis.yaml
+behaviors:
+  Tennis:
+    trainer: ppo
+    batch_size: 1024
+    beta: 0.005
+    buffer_size: 10240
+    epsilon: 0.2
+    hidden_units: 256
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 5.0e7
+    memory_size: 128
+    normalize: true
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 10000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
+    self_play:
+      window: 10
+      play_against_latest_model_ratio: 0.5
+      save_steps: 50000
+      swap_steps: 50000
+      team_change: 100000
--- a/config/ppo/VisualHallway.yaml
+++ b/config/ppo/VisualHallway.yaml
+behaviors:
+  VisualHallway:
+    trainer: ppo
+    batch_size: 64
+    beta: 0.01
+    buffer_size: 1024
+    epsilon: 0.2
+    hidden_units: 128
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 1.0e7
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 1
+    time_horizon: 64
+    sequence_length: 64
+    summary_freq: 10000
+    use_recurrent: true
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/ppo/VisualPushBlock.yaml
+++ b/config/ppo/VisualPushBlock.yaml
+behaviors:
+  VisualPushBlock:
+    trainer: ppo
+    batch_size: 64
+    beta: 0.01
+    buffer_size: 1024
+    epsilon: 0.2
+    hidden_units: 128
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 3.0e6
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 1
+    time_horizon: 64
+    sequence_length: 32
+    summary_freq: 60000
+    use_recurrent: true
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/ppo/VisualPyramids.yaml
+++ b/config/ppo/VisualPyramids.yaml
+behaviors:
+  VisualPyramids:
+    trainer: ppo
+    batch_size: 64
+    beta: 0.01
+    buffer_size: 2024
+    epsilon: 0.2
+    hidden_units: 256
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 1.0e7
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 1
+    time_horizon: 128
+    sequence_length: 64
+    summary_freq: 10000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
+      curiosity:
+        strength: 0.01
+        gamma: 0.99
+        encoding_size: 256
--- a/config/ppo/Walker.yaml
+++ b/config/ppo/Walker.yaml
+behaviors:
+  Walker:
+    trainer: ppo
+    batch_size: 2048
+    beta: 0.005
+    buffer_size: 20480
+    epsilon: 0.2
+    hidden_units: 512
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 2e7
+    memory_size: 128
+    normalize: true
+    num_epoch: 3
+    num_layers: 3
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 30000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.995
--- a/config/ppo/WallJump.yaml
+++ b/config/ppo/WallJump.yaml
+behaviors:
+  BigWallJump:
+    trainer: ppo
+    batch_size: 128
+    beta: 0.005
+    buffer_size: 2048
+    epsilon: 0.2
+    hidden_units: 256
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 2e7
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 128
+    sequence_length: 64
+    summary_freq: 20000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
+
+  SmallWallJump:
+    trainer: ppo
+    batch_size: 128
+    beta: 0.005
+    buffer_size: 2048
+    epsilon: 0.2
+    hidden_units: 256
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 5e6
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 128
+    sequence_length: 64
+    summary_freq: 20000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/ppo/WallJump_curriculum.yaml
+++ b/config/ppo/WallJump_curriculum.yaml
+behaviors:
+  BigWallJump:
+    trainer: ppo
+    batch_size: 128
+    beta: 0.005
+    buffer_size: 2048
+    epsilon: 0.2
+    hidden_units: 256
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 2e7
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 128
+    sequence_length: 64
+    summary_freq: 20000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
+    curriculum:
+      measure: progress
+      thresholds: [0.1, 0.3, 0.5]
+      min_lesson_length: 100
+      signal_smoothing: true
+      parameters:
+        big_wall_min_height: [0.0, 4.0, 6.0, 8.0]
+        big_wall_max_height: [4.0, 7.0, 8.0, 8.0]
+
+  SmallWallJump:
+    trainer: ppo
+    batch_size: 128
+    beta: 0.005
+    buffer_size: 2048
+    epsilon: 0.2
+    hidden_units: 256
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 5e6
+    memory_size: 128
+    normalize: false
+    num_epoch: 3
+    num_layers: 2
+    time_horizon: 128
+    sequence_length: 64
+    summary_freq: 20000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
+    curriculum:
+      measure: progress
+      thresholds: [0.1, 0.3, 0.5]
+      min_lesson_length: 100
+      signal_smoothing: true
+      parameters:
+        small_wall_height: [1.5, 2.0, 2.5, 4.0]
--- a/config/ppo/WormDynamic.yaml
+++ b/config/ppo/WormDynamic.yaml
+behaviors:
+  WormDynamic:
+    trainer: ppo
+    batch_size: 2024
+    beta: 0.005
+    buffer_size: 20240
+    epsilon: 0.2
+    hidden_units: 512
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 3.5e6
+    memory_size: 128
+    normalize: true
+    num_epoch: 3
+    num_layers: 3
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 30000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.995
--- a/config/ppo/WormStatic.yaml
+++ b/config/ppo/WormStatic.yaml
+behaviors:
+  WormStatic:
+    trainer: ppo
+    batch_size: 2024
+    beta: 0.005
+    buffer_size: 20240
+    epsilon: 0.2
+    hidden_units: 512
+    lambd: 0.95
+    learning_rate: 0.0003
+    learning_rate_schedule: linear
+    max_steps: 3.5e6
+    memory_size: 128
+    normalize: true
+    num_epoch: 3
+    num_layers: 3
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 30000
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.995
--- a/config/sac/3DBall.yaml
+++ b/config/sac/3DBall.yaml
+behaviors:
+  3DBall:
+    trainer: sac
+    batch_size: 64
+    buffer_size: 12000
+    buffer_init_steps: 0
+    hidden_units: 64
+    init_entcoef: 0.5
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 5.0e5
+    memory_size: 128
+    normalize: true
+    steps_per_update: 10
+    num_layers: 2
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 12000
+    tau: 0.005
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/sac/3DBallHard.yaml
+++ b/config/sac/3DBallHard.yaml
+behaviors:
+  3DBallHard:
+    trainer: sac
+    batch_size: 256
+    buffer_size: 50000
+    buffer_init_steps: 0
+    hidden_units: 128
+    init_entcoef: 1.0
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 5.0e5
+    memory_size: 128
+    normalize: true
+    steps_per_update: 10
+    num_layers: 2
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 12000
+    tau: 0.005
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/sac/Basic.yaml
+++ b/config/sac/Basic.yaml
+behaviors:
+  Basic:
+    trainer: sac
+    batch_size: 64
+    buffer_size: 50000
+    buffer_init_steps: 0
+    hidden_units: 20
+    init_entcoef: 0.01
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 5.0e5
+    memory_size: 128
+    normalize: false
+    steps_per_update: 10
+    num_layers: 2
+    time_horizon: 10
+    sequence_length: 64
+    summary_freq: 2000
+    tau: 0.005
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/sac/Bouncer.yaml
+++ b/config/sac/Bouncer.yaml
+behaviors:
+  Bouncer:
+    trainer: sac
+    batch_size: 128
+    buffer_size: 50000
+    buffer_init_steps: 0
+    hidden_units: 64
+    init_entcoef: 1.0
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 1.0e6
+    memory_size: 128
+    normalize: true
+    steps_per_update: 10
+    num_layers: 2
+    time_horizon: 64
+    sequence_length: 64
+    summary_freq: 20000
+    tau: 0.005
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/sac/CrawlerDynamic.yaml
+++ b/config/sac/CrawlerDynamic.yaml
+behaviors:
+  CrawlerDynamic:
+    trainer: sac
+    batch_size: 256
+    buffer_size: 500000
+    buffer_init_steps: 0
+    hidden_units: 512
+    init_entcoef: 1.0
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 5e6
+    memory_size: 128
+    normalize: true
+    steps_per_update: 20
+    num_layers: 3
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 30000
+    tau: 0.005
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.995
--- a/config/sac/CrawlerStatic.yaml
+++ b/config/sac/CrawlerStatic.yaml
+behaviors:
+  CrawlerStatic:
+    trainer: sac
+    batch_size: 256
+    buffer_size: 500000
+    buffer_init_steps: 2000
+    hidden_units: 512
+    init_entcoef: 1.0
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 3e6
+    memory_size: 128
+    normalize: true
+    steps_per_update: 20
+    num_layers: 3
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 30000
+    tau: 0.005
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.995
--- a/config/sac/FoodCollector.yaml
+++ b/config/sac/FoodCollector.yaml
+behaviors:
+  FoodCollector:
+    trainer: sac
+    batch_size: 256
+    buffer_size: 500000
+    buffer_init_steps: 0
+    hidden_units: 128
+    init_entcoef: 0.05
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 2.0e6
+    memory_size: 128
+    normalize: false
+    steps_per_update: 10
+    num_layers: 2
+    time_horizon: 64
+    sequence_length: 64
+    summary_freq: 10000
+    tau: 0.005
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/sac/GridWorld.yaml
+++ b/config/sac/GridWorld.yaml
+behaviors:
+  GridWorld:
+    trainer: sac
+    batch_size: 128
+    buffer_size: 50000
+    buffer_init_steps: 1000
+    hidden_units: 128
+    init_entcoef: 0.5
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 500000
+    memory_size: 128
+    normalize: false
+    steps_per_update: 10
+    num_layers: 1
+    time_horizon: 5
+    sequence_length: 64
+    summary_freq: 20000
+    tau: 0.005
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.9
--- a/config/sac/Hallway.yaml
+++ b/config/sac/Hallway.yaml
+behaviors:
+  Hallway:
+    trainer: sac
+    batch_size: 128
+    buffer_size: 50000
+    buffer_init_steps: 0
+    hidden_units: 128
+    init_entcoef: 0.1
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 5.0e6
+    memory_size: 128
+    normalize: false
+    steps_per_update: 10
+    num_layers: 2
+    time_horizon: 64
+    sequence_length: 32
+    summary_freq: 10000
+    tau: 0.005
+    use_recurrent: true
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/sac/PushBlock.yaml
+++ b/config/sac/PushBlock.yaml
+behaviors:
+  PushBlock:
+    trainer: sac
+    batch_size: 128
+    buffer_size: 50000
+    buffer_init_steps: 0
+    hidden_units: 256
+    init_entcoef: 0.05
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 2e6
+    memory_size: 128
+    normalize: false
+    steps_per_update: 10
+    num_layers: 2
+    time_horizon: 64
+    sequence_length: 64
+    summary_freq: 100000
+    tau: 0.005
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/sac/Pyramids.yaml
+++ b/config/sac/Pyramids.yaml
+behaviors:
+  Pyramids:
+    trainer: sac
+    batch_size: 128
+    buffer_size: 500000
+    buffer_init_steps: 10000
+    hidden_units: 256
+    init_entcoef: 0.01
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 1.0e7
+    memory_size: 128
+    normalize: false
+    steps_per_update: 10
+    num_layers: 2
+    time_horizon: 128
+    sequence_length: 16
+    summary_freq: 30000
+    tau: 0.01
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 2.0
+        gamma: 0.99
+      gail:
+        strength: 0.02
+        gamma: 0.99
+        encoding_size: 128
+        use_actions: true
+        demo_path: Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
--- a/config/sac/Reacher.yaml
+++ b/config/sac/Reacher.yaml
+behaviors:
+  Reacher:
+    trainer: sac
+    batch_size: 128
+    buffer_size: 500000
+    buffer_init_steps: 0
+    hidden_units: 128
+    init_entcoef: 1.0
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 2e7
+    memory_size: 128
+    normalize: true
+    steps_per_update: 20
+    num_layers: 2
+    time_horizon: 1000
+    sequence_length: 64
+    summary_freq: 60000
+    tau: 0.005
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
--- a/config/sac/Tennis.yaml
+++ b/config/sac/Tennis.yaml
+behaviors:
+  Tennis:
+    trainer: sac
+    batch_size: 128
+    buffer_size: 50000
+    buffer_init_steps: 0
+    hidden_units: 256
+    init_entcoef: 1.0
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 2e7
+    memory_size: 128
+    normalize: true
+    steps_per_update: 10
+    num_layers: 2
+    time_horizon: 64
+    sequence_length: 64
+    summary_freq: 10000
+    tau: 0.005
+    use_recurrent: false
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
+    self_play:
+      window: 10
+      play_against_current_self_ratio: 0.5
+      save_steps: 50000
+      swap_steps: 50000
--- a/config/sac/VisualHallway.yaml
+++ b/config/sac/VisualHallway.yaml
+behaviors:
+  VisualHallway:
+    trainer: sac
+    batch_size: 64
+    buffer_size: 50000
+    buffer_init_steps: 0
+    hidden_units: 128
+    init_entcoef: 1.0
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 1.0e7
+    memory_size: 128
+    normalize: false
+    steps_per_update: 10
+    num_layers: 1
+    time_horizon: 64
+    sequence_length: 32
+    summary_freq: 10000
+    tau: 0.005
+    use_recurrent: true
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
+    gamma: 0.99
--- a/config/sac/VisualPushBlock.yaml
+++ b/config/sac/VisualPushBlock.yaml
+behaviors:
+  VisualPushBlock:
+    trainer: sac
+    batch_size: 64
+    buffer_size: 1024
+    buffer_init_steps: 0
+    hidden_units: 128
+    init_entcoef: 1.0
+    learning_rate: 0.0003
+    learning_rate_schedule: constant
+    max_steps: 3.0e6
+    memory_size: 128
+    normalize: false
+    steps_per_update: 10
+    num_layers: 1
+    time_horizon: 64
+    sequence_length: 32
+    summary_freq: 60000
+    tau: 0.005
+    use_recurrent: true
+    vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        strength: 1.0
+        gamma: 0.99
+    gamma: 0.99