浏览代码

Merge branch 'master' into merge-release11-master

/release_11_branch
Andrew Cohen 4 年前
当前提交
c0d01baf
共有 74 个文件被更改,包括 472 次插入3280 次删除
  1. 17
      .github/workflows/pytest.yml
  2. 2
      Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorAgent.cs
  3. 18
      Project/Assets/ML-Agents/Examples/GridWorld/Scenes/GridWorld.unity
  4. 3
      Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridArea.cs
  5. 17
      com.unity.ml-agents/CHANGELOG.md
  6. 1
      com.unity.ml-agents/Editor/DemonstrationDrawer.cs
  7. 12
      com.unity.ml-agents/Runtime/Agent.cs
  8. 2
      com.unity.ml-agents/Runtime/Agent.deprecated.cs
  9. 5
      com.unity.ml-agents/Runtime/Analytics/InferenceAnalytics.cs
  10. 4
      com.unity.ml-agents/Runtime/Communicator/GrpcExtensions.cs
  11. 4
      docs/ML-Agents-Overview.md
  12. 2
      docs/Training-Configuration-File.md
  13. 3
      docs/Training-ML-Agents.md
  14. 17
      docs/Unity-Inference-Engine.md
  15. 4
      gym-unity/gym_unity/__init__.py
  16. 4
      ml-agents-envs/mlagents_envs/__init__.py
  17. 2
      ml-agents-envs/setup.py
  18. 4
      ml-agents/mlagents/trainers/__init__.py
  19. 25
      ml-agents/mlagents/trainers/cli_utils.py
  20. 9
      ml-agents/mlagents/trainers/demo_loader.py
  21. 7
      ml-agents/mlagents/trainers/learn.py
  22. 31
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  23. 3
      ml-agents/mlagents/trainers/policy/policy.py
  24. 41
      ml-agents/mlagents/trainers/policy/torch_policy.py
  25. 20
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  26. 77
      ml-agents/mlagents/trainers/ppo/trainer.py
  27. 51
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  28. 120
      ml-agents/mlagents/trainers/sac/trainer.py
  29. 10
      ml-agents/mlagents/trainers/settings.py
  30. 2
      ml-agents/mlagents/trainers/stats.py
  31. 2
      ml-agents/mlagents/trainers/tests/__init__.py
  32. 17
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  33. 19
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  34. 4
      ml-agents/mlagents/trainers/tests/test_training_status.py
  35. 31
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  36. 11
      ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py
  37. 2
      ml-agents/mlagents/trainers/tests/torch/test_encoders.py
  38. 6
      ml-agents/mlagents/trainers/tests/torch/test_ghost.py
  39. 6
      ml-agents/mlagents/trainers/tests/torch/test_hybrid.py
  40. 23
      ml-agents/mlagents/trainers/tests/torch/test_networks.py
  41. 24
      ml-agents/mlagents/trainers/tests/torch/test_policy.py
  42. 9
      ml-agents/mlagents/trainers/tests/torch/test_ppo.py
  43. 20
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
  44. 5
      ml-agents/mlagents/trainers/tests/torch/test_sac.py
  45. 5
      ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
  46. 18
      ml-agents/mlagents/trainers/tests/torch/test_utils.py
  47. 26
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  48. 39
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  49. 72
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
  50. 21
      ml-agents/mlagents/trainers/torch/components/reward_providers/rnd_reward_provider.py
  51. 9
      ml-agents/mlagents/trainers/torch/encoders.py
  52. 144
      ml-agents/mlagents/trainers/torch/networks.py
  53. 70
      ml-agents/mlagents/trainers/torch/utils.py
  54. 56
      ml-agents/mlagents/trainers/trainer/rl_trainer.py
  55. 24
      ml-agents/mlagents/trainers/trainer/trainer_factory.py
  56. 14
      ml-agents/mlagents/trainers/trainer_controller.py
  57. 6
      ml-agents/mlagents/trainers/training_status.py
  58. 96
      ml-agents/mlagents/trainers/trajectory.py
  59. 1
      ml-agents/setup.py
  60. 11
      ml-agents/tests/yamato/yamato_utils.py
  61. 4
      test_requirements.txt
  62. 175
      ml-agents/mlagents/trainers/model_saver/tf_model_saver.py
  63. 168
      ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
  64. 630
      ml-agents/mlagents/trainers/policy/tf_policy.py
  65. 361
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  66. 444
      ml-agents/mlagents/trainers/sac/network.py
  67. 641
      ml-agents/mlagents/trainers/sac/optimizer_tf.py
  68. 8
      test_constraints_min_version.txt
  69. 6
      test_constraints_max_tf2_version.txt
  70. 7
      test_constraints_max_tf1_version.txt
  71. 0
      /ml-agents/mlagents/torch_utils/globals.py

17
.github/workflows/pytest.yml


strategy:
matrix:
python-version: [3.6.x, 3.7.x, 3.8.x]
include:
- python-version: 3.6.x
pip_constraints: test_constraints_min_version.txt
- python-version: 3.7.x
pip_constraints: test_constraints_max_tf1_version.txt
- python-version: 3.8.x
pip_constraints: test_constraints_max_tf2_version.txt
steps:
- uses: actions/checkout@v2
- name: Set up Python

# This path is specific to Ubuntu
path: ~/.cache/pip
# Look to see if there is a cache hit for the corresponding requirements file
key: ${{ runner.os }}-pip-${{ hashFiles('ml-agents/setup.py', 'ml-agents-envs/setup.py', 'gym-unity/setup.py', 'test_requirements.txt', matrix.pip_constraints) }}
key: ${{ runner.os }}-pip-${{ hashFiles('ml-agents/setup.py', 'ml-agents-envs/setup.py', 'gym-unity/setup.py', 'test_requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-
${{ runner.os }}-

# pin pip to workaround https://github.com/pypa/pip/issues/9180
python -m pip install pip==20.2
python -m pip install --upgrade setuptools
python -m pip install --progress-bar=off -e ./ml-agents-envs -c ${{ matrix.pip_constraints }}
python -m pip install --progress-bar=off -e ./ml-agents -c ${{ matrix.pip_constraints }}
python -m pip install --progress-bar=off -r test_requirements.txt -c ${{ matrix.pip_constraints }}
python -m pip install --progress-bar=off -e ./gym-unity -c ${{ matrix.pip_constraints }}
python -m pip install --progress-bar=off -e ./ml-agents-envs
python -m pip install --progress-bar=off -e ./ml-agents
python -m pip install --progress-bar=off -r test_requirements.txt
python -m pip install --progress-bar=off -e ./gym-unity
- name: Save python dependencies
run: |
pip freeze > pip_versions-${{ matrix.python-version }}.txt

2
Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorAgent.cs


dirToGo += transform.right * right;
rotateDir = -transform.up * rotate;
var shootCommand = discreteActions[0] > 0;
var shootCommand = (int)discreteActions[0] > 0;
if (shootCommand)
{
m_Shoot = true;

18
Project/Assets/ML-Agents/Examples/GridWorld/Scenes/GridWorld.unity


m_ReflectionIntensity: 1
m_CustomReflection: {fileID: 0}
m_Sun: {fileID: 0}
m_IndirectSpecularColor: {r: 0.44971168, g: 0.4997775, b: 0.57563686, a: 1}
m_IndirectSpecularColor: {r: 0.44971228, g: 0.49977815, b: 0.57563734, a: 1}
m_UseRadianceAmbientProbe: 0
--- !u!157 &3
LightmapSettings:

agentParameters:
maxStep: 100
hasUpgradedFromAgentParameters: 1
maxStep: 100
MaxStep: 100
area: {fileID: 1795599557}
timeBetweenDecisionsAtInference: 0.15
renderCamera: {fileID: 797520692}

m_Name:
m_EditorClassIdentifier:
m_BrainParameters:
vectorObservationSize: 0
numStackedVectorObservations: 1
vectorActionSize: 05000000
vectorActionDescriptions: []
vectorActionSpaceType: 0
VectorObservationSize: 0
NumStackedVectorObservations: 1
VectorActionSize: 05000000
VectorActionDescriptions: []
VectorActionSpaceType: 0
m_Model: {fileID: 11400000, guid: a812f1ce7763a4a0c912717f3594fe20, type: 3}
m_InferenceDevice: 0
m_BehaviorType: 0

m_UseChildActuators: 1
m_ObservableAttributeHandling: 0
--- !u!114 &125487791
MonoBehaviour:
m_ObjectHideFlags: 0

m_RenderTexture: {fileID: 8400000, guid: 114608d5384404f89bff4b6f88432958, type: 2}
m_SensorName: RenderTextureSensor
m_Grayscale: 0
m_ObservationStacks: 1
m_Compression: 1
--- !u!1 &260425459
GameObject:

trueAgent: {fileID: 125487785}
goalPref: {fileID: 1508142483324970, guid: 1ec4e4e96e7514d45b7ebc3ba5a9a481, type: 3}
pitPref: {fileID: 1811317785436014, guid: d13ee2db77b3a4dcc8664d2fe2a0f219, type: 3}
numberOfObstacles: 1
--- !u!4 &1795599558
Transform:
m_ObjectHideFlags: 0

3
Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridArea.cs


public GameObject goalPref;
public GameObject pitPref;
GameObject[] m_Objects;
public int numberOfObstacles = 1;
GameObject m_Plane;
GameObject m_Sn;

transform.position = m_InitialPosition * (m_ResetParams.GetWithDefault("gridSize", 5f) + 1);
var playersList = new List<int>();
for (var i = 0; i < (int)m_ResetParams.GetWithDefault("numObstacles", 1); i++)
for (var i = 0; i < (int)m_ResetParams.GetWithDefault("numObstacles", numberOfObstacles); i++)
{
playersList.Add(1);
}

17
com.unity.ml-agents/CHANGELOG.md


### Major Changes
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
- TensorFlow trainers have been removed, please use the Torch trainers instead. (#4707)
### Minor Changes
#### com.unity.ml-agents / com.unity.ml-agents.extensions (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
### Bug Fixes
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
## [1.7.0-preview] - 2020-12-21
### Major Changes
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
The `.onnx` models generated by the trainers of this release are incompatible with versions of Barracuda before `1.2.1-preview`. If you upgrade the trainers, you must upgrade the version of the Barracuda package as well (which can be done by upgrading the `com.unity.ml-agents` package).
### Minor Changes
#### com.unity.ml-agents / com.unity.ml-agents.extensions (C#)
- Agents with both continuous and discrete actions are now supported. You can specify

1
com.unity.ml-agents/Editor/DemonstrationDrawer.cs


using System.Text;
using UnityEditor;
using Unity.MLAgents.Demonstrations;
using Unity.MLAgents.Policies;
namespace Unity.MLAgents.Editor

12
com.unity.ml-agents/Runtime/Agent.cs


/// <seealso cref="IActionReceiver.OnActionReceived"/>
public virtual void Heuristic(in ActionBuffers actionsOut)
{
var brainParams = m_PolicyFactory.BrainParameters;
var actionSpec = brainParams.ActionSpec;
// For continuous and discrete actions together, we don't need to fall back to the legacy method
if (actionSpec.NumContinuousActions > 0 && actionSpec.NumDiscreteActions > 0)
{
Debug.LogWarning("Heuristic method called but not implemented. Clearing ActionBuffers.");
actionsOut.Clear();
return;
}
switch (brainParams.VectorActionSpaceType)
switch (m_PolicyFactory.BrainParameters.VectorActionSpaceType)
{
case SpaceType.Continuous:
Heuristic(actionsOut.ContinuousActions.Array);

2
com.unity.ml-agents/Runtime/Agent.deprecated.cs


/// <returns>
/// The last action that was decided by the Agent (or null if no decision has been made).
/// </returns>
/// <seealso cref="OnActionReceived(Actuators.ActionBuffers)"/>
/// <seealso cref="OnActionReceived(ActionBuffers)"/>
[Obsolete("GetAction has been deprecated, please use GetStoredActionBuffers instead.")]
public float[] GetAction()
{

5
com.unity.ml-agents/Runtime/Analytics/InferenceAnalytics.cs


using System;
using System.Collections.Generic;
using Unity.Barracuda;
using Unity.MLAgents.Actuators;

/// <summary>
/// Whether or not we've registered this particular event yet
/// </summary>
static bool s_EventRegistered;
static bool s_EventRegistered = false;
/// <summary>
/// Hourly limit for this event name

/// <param name="behaviorName">The BehaviorName of the Agent using the model</param>
/// <param name="inferenceDevice">Whether inference is being performed on the CPU or GPU</param>
/// <param name="sensors">List of ISensors for the Agent. Used to generate information about the observation space.</param>
/// <param name="actionSpec">ActionSpec for the Agent. Used to generate information about the actions.</param>
/// <param name="actionSpec">ActionSpec for the Agent. Used to generate information about the action space.</param>
/// <returns></returns>
public static void InferenceModelSet(
NNModel nnModel,

4
com.unity.ml-agents/Runtime/Communicator/GrpcExtensions.cs


ActionSpec actionSpec;
if (bpp.ActionSpec == null)
{
var spaceType = bpp.VectorActionSpaceTypeDeprecated;
if (spaceType == SpaceTypeProto.Continuous)
var spaceType = (SpaceType)bpp.VectorActionSpaceTypeDeprecated;
if (spaceType == SpaceType.Continuous)
{
actionSpec = ActionSpec.MakeContinuous(bpp.VectorActionSizeDeprecated.ToArray()[0]);
}

4
docs/ML-Agents-Overview.md


below).
- `rnd`: represents an intrinsic reward signal that encourages exploration
in sparse-reward environments that is defined by the Curiosity module (see
below). (Not available for TensorFlow trainers)
below).
### Deep Reinforcement Learning

of the trained model is used as intrinsic reward. The more an Agent visits a state, the
more accurate the predictions and the lower the rewards which encourages the Agent to
explore new states with higher prediction errors.
__Note:__ RND is not available for TensorFlow trainers (only PyTorch trainers)
### Imitation Learning

2
docs/Training-Configuration-File.md


| `time_horizon` | (default = `64`) How many steps of experience to collect per-agent before adding it to the experience buffer. When this limit is reached before the end of an episode, a value estimate is used to predict the overall expected reward from the agent's current state. As such, this parameter trades off between a less biased, but higher variance estimate (long time horizon) and more biased, but less varied estimate (short time horizon). In cases where there are frequent rewards within an episode, or episodes are prohibitively large, a smaller number can be more ideal. This number should be large enough to capture all the important behavior within a sequence of an agent's actions. <br><br> Typical range: `32` - `2048` |
| `max_steps` | (default = `500000`) Total number of steps (i.e., observation collected and action taken) that must be taken in the environment (or across all environments if using multiple in parallel) before ending the training process. If you have multiple agents with the same behavior name within your environment, all steps taken by those agents will contribute to the same `max_steps` count. <br><br>Typical range: `5e5` - `1e7` |
| `keep_checkpoints` | (default = `5`) The maximum number of model checkpoints to keep. Checkpoints are saved after the number of steps specified by the checkpoint_interval option. Once the maximum number of checkpoints has been reached, the oldest checkpoint is deleted when saving a new checkpoint. |
| `checkpoint_interval` | (default = `500000`) The number of experiences collected between each checkpoint by the trainer. A maximum of `keep_checkpoints` checkpoints are saved before old ones are deleted. Each checkpoint saves the `.onnx` (and `.nn` if using TensorFlow) files in `results/` folder.|
| `checkpoint_interval` | (default = `500000`) The number of experiences collected between each checkpoint by the trainer. A maximum of `keep_checkpoints` checkpoints are saved before old ones are deleted. Each checkpoint saves the `.onnx` files in `results/` folder.|
| `init_path` | (default = None) Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents. <br><br>You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run. |
| `threaded` | (default = `true`) By default, model updates can happen while the environment is being stepped. This violates the [on-policy](https://spinningup.openai.com/en/latest/user/algorithms.html#the-on-policy-algorithms) assumption of PPO slightly in exchange for a training speedup. To maintain the strict on-policyness of PPO, you can disable parallel updates by setting `threaded` to `false`. There is usually no reason to turn `threaded` off for SAC. |
| `hyperparameters -> learning_rate` | (default = `3e-4`) Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase. <br><br>Typical range: `1e-5` - `1e-3` |

3
docs/Training-ML-Agents.md


save_steps: 50000
swap_steps: 2000
team_change: 100000
# use TensorFlow backend
framework: tensorflow
```
Here is an equivalent file if we use an SAC trainer instead. Notice that the

17
docs/Unity-Inference-Engine.md


might be non-fatal build time errors when target platform includes Graphics API
that does not support **Unity Compute Shaders**.
## Supported formats
There are currently two supported model formats:
- Barracuda (`.nn`) files use a proprietary format produced by the
[`tensorflow_to_barracuda.py`]() script.
- ONNX (`.onnx`) files use an
[industry-standard open format](https://onnx.ai/about.html) produced by the
[tf2onnx package](https://github.com/onnx/tensorflow-onnx).
Export to ONNX is used if using PyTorch (the default). To enable it
while using TensorFlow, make sure `tf2onnx>=1.6.1` is installed in pip.
## Using the Unity Inference Engine
When using a model, drag the model file into the **Model** field in the

Barracuda directly, instead of trying to run it through ML-Agents.
## Model inference outside of Unity
We do not provide support for inference anywhere outside of Unity. The
`frozen_graph_def.pb` and `.onnx` files produced by training are open formats
for TensorFlow and ONNX respectively; if you wish to convert these to another
We do not provide support for inference anywhere outside of Unity. The `.onnx` files produced by training use the open format ONNX; if you wish to convert a `.onnx` file to another
format or run inference with them, refer to their documentation.

4
gym-unity/gym_unity/__init__.py


# Version of the library that will be used to upload to pypi
__version__ = "0.23.0"
__version__ = "0.24.0.dev0"
__release_tag__ = "release_11"
__release_tag__ = None

4
ml-agents-envs/mlagents_envs/__init__.py


# Version of the library that will be used to upload to pypi
__version__ = "0.23.0"
__version__ = "0.24.0.dev0"
__release_tag__ = "release_11"
__release_tag__ = None

2
ml-agents-envs/setup.py


install_requires=[
"cloudpickle",
"grpcio>=1.11.0",
"numpy>=1.14.1,<1.19.0",
"numpy>=1.14.1",
"Pillow>=4.2.1",
"protobuf>=3.6",
"pyyaml>=3.1.0",

4
ml-agents/mlagents/trainers/__init__.py


# Version of the library that will be used to upload to pypi
__version__ = "0.23.0"
__version__ = "0.24.0.dev0"
__release_tag__ = "release_11"
__release_tag__ = None

25
ml-agents/mlagents/trainers/cli_utils.py


from mlagents.trainers.exception import TrainerConfigError
from mlagents_envs.environment import UnityEnvironment
import argparse
from mlagents_envs import logging_util
logger = logging_util.get_logger(__name__)
class RaiseRemovedWarning(argparse.Action):
"""
Internal custom Action to raise warning when argument is called.
"""
def __init__(self, nargs=0, **kwargs):
super().__init__(nargs=nargs, **kwargs)
def __call__(self, arg_parser, namespace, values, option_string=None):
logger.warning(f"The command line argument {option_string} was removed.")
class DetectDefault(argparse.Action):

argparser.add_argument(
"--torch",
default=False,
action=DetectDefaultStoreTrue,
help="Use the PyTorch framework. Note that this option is not required anymore as PyTorch is the"
"default framework, and will be removed in the next release.",
action=RaiseRemovedWarning,
help="(Removed) Use the PyTorch framework.",
action=DetectDefaultStoreTrue,
help="(Deprecated) Use the TensorFlow framework instead of PyTorch. Install TensorFlow "
"before using this option.",
action=RaiseRemovedWarning,
help="(Removed) Use the TensorFlow framework.",
)
eng_conf = argparser.add_argument_group(title="Engine Configuration")

9
ml-agents/mlagents/trainers/demo_loader.py


from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
AgentInfoActionPairProto,
)
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.trajectory import ObsUtil
from mlagents_envs.rpc_utils import behavior_spec_from_proto, steps_from_proto
from mlagents_envs.base_env import BehaviorSpec
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto

demo_raw_buffer["done"].append(next_done)
demo_raw_buffer["rewards"].append(next_reward)
split_obs = SplitObservations.from_observations(current_obs)
for i, obs in enumerate(split_obs.visual_observations):
demo_raw_buffer["visual_obs%d" % i].append(obs)
demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
for i, obs in enumerate(current_obs):
demo_raw_buffer[ObsUtil.get_name_at(i)].append(obs)
if (
len(current_pair_info.action_info.continuous_actions) == 0
and len(current_pair_info.action_info.discrete_actions) == 0

demo_raw_buffer["discrete_action"].append(
current_pair_info.action_info.discrete_actions
)
demo_raw_buffer["prev_action"].append(previous_action)
if next_done:
demo_raw_buffer.resequence_and_append(

7
ml-agents/mlagents/trainers/learn.py


import mlagents.trainers
import mlagents_envs
from mlagents import tf_utils
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
from mlagents.trainers.trainer import TrainerFactory

GaugeWriter,
ConsoleWriter,
)
from mlagents.trainers.cli_utils import parser, DetectDefault
from mlagents.trainers.cli_utils import parser
from mlagents_envs.environment import UnityEnvironment
from mlagents.trainers.settings import RunOptions

param_manager=env_parameter_manager,
init_path=maybe_init_path,
multi_gpu=False,
force_torch="torch" in DetectDefault.non_default_args,
force_tensorflow="tensorflow" in DetectDefault.non_default_args,
)
# Create controller and begin training.
tc = TrainerController(

log_level = logging_util.DEBUG
else:
log_level = logging_util.INFO
# disable noisy warnings from tensorflow
tf_utils.set_warnings_enabled(False)
logging_util.set_log_level(log_level)

31
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.trajectory import ObsUtil
from mlagents.trainers.torch.components.bc.module import BCModule
from mlagents.trainers.torch.components.reward_providers import create_reward_provider

def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
if self.policy.use_vis_obs:
visual_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
visual_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
visual_obs.append(visual_ob)
else:
visual_obs = []
n_obs = len(self.policy.behavior_spec.observation_shapes)
current_obs = ObsUtil.from_buffer(batch, n_obs)
# Convert to tensors
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
vec_vis_obs = SplitObservations.from_observations(next_obs)
next_vec_obs = [
ModelUtils.list_to_tensor(vec_vis_obs.vector_observations).unsqueeze(0)
]
next_vis_obs = [
ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0)
for _vis_ob in vec_vis_obs.visual_observations
]
next_obs = [obs.unsqueeze(0) for obs in next_obs]
vector_obs, visual_obs, memory, sequence_length=batch.num_experiences
current_obs, memory, sequence_length=batch.num_experiences
next_vec_obs, next_vis_obs, next_memory, sequence_length=1
next_obs, next_memory, sequence_length=1
)
for name, estimate in value_estimates.items():

3
ml-agents/mlagents/trainers/policy/policy.py


from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.buffer import AgentBuffer
class UnityPolicyException(UnityException):

raise RuntimeError("Continuous NaN action detected.")
@abstractmethod
def update_normalization(self, vector_obs: np.ndarray) -> None:
def update_normalization(self, buffer: AgentBuffer) -> None:
pass
@abstractmethod

41
ml-agents/mlagents/trainers/policy/torch_policy.py


from mlagents_envs.timers import timed
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.torch.networks import (
SharedActorCritic,
SeparateActorCritic,

from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs

"""
return self._export_m_size
def _split_decision_step(
self, decision_requests: DecisionSteps
) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
def _extract_masks(self, decision_requests: DecisionSteps) -> np.ndarray:
mask = None
if self.behavior_spec.action_spec.discrete_size > 0:
mask = torch.ones([len(decision_requests), np.sum(self.act_size)])

)
return vec_vis_obs, mask
return mask
def update_normalization(self, vector_obs: np.ndarray) -> None:
def update_normalization(self, buffer: AgentBuffer) -> None:
:param vector_obs: The vector observations to add to the running estimate of the distribution.
:param buffer: The buffer with the observations to add to the running estimate
of the distribution.
vector_obs = [torch.as_tensor(vector_obs)]
self.actor_critic.update_normalization(vector_obs)
self.actor_critic.update_normalization(buffer)
vec_obs: List[torch.Tensor],
vis_obs: List[torch.Tensor],
obs: List[torch.Tensor],
:param vec_obs: List of vector observations.
:param vis_obs: List of visual observations.
:param obs: List of observations.
:param masks: Loss masks for RNN, else None.
:param memories: Input memories when using RNN, else None.
:param seq_len: Sequence length when using RNN.

vec_obs, vis_obs, masks, memories, seq_len
obs, masks, memories, seq_len
vec_obs: List[torch.Tensor],
vis_obs: List[torch.Tensor],
obs: List[torch.Tensor],
actions: AgentAction,
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,

vec_obs, vis_obs, actions, masks, memories, seq_len
obs, actions, masks, memories, seq_len
)
return log_probs, entropies, value_heads

:param decision_requests: DecisionStep object containing inputs.
:return: Outputs from network as defined by self.inference_dict.
"""
vec_vis_obs, masks = self._split_decision_step(decision_requests)
vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)]
vis_obs = [
torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations
]
obs = decision_requests.obs
masks = self._extract_masks(decision_requests)
tensor_obs = [torch.as_tensor(np_ob) for np_ob in obs]
memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze(
0
)

action, log_probs, entropy, memories = self.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
tensor_obs, masks=masks, memories=memories
)
action_tuple = action.to_action_tuple()
run_out["action"] = action_tuple

20
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.trajectory import ObsUtil
class TorchPPOOptimizer(TorchOptimizer):

)
returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])
vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
n_obs = len(self.policy.behavior_spec.observation_shapes)
current_obs = ObsUtil.from_buffer(batch, n_obs)
# Convert to tensors
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
actions = AgentAction.from_dict(batch)

if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
vis_obs.append(vis_ob)
else:
vis_obs = []
vec_obs,
vis_obs,
current_obs,
masks=act_masks,
actions=actions,
memories=memories,

77
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, PPOSettings, FrameworkType
from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
BaseRewardProvider,
)
from mlagents import tf_utils
if tf_utils.is_available():
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
else:
TFPolicy = None # type: ignore
PPOOptimizer = None # type: ignore
from mlagents.trainers.settings import TrainerSettings, PPOSettings
logger = get_logger(__name__)

agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
self.policy.update_normalization(agent_buffer_trajectory)
# Get all value estimates
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(

for name, v in value_estimates.items():
agent_buffer_trajectory[f"{name}_value_estimates"].extend(v)
if isinstance(self.optimizer.reward_signals[name], BaseRewardProvider):
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
np.mean(v),
)
else:
self._stats_reporter.add_stat(
self.optimizer.reward_signals[name].value_name, np.mean(v)
)
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
np.mean(v),
)
# Evaluate all reward functions
self.collected_rewards["environment"][agent_id] += np.sum(

# BaseRewardProvider is a PyTorch-based reward signal
if isinstance(reward_signal, BaseRewardProvider):
evaluate_result = (
reward_signal.evaluate(agent_buffer_trajectory)
* reward_signal.strength
)
else: # reward_signal is a TensorFlow-based RewardSignal class
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
evaluate_result = (
reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength
)
agent_buffer_trajectory[f"{name}_rewards"].extend(evaluate_result)
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

self._clear_update_buffer()
return True
def create_tf_policy(
self,
parsed_behavior_id: BehaviorIdentifiers,
behavior_spec: BehaviorSpec,
create_graph: bool = False,
) -> TFPolicy:
"""
Creates a policy with a Tensorflow backend and PPO hyperparameters
:param parsed_behavior_id:
:param behavior_spec: specifications for policy construction
:param create_graph: whether to create the Tensorflow graph on construction
:return policy
"""
policy = TFPolicy(
self.seed,
behavior_spec,
self.trainer_settings,
condition_sigma_on_obs=False, # Faster training for PPO
create_tf_graph=create_graph,
)
return policy
def create_torch_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TorchPolicy:

)
return policy
def create_ppo_optimizer(self) -> PPOOptimizer:
if self.framework == FrameworkType.PYTORCH:
return TorchPPOOptimizer( # type: ignore
cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore
) # type: ignore
else:
return PPOOptimizer( # type: ignore
cast(TFPolicy, self.policy), self.trainer_settings # type: ignore
) # type: ignore
def create_ppo_optimizer(self) -> TorchPPOOptimizer:
return TorchPPOOptimizer( # type: ignore
cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore
) # type: ignore
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy

51
ml-agents/mlagents/trainers/sac/optimizer_torch.py


from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.settings import TrainerSettings, SACSettings
from contextlib import ExitStack
from mlagents.trainers.trajectory import ObsUtil
EPSILON = 1e-6 # Small value to avoid divide by zero

def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
actions: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

"""
Performs a forward pass on the value network, which consists of a Q1 and Q2
network. Optionally does not evaluate gradients for either the Q1, Q2, or both.
:param vec_inputs: List of vector observation tensors.
:param vis_input: List of visual observation tensors.
:param inputs: List of observation tensors.
:param actions: For a continuous Q function (has actions), tensor of actions.
Otherwise, None.
:param memories: Initial memories if using memory. Otherwise, None.

if not q1_grad:
stack.enter_context(torch.no_grad())
q1_out, _ = self.q1_network(
vec_inputs,
vis_inputs,
inputs,
actions=actions,
memories=memories,
sequence_length=sequence_length,

stack.enter_context(torch.no_grad())
q2_out, _ = self.q2_network(
vec_inputs,
vis_inputs,
inputs,
actions=actions,
memories=memories,
sequence_length=sequence_length,

for name in self.reward_signals:
rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"])
vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
next_vec_obs = [ModelUtils.list_to_tensor(batch["next_vector_in"])]
n_obs = len(self.policy.behavior_spec.observation_shapes)
current_obs = ObsUtil.from_buffer(batch, n_obs)
# Convert to tensors
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
next_obs = ObsUtil.from_buffer_next(batch, n_obs)
# Convert to tensors
next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
actions = AgentAction.from_dict(batch)

torch.zeros_like(next_memories) if next_memories is not None else None
)
vis_obs: List[torch.Tensor] = []
next_vis_obs: List[torch.Tensor] = []
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
vis_obs.append(vis_ob)
next_vis_ob = ModelUtils.list_to_tensor(
batch["next_visual_obs%d" % idx]
)
next_vis_obs.append(next_vis_ob)
# Copy normalizers from policy
self.value_network.q1_network.network_body.copy_normalization(
self.policy.actor_critic.network_body

value_estimates,
_,
) = self.policy.actor_critic.get_action_stats_and_value(
vec_obs,
vis_obs,
current_obs,
masks=act_masks,
memories=memories,
sequence_length=self.policy.sequence_length,

vec_obs,
vis_obs,
current_obs,
cont_sampled_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,

vec_obs,
vis_obs,
current_obs,
cont_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,

with torch.no_grad():
target_values, _ = self.target_network(
next_vec_obs,
next_vis_obs,
next_obs,
memories=next_memories,
sequence_length=self.policy.sequence_length,
)

120
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer
from mlagents.trainers.trajectory import Trajectory, SplitObservations
from mlagents.trainers.trajectory import Trajectory, ObsUtil
from mlagents.trainers.settings import TrainerSettings, SACSettings, FrameworkType
from mlagents.trainers.torch.components.reward_providers import BaseRewardProvider
from mlagents import tf_utils
if tf_utils.is_available():
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.sac.optimizer_tf import SACOptimizer
else:
TFPolicy = None # type: ignore
SACOptimizer = None # type: ignore
from mlagents.trainers.settings import TrainerSettings, SACSettings
logger = get_logger(__name__)

# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
self.policy.update_normalization(agent_buffer_trajectory)
# Evaluate all reward functions for reporting purposes
self.collected_rewards["environment"][agent_id] += np.sum(

# BaseRewardProvider is a PyTorch-based reward signal
if isinstance(reward_signal, BaseRewardProvider):
evaluate_result = (
reward_signal.evaluate(agent_buffer_trajectory)
* reward_signal.strength
)
else: # reward_signal uses TensorFlow
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
evaluate_result = (
reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength
)
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached
)
for name, v in value_estimates.items():
# BaseRewardProvider is a PyTorch-based reward signal
if isinstance(self.optimizer.reward_signals[name], BaseRewardProvider):
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value",
np.mean(v),
)
else: # TensorFlow reward signal
self._stats_reporter.add_stat(
self.optimizer.reward_signals[name].value_name, np.mean(v)
)
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value",
np.mean(v),
)
vec_vis_obs = SplitObservations.from_observations(last_step.obs)
for i, obs in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs
if vec_vis_obs.vector_observations.size > 1:
agent_buffer_trajectory["next_vector_in"][
-1
] = vec_vis_obs.vector_observations
last_step_obs = last_step.obs
for i, obs in enumerate(last_step_obs):
agent_buffer_trajectory[ObsUtil.get_name_at_next(i)][-1] = obs
agent_buffer_trajectory["done"][-1] = False
# Append to update buffer

)
)
def create_tf_policy(
self,
parsed_behavior_id: BehaviorIdentifiers,
behavior_spec: BehaviorSpec,
create_graph: bool = False,
) -> TFPolicy:
"""
Creates a policy with a Tensorflow backend and SAC hyperparameters
:param parsed_behavior_id:
:param behavior_spec: specifications for policy construction
:param create_graph: whether to create the Tensorflow graph on construction
:return policy
"""
policy = TFPolicy(
self.seed,
behavior_spec,
self.trainer_settings,
tanh_squash=True,
reparameterize=True,
create_tf_graph=create_graph,
)
self.maybe_load_replay_buffer()
return policy
def create_torch_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TorchPolicy:

)
# Get rewards for each reward
for name, signal in self.optimizer.reward_signals.items():
# BaseRewardProvider is a PyTorch-based reward signal
if isinstance(signal, BaseRewardProvider):
sampled_minibatch[f"{name}_rewards"] = (
signal.evaluate(sampled_minibatch) * signal.strength
)
else: # reward_signal is a TensorFlow-based RewardSignal class
sampled_minibatch[f"{name}_rewards"] = signal.evaluate_batch(
sampled_minibatch
).scaled_reward
sampled_minibatch[f"{name}_rewards"] = (
signal.evaluate(sampled_minibatch) * signal.strength
)
update_stats = self.optimizer.update(sampled_minibatch, n_sequences)
for stat_name, value in update_stats.items():

) / self.reward_signal_update_steps > self.reward_signal_steps_per_update:
# Get minibatches for reward signal update if needed
reward_signal_minibatches = {}
for name, signal in self.optimizer.reward_signals.items():
for name in self.optimizer.reward_signals.keys():
# BaseRewardProvider is a PyTorch-based reward signal
if not isinstance(signal, BaseRewardProvider):
# Some signals don't need a minibatch to be sampled - so we don't!
if signal.update_dict:
reward_signal_minibatches[name] = buffer.sample_mini_batch(
self.hyperparameters.batch_size,
sequence_length=self.policy.sequence_length,
)
else: # TensorFlow reward signal
if name != "extrinsic":
reward_signal_minibatches[name] = buffer.sample_mini_batch(
self.hyperparameters.batch_size,
sequence_length=self.policy.sequence_length,
)
if name != "extrinsic":
reward_signal_minibatches[name] = buffer.sample_mini_batch(
self.hyperparameters.batch_size,
sequence_length=self.policy.sequence_length,
)
update_stats = self.optimizer.update_reward_signals(
reward_signal_minibatches, n_sequences
)

self._stats_reporter.add_stat(stat, np.mean(stat_list))
def create_sac_optimizer(self) -> TorchSACOptimizer:
if self.framework == FrameworkType.PYTORCH:
return TorchSACOptimizer( # type: ignore
cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore
) # type: ignore
else:
return SACOptimizer( # type: ignore
cast(TFPolicy, self.policy), self.trainer_settings # type: ignore
) # type: ignore
return TorchSACOptimizer( # type: ignore
cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore
) # type: ignore
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy

10
ml-agents/mlagents/trainers/settings.py


return _mapping[self]
class FrameworkType(Enum):
TENSORFLOW: str = "tensorflow"
PYTORCH: str = "pytorch"
@attr.s(auto_attribs=True)
class TrainerSettings(ExportableSettings):
default_override: ClassVar[Optional["TrainerSettings"]] = None

threaded: bool = True
self_play: Optional[SelfPlaySettings] = None
behavioral_cloning: Optional[BehavioralCloningSettings] = None
framework: FrameworkType = FrameworkType.PYTORCH
cattr.register_structure_hook(
Dict[RewardSignalType, RewardSignalSettings], RewardSignalSettings.structure

d_copy.update(cattr.unstructure(TrainerSettings.default_override))
deep_update_dict(d_copy, d)
if "framework" in d_copy:
logger.warning("Framework option was deprecated but was specified")
d_copy.pop("framework", None)
for key, val in d_copy.items():
if attr.has(type(val)):

2
ml-agents/mlagents/trainers/stats.py


from mlagents_envs.logging_util import get_logger
from mlagents_envs.timers import set_gauge
from torch.utils.tensorboard import SummaryWriter
from mlagents.tf_utils.globals import get_rank
from mlagents.torch_utils.globals import get_rank
logger = get_logger(__name__)

2
ml-agents/mlagents/trainers/tests/__init__.py


# tb[-2] is the wrapper function, e.g. np_array_no_float64
# we want the calling function, so use tb[-3]
filename = tb[-3].filename
# Only raise if this came from mlagents code, not tensorflow
# Only raise if this came from mlagents code
if (
"ml-agents/mlagents" in filename
or "ml-agents-envs/mlagents" in filename

17
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.tests.test_buffer import construct_fake_buffer
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.settings import TrainerSettings, FrameworkType
from mlagents.trainers.settings import TrainerSettings
from mlagents_envs.base_env import ActionSpec

super()._process_trajectory(trajectory)
def create_rl_trainer(framework=FrameworkType.TENSORFLOW):
def create_rl_trainer():
TrainerSettings(
max_steps=100, checkpoint_interval=10, summary_freq=20, framework=framework
),
TrainerSettings(max_steps=100, checkpoint_interval=10, summary_freq=20),
True,
False,
"mock_model_path",

assert mocked_save_model.call_count == 0
@pytest.mark.parametrize(
"framework", [FrameworkType.TENSORFLOW, FrameworkType.PYTORCH], ids=["tf", "torch"]
)
def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary, framework):
trainer = create_rl_trainer(framework)
def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary):
trainer = create_rl_trainer()
mock_policy = mock.Mock()
trainer.add_policy("TestBrain", mock_policy)
trajectory_queue = AgentManagerQueue("testbrain")

calls = [mock.call(trainer.brain_name, step) for step in checkpoint_range]
trainer.model_saver.save_checkpoint.assert_has_calls(calls, any_order=True)
export_ext = "nn" if trainer.framework == FrameworkType.TENSORFLOW else "onnx"
export_ext = "onnx"
add_checkpoint_calls = [
mock.call(

19
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


from unittest.mock import MagicMock, patch
import pytest
from mlagents.torch_utils import torch
from mlagents.tf_utils import tf
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
from mlagents.trainers.ghost.controller import GhostController

@patch("numpy.random.seed")
@patch.object(tf, "set_random_seed")
def test_initialization_seed(numpy_random_seed, tensorflow_set_seed):
@patch.object(torch, "manual_seed")
def test_initialization_seed(numpy_random_seed, torch_set_seed):
seed = 27
trainer_factory_mock = MagicMock()
trainer_factory_mock.ghost_controller = GhostController()

training_seed=seed,
)
numpy_random_seed.assert_called_with(seed)
tensorflow_set_seed.assert_called_with(seed)
torch_set_seed.assert_called_with(seed)
@pytest.fixture

return tc, trainer_mock
@patch.object(tf, "reset_default_graph")
tf_reset_graph, trainer_controller_with_start_learning_mocks
trainer_controller_with_start_learning_mocks
tf_reset_graph.return_value = None
env_mock = MagicMock()
env_mock.close = MagicMock()

tc.start_learning(env_mock)
tf_reset_graph.assert_called_once()
@patch.object(tf, "reset_default_graph")
tf_reset_graph, trainer_controller_with_start_learning_mocks
trainer_controller_with_start_learning_mocks
tf_reset_graph.return_value = None
brain_info_mock = MagicMock()
env_mock = MagicMock()

tc.start_learning(env_mock)
tf_reset_graph.assert_called_once()
env_mock.reset.assert_called_once()
assert tc.advance.call_count == trainer_mock.get_max_steps + 1
tc._save_models.assert_called_once()

4
ml-agents/mlagents/trainers/tests/test_training_status.py


version_statsmetadata = StatusMetaData(mlagents_version="test")
default_metadata.check_compatibility(version_statsmetadata)
tf_version_statsmetadata = StatusMetaData(tensorflow_version="test")
default_metadata.check_compatibility(tf_version_statsmetadata)
torch_version_statsmetadata = StatusMetaData(torch_version="test")
default_metadata.check_compatibility(torch_version_statsmetadata)
# Assert that 2 warnings have been thrown
assert len(cm.output) == 2

31
ml-agents/mlagents/trainers/tests/test_trajectory.py


import numpy as np
import pytest
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.tests.mock_brain import make_fake_trajectory
from mlagents_envs.base_env import ActionSpec

@pytest.mark.parametrize("num_visual_obs", [0, 1, 2])
@pytest.mark.parametrize("num_vec_obs", [0, 1])
def test_split_obs(num_visual_obs, num_vec_obs):
obs = []
for _ in range(num_visual_obs):
obs.append(np.ones((84, 84, 3), dtype=np.float32))
for _ in range(num_vec_obs):
obs.append(np.ones(VEC_OBS_SIZE, dtype=np.float32))
split_observations = SplitObservations.from_observations(obs)
if num_vec_obs == 1:
assert len(split_observations.vector_observations) == VEC_OBS_SIZE
else:
assert len(split_observations.vector_observations) == 0
# Assert the number of vector observations.
assert len(split_observations.visual_observations) == num_visual_obs
"next_visual_obs0",
"visual_obs0",
"vector_obs",
"next_vector_in",
"next_obs_0",
"next_obs_1",
"obs_0",
"obs_1",
"memory",
"masks",
"done",