浏览代码

Merge branch 'master' into develop-magic-string

/develop/magic-string
Andrew Cohen 5 年前
当前提交
082789ea
共有 49 个文件被更改,包括 299 次插入572 次删除
  1. 12
      .pre-commit-config.yaml
  2. 7
      .pylintrc
  3. 2
      UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
  4. 6
      config/gail_config.yaml
  5. 2
      docs/Migrating.md
  6. 9
      docs/Reward-Signals.md
  7. 34
      docs/Training-Imitation-Learning.md
  8. 35
      docs/Training-ML-Agents.md
  9. 18
      docs/Training-PPO.md
  10. 18
      docs/Training-SAC.md
  11. 142
      docs/images/mlagents-ImitationAndRL.png
  12. 6
      ml-agents-envs/mlagents/envs/base_env.py
  13. 13
      ml-agents-envs/mlagents/envs/environment.py
  14. 3
      ml-agents-envs/mlagents/envs/mock_communicator.py
  15. 25
      ml-agents-envs/mlagents/envs/rpc_utils.py
  16. 2
      ml-agents-envs/mlagents/envs/side_channel/engine_configuration_channel.py
  17. 19
      ml-agents-envs/mlagents/envs/side_channel/float_properties_channel.py
  18. 2
      ml-agents-envs/mlagents/envs/side_channel/raw_bytes_channel.py
  19. 2
      ml-agents-envs/mlagents/envs/side_channel/side_channel.py
  20. 10
      ml-agents-envs/mlagents/envs/tests/test_rpc_utils.py
  21. 3
      ml-agents-envs/mlagents/envs/tests/test_side_channel.py
  22. 4
      ml-agents/mlagents/trainers/action_info.py
  23. 4
      ml-agents/mlagents/trainers/agent_processor.py
  24. 16
      ml-agents/mlagents/trainers/brain.py
  25. 6
      ml-agents/mlagents/trainers/components/bc/module.py
  26. 4
      ml-agents/mlagents/trainers/demo_loader.py
  27. 9
      ml-agents/mlagents/trainers/env_manager.py
  28. 4
      ml-agents/mlagents/trainers/models.py
  29. 8
      ml-agents/mlagents/trainers/ppo/policy.py
  30. 2
      ml-agents/mlagents/trainers/ppo/trainer.py
  31. 6
      ml-agents/mlagents/trainers/rl_trainer.py
  32. 6
      ml-agents/mlagents/trainers/sac/models.py
  33. 8
      ml-agents/mlagents/trainers/sac/policy.py
  34. 7
      ml-agents/mlagents/trainers/sac/trainer.py
  35. 9
      ml-agents/mlagents/trainers/simple_env_manager.py
  36. 9
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  37. 29
      ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
  38. 14
      ml-agents/mlagents/trainers/tests/test_bcmodule.py
  39. 2
      ml-agents/mlagents/trainers/tests/test_policy.py
  40. 2
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  41. 2
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  42. 62
      ml-agents/mlagents/trainers/tests/test_trainer_util.py
  43. 3
      ml-agents/mlagents/trainers/tf_policy.py
  44. 3
      ml-agents/mlagents/trainers/trainer.py
  45. 5
      ml-agents/mlagents/trainers/trainer_controller.py
  46. 11
      ml-agents/mlagents/trainers/trainer_util.py
  47. 30
      docs/Training-Behavioral-Cloning.md
  48. 236
      ml-agents/mlagents/trainers/tests/test_bc.py

12
.pre-commit-config.yaml


)$
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.740
rev: v0.750
# Currently mypy may assert after logging one message. To get all the messages at once, change repo and rev to
# repo: https://github.com/chriselion/mypy
# rev: 3d0b6164a9487a6c5cf9d144110b86600fd85e25
# This is a fork with the assert disabled, although precommit has trouble installing it sometimes.
args: [--ignore-missing-imports, --disallow-incomplete-defs]
args: [--ignore-missing-imports, --disallow-incomplete-defs, --namespace-packages]
args: [--ignore-missing-imports, --disallow-incomplete-defs]
args: [--ignore-missing-imports, --disallow-incomplete-defs, --namespace-packages]
args: [--ignore-missing-imports, --disallow-incomplete-defs]
args: [--ignore-missing-imports, --disallow-incomplete-defs, --namespace-packages]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.4.0

7
.pylintrc


# E0401: Unable to import...
# E0611: No name '...' in module '...'
# need to look into these, probably namespace packages
E0401, E0611
E0401, E0611,
# This was causing false positives
# Appears to be https://github.com/PyCQA/pylint/issues/2981
W0201,

2
UnitySDK/Assets/ML-Agents/Scripts/Agent.cs


academy.AgentSendState -= SendInfo;
academy.DecideAction -= DecideAction;
academy.AgentAct -= AgentStep;
academy.AgentForceReset -= ForceReset;
academy.AgentForceReset -= _AgentReset;
}
m_Brain?.Dispose();
}

6
config/gail_config.yaml


beta: 1.0e-2
max_steps: 5.0e5
num_epoch: 3
pretraining:
behavioral_cloning:
demo_path: ./demos/ExpertPyramid.demo
strength: 0.5
steps: 10000

summary_freq: 3000
num_layers: 3
hidden_units: 512
behavioral_cloning:
demo_path: ./demos/ExpertCrawlerSta.demo
strength: 0.5
steps: 5000
reward_signals:
gail:
strength: 1.0

2
docs/Migrating.md


* `reset()` on the Low-Level Python API no longer takes a `config` argument. `UnityEnvironment` no longer has a `reset_parameters` field. To modify float properties in the environment, you must use a `FloatPropertiesChannel`. For more information, refer to the [Low Level Python API documentation](Python-API.md)
* The Academy no longer has a `Training Configuration` nor `Inference Configuration` field in the inspector. To modify the configuration from the Low-Level Python API, use an `EngineConfigurationChannel`. To modify it during training, use the new command line arguments `--width`, `--height`, `--quality-level`, `--time-scale` and `--target-frame-rate` in `mlagents-learn`.
* The Academy no longer has a `Default Reset Parameters` field in the inspector. The Academy class no longer has a `ResetParameters`. To access shared float properties with Python, use the new `FloatProperties` field on the Academy.
* Offline Behavioral Cloning has been removed. To learn from demonstrations, use the GAIL and
Behavioral Cloning features with either PPO or SAC. See [Imitation Learning](Training-Imitation-Learning.md) for more information.
### Steps to Migrate
* If you had a custom `Training Configuration` in the Academy inspector, you will need to pass your custom configuration at every training run using the new command line arguments `--width`, `--height`, `--quality-level`, `--time-scale` and `--target-frame-rate`.

9
docs/Reward-Signals.md


In this way, while the agent gets better and better at mimicing the demonstrations, the
discriminator keeps getting stricter and stricter and the agent must try harder to "fool" it.
This approach, when compared to [Behavioral Cloning](Training-Behavioral-Cloning.md), requires
far fewer demonstrations to be provided. After all, we are still learning a policy that happens
to be similar to the demonstrations, not directly copying the behavior of the demonstrations. It
is especially effective when combined with an Extrinsic signal. However, the GAIL reward signal can
also be used independently to purely learn from demonstrations.
This approach learns a _policy_ that produces states and actions similar to the demonstrations,
requiring fewer demonstrations than direct cloning of the actions. In addition to learning purely
from demonstrations, the GAIL reward signal can be mixed with an extrinsic reward signal to guide
the learning process.
Using GAIL requires recorded demonstrations from your Unity environment. See the
[imitation learning guide](Training-Imitation-Learning.md) to learn more about recording demonstrations.

34
docs/Training-Imitation-Learning.md


reduce the time the agent takes to solve the environment.
For instance, on the [Pyramids environment](Learning-Environment-Examples.md#pyramids),
using 6 episodes of demonstrations can reduce training steps by more than 4 times.
See PreTraining + GAIL + Curiosity + RL below.
See Behavioral Cloning + GAIL + Curiosity + RL below.
<p align="center">
<img src="images/mlagents-ImitationAndRL.png"

The ML-Agents toolkit provides several ways to learn from demonstrations.
The ML-Agents toolkit provides two features that enable your agent to learn from demonstrations.
In most scenarios, you should combine these two features
* To train using GAIL (Generative Adversarial Imitation Learning) you can add the
* GAIL (Generative Adversarial Imitation Learning) uses an adversarial approach to
reward your Agent for behaving similar to a set of demonstrations. To use GAIL, you can add the
* To help bootstrap reinforcement learning, you can enable
[pretraining](Training-PPO.md#optional-pretraining-using-demonstrations)
on the PPO trainer, in addition to using a small GAIL reward signal.
* To train an agent to exactly mimic demonstrations, you can use the
[Behavioral Cloning](Training-Behavioral-Cloning.md) trainer. Behavioral Cloning can be
used with demonstrations (in-editor), and learns very quickly. However, it usually is ineffective
on more complex environments without a large number of demonstrations.
* Behavioral Cloning (BC) trains the Agent's neural network to exactly mimic the actions
shown in a set of demonstrations.
[The BC feature](Training-PPO.md#optional-behavioral-cloning-using-demonstrations)
can be enabled on the PPO or SAC trainer. BC tends to work best when
there are a lot of demonstrations, or in conjunction with GAIL and/or an extrinsic reward.
using pre-recorded demonstrations, you can generally enable both GAIL and Pretraining.
using pre-recorded demonstrations, you can generally enable both GAIL and Behavioral Cloning
at low strengths in addition to having an extrinsic reward.
If you want to train purely from demonstrations, GAIL is generally the preferred approach, especially
if you have few (<10) episodes of demonstrations. An example of this is provided for the Crawler example
environment under `CrawlerStaticLearning` in `config/gail_config.yaml`.
If you have plenty of demonstrations and/or a very simple environment, Offline Behavioral Cloning can be effective and quick. However, it cannot be combined with RL.
If you want to train purely from demonstrations, GAIL and BC _without_ an
extrinsic reward signal is the preferred approach. An example of this is provided for the Crawler
example environment under `CrawlerStaticLearning` in `config/gail_config.yaml`.
## Recording Demonstrations

They can be managed from the Editor, as well as used for training with Offline
Behavioral Cloning and GAIL.
They can be managed from the Editor, as well as used for training with BC and GAIL.
In order to record demonstrations from an agent, add the `Demonstration Recorder`
component to a GameObject in the scene which contains an `Agent` component.

35
docs/Training-ML-Agents.md


`config/gail_config.yaml` and `config/offline_bc_config.yaml` specifies the training method,
the hyperparameters, and a few additional values to use when training with Proximal Policy
Optimization(PPO), Soft Actor-Critic(SAC), GAIL (Generative Adversarial Imitation Learning)
with PPO, and online and offline Behavioral Cloning(BC)/Imitation. These files are divided
with PPO/SAC, and Behavioral Cloning(BC)/Imitation with PPO/SAC. These files are divided
training with PPO, SAC, GAIL (with PPO), and offline BC. These files are divided into sections.
training with PPO, SAC, GAIL (with PPO), and BC. These files are divided into sections.
The **default** section defines the default values for all the available settings. You can
also add new sections to override these defaults to train specific Behaviors. Name each of these
override sections after the appropriate `Behavior Name`. Sections for the

| :------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------- |
| batch_size | The number of experiences in each iteration of gradient descent. | PPO, SAC, BC |
| batches_per_epoch | In imitation learning, the number of batches of training examples to collect before training the model. | BC |
| batch_size | The number of experiences in each iteration of gradient descent. | PPO, SAC |
| batches_per_epoch | In imitation learning, the number of batches of training examples to collect before training the model. | |
| demo_path | For offline imitation learning, the file path of the recorded demonstration file | (offline)BC |
| hidden_units | The number of units in the hidden layers of the neural network. | PPO, SAC, BC |
| hidden_units | The number of units in the hidden layers of the neural network. | PPO, SAC |
| learning_rate | The initial learning rate for gradient descent. | PPO, SAC, BC |
| max_steps | The maximum number of simulation steps to run during a training session. | PPO, SAC, BC |
| memory_size | The size of the memory an agent must keep. Used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, SAC, BC |
| learning_rate | The initial learning rate for gradient descent. | PPO, SAC |
| max_steps | The maximum number of simulation steps to run during a training session. | PPO, SAC |
| memory_size | The size of the memory an agent must keep. Used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, SAC |
| num_layers | The number of hidden layers in the neural network. | PPO, SAC, BC |
| pretraining | Use demonstrations to bootstrap the policy neural network. See [Pretraining Using Demonstrations](Training-PPO.md#optional-pretraining-using-demonstrations). | PPO, SAC |
| reward_signals | The reward signals used to train the policy. Enable Curiosity and GAIL here. See [Reward Signals](Reward-Signals.md) for configuration options. | PPO, SAC, BC |
| num_layers | The number of hidden layers in the neural network. | PPO, SAC |
| behavioral_cloning | Use demonstrations to bootstrap the policy neural network. See [Pretraining Using Demonstrations](Training-PPO.md#optional-behavioral-cloning-using-demonstrations). | PPO, SAC |
| reward_signals | The reward signals used to train the policy. Enable Curiosity and GAIL here. See [Reward Signals](Reward-Signals.md) for configuration options. | PPO, SAC |
| sequence_length | Defines how long the sequences of experiences must be while training. Only used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, SAC, BC |
| summary_freq | How often, in steps, to save training statistics. This determines the number of data points shown by TensorBoard. | PPO, SAC, BC |
| sequence_length | Defines how long the sequences of experiences must be while training. Only used for training with a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, SAC |
| summary_freq | How often, in steps, to save training statistics. This determines the number of data points shown by TensorBoard. | PPO, SAC |
| time_horizon | How many steps of experience to collect per-agent before adding it to the experience buffer. | PPO, SAC, (online)BC |
| trainer | The type of training to perform: "ppo", "sac", "offline_bc" or "online_bc". | PPO, SAC, BC |
| time_horizon | How many steps of experience to collect per-agent before adding it to the experience buffer. | PPO, SAC |
| trainer | The type of training to perform: "ppo", "sac", "offline_bc" or "online_bc". | PPO, SAC |
| use_recurrent | Train using a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, SAC, BC |
| use_recurrent | Train using a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md). | PPO, SAC |
\*PPO = Proximal Policy Optimization, SAC = Soft Actor-Critic, BC = Behavioral Cloning (Imitation)
\*PPO = Proximal Policy Optimization, SAC = Soft Actor-Critic, BC = Behavioral Cloning (Imitation), GAIL = Generative Adversarial Imitaiton Learning
For specific advice on setting hyperparameters based on the type of training you
are conducting, see:

18
docs/Training-PPO.md


Typical Range: `64` - `512`
## (Optional) Pretraining Using Demonstrations
## (Optional) Behavioral Cloning Using Demonstrations
from a player. This can help guide the agent towards the reward. Pretraining adds
from a player. This can help guide the agent towards the reward. Behavioral Cloning (BC) adds
It is essentially equivalent to running [behavioral cloning](Training-Behavioral-Cloning.md)
in-line with PPO.
To use pretraining, add a `pretraining` section to the trainer_config. For instance:
To use BC, add a `behavioral_cloning` section to the trainer_config. For instance:
pretraining:
behavioral_cloning:
Below are the available hyperparameters for pretraining.
Below are the available hyperparameters for BC.
rate of PPO, and roughly corresponds to how strongly we allow the behavioral cloning
rate of PPO, and roughly corresponds to how strongly we allow BC
to influence the policy.
Typical Range: `0.1` - `0.5`

### Steps
During pretraining, it is often desirable to stop using demonstrations after the agent has
During BC, it is often desirable to stop using demonstrations after the agent has
pretraining is active. The learning rate of the pretrainer will anneal over the steps. Set
BC is active. The learning rate of BC will anneal over the steps. Set
the steps to 0 for constant imitation over the entire training run.
### (Optional) Batch Size

18
docs/Training-SAC.md


Default: `False`
## (Optional) Pretraining Using Demonstrations
## (Optional) Behavioral Cloning Using Demonstrations
from a player. This can help guide the agent towards the reward. Pretraining adds
from a player. This can help guide the agent towards the reward. Behavioral Cloning (BC) adds
It is essentially equivalent to running [behavioral cloning](./Training-Behavioral-Cloning.md)
in-line with SAC.
To use pretraining, add a `pretraining` section to the trainer_config. For instance:
To use BC, add a `behavioral_cloning` section to the trainer_config. For instance:
pretraining:
behavioral_cloning:
Below are the available hyperparameters for pretraining.
Below are the available hyperparameters for BC.
rate of SAC, and roughly corresponds to how strongly we allow the behavioral cloning
rate of SAC, and roughly corresponds to how strongly we allow BC
to influence the policy.
Typical Range: `0.1` - `0.5`

### Steps
During pretraining, it is often desirable to stop using demonstrations after the agent has
During BC, it is often desirable to stop using demonstrations after the agent has
pretraining is active. The learning rate of the pretrainer will anneal over the steps. Set
BC is active. The learning rate of BC will anneal over the steps. Set
the steps to 0 for constant imitation over the entire training run.
### (Optional) Batch Size

142
docs/images/mlagents-ImitationAndRL.png

之前 之后
宽度: 600  |  高度: 371  |  大小: 23 KiB

6
ml-agents-envs/mlagents/envs/base_env.py


"""
from abc import ABC, abstractmethod
from typing import List, NamedTuple, Tuple, Optional, Union, Dict, NewType
from typing import List, NamedTuple, Tuple, Optional, Union, Dict
AgentId = NewType("AgentId", int)
AgentGroup = NewType("AgentGroup", str)
AgentId = int
AgentGroup = str
class StepResult(NamedTuple):

13
ml-agents-envs/mlagents/envs/environment.py


AgentId,
)
from mlagents.envs.timers import timed, hierarchical_timer
from .exception import (
from mlagents.envs.exception import (
UnityEnvironmentException,
UnityCommunicationException,
UnityActionException,

from mlagents.envs.communicator_objects.command_pb2 import STEP, RESET
from mlagents.envs.rpc_utils import (
agent_group_spec_from_proto,
batched_step_result_from_proto,

action = action.astype(expected_type)
if agent_group not in self._env_actions:
self._env_actions[agent_group] = self._empty_action(
spec, self._env_state[agent_group].n_agents()
self._env_actions[agent_group] = spec.create_empty_action(
self._env_state[agent_group].n_agents()
)
try:
index = np.where(self._env_state[agent_group].agent_id == agent_id)[0][0]

@staticmethod
def _parse_side_channel_message(
side_channels: Dict[int, SideChannel], data: bytearray
side_channels: Dict[int, SideChannel], data: bytes
) -> None:
offset = 0
while offset < len(data):

for i in range(n_agents):
action = AgentActionProto(vector_actions=vector_action[b][i])
rl_in.agent_actions[b].value.extend([action])
rl_in.command = 0
rl_in.command = STEP
rl_in.command = 1
rl_in.command = RESET
rl_in.side_channel = bytes(self._generate_side_channel_data(self.side_channels))
return self.wrap_unity_input(rl_in)

3
ml-agents-envs/mlagents/envs/mock_communicator.py


NONE as COMPRESSION_TYPE_NONE,
PNG as COMPRESSION_TYPE_PNG,
)
from mlagents.envs.communicator_objects.space_type_pb2 import discrete, continuous
class MockCommunicator(Communicator):

bp = BrainParametersProto(
vector_action_size=[2],
vector_action_descriptions=["", ""],
vector_action_space_type=int(not self.is_discrete),
vector_action_space_type=discrete if self.is_discrete else continuous,
brain_name=self.brain_name,
is_training=True,
)

25
ml-agents-envs/mlagents/envs/rpc_utils.py


import logging
import numpy as np
import io
from typing import List, Tuple
from typing import cast, List, Tuple, Union, Collection
from PIL import Image
logger = logging.getLogger("mlagents.envs")

if brain_param_proto.vector_action_space_type == 0
else ActionType.CONTINUOUS
)
action_shape = None
action_shape = brain_param_proto.vector_action_size[0]
action_shape: Union[
int, Tuple[int, ...]
] = brain_param_proto.vector_action_size[0]
else:
action_shape = tuple(brain_param_proto.vector_action_size)
return AgentGroupSpec(observation_shape, action_type, action_shape)

@timed
def _process_visual_observation(
obs_index: int, shape: Tuple[int, int, int], agent_info_list: List[AgentInfoProto]
obs_index: int,
shape: Tuple[int, int, int],
agent_info_list: Collection[
AgentInfoProto
], # pylint: disable=unsubscriptable-object
) -> np.ndarray:
if len(agent_info_list) == 0:
return np.zeros((0, shape[0], shape[1], shape[2]), dtype=np.float32)

@timed
def _process_vector_observation(
obs_index: int, shape: Tuple[int, ...], agent_info_list: List[AgentInfoProto]
obs_index: int,
shape: Tuple[int, ...],
agent_info_list: Collection[
AgentInfoProto
], # pylint: disable=unsubscriptable-object
) -> np.ndarray:
if len(agent_info_list) == 0:
return np.zeros((0, shape[0]), dtype=np.float32)

@timed
def batched_step_result_from_proto(
agent_info_list: List[AgentInfoProto], group_spec: AgentGroupSpec
agent_info_list: Collection[
AgentInfoProto
], # pylint: disable=unsubscriptable-object
group_spec: AgentGroupSpec,
obs_shape = cast(Tuple[int, int, int], obs_shape)
obs_list += [
_process_visual_observation(obs_index, obs_shape, agent_info_list)
]

2
ml-agents-envs/mlagents/envs/side_channel/engine_configuration_channel.py


def channel_type(self) -> int:
return SideChannelType.EngineSettings
def on_message_received(self, data: bytearray) -> None:
def on_message_received(self, data: bytes) -> None:
"""
Is called by the environment to the side channel. Can be called
multiple times per step if multiple messages are meant for that

19
ml-agents-envs/mlagents/envs/side_channel/float_properties_channel.py


from mlagents.envs.side_channel.side_channel import SideChannel, SideChannelType
import struct
from typing import Tuple, Optional, List
from typing import Dict, Tuple, Optional, List
class FloatPropertiesChannel(SideChannel):

set_property, get_property and list_properties.
"""
def __init__(self):
self._float_properties = {}
def __init__(self) -> None:
self._float_properties: Dict[str, float] = {}
super().__init__()
@property

def on_message_received(self, data: bytearray) -> None:
def on_message_received(self, data: bytes) -> None:
"""
Is called by the environment to the side channel. Can be called
multiple times per step if multiple messages are meant for that

Returns a list of all the string identifiers of the properties
currently present in the Unity Environment.
"""
return self._float_properties.keys()
return list(self._float_properties.keys())
def get_property_dict_copy(self) -> Dict[str, float]:
"""
Returns a copy of the float properties.
:return:
"""
return dict(self._float_properties)
@staticmethod
def serialize_float_prop(key: str, value: float) -> bytearray:

return result
@staticmethod
def deserialize_float_prop(data: bytearray) -> Tuple[str, float]:
def deserialize_float_prop(data: bytes) -> Tuple[str, float]:
offset = 0
encoded_key_len = struct.unpack_from("<i", data, offset)[0]
offset = offset + 4

2
ml-agents-envs/mlagents/envs/side_channel/raw_bytes_channel.py


def channel_type(self) -> int:
return SideChannelType.RawBytesChannelStart + self._channel_id
def on_message_received(self, data: bytearray) -> None:
def on_message_received(self, data: bytes) -> None:
"""
Is called by the environment to the side channel. Can be called
multiple times per step if multiple messages are meant for that

2
ml-agents-envs/mlagents/envs/side_channel/side_channel.py


self.message_queue.append(data)
@abstractmethod
def on_message_received(self, data: bytearray) -> None:
def on_message_received(self, data: bytes) -> None:
"""
Is called by the environment to the side channel. Can be called
multiple times per step if multiple messages are meant for that

10
ml-agents-envs/mlagents/envs/tests/test_rpc_utils.py


from typing import List, Tuple
from mlagents.envs.communicator_objects.agent_info_pb2 import AgentInfoProto
from mlagents.envs.communicator_objects.observation_pb2 import ObservationProto
from mlagents.envs.communicator_objects.observation_pb2 import (
ObservationProto,
NONE,
PNG,
)
from mlagents.envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
import numpy as np
from mlagents.envs.base_env import AgentGroupSpec, ActionType

for obs_index in range(len(shape)):
obs_proto = ObservationProto()
obs_proto.shape.extend(list(shape[obs_index]))
obs_proto.compression_type = 0
obs_proto.compression_type = NONE
obs_proto.float_data.data.extend([0.1] * np.prod(shape[obs_index]))
obs_proto_list.append(obs_proto)
ap.observations.extend(obs_proto_list)

def generate_compressed_proto_obs(in_array: np.ndarray) -> ObservationProto:
obs_proto = ObservationProto()
obs_proto.compressed_data = generate_compressed_data(in_array)
obs_proto.compression_type = 1
obs_proto.compression_type = PNG
obs_proto.shape.extend(in_array.shape)
return obs_proto

3
ml-agents-envs/mlagents/envs/tests/test_side_channel.py


val = sender.get_property("prop1")
assert val == 1.0
assert receiver.get_property_dict_copy() == {"prop1": 1.0, "prop2": 2.0}
assert receiver.get_property_dict_copy() == sender.get_property_dict_copy()
def test_raw_bytes():
sender = RawBytesChannel()

4
ml-agents/mlagents/trainers/action_info.py


from typing import NamedTuple, Any, Dict, Optional
from typing import NamedTuple, Any, Dict
ActionInfoOutputs = Optional[Dict[str, Any]]
ActionInfoOutputs = Dict[str, Any]
class ActionInfo(NamedTuple):

4
ml-agents/mlagents/trainers/agent_processor.py


from typing import List
from typing import List, Union
from mlagents.trainers.buffer import AgentBuffer, BufferException

def append_to_update_buffer(
self,
update_buffer: AgentBuffer,
agent_id: str,
agent_id: Union[int, str],
key_list: List[str] = None,
batch_size: int = None,
training_length: int = None,

16
ml-agents/mlagents/trainers/brain.py


from mlagents.envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
from mlagents.envs.communicator_objects.observation_pb2 import ObservationProto
from mlagents.envs.timers import hierarchical_timer, timed
from typing import Dict, List, NamedTuple
from typing import Dict, List, NamedTuple, Collection
from PIL import Image
logger = logging.getLogger("mlagents.envs")

@timed
def from_agent_proto(
worker_id: int,
agent_info_list: List[AgentInfoProto],
agent_info_list: Collection[
AgentInfoProto
], # pylint: disable=unsubscriptable-object
brain_params: BrainParameters,
) -> "BrainInfo":
"""

@staticmethod
def _process_visual_observations(
brain_params: BrainParameters, agent_info_list: List[AgentInfoProto]
brain_params: BrainParameters,
agent_info_list: Collection[
AgentInfoProto
], # pylint: disable=unsubscriptable-object
) -> List[np.ndarray]:
visual_observation_protos: List[List[ObservationProto]] = []

@staticmethod
def _process_vector_observations(
brain_params: BrainParameters, agent_info_list: List[AgentInfoProto]
brain_params: BrainParameters,
agent_info_list: Collection[
AgentInfoProto
], # pylint: disable=unsubscriptable-object
) -> np.ndarray:
if len(agent_info_list) == 0:
vector_obs = np.zeros(

6
ml-agents/mlagents/trainers/components/bc/module.py


samples_per_update: int = 0,
):
"""
A BC trainer that can be used inline with RL, especially for pretraining.
A BC trainer that can be used inline with RL.
:param policy: The policy of the learning model
:param policy_learning_rate: The initial Learning Rate of the policy. Used to set an appropriate learning rate
for the pretrainer.

:param demo_path: The path to the demonstration file.
:param batch_size: The batch size to use during BC training.
:param num_epoch: Number of epochs to train for during each update.
:param samples_per_update: Maximum number of samples to train on during each pretraining update.
:param samples_per_update: Maximum number of samples to train on during each BC update.
"""
self.policy = policy
self.current_lr = policy_learning_rate * strength

@staticmethod
def check_config(config_dict: Dict[str, Any]) -> None:
"""
Check the pretraining config for the required keys.
Check the behavioral_cloning config for the required keys.
:param config_dict: Pretraining section of trainer_config
"""
param_keys = ["strength", "demo_path", "steps"]

4
ml-agents/mlagents/trainers/demo_loader.py


break
pos += next_pos
obs_decoded += 1
if not brain_params:
raise RuntimeError(
f"No BrainParameters found in demonstration file at {file_path}."
)
return brain_params, info_action_pairs, total_expected

9
ml-agents/mlagents/trainers/env_manager.py


from abc import ABC, abstractmethod
from typing import List, Dict, NamedTuple, Optional
from typing import List, Dict, NamedTuple
from mlagents.trainers.brain import AllBrainInfo, BrainParameters
from mlagents.trainers.policy import Policy
from mlagents.trainers.action_info import ActionInfo

previous_all_brain_info: Optional[AllBrainInfo]
previous_all_brain_info: AllBrainInfo
brain_name_to_action_info: Optional[Dict[str, ActionInfo]]
brain_name_to_action_info: Dict[str, ActionInfo]
self.brain_name_to_action_info is not None
and brain_name in self.brain_name_to_action_info
brain_name in self.brain_name_to_action_info
and self.brain_name_to_action_info[brain_name].outputs is not None
)

4
ml-agents/mlagents/trainers/models.py


self.running_variance: Optional[tf.Variable] = None
self.update_normalization: Optional[tf.Operation] = None
self.value: Optional[tf.Tensor] = None
self.all_log_probs: Optional[tf.Tensor] = None
self.output: Optional[tf.Tensor] = None
self.selected_actions: Optional[tf.Tensor] = None
self.action_holder: Optional[tf.Tensor] = None
@staticmethod
def create_global_steps():

8
ml-agents/mlagents/trainers/ppo/policy.py


with self.graph.as_default():
self.bc_module: Optional[BCModule] = None
# Create pretrainer if needed
if "pretraining" in trainer_params:
BCModule.check_config(trainer_params["pretraining"])
if "behavioral_cloning" in trainer_params:
BCModule.check_config(trainer_params["behavioral_cloning"])
default_num_epoch=trainer_params["num_epoch"],
**trainer_params["pretraining"],
default_num_epoch=3,
**trainer_params["behavioral_cloning"],
)
if load:

2
ml-agents/mlagents/trainers/ppo/trainer.py


else:
bootstrapping_info = next_info
idx = l
value_next = self.policy.get_value_estimates(
value_next = self.ppo_policy.get_value_estimates(
bootstrapping_info,
idx,
next_info.local_done[l] and not next_info.max_reached[l],

6
ml-agents/mlagents/trainers/rl_trainer.py


policy = self.create_policy(brain_parameters)
self.policy = policy
self.policies[brain_parameters.brain_name] = policy
def advance(self):
"""
Eventually logic from TrainerController.advance() will live here.
"""
self.clear_update_buffer()

6
ml-agents/mlagents/trainers/sac/models.py


self.q2_memory_in: Optional[tf.Tensor] = None
self.q1_memory_out: Optional[tf.Tensor] = None
self.q2_memory_out: Optional[tf.Tensor] = None
self.action_holder: Optional[tf.Tensor] = None
self.prev_action: Optional[tf.Tensor] = None
self.action_masks: Optional[tf.Tensor] = None
self.external_action_in: Optional[tf.Tensor] = None

self.all_log_probs: Optional[tf.Tensor] = None
self.selected_actions: Optional[tf.Tensor] = None
self.output: Optional[tf.Tensor] = None
self.output_oh: Optional[tf.Tensor] = None
self.output_pre: Optional[tf.Tensor] = None

self.dones_holder = tf.placeholder(
shape=[None], dtype=tf.float32, name="dones_holder"
)
# This is just a dummy to get pretraining to work. PPO has this but SAC doesn't.
# This is just a dummy to get BC to work. PPO has this but SAC doesn't.
# TODO: Proper input and output specs for models
self.epsilon = tf.placeholder(
shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"

8
ml-agents/mlagents/trainers/sac/policy.py


with self.graph.as_default():
# Create pretrainer if needed
self.bc_module: Optional[BCModule] = None
if "pretraining" in trainer_params:
BCModule.check_config(trainer_params["pretraining"])
if "behavioral_cloning" in trainer_params:
BCModule.check_config(trainer_params["behavioral_cloning"])
self.bc_module = BCModule(
self,
policy_learning_rate=trainer_params["learning_rate"],

**trainer_params["pretraining"],
**trainer_params["behavioral_cloning"],
if "samples_per_update" in trainer_params["pretraining"]:
if "samples_per_update" in trainer_params["behavioral_cloning"]:
logger.warning(
"Pretraining: Samples Per Update is not a valid setting for SAC."
)

7
ml-agents/mlagents/trainers/sac/trainer.py


for stat, stat_list in batch_update_stats.items():
self.stats[stat].append(np.mean(stat_list))
if self.policy.bc_module:
update_stats = self.policy.bc_module.update()
bc_module = self.sac_policy.bc_module
if bc_module:
update_stats = bc_module.update()
for stat, val in update_stats.items():
self.stats[stat].append(val)

self.trainer_parameters["batch_size"],
sequence_length=self.policy.sequence_length,
)
update_stats = self.policy.update_reward_signals(
update_stats = self.sac_policy.update_reward_signals(
reward_signal_minibatches, n_sequences
)
for stat_name, value in update_stats.items():

9
ml-agents/mlagents/trainers/simple_env_manager.py


super().__init__()
self.shared_float_properties = float_prop_channel
self.env = env
self.previous_step: EnvironmentStep = EnvironmentStep(None, {}, None)
self.previous_step: EnvironmentStep = EnvironmentStep({}, {}, {})
self.previous_all_action_info: Dict[str, ActionInfo] = {}
def step(self) -> List[EnvironmentStep]:

self.shared_float_properties.set_property(k, v)
self.env.reset()
all_brain_info = self._generate_all_brain_info()
self.previous_step = EnvironmentStep(None, all_brain_info, None)
self.previous_step = EnvironmentStep({}, all_brain_info, {})
return [self.previous_step]
@property

@property
def get_properties(self) -> Dict[str, float]:
reset_params = {}
for k in self.shared_float_properties.list_properties():
reset_params[k] = self.shared_float_properties.get_property(k)
return reset_params
return self.shared_float_properties.get_property_dict_copy()
def close(self):
self.env.close()

9
ml-agents/mlagents/trainers/subprocess_env_manager.py


self.process = process
self.worker_id = worker_id
self.conn = conn
self.previous_step: EnvironmentStep = EnvironmentStep(None, {}, None)
self.previous_step: EnvironmentStep = EnvironmentStep({}, {}, {})
self.previous_all_action_info: Dict[str, ActionInfo] = {}
self.waiting = False

elif cmd.name == "external_brains":
_send_response("external_brains", external_brains())
elif cmd.name == "get_properties":
reset_params = {}
for k in shared_float_properties.list_properties():
reset_params[k] = shared_float_properties.get_property(k)
reset_params = shared_float_properties.get_property_dict_copy()
_send_response("get_properties", reset_params)
elif cmd.name == "reset":
for k, v in cmd.payload.items():

ew.send("reset", config)
# Next (synchronously) collect the reset observations from each worker in sequence
for ew in self.env_workers:
ew.previous_step = EnvironmentStep(None, ew.recv().payload, None)
ew.previous_step = EnvironmentStep({}, ew.recv().payload, {})
return list(map(lambda ew: ew.previous_step, self.env_workers))
@property

29
ml-agents/mlagents/trainers/tests/test_barracuda_converter.py


import os
import yaml
import pytest
from mlagents.trainers.tests.test_bc import create_bc_trainer
def test_barracuda_converter():

# cleanup
os.remove(tmpfile)
@pytest.fixture
def bc_dummy_config():
return yaml.safe_load(
"""
hidden_units: 32
learning_rate: 3.0e-4
num_layers: 1
use_recurrent: false
sequence_length: 32
memory_size: 64
batches_per_epoch: 1
batch_size: 64
summary_freq: 2000
max_steps: 4000
"""
)
@pytest.mark.parametrize("use_lstm", [False, True], ids=["nolstm", "lstm"])
@pytest.mark.parametrize("use_discrete", [True, False], ids=["disc", "cont"])
def test_bc_export(bc_dummy_config, use_lstm, use_discrete):
bc_dummy_config["use_recurrent"] = use_lstm
trainer, env = create_bc_trainer(bc_dummy_config, use_discrete)
trainer.export_model()

14
ml-agents/mlagents/trainers/tests/test_bcmodule.py


summary_freq: 1000
use_recurrent: false
memory_size: 8
pretraining:
behavioral_cloning:
demo_path: ./demos/ExpertPyramid.demo
strength: 1.0
steps: 10000000

tau: 0.005
use_recurrent: false
vis_encode_type: simple
pretraining:
behavioral_cloning:
demo_path: ./demos/ExpertPyramid.demo
strength: 1.0
steps: 10000000

trainer_config["model_path"] = model_path
trainer_config["keep_checkpoints"] = 3
trainer_config["use_recurrent"] = use_rnn
trainer_config["pretraining"]["demo_path"] = (
trainer_config["behavioral_cloning"]["demo_path"] = (
os.path.dirname(os.path.abspath(__file__)) + "/" + demo_file
)

env, policy = create_policy_with_bc_mock(
mock_env, mock_brain, trainer_config, False, "test.demo"
)
assert policy.bc_module.num_epoch == trainer_config["num_epoch"]
assert policy.bc_module.num_epoch == 3
trainer_config["pretraining"]["num_epoch"] = 100
trainer_config["pretraining"]["batch_size"] = 10000
trainer_config["behavioral_cloning"]["num_epoch"] = 100
trainer_config["behavioral_cloning"]["batch_size"] = 10000
env, policy = create_policy_with_bc_mock(
mock_env, mock_brain, trainer_config, False, "test.demo"
)

@mock.patch("mlagents.envs.environment.UnityEnvironment")
def test_bcmodule_constant_lr_update(mock_env, trainer_config):
mock_brain = mb.create_mock_3dball_brain()
trainer_config["pretraining"]["steps"] = 0
trainer_config["behavioral_cloning"]["steps"] = 0
env, policy = create_policy_with_bc_mock(
mock_env, mock_brain, trainer_config, False, "test.demo"
)

2
ml-agents/mlagents/trainers/tests/test_policy.py


policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
no_agent_brain_info = BrainInfo([], [], [], agents=[])
result = policy.get_action(no_agent_brain_info)
assert result == ActionInfo([], [], None)
assert result == ActionInfo([], [], {})
def test_take_action_returns_nones_on_missing_values():

2
ml-agents/mlagents/trainers/tests/test_reward_signals.py


tau: 0.005
use_recurrent: false
vis_encode_type: simple
pretraining:
behavioral_cloning:
demo_path: ./demos/ExpertPyramid.demo
strength: 1.0
steps: 10000000

2
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


new_step_info.previous_all_brain_info[brain_name],
new_step_info.current_all_brain_info[brain_name],
)
trainer_mock.clear_update_buffer.assert_called_once()
trainer_mock.advance.assert_called_once()

62
ml-agents/mlagents/trainers/tests/test_trainer_util.py


import pytest
import yaml
import os
import io
from unittest.mock import patch

from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.bc.offline_trainer import OfflineBCTrainer
from mlagents.envs.exception import UnityEnvironmentException

@pytest.fixture
def dummy_offline_bc_config():
return yaml.safe_load(
"""
default:
trainer: offline_bc
demo_path: """
+ os.path.dirname(os.path.abspath(__file__))
+ """/test.demo
batches_per_epoch: 16
batch_size: 32
beta: 5.0e-3
buffer_size: 512
epsilon: 0.2
gamma: 0.99
hidden_units: 128
lambd: 0.95
learning_rate: 3.0e-4
max_steps: 5.0e4
normalize: true
num_epoch: 5
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 1000
use_recurrent: false
memory_size: 8
use_curiosity: false
curiosity_strength: 0.0
curiosity_enc_size: 1
"""
)
@pytest.fixture
def dummy_offline_bc_config_with_override():
base = dummy_offline_bc_config()
def dummy_config_with_override():
base = dummy_config()
base["testbrain"] = {}
base["testbrain"]["normalize"] = False
return base

train_model = True
load_model = False
seed = 11
expected_reward_buff_cap = 1
base_config = dummy_offline_bc_config_with_override()
base_config = dummy_config_with_override()
expected_config = base_config["default"]
expected_config["summary_path"] = summaries_dir + f"/{run_id}_testbrain"
expected_config["model_path"] = model_path + "/testbrain"

BrainParametersMock.return_value.brain_name = "testbrain"
external_brains = {"testbrain": brain_params_mock}
def mock_constructor(self, brain, trainer_parameters, training, load, seed, run_id):
def mock_constructor(
self,
brain,
reward_buff_cap,
trainer_parameters,
training,
load,
seed,
run_id,
multi_gpu,
):
self.trainer_metrics = TrainerMetrics("", "")
assert reward_buff_cap == expected_reward_buff_cap
assert multi_gpu == multi_gpu
with patch.object(OfflineBCTrainer, "__init__", mock_constructor):
with patch.object(PPOTrainer, "__init__", mock_constructor):
trainer_factory = trainer_util.TrainerFactory(
trainer_config=base_config,
summaries_dir=summaries_dir,

brain_parameters.brain_name
)
assert "testbrain" in trainers
assert isinstance(trainers["testbrain"], OfflineBCTrainer)
assert isinstance(trainers["testbrain"], PPOTrainer)
@patch("mlagents.trainers.brain.BrainParameters")

3
ml-agents/mlagents/trainers/tf_policy.py


self.brain = brain
self.use_recurrent = trainer_parameters["use_recurrent"]
self.memory_dict: Dict[str, np.ndarray] = {}
self.reward_signals: Dict[str, "RewardSignal"] = {}
self.num_branches = len(self.brain.vector_action_space_size)
self.previous_action_dict: Dict[str, np.array] = {}
self.normalize = trainer_parameters.get("normalize", False)

to be passed to add experiences
"""
if len(brain_info.agents) == 0:
return ActionInfo([], [], None)
return ActionInfo([], [], {})
agents_done = [
agent

3
ml-agents/mlagents/trainers/trainer.py


Gets policy from trainers list of policies
"""
return self.policies[brain_name]
def advance(self) -> None:
pass

5
ml-agents/mlagents/trainers/trainer_controller.py


)
else:
# Avoid memory leak during inference
trainer.clear_update_buffer()
# Eventually this whole block will take place in advance()
# But currently this only calls clear_update_buffer() in RLTrainer
# and nothing in the base class
trainer.advance()
return len(new_step_infos)

11
ml-agents/mlagents/trainers/trainer_util.py


from mlagents.trainers.meta_curriculum import MetaCurriculum
from mlagents.envs.exception import UnityEnvironmentException
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.trainer import Trainer, UnityTrainerException
from mlagents.trainers.bc.offline_trainer import OfflineBCTrainer
class TrainerFactory:

_brain_key = trainer_config[_brain_key]
trainer_parameters.update(trainer_config[_brain_key])
trainer = None
trainer: Trainer = None # type: ignore # will be set to one of these, or raise
trainer = OfflineBCTrainer(
brain_name, trainer_parameters, train_model, load_model, seed, run_id
raise UnityTrainerException(
"The offline_bc trainer has been removed. To train with demonstrations, "
"please use a PPO or SAC trainer with the GAIL Reward Signal and/or the "
"Behavioral Cloning feature enabled."
)
elif trainer_parameters["trainer"] == "ppo":
trainer = PPOTrainer(

30
docs/Training-Behavioral-Cloning.md


# Training with Behavioral Cloning
There are a variety of possible imitation learning algorithms which can
be used, the simplest one of them is Behavioral Cloning. It works by collecting
demonstrations from a teacher, and then simply uses them to directly learn a
policy, in the same way the supervised learning for image classification
or other traditional Machine Learning tasks work.
## Offline Training
With offline behavioral cloning, we can use demonstrations (`.demo` files)
generated using the `Demonstration Recorder` as the dataset used to train a behavior.
1. Choose an agent you would like to learn to imitate some set of demonstrations.
2. Record a set of demonstration using the `Demonstration Recorder` (see [here](Training-Imitation-Learning.md)).
For illustrative purposes we will refer to this file as `AgentRecording.demo`.
3. Build the scene(make sure the Agent is not using its heuristic).
4. Open the `config/offline_bc_config.yaml` file.
5. Modify the `demo_path` parameter in the file to reference the path to the
demonstration file recorded in step 2. In our case this is:
`./UnitySDK/Assets/Demonstrations/AgentRecording.demo`
6. Launch `mlagent-learn`, providing `./config/offline_bc_config.yaml`
as the config parameter, and include the `--run-id` and `--train` as usual.
Provide your environment as the `--env` parameter if it has been compiled
as standalone, or omit to train in the editor.
7. (Optional) Observe training performance using TensorBoard.
This will use the demonstration file to train a neural network driven agent
to directly imitate the actions provided in the demonstration. The environment
will launch and be used for evaluating the agent's performance during training.

236
ml-agents/mlagents/trainers/tests/test_bc.py


import unittest.mock as mock
import pytest
import os
import numpy as np
from mlagents.tf_utils import tf
import yaml
from mlagents.trainers.bc.models import BehavioralCloningModel
import mlagents.trainers.tests.mock_brain as mb
from mlagents.trainers.bc.policy import BCPolicy
from mlagents.trainers.bc.offline_trainer import BCTrainer
from mlagents.envs.mock_communicator import MockCommunicator
from mlagents.trainers.tests.mock_brain import make_brain_parameters
from mlagents.envs.environment import UnityEnvironment
from mlagents.trainers.brain_conversion_utils import (
step_result_to_brain_info,
group_spec_to_brain_parameters,
)
@pytest.fixture
def dummy_config():
return yaml.safe_load(
"""
hidden_units: 32
learning_rate: 3.0e-4
num_layers: 1
use_recurrent: false
sequence_length: 32
memory_size: 32
batches_per_epoch: 100 # Force code to use all possible batches
batch_size: 32
summary_freq: 2000
max_steps: 4000
"""
)
def create_bc_trainer(dummy_config, is_discrete=False, use_recurrent=False):
mock_env = mock.Mock()
if is_discrete:
mock_brain = mb.create_mock_pushblock_brain()
mock_braininfo = mb.create_mock_braininfo(
num_agents=12, num_vector_observations=70
)
else:
mock_brain = mb.create_mock_3dball_brain()
mock_braininfo = mb.create_mock_braininfo(
num_agents=12, num_vector_observations=8
)
mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)
env = mock_env()
trainer_parameters = dummy_config
trainer_parameters["summary_path"] = "tmp"
trainer_parameters["model_path"] = "tmp"
trainer_parameters["demo_path"] = (
os.path.dirname(os.path.abspath(__file__)) + "/test.demo"
)
trainer_parameters["use_recurrent"] = use_recurrent
trainer = BCTrainer(
mock_brain, trainer_parameters, training=True, load=False, seed=0, run_id=0
)
trainer.demonstration_buffer = mb.simulate_rollout(env, trainer.policy, 100)
return trainer, env
@pytest.mark.parametrize("use_recurrent", [True, False])
def test_bc_trainer_step(dummy_config, use_recurrent):
trainer, env = create_bc_trainer(dummy_config, use_recurrent=use_recurrent)
# Test get_step
assert trainer.get_step == 0
# Test update policy
trainer.update_policy()
assert len(trainer.stats["Losses/Cloning Loss"]) > 0
# Test increment step
trainer.increment_step(1)
assert trainer.step == 1
def test_bc_trainer_add_proc_experiences(dummy_config):
trainer, env = create_bc_trainer(dummy_config)
# Test add_experiences
returned_braininfo = env.step()
brain_name = "Ball3DBrain"
trainer.add_experiences(
returned_braininfo[brain_name], returned_braininfo[brain_name], {}
) # Take action outputs is not used
for agent_id in returned_braininfo[brain_name].agents:
assert trainer.evaluation_buffer[agent_id].last_brain_info is not None