浏览代码

[refactor] Remove BrainParameters from Python code (#4138)

/MLA-1734-demo-provider
GitHub 4 年前
当前提交
3bcb029b
共有 38 个文件被更改,包括 466 次插入819 次删除
  1. 2
      ml-agents/mlagents/trainers/agent_processor.py
  2. 7
      ml-agents/mlagents/trainers/behavior_id_utils.py
  3. 2
      ml-agents/mlagents/trainers/components/bc/model.py
  4. 8
      ml-agents/mlagents/trainers/components/bc/module.py
  5. 35
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
  6. 32
      ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
  7. 2
      ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
  8. 74
      ml-agents/mlagents/trainers/demo_loader.py
  9. 5
      ml-agents/mlagents/trainers/env_manager.py
  10. 8
      ml-agents/mlagents/trainers/ghost/trainer.py
  11. 54
      ml-agents/mlagents/trainers/models.py
  12. 7
      ml-agents/mlagents/trainers/policy/nn_policy.py
  13. 42
      ml-agents/mlagents/trainers/policy/tf_policy.py
  14. 8
      ml-agents/mlagents/trainers/ppo/trainer.py
  15. 5
      ml-agents/mlagents/trainers/sac/network.py
  16. 6
      ml-agents/mlagents/trainers/sac/trainer.py
  17. 13
      ml-agents/mlagents/trainers/simple_env_manager.py
  18. 22
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  19. 187
      ml-agents/mlagents/trainers/tests/mock_brain.py
  20. 38
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  21. 4
      ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
  22. 37
      ml-agents/mlagents/trainers/tests/test_bcmodule.py
  23. 70
      ml-agents/mlagents/trainers/tests/test_demo_loader.py
  24. 100
      ml-agents/mlagents/trainers/tests/test_ghost.py
  25. 49
      ml-agents/mlagents/trainers/tests/test_nn_policy.py
  26. 138
      ml-agents/mlagents/trainers/tests/test_ppo.py
  27. 17
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  28. 19
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  29. 86
      ml-agents/mlagents/trainers/tests/test_sac.py
  30. 6
      ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
  31. 4
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  32. 29
      ml-agents/mlagents/trainers/tests/test_trainer_util.py
  33. 4
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  34. 6
      ml-agents/mlagents/trainers/trainer/trainer.py
  35. 4
      ml-agents/mlagents/trainers/trainer_controller.py
  36. 36
      ml-agents/mlagents/trainers/tests/test_models.py
  37. 88
      ml-agents/mlagents/trainers/brain.py
  38. 31
      ml-agents/mlagents/trainers/brain_conversion_utils.py

2
ml-agents/mlagents/trainers/agent_processor.py


from mlagents.trainers.policy import Policy
from mlagents.trainers.action_info import ActionInfo, ActionInfoOutputs
from mlagents.trainers.stats import StatsReporter
from mlagents.trainers.brain_conversion_utils import get_global_agent_id
from mlagents.trainers.behavior_id_utils import get_global_agent_id
T = TypeVar("T")

7
ml-agents/mlagents/trainers/behavior_id_utils.py


:return: name_behavior_id
"""
return name + "?team=" + str(team_id)
def get_global_agent_id(worker_id: int, agent_id: int) -> str:
"""
Create an agent id that is unique across environment workers using the worker_id.
"""
return f"${worker_id}-{agent_id}"

2
ml-agents/mlagents/trainers/components/bc/model.py


self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32)
self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32)
if self.policy.brain.vector_action_space_type == "continuous":
if self.policy.behavior_spec.is_action_continuous():
action_length = self.policy.act_size[0]
self.action_in_expert = tf.placeholder(
shape=[None, action_length], dtype=tf.float32

8
ml-agents/mlagents/trainers/components/bc/module.py


self.current_lr = policy_learning_rate * settings.strength
self.model = BCModel(policy, self.current_lr, settings.steps)
_, self.demonstration_buffer = demo_to_buffer(
settings.demo_path, policy.sequence_length, policy.brain
settings.demo_path, policy.sequence_length, policy.behavior_spec
)
self.batch_size = (

self.policy.sequence_length_ph: self.policy.sequence_length,
}
feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"]
if not self.policy.use_continuous_act:
if self.policy.behavior_spec.is_action_discrete():
sum(self.policy.brain.vector_action_space_size),
sum(self.policy.behavior_spec.discrete_action_branches),
if self.policy.brain.vector_observation_space_size > 0:
if self.policy.vec_obs_size > 0:
feed_dict[self.policy.vector_in] = mini_batch_demo["vector_obs"]
for i, _ in enumerate(self.policy.visual_in):
feed_dict[self.policy.visual_in[i]] = mini_batch_demo["visual_obs%d" % i]

35
ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py


encoded_state_list = []
encoded_next_state_list = []
if self.policy.vis_obs_size > 0:
self.next_visual_in = []
# Create input ops for next (t+1) visual observations.
self.next_vector_in, self.next_visual_in = ModelUtils.create_input_placeholders(
self.policy.behavior_spec.observation_shapes, name_prefix="curiosity_next_"
)
if self.next_visual_in:
for i in range(self.policy.vis_obs_size):
# Create input ops for next (t+1) visual observations.
next_visual_input = ModelUtils.create_visual_input(
self.policy.brain.camera_resolutions[i],
name="curiosity_next_visual_observation_" + str(i),
)
self.next_visual_in.append(next_visual_input)
for i, (vis_in, next_vis_in) in enumerate(
zip(self.policy.visual_in, self.next_visual_in)
):
self.policy.visual_in[i],
vis_in,
self.encoding_size,
ModelUtils.swish,
1,

encoded_next_visual = ModelUtils.create_visual_observation_encoder(
self.next_visual_in[i],
next_vis_in,
self.encoding_size,
ModelUtils.swish,
1,

encoded_next_state_list.append(hidden_next_visual)
if self.policy.vec_obs_size > 0:
# Create the encoder ops for current and next vector input.
# Note that these encoders are siamese.
# Create input op for next (t+1) vector observation.
self.next_vector_in = tf.placeholder(
shape=[None, self.policy.vec_obs_size],
dtype=tf.float32,
name="curiosity_next_vector_observation",
)
encoded_vector_obs = ModelUtils.create_vector_observation_encoder(
self.policy.vector_in,
self.encoding_size,

)
encoded_state_list.append(encoded_vector_obs)
encoded_next_state_list.append(encoded_next_vector_obs)
encoded_state = tf.concat(encoded_state_list, axis=1)
encoded_next_state = tf.concat(encoded_next_state_list, axis=1)
return encoded_state, encoded_next_state

"""
combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
if self.policy.brain.vector_action_space_type == "continuous":
if self.policy.behavior_spec.is_action_continuous():
pred_action = tf.layers.dense(
hidden, self.policy.act_size[0], activation=None
)

32
ml-agents/mlagents/trainers/components/reward_signals/gail/model.py


from typing import List, Optional, Tuple
from typing import Optional, Tuple
from mlagents.tf_utils import tf

self.done_expert = tf.expand_dims(self.done_expert_holder, -1)
self.done_policy = tf.expand_dims(self.done_policy_holder, -1)
if self.policy.brain.vector_action_space_type == "continuous":
if self.policy.behavior_spec.is_action_continuous():
action_length = self.policy.act_size[0]
self.action_in_expert = tf.placeholder(
shape=[None, action_length], dtype=tf.float32

encoded_policy_list = []
encoded_expert_list = []
(
self.obs_in_expert,
self.expert_visual_in,
) = ModelUtils.create_input_placeholders(
self.policy.behavior_spec.observation_shapes, "gail_"
)
self.obs_in_expert = tf.placeholder(
shape=[None, self.policy.vec_obs_size], dtype=tf.float32
)
if self.policy.normalize:
encoded_expert_list.append(
ModelUtils.normalize_vector_obs(

encoded_expert_list.append(self.obs_in_expert)
encoded_policy_list.append(self.policy.vector_in)
if self.policy.vis_obs_size > 0:
self.expert_visual_in: List[tf.Tensor] = []
if self.expert_visual_in:
for i in range(self.policy.vis_obs_size):
# Create input ops for next (t+1) visual observations.
visual_input = ModelUtils.create_visual_input(
self.policy.brain.camera_resolutions[i],
name="gail_visual_observation_" + str(i),
)
self.expert_visual_in.append(visual_input)
for i, (vis_in, exp_vis_in) in enumerate(
zip(self.policy.visual_in, self.expert_visual_in)
):
self.policy.visual_in[i],
vis_in,
self.encoding_size,
ModelUtils.swish,
1,

encoded_expert_visual = ModelUtils.create_visual_observation_encoder(
self.expert_visual_in[i],
exp_vis_in,
self.encoding_size,
ModelUtils.swish,
1,

2
ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py


settings.use_vail,
)
_, self.demonstration_buffer = demo_to_buffer(
settings.demo_path, policy.sequence_length, policy.brain
settings.demo_path, policy.sequence_length, policy.behavior_spec
)
self.has_updated = False
self.update_dict: Dict[str, tf.Tensor] = {

74
ml-agents/mlagents/trainers/demo_loader.py


from typing import List, Tuple
import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.brain_conversion_utils import behavior_spec_to_brain_parameters
from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
AgentInfoActionPairProto,
)

@timed
def demo_to_buffer(
file_path: str, sequence_length: int, expected_brain_params: BrainParameters = None
) -> Tuple[BrainParameters, AgentBuffer]:
file_path: str, sequence_length: int, expected_behavior_spec: BehaviorSpec = None
) -> Tuple[BehaviorSpec, AgentBuffer]:
"""
Loads demonstration file and uses it to fill training buffer.
:param file_path: Location of demonstration file (.demo).

behavior_spec, info_action_pair, _ = load_demonstration(file_path)
demo_buffer = make_demo_buffer(info_action_pair, behavior_spec, sequence_length)
brain_params = behavior_spec_to_brain_parameters("DemoBrain", behavior_spec)
if expected_brain_params:
if expected_behavior_spec:
if (
brain_params.vector_action_space_size
!= expected_brain_params.vector_action_space_size
):
if behavior_spec.action_shape != expected_behavior_spec.action_shape:
brain_params.vector_action_space_size,
expected_brain_params.vector_action_space_size,
behavior_spec.action_shape, expected_behavior_spec.action_shape
if (
brain_params.vector_action_space_type
!= expected_brain_params.vector_action_space_type
):
if behavior_spec.action_type != expected_behavior_spec.action_type:
brain_params.vector_action_space_type,
expected_brain_params.vector_action_space_type,
behavior_spec.action_type, expected_behavior_spec.action_type
# check number of vector observations in demonstration match
if (
brain_params.vector_observation_space_size
!= expected_brain_params.vector_observation_space_size
# check observations match
if len(behavior_spec.observation_shapes) != len(
expected_behavior_spec.observation_shapes
"The vector observation dimensions of {} in demonstration do not match the policy's {}.".format(
brain_params.vector_observation_space_size,
expected_brain_params.vector_observation_space_size,
)
"The demonstrations do not have the same number of observations as the policy."
# check number of visual observations/resolutions in demonstration match
if (
brain_params.number_visual_observations
!= expected_brain_params.number_visual_observations
):
raise RuntimeError(
"Number of visual observations {} in demonstrations do not match the policy's {}.".format(
brain_params.number_visual_observations,
expected_brain_params.number_visual_observations,
else:
for i, (demo_obs, policy_obs) in enumerate(
zip(
behavior_spec.observation_shapes,
expected_behavior_spec.observation_shapes,
)
for i, (resolution, expected_resolution) in enumerate(
zip(
brain_params.camera_resolutions,
expected_brain_params.camera_resolutions,
)
):
if resolution != expected_resolution:
raise RuntimeError(
"The resolution of visual observation {} in demonstrations do not match the policy's.".format(
i
):
if demo_obs != policy_obs:
raise RuntimeError(
f"The shape {demo_obs} for observation {i} in demonstration \
do not match the policy's {policy_obs}."
)
return brain_params, demo_buffer
return behavior_spec, demo_buffer
def get_demo_files(path: str) -> List[str]:

@timed
def load_demonstration(
file_path: str
) -> Tuple[BrainParameters, List[AgentInfoActionPairProto], int]:
file_path: str,
) -> Tuple[BehaviorSpec, List[AgentInfoActionPairProto], int]:
"""
Loads and parses a demonstration file.
:param file_path: Location of demonstration file (.demo).

5
ml-agents/mlagents/trainers/env_manager.py


)
from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.agent_processor import AgentManager, AgentManagerQueue
from mlagents.trainers.action_info import ActionInfo

@property
@abstractmethod
def external_brains(self) -> Dict[BehaviorName, BrainParameters]:
def training_behaviors(self) -> Dict[BehaviorName, BehaviorSpec]:
pass
@abstractmethod

self._process_step_infos(self.first_step_infos)
self.first_step_infos = None
# Get new policies if found. Always get the latest policy.
for brain_name in self.external_brains:
for brain_name in self.training_behaviors:
_policy = None
try:
# We make sure to empty the policy queue before continuing to produce steps.

8
ml-agents/mlagents/trainers/ghost/trainer.py


import numpy as np
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.brain import BrainParameters
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.policy import Policy
from mlagents.trainers.policy.tf_policy import TFPolicy

self.trainer.export_model(brain_name)
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TFPolicy:
"""
Creates policy with the wrapped trainer's create_policy function

wrapped trainer to be trained.
"""
policy = self.trainer.create_policy(parsed_behavior_id, brain_parameters)
policy = self.trainer.create_policy(parsed_behavior_id, behavior_spec)
policy.create_tf_graph()
policy.initialize_or_load()
policy.init_load_weights()

# First policy or a new agent on the same team encountered
if self.wrapped_trainer_team is None or team_id == self.wrapped_trainer_team:
internal_trainer_policy = self.trainer.create_policy(
parsed_behavior_id, brain_parameters
parsed_behavior_id, behavior_spec
)
self.trainer.add_policy(parsed_behavior_id, internal_trainer_policy)
internal_trainer_policy.init_load_weights()

54
ml-agents/mlagents/trainers/models.py


from mlagents.tf_utils import tf
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.brain import CameraResolution
ActivationFunction = Callable[[tf.Tensor], tf.Tensor]
EncoderFunction = Callable[

EPSILON = 1e-7
class Tensor3DShape(NamedTuple):
height: int
width: int
num_channels: int
class EncoderType(Enum):

return tf.multiply(input_activation, tf.nn.sigmoid(input_activation))
@staticmethod
def create_visual_input(
camera_parameters: CameraResolution, name: str
) -> tf.Tensor:
def create_visual_input(camera_parameters: Tensor3DShape, name: str) -> tf.Tensor:
"""
Creates image input op.
:param camera_parameters: Parameters for visual observation.

return visual_in
@staticmethod
def create_visual_input_placeholders(
camera_resolutions: List[CameraResolution]
) -> List[tf.Tensor]:
def create_input_placeholders(
observation_shapes: List[Tuple], name_prefix: str = ""
) -> Tuple[tf.Tensor, List[tf.Tensor]]:
:param camera_resolutions: A List of CameraResolutions that specify the resolutions
of the input visual observations.
:param observation_shapes: A List of tuples that specify the resolutions
of the input observations. Tuples for now are restricted to 1D (vector) or 3D (Tensor)
:param name_prefix: A name prefix to add to the placeholder names. This is used so that there
is no conflict when creating multiple placeholder sets.
for i, camera_resolution in enumerate(camera_resolutions):
visual_input = ModelUtils.create_visual_input(
camera_resolution, name="visual_observation_" + str(i)
)
visual_in.append(visual_input)
return visual_in
vector_in_size = 0
for i, dimension in enumerate(observation_shapes):
if len(dimension) == 3:
_res = Tensor3DShape(
height=dimension[0], width=dimension[1], num_channels=dimension[2]
)
visual_input = ModelUtils.create_visual_input(
_res, name=name_prefix + "visual_observation_" + str(i)
)
visual_in.append(visual_input)
elif len(dimension) == 1:
vector_in_size += dimension[0]
else:
raise UnityTrainerException(
f"Unsupported shape of {dimension} for observation {i}"
)
vector_in = tf.placeholder(
shape=[None, vector_in_size],
dtype=tf.float32,
name=name_prefix + "vector_observation",
)
return vector_in, visual_in
@staticmethod
def create_vector_input(

)
visual_encoders.append(encoded_visual)
hidden_visual = tf.concat(visual_encoders, axis=1)
if vector_in.get_shape()[-1] > 0: # Don't encode 0-shape inputs
if vector_in.get_shape()[-1] > 0:
# Don't encode non-existant or 0-shape inputs
hidden_state = ModelUtils.create_vector_observation_encoder(
vector_observation_input,
h_size,

7
ml-agents/mlagents/trainers/policy/nn_policy.py


from typing import Any, Dict, Optional, List
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents_envs.base_env import DecisionSteps
from mlagents.trainers.brain import BrainParameters
from mlagents_envs.base_env import DecisionSteps, BehaviorSpec
from mlagents.trainers.models import EncoderType
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.policy.tf_policy import TFPolicy

def __init__(
self,
seed: int,
brain: BrainParameters,
behavior_spec: BehaviorSpec,
trainer_params: TrainerSettings,
is_training: bool,
model_path: str,

:param tanh_squash: Whether to use a tanh function on the continuous output, or a clipped output.
:param reparameterize: Whether we are using the resampling trick to update the policy in continuous output.
"""
super().__init__(seed, brain, trainer_params, model_path, load)
super().__init__(seed, behavior_spec, trainer_params, model_path, load)
self.grads = None
self.update_batch: Optional[tf.Operation] = None
num_layers = self.network_settings.num_layers

42
ml-agents/mlagents/trainers/policy/tf_policy.py


from mlagents.tf_utils import tf
from mlagents import tf_utils
from mlagents_envs.exception import UnityException
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.brain_conversion_utils import get_global_agent_id
from mlagents.trainers.behavior_id_utils import get_global_agent_id
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers import __version__

def __init__(
self,
seed: int,
brain: BrainParameters,
behavior_spec: BehaviorSpec,
trainer_settings: TrainerSettings,
model_path: str,
load: bool = False,

self.update_dict: Dict[str, tf.Tensor] = {}
self.sequence_length = 1
self.seed = seed
self.brain = brain
self.behavior_spec = behavior_spec
self.act_size = brain.vector_action_space_size
self.vec_obs_size = brain.vector_observation_space_size
self.vis_obs_size = brain.number_visual_observations
self.act_size = (
list(behavior_spec.discrete_action_branches)
if behavior_spec.is_action_discrete()
else [behavior_spec.action_size]
)
self.vec_obs_size = sum(
shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1
)
self.vis_obs_size = sum(
1 for shape in behavior_spec.observation_shapes if len(shape) == 3
)
self.num_branches = len(self.brain.vector_action_space_size)
self.num_branches = self.behavior_spec.action_size
self.use_continuous_act = brain.vector_action_space_type == "continuous"
if self.use_continuous_act:
self.num_branches = self.brain.vector_action_space_size[0]
self.use_continuous_act = behavior_spec.is_action_continuous()
self.model_path = model_path
self.initialize_path = self.trainer_settings.init_path
self.keep_checkpoints = self.trainer_settings.keep_checkpoints

feed_dict[self.vector_in] = vec_vis_obs.vector_observations
if not self.use_continuous_act:
mask = np.ones(
(len(batched_step_result), np.sum(self.brain.vector_action_space_size)),
(
len(batched_step_result),
sum(self.behavior_spec.discrete_action_branches),
),
dtype=np.float32,
)
if batched_step_result.action_mask is not None:

self.increment_step_op,
self.steps_to_increment,
) = ModelUtils.create_global_steps()
self.visual_in = ModelUtils.create_visual_input_placeholders(
self.brain.camera_resolutions
self.vector_in, self.visual_in = ModelUtils.create_input_placeholders(
self.behavior_spec.observation_shapes
self.vector_in = ModelUtils.create_vector_input(self.vec_obs_size)
if self.normalize:
normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
self.update_normalization_op = normalization_tensors.update_op

self.mask = tf.cast(self.mask_input, tf.int32)
tf.Variable(
int(self.brain.vector_action_space_type == "continuous"),
int(self.behavior_spec.is_action_continuous()),
name="is_continuous_control",
trainable=False,
dtype=tf.int32,

tf.Variable(
self.m_size, name="memory_size", trainable=False, dtype=tf.int32
)
if self.brain.vector_action_space_type == "continuous":
if self.behavior_spec.is_action_continuous():
tf.Variable(
self.act_size[0],
name="action_output_shape",

8
ml-agents/mlagents/trainers/ppo/trainer.py


import numpy as np
from mlagents_envs.logging_util import get_logger
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.trajectory import Trajectory

return True
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
:param brain_parameters: specifications for policy construction
:param behavior_spec: specifications for policy construction
brain_parameters,
behavior_spec,
self.trainer_settings,
self.is_training,
self.artifact_path,

5
ml-agents/mlagents/trainers/sac/network.py


vis_encode_type,
)
with tf.variable_scope(TARGET_SCOPE):
self.visual_in = ModelUtils.create_visual_input_placeholders(
policy.brain.camera_resolutions
self.vector_in, self.visual_in = ModelUtils.create_input_placeholders(
self.policy.behavior_spec.observation_shapes
self.vector_in = ModelUtils.create_vector_input(policy.vec_obs_size)
if self.policy.normalize:
normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
self.update_normalization_op = normalization_tensors.update_op

6
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents_envs.logging_util import get_logger
from mlagents_envs.timers import timed
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, SACSettings

return policy_was_updated
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
brain_parameters,
behavior_spec,
self.trainer_settings,
self.is_training,
self.artifact_path,

13
ml-agents/mlagents/trainers/simple_env_manager.py


from typing import Dict, List
from mlagents_envs.base_env import BaseEnv, BehaviorName
from mlagents_envs.base_env import BaseEnv, BehaviorName, BehaviorSpec
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.brain_conversion_utils import behavior_spec_to_brain_parameters
class SimpleEnvManager(EnvManager):

v.apply(k, self.env_params)
@property
def external_brains(self) -> Dict[BehaviorName, BrainParameters]:
result = {}
for behavior_name, behavior_spec in self.env.behavior_specs.items():
result[behavior_name] = behavior_spec_to_brain_parameters(
behavior_name, behavior_spec
)
return result
def training_behaviors(self) -> Dict[BehaviorName, BehaviorSpec]:
return self.env.behavior_specs
def close(self):
self.env.close()

22
ml-agents/mlagents/trainers/subprocess_env_manager.py


from multiprocessing import Process, Pipe, Queue
from multiprocessing.connection import Connection
from queue import Empty as EmptyQueueException
from mlagents_envs.base_env import BaseEnv, BehaviorName
from mlagents_envs.base_env import BaseEnv, BehaviorName, BehaviorSpec
from mlagents_envs import logging_util
from mlagents.trainers.env_manager import EnvManager, EnvironmentStep, AllStepResult
from mlagents_envs.timers import (

reset_timers,
get_timer_root,
)
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.settings import ParameterRandomizationSettings
from mlagents.trainers.action_info import ActionInfo
from mlagents_envs.side_channel.environment_parameters_channel import (

StatsAggregationMethod,
)
from mlagents_envs.side_channel.side_channel import SideChannel
from mlagents.trainers.brain_conversion_utils import behavior_spec_to_brain_parameters
logger = logging_util.get_logger(__name__)

STEP = 1
EXTERNAL_BRAINS = 2
BEHAVIOR_SPECS = 2
ENVIRONMENT_PARAMETERS = 3
RESET = 4
CLOSE = 5

all_step_result[brain_name] = env.get_steps(brain_name)
return all_step_result
def external_brains():
result = {}
for behavior_name, behavior_specs in env.behavior_specs.items():
result[behavior_name] = behavior_spec_to_brain_parameters(
behavior_name, behavior_specs
)
return result
try:
env = env_factory(
worker_id, [env_parameters, engine_configuration_channel, stats_channel]

)
)
reset_timers()
elif req.cmd == EnvironmentCommand.EXTERNAL_BRAINS:
_send_response(EnvironmentCommand.EXTERNAL_BRAINS, external_brains())
elif req.cmd == EnvironmentCommand.BEHAVIOR_SPECS:
_send_response(EnvironmentCommand.BEHAVIOR_SPECS, env.behavior_specs)
elif req.cmd == EnvironmentCommand.ENVIRONMENT_PARAMETERS:
for k, v in req.payload.items():
if isinstance(v, float):

ew.send(EnvironmentCommand.ENVIRONMENT_PARAMETERS, config)
@property
def external_brains(self) -> Dict[BehaviorName, BrainParameters]:
self.env_workers[0].send(EnvironmentCommand.EXTERNAL_BRAINS)
def training_behaviors(self) -> Dict[BehaviorName, BehaviorSpec]:
self.env_workers[0].send(EnvironmentCommand.BEHAVIOR_SPECS)
return self.env_workers[0].recv().payload
def close(self) -> None:

187
ml-agents/mlagents/trainers/tests/mock_brain.py


from unittest import mock
from typing import List, Tuple
from typing import List, Tuple, Union
from collections.abc import Iterable
from mlagents.trainers.brain import CameraResolution, BrainParameters
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents_envs.base_env import (

)
def create_mock_brainparams(
number_visual_observations=0,
vector_action_space_type="continuous",
vector_observation_space_size=3,
vector_action_space_size=None,
):
"""
Creates a mock BrainParameters object with parameters.
"""
# Avoid using mutable object as default param
if vector_action_space_size is None:
vector_action_space_size = [2]
mock_brain = mock.Mock()
mock_brain.return_value.number_visual_observations = number_visual_observations
mock_brain.return_value.vector_action_space_type = vector_action_space_type
mock_brain.return_value.vector_observation_space_size = (
vector_observation_space_size
)
camrez = CameraResolution(height=84, width=84, num_channels=3)
mock_brain.return_value.camera_resolutions = [camrez] * number_visual_observations
mock_brain.return_value.vector_action_space_size = vector_action_space_size
mock_brain.return_value.brain_name = "MockBrain"
return mock_brain()
num_agents: int = 1,
num_vector_observations: int = 0,
num_vis_observations: int = 0,
action_shape: List[int] = None,
num_agents: int,
observation_shapes: List[Tuple],
action_shape: Union[int, Tuple[int]] = None,
discrete: bool = False,
done: bool = False,
) -> Tuple[DecisionSteps, TerminalSteps]:

:int num_agents: Number of "agents" to imitate.
:int num_vector_observations: Number of "observations" in your observation space
:int num_vis_observations: Number of "observations" in your observation space
:List observation_shapes: A List of the observation spaces in your steps
action_shape = [2]
action_shape = 2
for _ in range(num_vis_observations):
obs_list.append(np.ones((num_agents, 84, 84, 3), dtype=np.float32))
if num_vector_observations > 1:
obs_list.append(
np.array(num_agents * [num_vector_observations * [1]], dtype=np.float32)
)
for _shape in observation_shapes:
obs_list.append(np.ones((num_agents,) + _shape, dtype=np.float32))
if discrete:
if discrete and isinstance(action_shape, Iterable):
for action_size in action_shape
]
for action_size in action_shape # type: ignore
] # type: ignore
[(84, 84, 3)] * num_vis_observations + [(num_vector_observations, 0, 0)],
observation_shapes,
action_shape if discrete else action_shape[0],
action_shape,
)
if done:
return (

)
def create_steps_from_brainparams(
brain_params: BrainParameters, num_agents: int = 1
def create_steps_from_behavior_spec(
behavior_spec: BehaviorSpec, num_agents: int = 1
num_vector_observations=brain_params.vector_observation_space_size,
num_vis_observations=brain_params.number_visual_observations,
action_shape=brain_params.vector_action_space_size,
discrete=brain_params.vector_action_space_type == "discrete",
observation_shapes=behavior_spec.observation_shapes,
action_shape=behavior_spec.action_shape,
discrete=behavior_spec.is_action_discrete(),
observation_shapes: List[Tuple],
vec_obs_size: int = 1,
num_vis_obs: int = 1,
action_space: List[int] = None,
action_space: Union[int, Tuple[int]] = 2,
memory_size: int = 10,
is_discrete: bool = True,
) -> Trajectory:

"""
if action_space is None:
action_space = [2]
for _j in range(num_vis_obs):
obs.append(np.ones((84, 84, 3), dtype=np.float32))
obs.append(np.ones(vec_obs_size, dtype=np.float32))
for _shape in observation_shapes:
obs.append(np.ones(_shape, dtype=np.float32))
action_size = len(action_space)
action_size = len(action_space) # type: ignore
action_size = action_space[0]
action_size = int(action_space) # type: ignore
[[False for _ in range(branch)] for branch in action_space]
[[False for _ in range(branch)] for branch in action_space] # type: ignore
if is_discrete
else None
)

def simulate_rollout(
length: int,
brain_params: BrainParameters,
behavior_spec: BehaviorSpec,
vec_obs_size = brain_params.vector_observation_space_size
num_vis_obs = brain_params.number_visual_observations
action_space = brain_params.vector_action_space_size
is_discrete = brain_params.vector_action_space_type == "discrete"
action_space = behavior_spec.action_shape
is_discrete = behavior_spec.is_action_discrete()
vec_obs_size=vec_obs_size,
num_vis_obs=num_vis_obs,
behavior_spec.observation_shapes,
action_space=action_space,
memory_size=memory_size,
is_discrete=is_discrete,

return buffer
def setup_mock_brain(
use_discrete,
use_visual,
discrete_action_space=None,
vector_action_space=None,
vector_obs_space=8,
def setup_test_behavior_specs(
use_discrete=True, use_visual=False, vector_action_space=2, vector_obs_space=8
# defaults
discrete_action_space = (
[3, 3, 3, 2] if discrete_action_space is None else discrete_action_space
behavior_spec = BehaviorSpec(
[(84, 84, 3)] * int(use_visual) + [(vector_obs_space,)],
ActionType.DISCRETE if use_discrete else ActionType.CONTINUOUS,
tuple(vector_action_space) if use_discrete else vector_action_space,
vector_action_space = [2] if vector_action_space is None else vector_action_space
return behavior_spec
if not use_visual:
mock_brain = create_mock_brainparams(
vector_action_space_type="discrete" if use_discrete else "continuous",
vector_action_space_size=discrete_action_space
if use_discrete
else vector_action_space,
vector_observation_space_size=vector_obs_space,
)
else:
mock_brain = create_mock_brainparams(
vector_action_space_type="discrete" if use_discrete else "continuous",
vector_action_space_size=discrete_action_space
if use_discrete
else vector_action_space,
vector_observation_space_size=0,
number_visual_observations=1,
)
return mock_brain
def create_mock_3dball_brain():
mock_brain = create_mock_brainparams(
vector_action_space_type="continuous",
vector_action_space_size=[2],
vector_observation_space_size=8,
def create_mock_3dball_behavior_specs():
return setup_test_behavior_specs(
False, False, vector_action_space=2, vector_obs_space=8
mock_brain.brain_name = "Ball3DBrain"
return mock_brain
def create_mock_pushblock_brain():
mock_brain = create_mock_brainparams(
vector_action_space_type="discrete",
vector_action_space_size=[7],
vector_observation_space_size=70,
)
mock_brain.brain_name = "PushblockLearning"
return mock_brain
def create_mock_banana_brain():
mock_brain = create_mock_brainparams(
number_visual_observations=1,
vector_action_space_type="discrete",
vector_action_space_size=[3, 3, 3, 2],
vector_observation_space_size=0,
def create_mock_pushblock_behavior_specs():
return setup_test_behavior_specs(
True, False, vector_action_space=7, vector_obs_space=70
return mock_brain
def make_brain_parameters(
discrete_action: bool = False,
visual_inputs: int = 0,
brain_name: str = "RealFakeBrain",
vec_obs_size: int = 6,
) -> BrainParameters:
resolutions = [
CameraResolution(width=30, height=40, num_channels=3)
for _ in range(visual_inputs)
]
return BrainParameters(
vector_observation_space_size=vec_obs_size,
camera_resolutions=resolutions,
vector_action_space_size=[2],
vector_action_descriptions=["", ""],
vector_action_space_type=int(not discrete_action),
brain_name=brain_name,
def create_mock_banana_behavior_specs():
return setup_test_behavior_specs(
True, True, vector_action_space=[3, 3, 3, 2], vector_obs_space=0
)

38
ml-agents/mlagents/trainers/tests/test_agent_processor.py


from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.stats import StatsReporter, StatsSummary
from mlagents.trainers.brain_conversion_utils import get_global_agent_id
from mlagents.trainers.behavior_id_utils import get_global_agent_id
def create_mock_brain():
mock_brain = mb.create_mock_brainparams(
vector_action_space_type="continuous",
vector_action_space_size=[2],
vector_observation_space_size=8,
number_visual_observations=1,
)
return mock_brain
def create_mock_policy():
mock_policy = mock.Mock()
mock_policy.reward_signals = {}

}
mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
num_agents=2,
num_vector_observations=8,
action_shape=[2],
num_vis_observations=num_vis_obs,
observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)],
action_shape=2,
)
fake_action_info = ActionInfo(
action=[0.1, 0.1],

# Test empty steps
mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
num_agents=0,
num_vector_observations=8,
action_shape=[2],
num_vis_observations=num_vis_obs,
observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)],
action_shape=2,
)
processor.add_experiences(
mock_decision_steps, mock_terminal_steps, 0, ActionInfo([], [], {}, [])

"log_probs": [0.1],
}
mock_decision_step, mock_terminal_step = mb.create_mock_steps(
num_agents=1,
num_vector_observations=8,
action_shape=[2],
num_vis_observations=0,
num_agents=1, observation_shapes=[(8,)], action_shape=2
num_agents=1,
num_vector_observations=8,
action_shape=[2],
num_vis_observations=0,
done=True,
num_agents=1, observation_shapes=[(8,)], action_shape=2, done=True
)
fake_action_info = ActionInfo(
action=[0.1],

"log_probs": [0.1],
}
mock_decision_step, mock_terminal_step = mb.create_mock_steps(
num_agents=1,
num_vector_observations=8,
action_shape=[2],
num_vis_observations=0,
num_agents=1, observation_shapes=[(8,)], action_shape=2
)
fake_action_info = ActionInfo(
action=[0.1],

4
ml-agents/mlagents/trainers/tests/test_barracuda_converter.py


use_visual=visual,
)
policy.save_model(1000)
settings = SerializationSettings(
policy.model_path, os.path.join(tmpdir, policy.brain.brain_name)
)
settings = SerializationSettings(policy.model_path, os.path.join(tmpdir, "test"))
export_policy_model(settings, policy.graph, policy.sess)
# These checks taken from test_barracuda_converter

37
ml-agents/mlagents/trainers/tests/test_bcmodule.py


)
def create_bc_module(mock_brain, bc_settings, use_rnn, tanhresample):
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample):
# model_path = env.external_brain_names[0]
trainer_config = TrainerSettings()
trainer_config.network_settings.memory = (

0, mock_brain, trainer_config, False, "test", False, tanhresample, tanhresample
0,
mock_behavior_specs,
trainer_config,
False,
"test",
False,
tanhresample,
tanhresample,
)
with policy.graph.as_default():
bc_module = BCModule(

# Test default values
def test_bcmodule_defaults():
# See if default values match
mock_brain = mb.create_mock_3dball_brain()
mock_specs = mb.create_mock_3dball_behavior_specs()
bc_module = create_bc_module(mock_brain, bc_settings, False, False)
bc_module = create_bc_module(mock_specs, bc_settings, False, False)
assert bc_module.num_epoch == 3
assert bc_module.batch_size == TrainerSettings().hyperparameters.batch_size
# Assign strange values and see if it overrides properly

batch_size=10000,
)
bc_module = create_bc_module(mock_brain, bc_settings, False, False)
bc_module = create_bc_module(mock_specs, bc_settings, False, False)
assert bc_module.num_epoch == 100
assert bc_module.batch_size == 10000

def test_bcmodule_update(is_sac):
mock_brain = mb.create_mock_3dball_brain()
mock_specs = mb.create_mock_3dball_behavior_specs()
bc_module = create_bc_module(mock_brain, bc_settings, False, is_sac)
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)

@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_constant_lr_update(is_sac):
mock_brain = mb.create_mock_3dball_brain()
mock_specs = mb.create_mock_3dball_behavior_specs()
bc_module = create_bc_module(mock_brain, bc_settings, False, is_sac)
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)

# Test with RNN
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_rnn_update(is_sac):
mock_brain = mb.create_mock_3dball_brain()
mock_specs = mb.create_mock_3dball_behavior_specs()
bc_module = create_bc_module(mock_brain, bc_settings, True, is_sac)
bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)

@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_dc_visual_update(is_sac):
mock_brain = mb.create_mock_banana_brain()
mock_specs = mb.create_mock_banana_behavior_specs()
bc_module = create_bc_module(mock_brain, bc_settings, False, is_sac)
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)

@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_rnn_dc_update(is_sac):
mock_brain = mb.create_mock_banana_brain()
mock_specs = mb.create_mock_banana_behavior_specs()
bc_module = create_bc_module(mock_brain, bc_settings, True, is_sac)
bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)

70
ml-agents/mlagents/trainers/tests/test_demo_loader.py


from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
DemonstrationMetaProto,
)
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.tests.mock_brain import (
create_mock_3dball_behavior_specs,
setup_test_behavior_specs,
)
from mlagents.trainers.demo_loader import (
load_demonstration,
demo_to_buffer,

BRAIN_PARAMS = BrainParameters(
brain_name="test_brain",
vector_observation_space_size=8,
camera_resolutions=[],
vector_action_space_size=[2],
vector_action_descriptions=[],
vector_action_space_type=1,
)
BEHAVIOR_SPEC = create_mock_3dball_behavior_specs()
def test_load_demo():

assert np.sum(behavior_spec.observation_shapes[0]) == 8
assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, BRAIN_PARAMS)
_, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, BEHAVIOR_SPEC)
assert len(demo_buffer["actions"]) == total_expected - 1

assert np.sum(behavior_spec.observation_shapes[0]) == 8
assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1, BRAIN_PARAMS)
_, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1, BEHAVIOR_SPEC)
# observation mismatch
# observation size mismatch
brain_params_obs = BrainParameters(
brain_name="test_brain",
vector_observation_space_size=9,
camera_resolutions=[],
vector_action_space_size=[2],
vector_action_descriptions=[],
vector_action_space_type=1,
mismatch_obs = setup_test_behavior_specs(
False, False, vector_action_space=2, vector_obs_space=9
_, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, brain_params_obs)
_, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, mismatch_obs)
brain_params_act = BrainParameters(
brain_name="test_brain",
vector_observation_space_size=8,
camera_resolutions=[],
vector_action_space_size=[3],
vector_action_descriptions=[],
vector_action_space_type=1,
mismatch_act = setup_test_behavior_specs(
False, False, vector_action_space=3, vector_obs_space=9
_, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, brain_params_act)
_, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, mismatch_act)
brain_params_type = BrainParameters(
brain_name="test_brain",
vector_observation_space_size=8,
camera_resolutions=[],
vector_action_space_size=[2],
vector_action_descriptions=[],
vector_action_space_type=0,
mismatch_act_type = setup_test_behavior_specs(
True, False, vector_action_space=[2], vector_obs_space=9
path_prefix + "/test.demo", 1, brain_params_type
path_prefix + "/test.demo", 1, mismatch_act_type
# vis obs mismatch
# number obs mismatch
brain_params_vis = BrainParameters(
brain_name="test_brain",
vector_observation_space_size=8,
camera_resolutions=[[30, 40]],
vector_action_space_size=[2],
vector_action_descriptions=[],
vector_action_space_type=1,
mismatch_obs_number = setup_test_behavior_specs(
False, True, vector_action_space=2, vector_obs_space=9
)
_, demo_buffer = demo_to_buffer(
path_prefix + "/test.demo", 1, mismatch_obs_number
_, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, brain_params_vis)
def test_edge_cases():

100
ml-agents/mlagents/trainers/tests/test_ghost.py


from mlagents.trainers.ghost.controller import GhostController
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory

return TrainerSettings(self_play=SelfPlaySettings())
VECTOR_ACTION_SPACE = [1]
VECTOR_ACTION_SPACE = 1
VECTOR_OBS_SPACE = 8
DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 513

@pytest.mark.parametrize("use_discrete", [True, False])
def test_load_and_set(dummy_config, use_discrete):
mock_brain = mb.setup_mock_brain(
mock_specs = mb.setup_test_behavior_specs(
vector_action_space=VECTOR_ACTION_SPACE,
vector_action_space=DISCRETE_ACTION_SPACE
if use_discrete
else VECTOR_ACTION_SPACE,
discrete_action_space=DISCRETE_ACTION_SPACE,
trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0")
trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0")
policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
policy = trainer.create_policy("test", mock_specs)
to_load_policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
to_load_policy = trainer.create_policy("test", mock_specs)
to_load_policy.create_tf_graph()
to_load_policy.init_load_weights()

def test_process_trajectory(dummy_config):
brain_params_team0 = BrainParameters(
brain_name="test_brain?team=0",
vector_observation_space_size=1,
camera_resolutions=[],
vector_action_space_size=[2],
vector_action_descriptions=[],
vector_action_space_type=0,
mock_specs = mb.setup_test_behavior_specs(
True, False, vector_action_space=[2], vector_obs_space=1
brain_name = BehaviorIdentifiers.from_name_behavior_id(
brain_params_team0.brain_name
).brain_name
behavior_id_team0 = "test_brain?team=0"
behavior_id_team1 = "test_brain?team=1"
brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name
brain_params_team1 = BrainParameters(
brain_name="test_brain?team=1",
vector_observation_space_size=1,
camera_resolutions=[],
vector_action_space_size=[2],
vector_action_descriptions=[],
vector_action_space_type=0,
)
ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
controller = GhostController(100)
trainer = GhostTrainer(

# first policy encountered becomes policy trained by wrapped PPO
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
brain_params_team0.brain_name
)
policy = trainer.create_policy(parsed_behavior_id0, brain_params_team0)
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
trajectory_queue0 = AgentManagerQueue(brain_params_team0.brain_name)
trajectory_queue0 = AgentManagerQueue(behavior_id_team0)
parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
brain_params_team1.brain_name
)
policy = trainer.create_policy(parsed_behavior_id1, brain_params_team1)
parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1)
policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
trajectory_queue1 = AgentManagerQueue(brain_params_team1.brain_name)
trajectory_queue1 = AgentManagerQueue(behavior_id_team1)
trainer.subscribe_trajectory_queue(trajectory_queue1)
time_horizon = 15

vec_obs_size=1,
num_vis_obs=0,
observation_shapes=[(1,)],
action_space=[2],
)
trajectory_queue0.put(trajectory)

def test_publish_queue(dummy_config):
brain_params_team0 = BrainParameters(
brain_name="test_brain?team=0",
vector_observation_space_size=8,
camera_resolutions=[],
vector_action_space_size=[1],
vector_action_descriptions=[],
vector_action_space_type=0,
mock_specs = mb.setup_test_behavior_specs(
True, False, vector_action_space=[1], vector_obs_space=8
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
brain_params_team0.brain_name
)
behavior_id_team0 = "test_brain?team=0"
behavior_id_team1 = "test_brain?team=1"
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
brain_params_team1 = BrainParameters(
brain_name="test_brain?team=1",
vector_observation_space_size=8,
camera_resolutions=[],
vector_action_space_size=[1],
vector_action_descriptions=[],
vector_action_space_type=0,
)
ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
controller = GhostController(100)
trainer = GhostTrainer(

# First policy encountered becomes policy trained by wrapped PPO
# This queue should remain empty after swap snapshot
policy = trainer.create_policy(parsed_behavior_id0, brain_params_team0)
policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
policy_queue0 = AgentManagerQueue(brain_params_team0.brain_name)
policy_queue0 = AgentManagerQueue(behavior_id_team0)
parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
brain_params_team1.brain_name
)
policy = trainer.create_policy(parsed_behavior_id1, brain_params_team1)
parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1)
policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
policy_queue1 = AgentManagerQueue(brain_params_team1.brain_name)
policy_queue1 = AgentManagerQueue(behavior_id_team1)
trainer.publish_policy_queue(policy_queue1)
# check ghost trainer swap pushes to ghost queue and not trainer

# clear
policy_queue1.get_nowait()
mock_brain = mb.setup_mock_brain(
mock_specs = mb.setup_test_behavior_specs(
discrete_action_space=DISCRETE_ACTION_SPACE,
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain)
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs)
# Mock out reward signal eval
buffer["extrinsic_rewards"] = buffer["environment_rewards"]
buffer["extrinsic_returns"] = buffer["environment_rewards"]

49
ml-agents/mlagents/trainers/tests/test_nn_policy.py


from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.models import EncoderType, ModelUtils
from mlagents.trainers.models import EncoderType, ModelUtils, Tensor3DShape
from mlagents.trainers.brain import BrainParameters, CameraResolution
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory

VECTOR_ACTION_SPACE = [2]
VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8
DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 32

load: bool = False,
seed: int = 0,
) -> NNPolicy:
mock_brain = mb.setup_mock_brain(
mock_spec = mb.setup_test_behavior_specs(
vector_action_space=VECTOR_ACTION_SPACE,
vector_action_space=DISCRETE_ACTION_SPACE
if use_discrete
else VECTOR_ACTION_SPACE,
discrete_action_space=DISCRETE_ACTION_SPACE,
)
trainer_settings = dummy_config

)
policy = NNPolicy(seed, mock_brain, trainer_settings, False, model_path, load)
policy = NNPolicy(seed, mock_spec, trainer_settings, False, model_path, load)
return policy

"""
Make sure two policies have the same output for the same input.
"""
decision_step, _ = mb.create_steps_from_brainparams(policy1.brain, num_agents=1)
decision_step, _ = mb.create_steps_from_behavior_spec(
policy1.behavior_spec, num_agents=1
)
run_out1 = policy1.evaluate(decision_step, list(decision_step.agent_id))
run_out2 = policy2.evaluate(decision_step, list(decision_step.agent_id))

policy = create_policy_mock(
TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
decision_step, terminal_step = mb.create_steps_from_brainparams(
policy.brain, num_agents=NUM_AGENTS
decision_step, terminal_step = mb.create_steps_from_behavior_spec(
policy.behavior_spec, num_agents=NUM_AGENTS
)
run_out = policy.evaluate(decision_step, list(decision_step.agent_id))

assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE[0])
assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)
brain_params = BrainParameters(
brain_name="test_brain",
vector_observation_space_size=1,
camera_resolutions=[],
vector_action_space_size=[2],
vector_action_descriptions=[],
vector_action_space_type=0,
behavior_spec = mb.setup_test_behavior_specs(
use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1
)
time_horizon = 6

vec_obs_size=1,
num_vis_obs=0,
observation_shapes=[(1,)],
action_space=[2],
)
# Change half of the obs to 0

0,
brain_params,
behavior_spec,
TrainerSettings(network_settings=NetworkSettings(normalize=True)),
False,
"testdir",

trajectory = make_fake_trajectory(
length=time_horizon,
max_step_complete=True,
vec_obs_size=1,
num_vis_obs=0,
observation_shapes=[(1,)],
action_space=[2],
)
trajectory_buffer = trajectory.to_agentbuffer()

for encoder_type in EncoderType:
with tf.Graph().as_default():
good_size = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[encoder_type]
good_res = CameraResolution(
width=good_size, height=good_size, num_channels=3
)
good_res = Tensor3DShape(width=good_size, height=good_size, num_channels=3)
vis_input = ModelUtils.create_visual_input(good_res, "test_min_visual_size")
ModelUtils._check_resolution_for_encoder(vis_input, encoder_type)
enc_func = ModelUtils.get_encoder_for_type(encoder_type)

with pytest.raises(Exception):
with tf.Graph().as_default():
bad_size = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[encoder_type] - 1
bad_res = CameraResolution(
width=bad_size, height=bad_size, num_channels=3
)
bad_res = Tensor3DShape(width=bad_size, height=bad_size, num_channels=3)
vis_input = ModelUtils.create_visual_input(
bad_res, "test_min_visual_size"
)

138
ml-agents/mlagents/trainers/tests/test_ppo.py


from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.tests.mock_brain import make_brain_parameters
from mlagents.trainers.settings import NetworkSettings, TrainerSettings, PPOSettings
from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.exception import TrainerConfigError
from mlagents.trainers.tests.test_reward_signals import ( # noqa: F401; pylint: disable=unused-variable
curiosity_dummy_config,
gail_dummy_config,

return copy.deepcopy(PPO_CONFIG)
VECTOR_ACTION_SPACE = [2]
VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8
DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 64

def _create_ppo_optimizer_ops_mock(dummy_config, use_rnn, use_discrete, use_visual):
mock_brain = mb.setup_mock_brain(
mock_specs = mb.setup_test_behavior_specs(
vector_action_space=VECTOR_ACTION_SPACE,
vector_action_space=DISCRETE_ACTION_SPACE
if use_discrete
else VECTOR_ACTION_SPACE,
discrete_action_space=DISCRETE_ACTION_SPACE,
)
trainer_settings = attr.evolve(dummy_config)

else None
)
policy = NNPolicy(
0, mock_brain, trainer_settings, False, "test", False, create_tf_graph=False
0, mock_specs, trainer_settings, False, "test", False, create_tf_graph=False
def _create_fake_trajectory(use_discrete, use_visual, time_horizon):
if use_discrete:
act_space = DISCRETE_ACTION_SPACE
else:
act_space = VECTOR_ACTION_SPACE
if use_visual:
num_vis_obs = 1
vec_obs_size = 0
else:
num_vis_obs = 0
vec_obs_size = VECTOR_OBS_SPACE
trajectory = make_fake_trajectory(
length=time_horizon,
max_step_complete=True,
vec_obs_size=vec_obs_size,
num_vis_obs=num_vis_obs,
action_space=act_space,
)
return trajectory
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])

dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]

dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]

PPO_CONFIG, use_rnn=False, use_discrete=False, use_visual=False
)
# Test update
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]

)
# Check if buffer size is too big
update_buffer = mb.simulate_rollout(3000, optimizer.policy.brain)
update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]

dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
time_horizon = 15
trajectory = _create_fake_trajectory(discrete, visual, time_horizon)
trajectory = make_fake_trajectory(
length=time_horizon,
observation_shapes=optimizer.policy.behavior_spec.observation_shapes,
max_step_complete=True,
action_space=DISCRETE_ACTION_SPACE if discrete else VECTOR_ACTION_SPACE,
is_discrete=discrete,
)
run_out, final_value_out = optimizer.get_trajectory_value_estimates(
trajectory.to_agentbuffer(), trajectory.next_obs, done=False
)

mock_optimizer.reward_signals = {}
ppo_optimizer.return_value = mock_optimizer
brain_params = BrainParameters(
brain_name="test_brain",
vector_observation_space_size=1,
camera_resolutions=[],
vector_action_space_size=[2],
vector_action_descriptions=[],
vector_action_space_type=0,
)
trainer = PPOTrainer(
brain_params.brain_name, 0, trainer_params, True, False, 0, "0"
)
trainer = PPOTrainer("test_brain", 0, trainer_params, True, False, 0, "0")
policy_mock = mock.Mock(spec=NNPolicy)
policy_mock.get_current_step.return_value = 0
step_count = (

def test_trainer_update_policy(
dummy_config, curiosity_dummy_config, use_discrete # noqa: F811
):
mock_brain = mb.setup_mock_brain(
mock_brain = mb.setup_test_behavior_specs(
vector_action_space=VECTOR_ACTION_SPACE,
vector_action_space=DISCRETE_ACTION_SPACE
if use_discrete
else VECTOR_ACTION_SPACE,
discrete_action_space=DISCRETE_ACTION_SPACE,
)
trainer_params = dummy_config

# Test curiosity reward signal
trainer_params.reward_signals = curiosity_dummy_config
trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0")
policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
trainer.add_policy(mock_brain.brain_name, policy)
trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0")
policy = trainer.create_policy("test", mock_brain)
trainer.add_policy("test", policy)
# Test update with sequence length smaller than batch size
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain)
# Mock out reward signal eval

def test_process_trajectory(dummy_config):
brain_params = BrainParameters(
brain_name="test_brain",
vector_observation_space_size=1,
camera_resolutions=[],
vector_action_space_size=[2],
vector_action_descriptions=[],
vector_action_space_type=0,
behavior_spec = mb.setup_test_behavior_specs(
True,
False,
vector_action_space=DISCRETE_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE,
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = trainer.create_policy(brain_params.brain_name, brain_params)
trainer.add_policy(brain_params.brain_name, policy)
trainer = PPOTrainer("test_brain", 0, dummy_config, True, False, 0, "0")
policy = trainer.create_policy("test_brain", behavior_spec)
trainer.add_policy("test_brain", policy)
observation_shapes=behavior_spec.observation_shapes,
vec_obs_size=1,
num_vis_obs=0,
action_space=[2],
)
trajectory_queue.put(trajectory)

trajectory = make_fake_trajectory(
length=time_horizon + 1,
max_step_complete=False,
vec_obs_size=1,
num_vis_obs=0,
observation_shapes=behavior_spec.observation_shapes,
action_space=[2],
)
trajectory_queue.put(trajectory)

@mock.patch("mlagents.trainers.ppo.trainer.PPOOptimizer")
def test_add_get_policy(ppo_optimizer, dummy_config):
brain_params = make_brain_parameters(
discrete_action=False, visual_inputs=0, vec_obs_size=6
)
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
trainer = PPOTrainer("test_policy", 0, dummy_config, True, False, 0, "0")
trainer.add_policy(brain_params.brain_name, policy)
assert trainer.get_policy(brain_params.brain_name) == policy
trainer.add_policy("test_policy", policy)
assert trainer.get_policy("test_policy") == policy
# Make sure the summary steps were loaded properly
assert trainer.get_step == 2000

with pytest.raises(RuntimeError):
trainer.add_policy(brain_params, policy)
# TODO: Move this to test_settings.py
def test_bad_config():
brain_params = make_brain_parameters(
discrete_action=False, visual_inputs=0, vec_obs_size=6
)
# Test that we throw an error if we have sequence length greater than batch size
with pytest.raises(TrainerConfigError):
TrainerSettings(
network_settings=NetworkSettings(
memory=NetworkSettings.MemorySettings(sequence_length=64)
),
hyperparameters=PPOSettings(batch_size=32),
)
_ = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
trainer.add_policy("test_policy", policy)
if __name__ == "__main__":

17
ml-agents/mlagents/trainers/tests/test_reward_signals.py


return {RewardSignalType.EXTRINSIC: RewardSignalSettings()}
VECTOR_ACTION_SPACE = [2]
VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8
DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 20

def create_optimizer_mock(
trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual
):
mock_brain = mb.setup_mock_brain(
mock_specs = mb.setup_test_behavior_specs(
vector_action_space=VECTOR_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE,
discrete_action_space=DISCRETE_ACTION_SPACE,
vector_action_space=DISCRETE_ACTION_SPACE
if use_discrete
else VECTOR_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
)
trainer_settings = trainer_config
trainer_settings.reward_signals = reward_signal_config

else None
)
policy = NNPolicy(
0, mock_brain, trainer_settings, False, "test", False, create_tf_graph=False
0, mock_specs, trainer_settings, False, "test", False, create_tf_graph=False
)
if trainer_settings.trainer_type == TrainerType.SAC:
optimizer = SACOptimizer(policy, trainer_settings)

def reward_signal_eval(optimizer, reward_signal_name):
buffer = mb.simulate_rollout(BATCH_SIZE, optimizer.policy.brain)
buffer = mb.simulate_rollout(BATCH_SIZE, optimizer.policy.behavior_spec)
# Test evaluate
rsig_result = optimizer.reward_signals[reward_signal_name].evaluate_batch(buffer)
assert rsig_result.scaled_reward.shape == (BATCH_SIZE,)

def reward_signal_update(optimizer, reward_signal_name):
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec)
feed_dict = optimizer.reward_signals[reward_signal_name].prepare_update(
optimizer.policy, buffer.make_mini_batch(0, 10), 2
)

19
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


from mlagents.trainers.settings import TrainerSettings
def create_mock_brain():
mock_brain = mb.create_mock_brainparams(
vector_action_space_type="continuous",
vector_action_space_size=[2],
vector_observation_space_size=8,
number_visual_observations=1,
)
return mock_brain
# Add concrete implementations of abstract methods
class FakeTrainer(RLTrainer):
def set_is_policy_updating(self, is_updating):

def create_rl_trainer():
mock_brainparams = create_mock_brain()
mock_brainparams,
"test_trainer",
TrainerSettings(max_steps=100, checkpoint_interval=10, summary_freq=20),
True,
0,

time_horizon = 10
trajectory = mb.make_fake_trajectory(
length=time_horizon,
observation_shapes=[(1,)],
vec_obs_size=1,
num_vis_obs=0,
action_space=[2],
)
trajectory_queue.put(trajectory)

checkpoint_interval = trainer.trainer_settings.checkpoint_interval
trajectory = mb.make_fake_trajectory(
length=time_horizon,
observation_shapes=[(1,)],
vec_obs_size=1,
num_vis_obs=0,
action_space=[2],
)
# Check that we can turn off the trainer and that the buffer is cleared

86
ml-agents/mlagents/trainers/tests/test_sac.py


from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.mock_brain import make_brain_parameters
from mlagents.trainers.tests.mock_brain import setup_test_behavior_specs
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
from mlagents.trainers.tests.test_simple_rl import SAC_CONFIG
from mlagents.trainers.settings import NetworkSettings

return copy.deepcopy(SAC_CONFIG)
VECTOR_ACTION_SPACE = [2]
VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8
DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 64

def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual):
mock_brain = mb.setup_mock_brain(
mock_brain = mb.setup_test_behavior_specs(
vector_action_space=VECTOR_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE,
discrete_action_space=DISCRETE_ACTION_SPACE,
vector_action_space=DISCRETE_ACTION_SPACE
if use_discrete
else VECTOR_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
)
trainer_settings = dummy_config
trainer_settings.network_settings.memory = (

dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
# Mock out reward signal eval
update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
optimizer.update(

)
# Test update, while removing PPO-specific buffer elements.
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
# Mock out reward signal eval
update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]

def test_sac_save_load_buffer(tmpdir, dummy_config):
mock_brain = mb.setup_mock_brain(
mock_specs = mb.setup_test_behavior_specs(
discrete_action_space=DISCRETE_ACTION_SPACE,
trainer = SACTrainer(
mock_brain.brain_name, 1, trainer_params, True, False, 0, "testdir"
trainer = SACTrainer("test", 1, trainer_params, True, False, 0, "testdir")
policy = trainer.create_policy("test", mock_specs)
trainer.add_policy("test", policy)
trainer.update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, policy.behavior_spec
policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
trainer.add_policy(mock_brain.brain_name, policy)
trainer.update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
trainer.save_model(mock_brain.brain_name)
trainer.save_model(trainer.brain_name)
trainer2 = SACTrainer(
mock_brain.brain_name, 1, trainer_params, True, True, 0, "testdir"
)
trainer2 = SACTrainer("test", 1, trainer_params, True, True, 0, "testdir")
policy = trainer2.create_policy(mock_brain.brain_name, mock_brain)
trainer2.add_policy(mock_brain.brain_name, policy)
policy = trainer2.create_policy("test", mock_specs)
trainer2.add_policy("test", policy)
brain_params = make_brain_parameters(
discrete_action=False, visual_inputs=0, vec_obs_size=6
)
trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
trainer = SACTrainer("test", 0, dummy_config, True, False, 0, "0")
trainer.add_policy(brain_params.brain_name, policy)
assert trainer.get_policy(brain_params.brain_name) == policy
trainer.add_policy("test", policy)
assert trainer.get_policy("test") == policy
# Make sure the summary steps were loaded properly
assert trainer.get_step == 2000

with pytest.raises(RuntimeError):
trainer.add_policy(brain_params, policy)
trainer.add_policy("test", policy)
brain_params = make_brain_parameters(
discrete_action=False, visual_inputs=0, vec_obs_size=6
specs = setup_test_behavior_specs(
use_discrete=False, use_visual=False, vector_action_space=2
trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = trainer.create_policy(brain_params.brain_name, brain_params)
trainer.add_policy(brain_params.brain_name, policy)
trainer = SACTrainer("test", 0, dummy_config, True, False, 0, "0")
policy = trainer.create_policy("test", specs)
trainer.add_policy("test", policy)
trajectory_queue = AgentManagerQueue("testbrain")
policy_queue = AgentManagerQueue("testbrain")

trajectory = make_fake_trajectory(
length=15,
observation_shapes=specs.observation_shapes,
vec_obs_size=6,
num_vis_obs=0,
action_space=[2],
action_space=2,
is_discrete=False,
)
trajectory_queue.put(trajectory)

# Add a terminal trajectory
trajectory = make_fake_trajectory(
length=6,
observation_shapes=specs.observation_shapes,
vec_obs_size=6,
num_vis_obs=0,
action_space=[2],
action_space=2,
is_discrete=False,
)
trajectory_queue.put(trajectory)

# two updates, there should NOT be a policy on the queue.
trajectory = make_fake_trajectory(
length=5,
observation_shapes=specs.observation_shapes,
vec_obs_size=6,
num_vis_obs=0,
action_space=[2],
action_space=2,
is_discrete=False,
)
trajectory_queue.put(trajectory)

# Call add_policy and check that we update the correct number of times.
# This is to emulate a load from checkpoint.
policy = trainer.create_policy(brain_params.brain_name, brain_params)
policy = trainer.create_policy("test", specs)
trainer.add_policy(brain_params.brain_name, policy)
trainer.add_policy("test", policy)
trainer.optimizer.update = mock.Mock()
trainer.optimizer.update_reward_signals = mock.Mock()
trainer.optimizer.update_reward_signals.return_value = {}

6
ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py


@mock.patch("mlagents.trainers.subprocess_env_manager.SubprocessEnvManager._step")
@mock.patch(
"mlagents.trainers.subprocess_env_manager.SubprocessEnvManager.external_brains",
"mlagents.trainers.subprocess_env_manager.SubprocessEnvManager.training_behaviors",
def test_advance(self, mock_create_worker, external_brains_mock, step_mock):
def test_advance(self, mock_create_worker, training_behaviors_mock, step_mock):
brain_name = "testbrain"
action_info_dict = {brain_name: MagicMock()}
mock_create_worker.side_effect = create_worker_mock

external_brains_mock.return_value = [brain_name]
training_behaviors_mock.return_value = [brain_name]
agent_manager_mock = mock.Mock()
mock_policy = mock.Mock()
agent_manager_mock.policy_queue.get_nowait.side_effect = [

4
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


env_mock = MagicMock()
env_mock.close = MagicMock()
env_mock.reset = MagicMock()
env_mock.external_brains = MagicMock()
env_mock.training_behaviors = MagicMock()
tc.start_learning(env_mock)
tf_reset_graph.assert_called_once()

env_mock = MagicMock()
env_mock.close = MagicMock()
env_mock.reset = MagicMock(return_value=brain_info_mock)
env_mock.external_brains = MagicMock()
env_mock.training_behaviors = MagicMock()
tc.start_learning(env_mock)
tf_reset_graph.assert_called_once()

29
ml-agents/mlagents/trainers/tests/test_trainer_util.py


from mlagents.trainers.cli_utils import load_config, _load_config
from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.exception import TrainerConfigError, UnityTrainerException
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.settings import RunOptions
from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG

return RunOptions(behaviors={"testbrain": PPO_CONFIG})
@patch("mlagents.trainers.brain.BrainParameters")
def test_initialize_ppo_trainer(BrainParametersMock, dummy_config):
brain_params_mock = BrainParametersMock()
BrainParametersMock.return_value.brain_name = "testbrain"
external_brains = {"testbrain": BrainParametersMock()}
@patch("mlagents_envs.base_env.BehaviorSpec")
def test_initialize_ppo_trainer(BehaviorSpecMock, dummy_config):
brain_name = "testbrain"
training_behaviors = {"testbrain": BehaviorSpecMock()}
output_path = "results_dir"
train_model = True
load_model = False

seed,
artifact_path,
):
assert brain == brain_params_mock.brain_name
assert brain == brain_name
assert trainer_settings == expected_config
assert reward_buff_cap == expected_reward_buff_cap
assert training == train_model

seed=seed,
)
trainers = {}
for brain_name, brain_parameters in external_brains.items():
trainers[brain_name] = trainer_factory.generate(brain_parameters.brain_name)
for brain_name in training_behaviors.keys():
trainers[brain_name] = trainer_factory.generate(brain_name)
@patch("mlagents.trainers.brain.BrainParameters")
def test_handles_no_config_provided(BrainParametersMock):
def test_handles_no_config_provided():
brain_parameters = BrainParameters(
brain_name=brain_name,
vector_observation_space_size=1,
camera_resolutions=[],
vector_action_space_size=[2],
vector_action_descriptions=[],
vector_action_space_type=0,
)
trainer_factory = trainer_util.TrainerFactory(
trainer_config=no_default_config,

seed=42,
)
trainer_factory.generate(brain_parameters.brain_name)
trainer_factory.generate(brain_name)
def test_load_config_missing_file():

4
ml-agents/mlagents/trainers/tests/test_trajectory.py


]
wanted_keys = set(wanted_keys)
trajectory = make_fake_trajectory(
length=length, vec_obs_size=VEC_OBS_SIZE, action_space=[ACTION_SIZE]
length=length,
observation_shapes=[(VEC_OBS_SIZE,), (84, 84, 3)],
action_space=[ACTION_SIZE],
)
agentbuffer = trajectory.to_agentbuffer()
seen_keys = set()

6
ml-agents/mlagents/trainers/trainer/trainer.py


from mlagents_envs.logging_util import get_logger
from mlagents_envs.timers import timed
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.policy import Policy
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings

):
"""
Responsible for collecting experiences and training a neural network model.
:BrainParameters brain: Brain to be trained.
:param brain_name: Brain name of brain to be trained.
:param trainer_settings: The parameters for the trainer (dictionary).
:param training: Whether the trainer is set for training.
:param artifact_path: The directory within which to store artifacts from this trainer

@abc.abstractmethod
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TFPolicy:
"""
Creates policy

4
ml-agents/mlagents/trainers/trainer_controller.py


self.trainer_threads.append(trainerthread)
policy = trainer.create_policy(
parsed_behavior_id, env_manager.external_brains[name_behavior_id]
parsed_behavior_id, env_manager.training_behaviors[name_behavior_id]
)
trainer.add_policy(parsed_behavior_id, policy)

# Initial reset
self._reset_env(env_manager)
while self._not_done_training():
external_brain_behavior_ids = set(env_manager.external_brains.keys())
external_brain_behavior_ids = set(env_manager.training_behaviors.keys())
new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids
self._create_trainers_and_managers(env_manager, new_behavior_ids)
last_brain_behavior_ids = external_brain_behavior_ids

36
ml-agents/mlagents/trainers/tests/test_models.py


import pytest
from mlagents.trainers.models import ModelUtils
from mlagents.tf_utils import tf
from mlagents_envs.base_env import BehaviorSpec, ActionType
def create_behavior_spec(num_visual, num_vector, vector_size):
behavior_spec = BehaviorSpec(
[(84, 84, 3)] * int(num_visual) + [(vector_size,)] * int(num_vector),
ActionType.DISCRETE,
(1,),
)
return behavior_spec
@pytest.mark.parametrize("num_visual", [1, 2, 4])
@pytest.mark.parametrize("num_vector", [1, 2, 4])
def test_create_input_placeholders(num_vector, num_visual):
vec_size = 8
name_prefix = "test123"
bspec = create_behavior_spec(num_visual, num_vector, vec_size)
vec_in, vis_in = ModelUtils.create_input_placeholders(
bspec.observation_shapes, name_prefix=name_prefix
)
assert isinstance(vis_in, list)
assert len(vis_in) == num_visual
assert isinstance(vec_in, tf.Tensor)
assert vec_in.get_shape().as_list()[1] == num_vector * 8
# Check names contain prefix and vis shapes are correct
for _vis in vis_in:
assert _vis.get_shape().as_list() == [None, 84, 84, 3]
assert _vis.name.startswith(name_prefix)
assert vec_in.name.startswith(name_prefix)

88
ml-agents/mlagents/trainers/brain.py


from mlagents_envs.communicator_objects.agent_info_pb2 import AgentInfoProto
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
from typing import List, NamedTuple
class CameraResolution(NamedTuple):
height: int
width: int
num_channels: int
@property
def gray_scale(self) -> bool:
return self.num_channels == 1
def __str__(self):
return f"CameraResolution({self.height}, {self.width}, {self.num_channels})"
class BrainParameters:
def __init__(
self,
brain_name: str,
vector_observation_space_size: int,
camera_resolutions: List[CameraResolution],
vector_action_space_size: List[int],
vector_action_descriptions: List[str],
vector_action_space_type: int,
):
"""
Contains all brain-specific parameters.
"""
self.brain_name = brain_name
self.vector_observation_space_size = vector_observation_space_size
self.number_visual_observations = len(camera_resolutions)
self.camera_resolutions = camera_resolutions
self.vector_action_space_size = vector_action_space_size
self.vector_action_descriptions = vector_action_descriptions
self.vector_action_space_type = ["discrete", "continuous"][
vector_action_space_type
]
def __str__(self):
return """Unity brain name: {}
Number of Visual Observations (per agent): {}
Camera Resolutions: {}
Vector Observation space size (per agent): {}
Vector Action space type: {}
Vector Action space size (per agent): {}
Vector Action descriptions: {}""".format(
self.brain_name,
str(self.number_visual_observations),
str([str(cr) for cr in self.camera_resolutions]),
str(self.vector_observation_space_size),
self.vector_action_space_type,
str(self.vector_action_space_size),
", ".join(self.vector_action_descriptions),
)
@staticmethod
def from_proto(
brain_param_proto: BrainParametersProto, agent_info: AgentInfoProto
) -> "BrainParameters":
"""
Converts brain parameter proto to BrainParameter object.
:param brain_param_proto: protobuf object.
:return: BrainParameter object.
"""
resolutions = [
CameraResolution(obs.shape[0], obs.shape[1], obs.shape[2])
for obs in agent_info.observations
if len(obs.shape) >= 3
]
total_vector_obs = sum(
obs.shape[0] for obs in agent_info.observations if len(obs.shape) == 1
)
brain_params = BrainParameters(
brain_name=brain_param_proto.brain_name,
vector_observation_space_size=total_vector_obs,
camera_resolutions=resolutions,
vector_action_space_size=list(brain_param_proto.vector_action_size),
vector_action_descriptions=list(
brain_param_proto.vector_action_descriptions
),
vector_action_space_type=brain_param_proto.vector_action_space_type,
)
return brain_params

31
ml-agents/mlagents/trainers/brain_conversion_utils.py


from mlagents.trainers.brain import BrainParameters, CameraResolution
from mlagents_envs.base_env import BehaviorSpec
import numpy as np
from typing import List
def behavior_spec_to_brain_parameters(
name: str, behavior_spec: BehaviorSpec
) -> BrainParameters:
vec_size = np.sum(
[shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1]
)
vis_sizes = [shape for shape in behavior_spec.observation_shapes if len(shape) == 3]
cam_res = [CameraResolution(s[0], s[1], s[2]) for s in vis_sizes]
a_size: List[int] = []
if behavior_spec.is_action_discrete():
a_size += list(behavior_spec.discrete_action_branches)
vector_action_space_type = 0
else:
a_size += [behavior_spec.action_size]
vector_action_space_type = 1
return BrainParameters(
name, int(vec_size), cam_res, a_size, [], vector_action_space_type
)
def get_global_agent_id(worker_id: int, agent_id: int) -> str:
"""
Create an agent id that is unique across environment workers using the worker_id.
"""
return f"${worker_id}-{agent_id}"
正在加载...
取消
保存