浏览代码

Replace BrainInfos with BatchedStepResult (#3207)

/asymm-envs
GitHub 4 年前
当前提交
f058b18c
共有 36 个文件被更改,包括 636 次插入922 次删除
  1. 2
      gym-unity/gym_unity/envs/__init__.py
  2. 30
      gym-unity/gym_unity/tests/test_gym.py
  3. 16
      ml-agents-envs/mlagents_envs/base_env.py
  4. 3
      ml-agents/mlagents/trainers/action_info.py
  5. 113
      ml-agents/mlagents/trainers/agent_processor.py
  6. 157
      ml-agents/mlagents/trainers/brain.py
  7. 50
      ml-agents/mlagents/trainers/brain_conversion_utils.py
  8. 22
      ml-agents/mlagents/trainers/components/reward_signals/__init__.py
  9. 27
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
  10. 14
      ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
  11. 18
      ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
  12. 56
      ml-agents/mlagents/trainers/demo_loader.py
  13. 34
      ml-agents/mlagents/trainers/env_manager.py
  14. 2
      ml-agents/mlagents/trainers/models.py
  15. 6
      ml-agents/mlagents/trainers/policy.py
  16. 2
      ml-agents/mlagents/trainers/ppo/models.py
  17. 20
      ml-agents/mlagents/trainers/ppo/policy.py
  18. 2
      ml-agents/mlagents/trainers/sac/models.py
  19. 19
      ml-agents/mlagents/trainers/sac/policy.py
  20. 46
      ml-agents/mlagents/trainers/simple_env_manager.py
  21. 51
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  22. 261
      ml-agents/mlagents/trainers/tests/mock_brain.py
  23. 10
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  24. 67
      ml-agents/mlagents/trainers/tests/test_bcmodule.py
  25. 11
      ml-agents/mlagents/trainers/tests/test_demo_loader.py
  26. 37
      ml-agents/mlagents/trainers/tests/test_policy.py
  27. 43
      ml-agents/mlagents/trainers/tests/test_ppo.py
  28. 120
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  29. 120
      ml-agents/mlagents/trainers/tests/test_sac.py
  30. 16
      ml-agents/mlagents/trainers/tests/test_simple_rl.py
  31. 4
      ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
  32. 14
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  33. 66
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  34. 57
      ml-agents/mlagents/trainers/tf_policy.py
  35. 3
      ml-agents/mlagents/trainers/trainer_controller.py
  36. 39
      ml-agents/mlagents/trainers/trajectory.py

2
gym-unity/gym_unity/envs/__init__.py


observation (object/list): agent's observation of the current environment
reward (float/list) : amount of reward returned after previous action
done (boolean/list): whether the episode has ended.
info (dict): contains auxiliary diagnostic information, including BrainInfo.
info (dict): contains auxiliary diagnostic information, including BatchedStepResult.
"""
# Use random actions for all other agents in environment.

30
gym-unity/gym_unity/tests/test_gym.py


@mock.patch("gym_unity.envs.UnityEnvironment")
def test_gym_wrapper(mock_env):
mock_brain = create_mock_group_spec()
mock_braininfo = create_mock_vector_step_result()
setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)
mock_spec = create_mock_group_spec()
mock_step = create_mock_vector_step_result()
setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
env = UnityEnv(" ", use_visual=False, multiagent=False)
assert isinstance(env, UnityEnv)

@mock.patch("gym_unity.envs.UnityEnvironment")
def test_multi_agent(mock_env):
mock_brain = create_mock_group_spec()
mock_braininfo = create_mock_vector_step_result(num_agents=2)
setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)
mock_spec = create_mock_group_spec()
mock_step = create_mock_vector_step_result(num_agents=2)
setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
with pytest.raises(UnityGymException):
UnityEnv(" ", multiagent=False)

@mock.patch("gym_unity.envs.UnityEnvironment")
def test_branched_flatten(mock_env):
mock_brain = create_mock_group_spec(
mock_spec = create_mock_group_spec(
mock_braininfo = create_mock_vector_step_result(num_agents=1)
setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)
mock_step = create_mock_vector_step_result(num_agents=1)
setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
env = UnityEnv(" ", use_visual=False, multiagent=False, flatten_branched=True)
assert isinstance(env.action_space, spaces.Discrete)

@pytest.mark.parametrize("use_uint8", [True, False], ids=["float", "uint8"])
@mock.patch("gym_unity.envs.UnityEnvironment")
def test_gym_wrapper_visual(mock_env, use_uint8):
mock_brain = create_mock_group_spec(number_visual_observations=1)
mock_braininfo = create_mock_vector_step_result(number_visual_observations=1)
setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)
mock_spec = create_mock_group_spec(number_visual_observations=1)
mock_step = create_mock_vector_step_result(number_visual_observations=1)
setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
env = UnityEnv(" ", use_visual=True, multiagent=False, uint8_visual=use_uint8)
assert isinstance(env, UnityEnv)

def create_mock_vector_step_result(num_agents=1, number_visual_observations=0):
"""
Creates a mock BrainInfo with vector observations. Imitates constant
Creates a mock BatchedStepResult with vector observations. Imitates constant
:int num_agents: Number of "agents" to imitate in your BrainInfo values.
:int num_agents: Number of "agents" to imitate in your BatchedStepResult values.
"""
obs = [np.array([num_agents * [1, 2, 3]])]
if number_visual_observations:

def setup_mock_unityenvironment(mock_env, mock_spec, mock_result):
"""
Takes a mock UnityEnvironment and adds the appropriate properties, defined by the mock
BrainParameters and BrainInfo.
GroupSpec and BatchedStepResult.
:Mock mock_env: A mock UnityEnvironment, usually empty.
:Mock mock_spec: An AgentGroupSpec object that specifies the params of this environment.

16
ml-agents-envs/mlagents_envs/base_env.py


self.max_step: np.ndarray = max_step
self.agent_id: np.ndarray = agent_id
self.action_mask: Optional[List[np.ndarray]] = action_mask
self._agent_id_to_index: Optional[Dict[int, int]] = None
self._agent_id_to_index: Optional[Dict[AgentId, int]] = None
def contains_agent(self, agent_id: AgentId) -> bool:
@property
def agent_id_to_index(self) -> Dict[AgentId, int]:
"""
Returns the index of the agent_id in this BatchedStepResult, and
-1 if agent_id is not in this BatchedStepResult.
:param agent_id: The id of the agent
:returns: The index of the agent_id, and -1 if not found.
"""
return agent_id in self._agent_id_to_index
return self._agent_id_to_index
def contains_agent(self, agent_id: AgentId) -> bool:
return agent_id in self.agent_id_to_index
def get_agent_step_result(self, agent_id: AgentId) -> StepResult:
"""

3
ml-agents/mlagents/trainers/action_info.py


from typing import NamedTuple, Any, Dict, List
import numpy as np
from mlagents_envs.base_env import AgentId
ActionInfoOutputs = Dict[str, np.ndarray]

value: Any
outputs: ActionInfoOutputs
agents: List[str]
agent_ids: List[AgentId]

113
ml-agents/mlagents/trainers/agent_processor.py


import sys
import numpy as np
from mlagents_envs.base_env import BatchedStepResult
from mlagents.trainers.brain import BrainInfo
from mlagents.trainers.env_manager import get_global_agent_id
T = TypeVar("T")

:param stats_category: The category under which to write the stats. Usually, this comes from the Trainer.
"""
self.experience_buffers: Dict[str, List[AgentExperience]] = defaultdict(list)
self.last_brain_info: Dict[str, BrainInfo] = {}
self.last_step_result: Dict[str, BatchedStepResult] = {}
# last_take_action_outputs stores the action a_t taken before the current observation s_(t+1), while
# grabbing previous_action from the policy grabs the action PRIOR to that, a_(t-1).
self.last_take_action_outputs: Dict[str, ActionInfoOutputs] = {}

self.behavior_id = behavior_id
def add_experiences(
self, curr_info: BrainInfo, previous_action: ActionInfo
self,
batched_step_result: BatchedStepResult,
worker_id: int,
previous_action: ActionInfo,
:param curr_info: current BrainInfo.
:param previous_action: The return value of the Policy's get_action method.
:param batched_step_result: current BatchedStepResult.
:param previous_action: The outputs of the Policy's get_action method.
"""
take_action_outputs = previous_action.outputs
if take_action_outputs:

"Policy/Learning Rate", take_action_outputs["learning_rate"]
)
for agent_id in previous_action.agents:
self.last_take_action_outputs[agent_id] = take_action_outputs
# Make unique agent_ids that are global across workers
action_global_agent_ids = [
get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids
]
for global_id in action_global_agent_ids:
self.last_take_action_outputs[global_id] = take_action_outputs
# Store the environment reward
tmp_environment_reward = curr_info.rewards
for agent_idx, agent_id in enumerate(curr_info.agents):
stored_info = self.last_brain_info.get(agent_id, None)
for _id in np.nditer(batched_step_result.agent_id): # Explicit numpy iteration
local_id = int(
_id
) # Needed for mypy to pass since ndarray has no content type
curr_agent_step = batched_step_result.get_agent_step_result(local_id)
global_id = get_global_agent_id(worker_id, local_id)
stored_step = self.last_step_result.get(global_id, None)
agent_id, None
global_id, None
if stored_info is not None and stored_take_action_outputs is not None:
prev_idx = stored_info.agents.index(agent_id)
obs = []
if not stored_info.local_done[prev_idx]:
for i, _ in enumerate(stored_info.visual_observations):
obs.append(stored_info.visual_observations[i][prev_idx])
if self.policy.use_vec_obs:
obs.append(stored_info.vector_observations[prev_idx])
if stored_step is not None and stored_take_action_outputs is not None:
# We know the step is from the same worker, so use the local agent id.
stored_agent_step = stored_step.get_agent_step_result(local_id)
idx = stored_step.agent_id_to_index[local_id]
obs = stored_agent_step.obs
if not stored_agent_step.done:
memory = self.policy.retrieve_memories([agent_id])[0, :]
memory = self.policy.retrieve_memories([global_id])[0, :]
done = curr_info.local_done[agent_idx]
max_step = curr_info.max_reached[agent_idx]
done = curr_agent_step.done
max_step = curr_agent_step.max_step
action = stored_take_action_outputs["action"][prev_idx]
action = stored_take_action_outputs["action"][idx]
action_pre = stored_take_action_outputs["pre_action"][prev_idx]
action_pre = stored_take_action_outputs["pre_action"][idx]
action_probs = stored_take_action_outputs["log_probs"][prev_idx]
action_masks = stored_info.action_masks[prev_idx]
prev_action = self.policy.retrieve_previous_action([agent_id])[0, :]
action_probs = stored_take_action_outputs["log_probs"][idx]
action_mask = stored_agent_step.action_mask
prev_action = self.policy.retrieve_previous_action([global_id])[
0, :
]
reward=tmp_environment_reward[agent_idx],
reward=curr_agent_step.reward,
action_mask=action_masks,
action_mask=action_mask,
self.experience_buffers[agent_id].append(experience)
self.episode_rewards[agent_id] += tmp_environment_reward[agent_idx]
self.experience_buffers[global_id].append(experience)
self.episode_rewards[global_id] += curr_agent_step.reward
curr_info.local_done[agent_idx]
curr_agent_step.done
len(self.experience_buffers[agent_id])
len(self.experience_buffers[global_id])
) and len(self.experience_buffers[agent_id]) > 0:
) and len(self.experience_buffers[global_id]) > 0:
next_obs = []
for i, _ in enumerate(curr_info.visual_observations):
next_obs.append(curr_info.visual_observations[i][agent_idx])
if self.policy.use_vec_obs:
next_obs.append(curr_info.vector_observations[agent_idx])
next_obs = curr_agent_step.obs
steps=self.experience_buffers[agent_id],
agent_id=agent_id,
steps=self.experience_buffers[global_id],
agent_id=global_id,
self.experience_buffers[agent_id] = []
if curr_info.local_done[agent_idx]:
self.experience_buffers[global_id] = []
if curr_agent_step.done:
self.episode_rewards.get(agent_id, 0),
self.episode_rewards.get(global_id, 0),
self.episode_steps.get(agent_id, 0),
self.episode_steps.get(global_id, 0),
del self.episode_steps[agent_id]
del self.episode_rewards[agent_id]
elif not curr_info.local_done[agent_idx]:
self.episode_steps[agent_id] += 1
del self.episode_steps[global_id]
del self.episode_rewards[global_id]
elif not curr_agent_step.done:
self.episode_steps[global_id] += 1
self.last_brain_info[agent_id] = curr_info
self.last_step_result[global_id] = batched_step_result
previous_action.agents, take_action_outputs["action"]
previous_action.agent_ids, take_action_outputs["action"]
)
def publish_trajectory_queue(

157
ml-agents/mlagents/trainers/brain.py


import logging
import numpy as np
from mlagents_envs.communicator_objects.observation_pb2 import ObservationProto
from mlagents_envs.timers import timed
from mlagents_envs import rpc_utils
from typing import Dict, List, NamedTuple, Collection
from typing import List, NamedTuple
logger = logging.getLogger("mlagents.trainers")

vector_action_space_type=brain_param_proto.vector_action_space_type,
)
return brain_params
class BrainInfo:
def __init__(
self,
visual_observation,
vector_observation,
reward=None,
agents=None,
local_done=None,
max_reached=None,
action_mask=None,
):
"""
Describes experience at current step of all agents linked to a brain.
"""
self.visual_observations = visual_observation
self.vector_observations = vector_observation
self.rewards = reward
self.local_done = local_done
self.max_reached = max_reached
self.agents = agents
self.action_masks = action_mask
@staticmethod
@timed
def from_agent_proto(
worker_id: int,
agent_info_list: Collection[
AgentInfoProto
], # pylint: disable=unsubscriptable-object
brain_params: BrainParameters,
) -> "BrainInfo":
"""
Converts list of agent infos to BrainInfo.
"""
vis_obs = BrainInfo._process_visual_observations(brain_params, agent_info_list)
total_num_actions = sum(brain_params.vector_action_space_size)
mask_actions = np.ones(
(len(agent_info_list), total_num_actions), dtype=np.float32
)
for agent_index, agent_info in enumerate(agent_info_list):
if agent_info.action_mask is not None:
if len(agent_info.action_mask) == total_num_actions:
mask_actions[agent_index, :] = [
0 if agent_info.action_mask[k] else 1
for k in range(total_num_actions)
]
if any(np.isnan(x.reward) for x in agent_info_list):
logger.warning(
"An agent had a NaN reward for brain " + brain_params.brain_name
)
vector_obs = BrainInfo._process_vector_observations(
brain_params, agent_info_list
)
agents = [f"${worker_id}-{x.id}" for x in agent_info_list]
brain_info = BrainInfo(
visual_observation=vis_obs,
vector_observation=vector_obs,
reward=[x.reward if not np.isnan(x.reward) else 0 for x in agent_info_list],
agents=agents,
local_done=[x.done for x in agent_info_list],
max_reached=[x.max_step_reached for x in agent_info_list],
action_mask=mask_actions,
)
return brain_info
@staticmethod
def _process_visual_observations(
brain_params: BrainParameters,
agent_info_list: Collection[
AgentInfoProto
], # pylint: disable=unsubscriptable-object
) -> List[np.ndarray]:
visual_observation_protos: List[List[ObservationProto]] = []
# Grab the visual observations - need this together so we can iterate with the camera observations
for agent in agent_info_list:
agent_vis: List[ObservationProto] = []
for proto_obs in agent.observations:
is_visual = len(proto_obs.shape) == 3
if is_visual:
agent_vis.append(proto_obs)
visual_observation_protos.append(agent_vis)
vis_obs: List[np.ndarray] = []
for i in range(brain_params.number_visual_observations):
obs = [
rpc_utils.observation_to_np_array(
agent_obs[i], brain_params.camera_resolutions[i]
)
for agent_obs in visual_observation_protos
]
vis_obs += [obs]
return vis_obs
@staticmethod
def _process_vector_observations(
brain_params: BrainParameters,
agent_info_list: Collection[
AgentInfoProto
], # pylint: disable=unsubscriptable-object
) -> np.ndarray:
if len(agent_info_list) == 0:
vector_obs = np.zeros(
(0, brain_params.vector_observation_space_size), dtype=np.float32
)
else:
stacked_obs = []
has_nan = False
has_inf = False
for agent_info in agent_info_list:
vec_obs = [
obs for obs in agent_info.observations if len(obs.shape) == 1
]
# Concatenate vector obs
proto_vector_obs: List[float] = []
for vo in vec_obs:
# TODO consider itertools.chain here
proto_vector_obs.extend(vo.float_data.data)
np_obs = np.array(proto_vector_obs, dtype=np.float32)
# Check for NaNs or infs in the observations
# If there's a NaN in the observations, the dot() result will be NaN
# If there's an Inf (either sign) then the result will be Inf
# See https://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy for background
# Note that a very large values (larger than sqrt(float_max)) will result in an Inf value here
# This is OK though, worst case it results in an unnecessary (but harmless) nan_to_num call.
d = np.dot(np_obs, np_obs)
has_nan = has_nan or np.isnan(d)
has_inf = has_inf or not np.isfinite(d)
stacked_obs.append(np_obs)
vector_obs = np.array(stacked_obs, dtype=np.float32)
# In we have any NaN or Infs, use np.nan_to_num to replace these with finite values
if has_nan or has_inf:
vector_obs = np.nan_to_num(vector_obs)
if has_nan:
logger.warning(
f"An agent had a NaN observation for brain {brain_params.brain_name}"
)
return vector_obs
# Renaming of dictionary of brain name to BrainInfo for clarity
AllBrainInfo = Dict[str, BrainInfo]

50
ml-agents/mlagents/trainers/brain_conversion_utils.py


from mlagents.trainers.brain import BrainInfo, BrainParameters, CameraResolution
from mlagents_envs.base_env import BatchedStepResult, AgentGroupSpec
from mlagents_envs.exception import UnityEnvironmentException
from mlagents.trainers.brain import BrainParameters, CameraResolution
from mlagents_envs.base_env import AgentGroupSpec
def step_result_to_brain_info(
step_result: BatchedStepResult,
group_spec: AgentGroupSpec,
agent_id_prefix: int = None,
) -> BrainInfo:
n_agents = step_result.n_agents()
vis_obs_indices = []
vec_obs_indices = []
for index, observation in enumerate(step_result.obs):
if len(observation.shape) == 2:
vec_obs_indices.append(index)
elif len(observation.shape) == 4:
vis_obs_indices.append(index)
else:
raise UnityEnvironmentException(
"Invalid input received from the environment, the observation should "
"either be a vector of float or a PNG image"
)
if len(vec_obs_indices) == 0:
vec_obs = np.zeros((n_agents, 0), dtype=np.float32)
else:
vec_obs = np.concatenate([step_result.obs[i] for i in vec_obs_indices], axis=1)
vis_obs = [step_result.obs[i] for i in vis_obs_indices]
mask = np.ones((n_agents, np.sum(group_spec.action_size)), dtype=np.float32)
if group_spec.is_action_discrete():
mask = np.ones(
(n_agents, np.sum(group_spec.discrete_action_branches)), dtype=np.float32
)
if step_result.action_mask is not None:
mask = 1 - np.concatenate(step_result.action_mask, axis=1)
if agent_id_prefix is None:
agent_ids = [str(ag_id) for ag_id in list(step_result.agent_id)]
else:
agent_ids = [f"${agent_id_prefix}-{ag_id}" for ag_id in step_result.agent_id]
return BrainInfo(
vis_obs,
vec_obs,
list(step_result.reward),
agent_ids,
list(step_result.done),
list(step_result.max_step),
mask,
)
def group_spec_to_brain_parameters(

22
ml-agents/mlagents/trainers/components/reward_signals/__init__.py


from mlagents.tf_utils import tf
from mlagents.trainers.brain import BrainInfo
from mlagents.trainers.trainer import UnityTrainerException
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.models import LearningModel

self.strength = strength
self.stats_name_to_update_name: Dict[str, str] = {}
def evaluate(
self, current_info: BrainInfo, action: np.array, next_info: BrainInfo
) -> RewardSignalResult:
"""
Evaluates the reward for the agents present in current_info given the next_info
:param current_info: The current BrainInfo.
:param action: the action that was taken between the two infos
:param next_info: The BrainInfo from the next timestep.
:return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
"""
return RewardSignalResult(
self.strength * np.zeros(len(current_info.agents), dtype=np.float32),
np.zeros(len(current_info.agents), dtype=np.float32),
)
Evaluates the reward for the data present in the Dict mini_batch. Note the distiction between
evaluate(), which takes in two BrainInfos. This reflects the different data formats (i.e. from the Buffer
vs. before being placed into the Buffer. Use this when evaluating a reward function drawn straight from a
Buffer.
Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward
function drawn straight from a Buffer.
:param mini_batch: A Dict of numpy arrays (the format used by our Buffer)
when drawing from the update buffer.
:return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator

27
ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py


import numpy as np
from mlagents.tf_utils import tf
from mlagents.trainers.brain import BrainInfo
from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel
from mlagents.trainers.tf_policy import TFPolicy

"Losses/Curiosity Inverse Loss": "curiosity_inverse_loss",
}
self.has_updated = False
def evaluate(
self, current_info: BrainInfo, action: np.array, next_info: BrainInfo
) -> RewardSignalResult:
"""
Evaluates the reward for the agents present in current_info given the next_info
:param current_info: The current BrainInfo.
:param next_info: The BrainInfo from the next timestep.
:return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
"""
if len(current_info.agents) == 0:
return RewardSignalResult([], [])
mini_batch: Dict[str, np.array] = {}
# Construct the batch and use evaluate_batch
mini_batch["actions"] = action
mini_batch["done"] = np.reshape(next_info.local_done, [-1, 1])
for i in range(len(current_info.visual_observations)):
mini_batch["visual_obs%d" % i] = current_info.visual_observations[i]
mini_batch["next_visual_obs%d" % i] = next_info.visual_observations[i]
if self.policy.use_vec_obs:
mini_batch["vector_obs"] = current_info.vector_observations
mini_batch["next_vector_in"] = next_info.vector_observations
result = self.evaluate_batch(mini_batch)
return result
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {

14
ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py


from typing import Any, Dict, List
import numpy as np
from mlagents.trainers.brain import BrainInfo
from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult

"""
param_keys = ["strength", "gamma"]
super().check_config(config_dict, param_keys)
def evaluate(
self, current_info: BrainInfo, action: np.array, next_info: BrainInfo
) -> RewardSignalResult:
"""
Evaluates the reward for the agents present in current_info given the next_info
:param current_info: The current BrainInfo.
:param next_info: The BrainInfo from the next timestep.
:return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
"""
unscaled_reward = np.array(next_info.rewards, dtype=np.float32)
scaled_reward = self.strength * unscaled_reward
return RewardSignalResult(scaled_reward, unscaled_reward)
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
env_rews = np.array(mini_batch["environment_rewards"], dtype=np.float32)

18
ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py


import numpy as np
from mlagents.tf_utils import tf
from mlagents.trainers.brain import BrainInfo
from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.models import LearningModel

"Policy/GAIL Policy Estimate": "gail_policy_estimate",
"Policy/GAIL Expert Estimate": "gail_expert_estimate",
}
def evaluate(
self, current_info: BrainInfo, action: np.array, next_info: BrainInfo
) -> RewardSignalResult:
if len(current_info.agents) == 0:
return RewardSignalResult([], [])
mini_batch: Dict[str, np.array] = {}
# Construct the batch
mini_batch["actions"] = action
mini_batch["done"] = np.reshape(next_info.local_done, [-1, 1])
for i, obs in enumerate(current_info.visual_observations):
mini_batch["visual_obs%d" % i] = obs
if self.policy.use_vec_obs:
mini_batch["vector_obs"] = current_info.vector_observations
result = self.evaluate_batch(mini_batch)
return result
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {

56
ml-agents/mlagents/trainers/demo_loader.py


from typing import List, Tuple
import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.brain import BrainParameters, BrainInfo
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.brain_conversion_utils import group_spec_to_brain_parameters
from mlagents.trainers.trajectory import SplitObservations
from mlagents_envs.rpc_utils import (
agent_group_spec_from_proto,
batched_step_result_from_proto,
)
from mlagents_envs.base_env import AgentGroupSpec
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
DemonstrationMetaProto,

@timed
def make_demo_buffer(
pair_infos: List[AgentInfoActionPairProto],
brain_params: BrainParameters,
group_spec: AgentGroupSpec,
sequence_length: int,
) -> AgentBuffer:
# Create and populate buffer using experiences

if idx > len(pair_infos) - 2:
break
next_pair_info = pair_infos[idx + 1]
current_brain_info = BrainInfo.from_agent_proto(
0, [current_pair_info.agent_info], brain_params
current_step_info = batched_step_result_from_proto(
[current_pair_info.agent_info], group_spec
next_brain_info = BrainInfo.from_agent_proto(
0, [next_pair_info.agent_info], brain_params
next_step_info = batched_step_result_from_proto(
[next_pair_info.agent_info], group_spec
)
previous_action = (
np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0

pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32
)
demo_raw_buffer["done"].append(next_brain_info.local_done[0])
demo_raw_buffer["rewards"].append(next_brain_info.rewards[0])
for i in range(brain_params.number_visual_observations):
demo_raw_buffer["visual_obs%d" % i].append(
current_brain_info.visual_observations[i][0]
)
if brain_params.vector_observation_space_size > 0:
demo_raw_buffer["vector_obs"].append(
current_brain_info.vector_observations[0]
)
agent_id = current_step_info.agent_id[0]
current_agent_step_info = current_step_info.get_agent_step_result(agent_id)
next_agent_step_info = next_step_info.get_agent_step_result(agent_id)
demo_raw_buffer["done"].append(next_agent_step_info.done)
demo_raw_buffer["rewards"].append(next_agent_step_info.reward)
split_obs = SplitObservations.from_observations(current_agent_step_info.obs)
for i, obs in enumerate(split_obs.visual_observations):
demo_raw_buffer["visual_obs%d" % i].append(obs)
demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
if next_brain_info.local_done[0]:
if next_step_info.done:
demo_raw_buffer.resequence_and_append(
demo_processed_buffer, batch_size=None, training_length=sequence_length
)

:param sequence_length: Length of trajectories to fill buffer.
:return:
"""
brain_params, info_action_pair, _ = load_demonstration(file_path)
demo_buffer = make_demo_buffer(info_action_pair, brain_params, sequence_length)
group_spec, info_action_pair, _ = load_demonstration(file_path)
demo_buffer = make_demo_buffer(info_action_pair, group_spec, sequence_length)
brain_params = group_spec_to_brain_parameters("DemoBrain", group_spec)
return brain_params, demo_buffer

"The demonstration file or directory {} does not exist.".format(file_path)
)
brain_params = None
group_spec = None
brain_param_proto = None
info_action_pairs = []
total_expected = 0

if obs_decoded > 1:
agent_info_action = AgentInfoActionPairProto()
agent_info_action.ParseFromString(data[pos : pos + next_pos])
if brain_params is None:
brain_params = BrainParameters.from_proto(
if group_spec is None:
group_spec = agent_group_spec_from_proto(
brain_param_proto, agent_info_action.agent_info
)
info_action_pairs.append(agent_info_action)

obs_decoded += 1
if not brain_params:
if not group_spec:
return brain_params, info_action_pairs, total_expected
return group_spec, info_action_pairs, total_expected

34
ml-agents/mlagents/trainers/env_manager.py


from abc import ABC, abstractmethod
from typing import List, Dict, NamedTuple, Iterable
from mlagents.trainers.brain import AllBrainInfo, BrainParameters
from mlagents_envs.base_env import BatchedStepResult, AgentGroupSpec, AgentGroup
from mlagents.trainers.brain import BrainParameters
AllStepResult = Dict[AgentGroup, BatchedStepResult]
AllGroupSpec = Dict[AgentGroup, AgentGroupSpec]
def get_global_agent_id(worker_id: int, agent_id: int) -> str:
"""
Create an agent id that is unique across environment workers using the worker_id.
"""
return f"${worker_id}-{agent_id}"
current_all_brain_info: AllBrainInfo
brain_name_to_action_info: Dict[str, ActionInfo]
current_all_step_result: AllStepResult
worker_id: int
brain_name_to_action_info: Dict[AgentGroup, ActionInfo]
def name_behavior_ids(self) -> Iterable[str]:
return self.current_all_brain_info.keys()
def name_behavior_ids(self) -> Iterable[AgentGroup]:
return self.current_all_step_result.keys()
@staticmethod
def empty(worker_id: int) -> "EnvironmentStep":
return EnvironmentStep({}, worker_id, {})
self.policies: Dict[str, Policy] = {}
self.policies: Dict[AgentGroup, Policy] = {}
def set_policy(self, brain_name: str, policy: Policy) -> None:
def set_policy(self, brain_name: AgentGroup, policy: Policy) -> None:
self.policies[brain_name] = policy
@abstractmethod

@property
@abstractmethod
def external_brains(self) -> Dict[str, BrainParameters]:
def external_brains(self) -> Dict[AgentGroup, BrainParameters]:
def get_properties(self) -> Dict[str, float]:
def get_properties(self) -> Dict[AgentGroup, float]:
pass
@abstractmethod

2
ml-agents/mlagents/trainers/models.py


) -> tf.Tensor:
"""
Creates image input op.
:param camera_parameters: Parameters for visual observation from BrainInfo.
:param camera_parameters: Parameters for visual observation.
:param name: Desired name of input op.
:return: input op.
"""

6
ml-agents/mlagents/trainers/policy.py


from abc import ABC, abstractmethod
from mlagents.trainers.brain import BrainInfo
from mlagents_envs.base_env import BatchedStepResult
def get_action(self, brain_info: BrainInfo) -> ActionInfo:
def get_action(
self, batched_step_result: BatchedStepResult, worker_id: int = 0
) -> ActionInfo:
pass

2
ml-agents/mlagents/trainers/ppo/models.py


"""
Takes a Unity environment and model-specific hyper-parameters and returns the
appropriate PPO agent model for the environment.
:param brain: BrainInfo used to generate specific network graph.
:param brain: brain parameters used to generate specific network graph.
:param lr: Learning rate.
:param lr_schedule: Learning rate decay schedule.
:param h_size: Size of hidden layers

20
ml-agents/mlagents/trainers/ppo/policy.py


import logging
import numpy as np
from typing import Any, Dict, Optional
from typing import Any, Dict, Optional, List
from mlagents_envs.base_env import BatchedStepResult
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.models import EncoderType, LearningRateSchedule
from mlagents.trainers.ppo.models import PPOModel

self.update_dict.update(self.reward_signals[reward_signal].update_dict)
@timed
def evaluate(self, brain_info):
def evaluate(
self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]
) -> Dict[str, Any]:
:param brain_info: BrainInfo object containing inputs.
:param batched_step_result: BatchedStepResult object containing inputs.
:param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result.
self.model.batch_size: len(brain_info.vector_observations),
self.model.batch_size: batched_step_result.n_agents(),
self.model.sequence_length: 1,
}
epsilon = None

brain_info.agents
global_agent_ids
feed_dict[self.model.memory_in] = self.retrieve_memories(brain_info.agents)
feed_dict[self.model.memory_in] = self.retrieve_memories(global_agent_ids)
size=(len(brain_info.vector_observations), self.model.act_size[0])
size=(batched_step_result.n_agents(), self.model.act_size[0])
feed_dict = self.fill_eval_dict(feed_dict, brain_info)
feed_dict = self.fill_eval_dict(feed_dict, batched_step_result)
run_out = self._execute_model(feed_dict, self.inference_dict)
return run_out

2
ml-agents/mlagents/trainers/sac/models.py


"""
Takes a Unity environment and model-specific hyper-parameters and returns the
appropriate PPO agent model for the environment.
:param brain: BrainInfo used to generate specific network graph.
:param brain: Brain parameters used to generate specific network graph.
:param lr: Learning rate.
:param lr_schedule: Learning rate decay schedule.
:param h_size: Size of hidden layers

19
ml-agents/mlagents/trainers/sac/policy.py


import logging
from typing import Dict, Any, Optional, Mapping
from typing import Dict, Any, Optional, Mapping, List
from mlagents.trainers.brain import BrainInfo, BrainParameters
from mlagents_envs.base_env import BatchedStepResult
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.models import EncoderType, LearningRateSchedule
from mlagents.trainers.sac.models import SACModel
from mlagents.trainers.tf_policy import TFPolicy

self, self.model, reward_signal, config
)
def evaluate(self, brain_info: BrainInfo) -> Dict[str, np.ndarray]:
def evaluate(
self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]
) -> Dict[str, np.ndarray]:
:param brain_info: BrainInfo object containing inputs.
:param batched_step_result: BatchedStepResult object containing inputs.
self.model.batch_size: len(brain_info.vector_observations),
self.model.batch_size: batched_step_result.n_agents(),
brain_info.agents
global_agent_ids
feed_dict[self.model.memory_in] = self.retrieve_memories(brain_info.agents)
feed_dict[self.model.memory_in] = self.retrieve_memories(global_agent_ids)
feed_dict = self.fill_eval_dict(feed_dict, brain_info)
feed_dict = self.fill_eval_dict(feed_dict, batched_step_result)
run_out = self._execute_model(feed_dict, self.inference_dict)
return run_out

46
ml-agents/mlagents/trainers/simple_env_manager.py


from typing import Dict, List
from mlagents_envs.base_env import BaseEnv
from mlagents.trainers.env_manager import EnvManager, EnvironmentStep
from mlagents_envs.base_env import BaseEnv, AgentGroup
from mlagents.trainers.env_manager import EnvManager, EnvironmentStep, AllStepResult
from mlagents.trainers.brain import BrainParameters, AllBrainInfo
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.brain_conversion_utils import (
step_result_to_brain_info,
group_spec_to_brain_parameters,
)
from mlagents.trainers.brain_conversion_utils import group_spec_to_brain_parameters
class SimpleEnvManager(EnvManager):

super().__init__()
self.shared_float_properties = float_prop_channel
self.env = env
self.previous_step: EnvironmentStep = EnvironmentStep({}, {})
self.previous_step: EnvironmentStep = EnvironmentStep.empty(0)
self.previous_all_action_info: Dict[str, ActionInfo] = {}
def step(self) -> List[EnvironmentStep]:

for brain_name, action_info in all_action_info.items():
self.env.set_actions(brain_name, action_info.action)
self.env.step()
all_brain_info = self._generate_all_brain_info()
step_brain_info = all_brain_info
all_step_result = self._generate_all_results()
step_info = EnvironmentStep(step_brain_info, self.previous_all_action_info)
step_info = EnvironmentStep(all_step_result, 0, self.previous_all_action_info)
self, config: Dict[str, float] = None
self, config: Dict[AgentGroup, float] = None
all_brain_info = self._generate_all_brain_info()
self.previous_step = EnvironmentStep(all_brain_info, {})
all_step_result = self._generate_all_results()
self.previous_step = EnvironmentStep(all_step_result, 0, {})
def external_brains(self) -> Dict[str, BrainParameters]:
def external_brains(self) -> Dict[AgentGroup, BrainParameters]:
result = {}
for brain_name in self.env.get_agent_groups():
result[brain_name] = group_spec_to_brain_parameters(

@property
def get_properties(self) -> Dict[str, float]:
def get_properties(self) -> Dict[AgentGroup, float]:
return self.shared_float_properties.get_property_dict_copy()
def close(self):

def _take_step(self, last_step: EnvironmentStep) -> Dict[str, ActionInfo]:
def _take_step(self, last_step: EnvironmentStep) -> Dict[AgentGroup, ActionInfo]:
for brain_name, brain_info in last_step.current_all_brain_info.items():
for brain_name, step_info in last_step.current_all_step_result.items():
brain_info
step_info,
0, # As there is only one worker, we assign the worker_id to 0.
def _generate_all_brain_info(self) -> AllBrainInfo:
all_brain_info = {}
def _generate_all_results(self) -> AllStepResult:
all_step_result: AllStepResult = {}
all_brain_info[brain_name] = step_result_to_brain_info(
self.env.get_step_result(brain_name),
self.env.get_agent_group_spec(brain_name),
)
return all_brain_info
all_step_result[brain_name] = self.env.get_step_result(brain_name)
return all_step_result

51
ml-agents/mlagents/trainers/subprocess_env_manager.py


from multiprocessing import Process, Pipe, Queue
from multiprocessing.connection import Connection
from queue import Empty as EmptyQueueException
from mlagents_envs.base_env import BaseEnv
from mlagents.trainers.env_manager import EnvManager, EnvironmentStep
from mlagents_envs.base_env import BaseEnv, AgentGroup
from mlagents.trainers.env_manager import EnvManager, EnvironmentStep, AllStepResult
from mlagents_envs.timers import (
TimerNode,
timed,

)
from mlagents.trainers.brain import AllBrainInfo, BrainParameters
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.action_info import ActionInfo
from mlagents_envs.side_channel.float_properties_channel import FloatPropertiesChannel
from mlagents_envs.side_channel.engine_configuration_channel import (

from mlagents_envs.side_channel.side_channel import SideChannel
from mlagents.trainers.brain_conversion_utils import (
step_result_to_brain_info,
group_spec_to_brain_parameters,
)
from mlagents.trainers.brain_conversion_utils import group_spec_to_brain_parameters
logger = logging.getLogger("mlagents.trainers")

class StepResponse(NamedTuple):
all_brain_info: AllBrainInfo
all_step_result: AllStepResult
timer_root: Optional[TimerNode]

self.worker_id = worker_id
self.conn = conn
self.previous_step: EnvironmentStep = EnvironmentStep({}, {})
self.previous_step: EnvironmentStep = EnvironmentStep.empty(worker_id)
self.previous_all_action_info: Dict[str, ActionInfo] = {}
self.waiting = False

def _send_response(cmd_name, payload):
parent_conn.send(EnvironmentResponse(cmd_name, worker_id, payload))
def _generate_all_brain_info() -> AllBrainInfo:
all_brain_info = {}
def _generate_all_results() -> AllStepResult:
all_step_result: AllStepResult = {}
all_brain_info[brain_name] = step_result_to_brain_info(
env.get_step_result(brain_name),
env.get_agent_group_spec(brain_name),
worker_id,
)
return all_brain_info
all_step_result[brain_name] = env.get_step_result(brain_name)
return all_step_result
def external_brains():
result = {}

if len(action_info.action) != 0:
env.set_actions(brain_name, action_info.action)
env.step()
all_brain_info = _generate_all_brain_info()
all_step_result = _generate_all_results()
step_response = StepResponse(all_brain_info, get_timer_root())
step_response = StepResponse(all_step_result, get_timer_root())
step_queue.put(EnvironmentResponse("step", worker_id, step_response))
reset_timers()
elif cmd.name == "external_brains":

for k, v in cmd.payload.items():
shared_float_properties.set_property(k, v)
env.reset()
all_brain_info = _generate_all_brain_info()
_send_response("reset", all_brain_info)
all_step_result = _generate_all_results()
_send_response("reset", all_step_result)
elif cmd.name == "close":
break
except (KeyboardInterrupt, UnityCommunicationException, UnityTimeOutException):

ew.send("reset", config)
# Next (synchronously) collect the reset observations from each worker in sequence
for ew in self.env_workers:
ew.previous_step = EnvironmentStep(ew.recv().payload, {})
ew.previous_step = EnvironmentStep(ew.recv().payload, ew.worker_id, {})
def external_brains(self) -> Dict[str, BrainParameters]:
def external_brains(self) -> Dict[AgentGroup, BrainParameters]:
def get_properties(self) -> Dict[str, float]:
def get_properties(self) -> Dict[AgentGroup, float]:
self.env_workers[0].send("get_properties")
return self.env_workers[0].recv().payload

payload: StepResponse = step.payload
env_worker = self.env_workers[step.worker_id]
new_step = EnvironmentStep(
payload.all_brain_info, env_worker.previous_all_action_info
payload.all_step_result,
step.worker_id,
env_worker.previous_all_action_info,
)
step_infos.append(new_step)
env_worker.previous_step = new_step

return step_infos
@timed
def _take_step(self, last_step: EnvironmentStep) -> Dict[str, ActionInfo]:
def _take_step(self, last_step: EnvironmentStep) -> Dict[AgentGroup, ActionInfo]:
for brain_name, brain_info in last_step.current_all_brain_info.items():
for brain_name, batch_step_result in last_step.current_all_step_result.items():
brain_info
batch_step_result, last_step.worker_id
)
return all_action_info

261
ml-agents/mlagents/trainers/tests/mock_brain.py


from unittest import mock
from typing import List
from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents_envs.base_env import BatchedStepResult
def create_mock_brainparams(

return mock_brain()
def create_mock_braininfo(
num_agents=1,
num_vector_observations=0,
num_vis_observations=0,
num_vector_acts=2,
discrete=False,
num_discrete_branches=1,
):
def create_mock_batchedstep(
num_agents: int = 1,
num_vector_observations: int = 0,
num_vis_observations: int = 0,
action_shape: List[int] = None,
discrete: bool = False,
) -> BatchedStepResult:
Creates a mock BrainInfo with observations. Imitates constant
Creates a mock BatchedStepResult with observations. Imitates constant
:int num_agents: Number of "agents" to imitate in your BrainInfo values.
:int num_agents: Number of "agents" to imitate.
mock_braininfo = mock.Mock()
if action_shape is None:
action_shape = [2]
mock_braininfo.return_value.visual_observations = num_vis_observations * [
np.ones((num_agents, 84, 84, 3), dtype=np.float32)
]
mock_braininfo.return_value.vector_observations = np.array(
num_agents * [num_vector_observations * [1]], dtype=np.float32
)
if discrete:
mock_braininfo.return_value.previous_vector_actions = np.array(
num_agents * [num_discrete_branches * [0.5]], dtype=np.float32
obs_list = []
for _ in range(num_vis_observations):
obs_list.append(np.ones((num_agents, 84, 84, 3), dtype=np.float32))
if num_vector_observations > 1:
obs_list.append(
np.array(num_agents * [num_vector_observations * [1]], dtype=np.float32)
mock_braininfo.return_value.action_masks = np.array(
num_agents * [num_vector_acts * [1.0]], dtype=np.float32
)
else:
mock_braininfo.return_value.previous_vector_actions = np.array(
num_agents * [num_vector_acts * [0.5]], dtype=np.float32
)
mock_braininfo.return_value.memories = np.ones((num_agents, 8), dtype=np.float32)
mock_braininfo.return_value.rewards = num_agents * [1.0]
mock_braininfo.return_value.local_done = num_agents * [False]
mock_braininfo.return_value.max_reached = num_agents * [100]
mock_braininfo.return_value.action_masks = num_agents * [num_vector_acts * [1.0]]
mock_braininfo.return_value.agents = range(0, num_agents)
return mock_braininfo()
action_mask = None
if discrete:
action_mask = [
np.array(num_agents * [action_size * [False]])
for action_size in action_shape
]
reward = np.array(num_agents * [1.0], dtype=np.float32)
done = np.array(num_agents * [False], dtype=np.bool)
max_step = np.array(num_agents * [False], dtype=np.bool)
agent_id = np.arange(num_agents, dtype=np.int32)
return BatchedStepResult(obs_list, reward, done, max_step, agent_id, action_mask)
def create_batchedstep_from_brainparams(
brain_params: BrainParameters, num_agents: int = 1
) -> BatchedStepResult:
return create_mock_batchedstep(
num_agents=num_agents,
num_vector_observations=brain_params.vector_observation_space_size,
num_vis_observations=brain_params.number_visual_observations,
action_shape=brain_params.vector_action_space_size,
discrete=brain_params.vector_action_space_type == "discrete",
)
def setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo):
def make_fake_trajectory(
length: int,
max_step_complete: bool = False,
vec_obs_size: int = 1,
num_vis_obs: int = 1,
action_space: List[int] = None,
memory_size: int = 10,
is_discrete: bool = True,
) -> Trajectory:
Takes a mock UnityEnvironment and adds the appropriate properties, defined by the mock
BrainParameters and BrainInfo.
Makes a fake trajectory of length length. If max_step_complete,
the trajectory is terminated by a max step rather than a done.
"""
if action_space is None:
action_space = [2]
steps_list = []
for _i in range(length - 1):
obs = []
for _j in range(num_vis_obs):
obs.append(np.ones((84, 84, 3), dtype=np.float32))
obs.append(np.ones(vec_obs_size, dtype=np.float32))
reward = 1.0
done = False
if is_discrete:
action_size = len(action_space)
else:
action_size = action_space[0]
action = np.zeros(action_size, dtype=np.float32)
action_probs = np.ones(action_size, dtype=np.float32)
action_pre = np.zeros(action_size, dtype=np.float32)
action_mask = (
[[False for _ in range(branch)] for branch in action_space]
if is_discrete
else None
)
prev_action = np.ones(action_size, dtype=np.float32)
max_step = False
memory = np.ones(memory_size, dtype=np.float32)
agent_id = "test_agent"
behavior_id = "test_brain"
experience = AgentExperience(
obs=obs,
reward=reward,
done=done,
action=action,
action_probs=action_probs,
action_pre=action_pre,
action_mask=action_mask,
prev_action=prev_action,
max_step=max_step,
memory=memory,
)
steps_list.append(experience)
last_experience = AgentExperience(
obs=obs,
reward=reward,
done=not max_step_complete,
action=action,
action_probs=action_probs,
action_pre=action_pre,
action_mask=action_mask,
prev_action=prev_action,
max_step=max_step_complete,
memory=memory,
)
steps_list.append(last_experience)
return Trajectory(
steps=steps_list, agent_id=agent_id, behavior_id=behavior_id, next_obs=obs
)
:Mock mock_env: A mock UnityEnvironment, usually empty.
:Mock mock_brain: A mock Brain object that specifies the params of this environment.
:Mock mock_braininfo: A mock BrainInfo object that will be returned at each step and reset.
"""
brain_name = mock_brain.brain_name
mock_env.return_value.academy_name = "MockAcademy"
mock_env.return_value.brains = {brain_name: mock_brain}
mock_env.return_value.external_brain_names = [brain_name]
mock_env.return_value.reset.return_value = {brain_name: mock_braininfo}
mock_env.return_value.step.return_value = {brain_name: mock_braininfo}
def simulate_rollout(
length: int,
brain_params: BrainParameters,
memory_size: int = 10,
exclude_key_list: List[str] = None,
) -> AgentBuffer:
vec_obs_size = brain_params.vector_observation_space_size
num_vis_obs = brain_params.number_visual_observations
action_space = brain_params.vector_action_space_size
is_discrete = brain_params.vector_action_space_type == "discrete"
def simulate_rollout(env, policy, buffer_init_samples, exclude_key_list=None):
brain_info_list = []
for _ in range(buffer_init_samples):
brain_info_list.append(env.step()[env.external_brain_names[0]])
buffer = create_buffer(brain_info_list, policy.brain, policy.sequence_length)
trajectory = make_fake_trajectory(
length,
vec_obs_size=vec_obs_size,
num_vis_obs=num_vis_obs,
action_space=action_space,
memory_size=memory_size,
is_discrete=is_discrete,
)
buffer = trajectory.to_agentbuffer()
# If a key_list was given, remove those keys
if exclude_key_list:
for key in exclude_key_list:

def create_buffer(brain_infos, brain_params, sequence_length, memory_size=8):
buffer = AgentBuffer()
update_buffer = AgentBuffer()
# Make a buffer
for idx, experience in enumerate(brain_infos):
if idx > len(brain_infos) - 2:
break
current_brain_info = experience
next_brain_info = brain_infos[idx + 1]
buffer.last_brain_info = current_brain_info
buffer["done"].append(next_brain_info.local_done[0])
buffer["rewards"].append(next_brain_info.rewards[0])
for i in range(brain_params.number_visual_observations):
buffer["visual_obs%d" % i].append(
current_brain_info.visual_observations[i][0]
)
buffer["next_visual_obs%d" % i].append(
current_brain_info.visual_observations[i][0]
)
if brain_params.vector_observation_space_size > 0:
buffer["vector_obs"].append(current_brain_info.vector_observations[0])
buffer["next_vector_in"].append(current_brain_info.vector_observations[0])
fake_action_size = len(brain_params.vector_action_space_size)
if brain_params.vector_action_space_type == "continuous":
fake_action_size = brain_params.vector_action_space_size[0]
buffer["actions"].append(np.zeros(fake_action_size, dtype=np.float32))
buffer["prev_action"].append(np.zeros(fake_action_size, dtype=np.float32))
buffer["masks"].append(1.0)
buffer["advantages"].append(1.0)
if brain_params.vector_action_space_type == "discrete":
buffer["action_probs"].append(
np.ones(sum(brain_params.vector_action_space_size), dtype=np.float32)
)
else:
buffer["action_probs"].append(
np.ones(buffer["actions"][0].shape, dtype=np.float32)
)
buffer["actions_pre"].append(
np.ones(buffer["actions"][0].shape, dtype=np.float32)
)
buffer["action_mask"].append(
np.ones(np.sum(brain_params.vector_action_space_size), dtype=np.float32)
)
buffer["memory"].append(np.ones(memory_size, dtype=np.float32))
buffer.resequence_and_append(
update_buffer, batch_size=None, training_length=sequence_length
)
return update_buffer
def setup_mock_env_and_brains(
mock_env,
def setup_mock_brain(
num_agents=12,
discrete_action_space=None,
vector_action_space=None,
vector_obs_space=8,

else vector_action_space,
vector_observation_space_size=vector_obs_space,
)
mock_braininfo = create_mock_braininfo(
num_agents=num_agents,
num_vector_observations=vector_obs_space,
num_vector_acts=sum(
discrete_action_space if use_discrete else vector_action_space
),
discrete=use_discrete,
num_discrete_branches=len(discrete_action_space),
)
else:
mock_brain = create_mock_brainparams(
vector_action_space_type="discrete" if use_discrete else "continuous",

vector_observation_space_size=0,
number_visual_observations=1,
)
mock_braininfo = create_mock_braininfo(
num_agents=num_agents,
num_vis_observations=1,
num_vector_acts=sum(
discrete_action_space if use_discrete else vector_action_space
),
discrete=use_discrete,
num_discrete_branches=len(discrete_action_space),
)
setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)
env = mock_env()
return env, mock_brain, mock_braininfo
return mock_brain
def create_mock_3dball_brain():

10
ml-agents/mlagents/trainers/tests/test_agent_processor.py


"pre_action": [0.1, 0.1],
"log_probs": [0.1, 0.1],
}
mock_braininfo = mb.create_mock_braininfo(
mock_step = mb.create_mock_batchedstep(
num_vector_acts=2,
action_shape=[2],
num_vis_observations=num_vis_obs,
)
fake_action_info = ActionInfo(

agents=mock_braininfo.agents,
agent_ids=mock_step.agent_id,
processor.add_experiences(mock_braininfo, ActionInfo([], [], {}, []))
processor.add_experiences(mock_step, 0, ActionInfo([], [], {}, []))
processor.add_experiences(mock_braininfo, fake_action_info)
processor.add_experiences(mock_step, 0, fake_action_info)
# Assert that two trajectories have been added to the Trainer
assert len(tqueue.put.call_args_list) == 2

67
ml-agents/mlagents/trainers/tests/test_bcmodule.py


from unittest import mock
import pytest
import mlagents.trainers.tests.mock_brain as mb

)
def create_policy_with_bc_mock(
mock_env, mock_brain, trainer_config, use_rnn, demo_file
):
mock_braininfo = mb.create_mock_braininfo(num_agents=12, num_vector_observations=8)
mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)
env = mock_env()
model_path = env.external_brain_names[0]
trainer_config["model_path"] = model_path
def create_policy_with_bc_mock(mock_brain, trainer_config, use_rnn, demo_file):
# model_path = env.external_brain_names[0]
trainer_config["model_path"] = "testpath"
trainer_config["keep_checkpoints"] = 3
trainer_config["use_recurrent"] = use_rnn
trainer_config["behavioral_cloning"]["demo_path"] = (

if trainer_config["trainer"] == "ppo"
else SACPolicy(0, mock_brain, trainer_config, False, False)
)
return env, policy
return policy
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_bcmodule_defaults(mock_env):
def test_bcmodule_defaults():
env, policy = create_policy_with_bc_mock(
mock_env, mock_brain, trainer_config, False, "test.demo"
)
policy = create_policy_with_bc_mock(mock_brain, trainer_config, False, "test.demo")
env.close()
env, policy = create_policy_with_bc_mock(
mock_env, mock_brain, trainer_config, False, "test.demo"
)
policy = create_policy_with_bc_mock(mock_brain, trainer_config, False, "test.demo")
env.close()
# Test with continuous control env and vector actions

@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_bcmodule_update(mock_env, trainer_config):
def test_bcmodule_update(trainer_config):
env, policy = create_policy_with_bc_mock(
mock_env, mock_brain, trainer_config, False, "test.demo"
)
policy = create_policy_with_bc_mock(mock_brain, trainer_config, False, "test.demo")
env.close()
# Test with constant pretraining learning rate

@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_bcmodule_constant_lr_update(mock_env, trainer_config):
def test_bcmodule_constant_lr_update(trainer_config):
env, policy = create_policy_with_bc_mock(
mock_env, mock_brain, trainer_config, False, "test.demo"
)
policy = create_policy_with_bc_mock(mock_brain, trainer_config, False, "test.demo")
stats = policy.bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)

@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_bcmodule_rnn_update(mock_env, trainer_config):
def test_bcmodule_rnn_update(trainer_config):
env, policy = create_policy_with_bc_mock(
mock_env, mock_brain, trainer_config, True, "test.demo"
)
policy = create_policy_with_bc_mock(mock_brain, trainer_config, True, "test.demo")
env.close()
# Test with discrete control and visual observations

@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_bcmodule_dc_visual_update(mock_env, trainer_config):
def test_bcmodule_dc_visual_update(trainer_config):
env, policy = create_policy_with_bc_mock(
mock_env, mock_brain, trainer_config, False, "testdcvis.demo"
policy = create_policy_with_bc_mock(
mock_brain, trainer_config, False, "testdcvis.demo"
env.close()
# Test with discrete control, visual observations and RNN

@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_bcmodule_rnn_dc_update(mock_env, trainer_config):
def test_bcmodule_rnn_dc_update(trainer_config):
env, policy = create_policy_with_bc_mock(
mock_env, mock_brain, trainer_config, True, "testdcvis.demo"
policy = create_policy_with_bc_mock(
mock_brain, trainer_config, True, "testdcvis.demo"
env.close()
if __name__ == "__main__":

11
ml-agents/mlagents/trainers/tests/test_demo_loader.py


import os
import numpy as np
from mlagents.trainers.demo_loader import load_demonstration, demo_to_buffer

brain_parameters, pair_infos, total_expected = load_demonstration(
group_spec, pair_infos, total_expected = load_demonstration(
assert brain_parameters.brain_name == "Ball3DBrain"
assert brain_parameters.vector_observation_space_size == 8
assert np.sum(group_spec.observation_shapes[0]) == 8
assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1)

def test_load_demo_dir():
path_prefix = os.path.dirname(os.path.abspath(__file__))
brain_parameters, pair_infos, total_expected = load_demonstration(
group_spec, pair_infos, total_expected = load_demonstration(
assert brain_parameters.brain_name == "3DBall"
assert brain_parameters.vector_observation_space_size == 8
assert np.sum(group_spec.observation_shapes[0]) == 8
assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1)

37
ml-agents/mlagents/trainers/tests/test_policy.py


from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.brain import BrainInfo
from mlagents_envs.base_env import BatchedStepResult, AgentGroupSpec
from mlagents.trainers.action_info import ActionInfo
from unittest.mock import MagicMock
import numpy as np

def test_take_action_returns_empty_with_no_agents():
test_seed = 3
policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
no_agent_brain_info = BrainInfo([], [], [], agents=[])
result = policy.get_action(no_agent_brain_info)
# Doesn't really matter what this is
dummy_groupspec = AgentGroupSpec([(1,)], "continuous", 1)
no_agent_step = BatchedStepResult.empty(dummy_groupspec)
result = policy.get_action(no_agent_step)
assert result == ActionInfo([], [], {}, [])

policy.evaluate = MagicMock(return_value={})
policy.save_memories = MagicMock()
brain_info_with_agents = BrainInfo(
[], [], [], agents=["an-agent-id"], local_done=[False]
step_with_agents = BatchedStepResult(
[],
np.array([], dtype=np.float32),
np.array([False], dtype=np.bool),
np.array([], dtype=np.bool),
np.array([0]),
None,
result = policy.get_action(brain_info_with_agents)
assert result == ActionInfo(None, None, {}, ["an-agent-id"])
result = policy.get_action(step_with_agents, worker_id=0)
assert result == ActionInfo(None, None, {}, [0])
def test_take_action_returns_action_info_when_available():

"value": np.array([1.1], dtype=np.float32),
}
policy.evaluate = MagicMock(return_value=policy_eval_out)
brain_info_with_agents = BrainInfo(
[], [], [], agents=["an-agent-id"], local_done=[False]
step_with_agents = BatchedStepResult(
[],
np.array([], dtype=np.float32),
np.array([False], dtype=np.bool),
np.array([], dtype=np.bool),
np.array([0]),
None,
result = policy.get_action(brain_info_with_agents)
result = policy.get_action(step_with_agents)
policy_eval_out["action"],
policy_eval_out["value"],
policy_eval_out,
["an-agent-id"],
policy_eval_out["action"], policy_eval_out["value"], policy_eval_out, [0]
)
assert result == expected

43
ml-agents/mlagents/trainers/tests/test_ppo.py


from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.mock_brain import make_brain_parameters
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
from mlagents.trainers.brain_conversion_utils import (
step_result_to_brain_info,
group_spec_to_brain_parameters,
)
from mlagents.trainers.brain_conversion_utils import group_spec_to_brain_parameters
@pytest.fixture

env = UnityEnvironment(" ")
env.reset()
brain_name = env.get_agent_groups()[0]
brain_info = step_result_to_brain_info(
env.get_step_result(brain_name), env.get_agent_group_spec(brain_name)
)
batched_step = env.get_step_result(brain_name)
brain_params = group_spec_to_brain_parameters(
brain_name, env.get_agent_group_spec(brain_name)
)

trainer_parameters["model_path"] = model_path
trainer_parameters["keep_checkpoints"] = 3
policy = PPOPolicy(0, brain_params, trainer_parameters, False, False)
run_out = policy.evaluate(brain_info)
run_out = policy.evaluate(batched_step, list(batched_step.agent_id))
assert run_out["action"].shape == (3, 2)
env.close()

max_step_complete=True,
vec_obs_size=1,
num_vis_obs=0,
action_space=2,
action_space=[2],
)
run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=False)
for key, val in run_out.items():

assert trainer.step == step_count
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_trainer_update_policy(mock_env, dummy_config, use_discrete):
env, mock_brain, _ = mb.setup_mock_env_and_brains(
mock_env,
def test_trainer_update_policy(dummy_config, use_discrete):
mock_brain = mb.setup_mock_brain(
num_agents=NUM_AGENTS,
vector_action_space=VECTOR_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE,
discrete_action_space=DISCRETE_ACTION_SPACE,

policy = trainer.create_policy(mock_brain)
trainer.add_policy(mock_brain.brain_name, policy)
# Test update with sequence length smaller than batch size
buffer = mb.simulate_rollout(env, trainer.policy, BUFFER_INIT_SAMPLES)
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain)
buffer["extrinsic_rewards"] = buffer["rewards"]
buffer["extrinsic_returns"] = buffer["rewards"]
buffer["extrinsic_value_estimates"] = buffer["rewards"]
buffer["curiosity_rewards"] = buffer["rewards"]
buffer["curiosity_returns"] = buffer["rewards"]
buffer["curiosity_value_estimates"] = buffer["rewards"]
buffer["extrinsic_rewards"] = buffer["environment_rewards"]
buffer["extrinsic_returns"] = buffer["environment_rewards"]
buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
buffer["curiosity_rewards"] = buffer["environment_rewards"]
buffer["curiosity_returns"] = buffer["environment_rewards"]
buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
buffer["advantages"] = buffer["environment_rewards"]
trainer.update_buffer = buffer
trainer._update_policy()

max_step_complete=True,
vec_obs_size=1,
num_vis_obs=0,
action_space=2,
action_space=[2],
)
trajectory_queue.put(trajectory)
trainer.advance()

max_step_complete=False,
vec_obs_size=1,
num_vis_obs=0,
action_space=2,
action_space=[2],
)
trajectory_queue.put(trajectory)
trainer.advance()

max_step_complete=True,
vec_obs_size=1,
num_vis_obs=0,
action_space=2,
action_space=[2],
)
# Change half of the obs to 0
for i in range(3):

max_step_complete=True,
vec_obs_size=1,
num_vis_obs=0,
action_space=2,
action_space=[2],
)
trainer._process_trajectory(trajectory)

120
ml-agents/mlagents/trainers/tests/test_reward_signals.py


from unittest import mock
import numpy as np
import mlagents.trainers.tests.mock_brain as mb
from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.sac.policy import SACPolicy

VECTOR_OBS_SPACE = 8
DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 20
BATCH_SIZE = 12
mock_env, trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual
trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual
env, mock_brain, _ = mb.setup_mock_env_and_brains(
mock_env,
mock_brain = mb.setup_mock_brain(
num_agents=NUM_AGENTS,
vector_action_space=VECTOR_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE,
discrete_action_space=DISCRETE_ACTION_SPACE,

model_path = env.external_brain_names[0]
model_path = "testpath"
trainer_parameters["model_path"] = model_path
trainer_parameters["keep_checkpoints"] = 3
trainer_parameters["reward_signals"].update(reward_signal_config)

else:
policy = SACPolicy(0, mock_brain, trainer_parameters, False, False)
return env, policy
return policy
def reward_signal_eval(env, policy, reward_signal_name):
brain_infos = env.reset()
brain_info = brain_infos[env.external_brain_names[0]]
next_brain_info = env.step()[env.external_brain_names[0]]
def reward_signal_eval(policy, reward_signal_name):
buffer = mb.simulate_rollout(BATCH_SIZE, policy.brain)
action = np.ones((len(brain_info.agents), policy.num_branches), dtype=np.float32)
rsig_result = policy.reward_signals[reward_signal_name].evaluate(
brain_info, action, next_brain_info
)
assert rsig_result.scaled_reward.shape == (NUM_AGENTS,)
assert rsig_result.unscaled_reward.shape == (NUM_AGENTS,)
rsig_result = policy.reward_signals[reward_signal_name].evaluate_batch(buffer)
assert rsig_result.scaled_reward.shape == (BATCH_SIZE,)
assert rsig_result.unscaled_reward.shape == (BATCH_SIZE,)
def reward_signal_update(env, policy, reward_signal_name):
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
def reward_signal_update(policy, reward_signal_name):
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
feed_dict = policy.reward_signals[reward_signal_name].prepare_update(
policy.model, buffer.make_mini_batch(0, 10), 2
)

@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_gail_cc(mock_env, trainer_config, gail_dummy_config):
env, policy = create_policy_mock(
mock_env, trainer_config, gail_dummy_config, False, False, False
)
reward_signal_eval(env, policy, "gail")
reward_signal_update(env, policy, "gail")
def test_gail_cc(trainer_config, gail_dummy_config):
policy = create_policy_mock(trainer_config, gail_dummy_config, False, False, False)
reward_signal_eval(policy, "gail")
reward_signal_update(policy, "gail")
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_gail_dc_visual(mock_env, trainer_config, gail_dummy_config):
def test_gail_dc_visual(trainer_config, gail_dummy_config):
env, policy = create_policy_mock(
mock_env, trainer_config, gail_dummy_config, False, True, True
)
reward_signal_eval(env, policy, "gail")
reward_signal_update(env, policy, "gail")
policy = create_policy_mock(trainer_config, gail_dummy_config, False, True, True)
reward_signal_eval(policy, "gail")
reward_signal_update(policy, "gail")
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_gail_rnn(mock_env, trainer_config, gail_dummy_config):
env, policy = create_policy_mock(
mock_env, trainer_config, gail_dummy_config, True, False, False
)
reward_signal_eval(env, policy, "gail")
reward_signal_update(env, policy, "gail")
def test_gail_rnn(trainer_config, gail_dummy_config):
policy = create_policy_mock(trainer_config, gail_dummy_config, True, False, False)
reward_signal_eval(policy, "gail")
reward_signal_update(policy, "gail")
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_curiosity_cc(mock_env, trainer_config, curiosity_dummy_config):
env, policy = create_policy_mock(
mock_env, trainer_config, curiosity_dummy_config, False, False, False
def test_curiosity_cc(trainer_config, curiosity_dummy_config):
policy = create_policy_mock(
trainer_config, curiosity_dummy_config, False, False, False
reward_signal_eval(env, policy, "curiosity")
reward_signal_update(env, policy, "curiosity")
reward_signal_eval(policy, "curiosity")
reward_signal_update(policy, "curiosity")
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_curiosity_dc(mock_env, trainer_config, curiosity_dummy_config):
env, policy = create_policy_mock(
mock_env, trainer_config, curiosity_dummy_config, False, True, False
def test_curiosity_dc(trainer_config, curiosity_dummy_config):
policy = create_policy_mock(
trainer_config, curiosity_dummy_config, False, True, False
reward_signal_eval(env, policy, "curiosity")
reward_signal_update(env, policy, "curiosity")
reward_signal_eval(policy, "curiosity")
reward_signal_update(policy, "curiosity")
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_curiosity_visual(mock_env, trainer_config, curiosity_dummy_config):
env, policy = create_policy_mock(
mock_env, trainer_config, curiosity_dummy_config, False, False, True
def test_curiosity_visual(trainer_config, curiosity_dummy_config):
policy = create_policy_mock(
trainer_config, curiosity_dummy_config, False, False, True
reward_signal_eval(env, policy, "curiosity")
reward_signal_update(env, policy, "curiosity")
reward_signal_eval(policy, "curiosity")
reward_signal_update(policy, "curiosity")
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_curiosity_rnn(mock_env, trainer_config, curiosity_dummy_config):
env, policy = create_policy_mock(
mock_env, trainer_config, curiosity_dummy_config, True, False, False
def test_curiosity_rnn(trainer_config, curiosity_dummy_config):
policy = create_policy_mock(
trainer_config, curiosity_dummy_config, True, False, False
reward_signal_eval(env, policy, "curiosity")
reward_signal_update(env, policy, "curiosity")
reward_signal_eval(policy, "curiosity")
reward_signal_update(policy, "curiosity")
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_extrinsic(mock_env, trainer_config, curiosity_dummy_config):
env, policy = create_policy_mock(
mock_env, trainer_config, curiosity_dummy_config, False, False, False
def test_extrinsic(trainer_config, curiosity_dummy_config):
policy = create_policy_mock(
trainer_config, curiosity_dummy_config, False, False, False
reward_signal_eval(env, policy, "extrinsic")
reward_signal_update(env, policy, "extrinsic")
reward_signal_eval(policy, "extrinsic")
reward_signal_update(policy, "extrinsic")
if __name__ == "__main__":

120
ml-agents/mlagents/trainers/tests/test_sac.py


from unittest import mock
import pytest
import yaml

from mlagents.trainers.sac.policy import SACPolicy
from mlagents.trainers.sac.trainer import SACTrainer
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.mock_brain import make_brain_parameters
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory

NUM_AGENTS = 12
def create_sac_policy_mock(mock_env, dummy_config, use_rnn, use_discrete, use_visual):
env, mock_brain, _ = mb.setup_mock_env_and_brains(
mock_env,
def create_sac_policy_mock(dummy_config, use_rnn, use_discrete, use_visual):
mock_brain = mb.setup_mock_brain(
num_agents=NUM_AGENTS,
vector_action_space=VECTOR_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE,
discrete_action_space=DISCRETE_ACTION_SPACE,

model_path = env.external_brain_names[0]
model_path = "testmodel"
return env, policy
return policy
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_sac_cc_policy(mock_env, dummy_config):
def test_sac_cc_policy(dummy_config):
env, policy = create_sac_policy_mock(
mock_env, dummy_config, use_rnn=False, use_discrete=False, use_visual=False
policy = create_sac_policy_mock(
dummy_config, use_rnn=False, use_discrete=False, use_visual=False
brain_infos = env.reset()
brain_info = brain_infos[env.external_brain_names[0]]
run_out = policy.evaluate(brain_info)
step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
run_out = policy.evaluate(step, list(step.agent_id))
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
env.close()
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_sac_update_reward_signals(mock_env, dummy_config, discrete):
def test_sac_update_reward_signals(dummy_config, discrete):
# Test evaluate
tf.reset_default_graph()
# Add a Curiosity module

dummy_config["reward_signals"]["curiosity"]["encoding_size"] = 128
env, policy = create_sac_policy_mock(
mock_env, dummy_config, use_rnn=False, use_discrete=discrete, use_visual=False
policy = create_sac_policy_mock(
dummy_config, use_rnn=False, use_discrete=discrete, use_visual=False
update_buffer = mb.simulate_rollout(
env, policy, BUFFER_INIT_SAMPLES, exclude_key_list=["advantages", "actions_pre"]
)
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
update_buffer["curiosity_rewards"] = update_buffer["rewards"]
update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
update_buffer["curiosity_rewards"] = update_buffer["environment_rewards"]
env.close()
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_sac_dc_policy(mock_env, dummy_config):
def test_sac_dc_policy(dummy_config):
env, policy = create_sac_policy_mock(
mock_env, dummy_config, use_rnn=False, use_discrete=True, use_visual=False
policy = create_sac_policy_mock(
dummy_config, use_rnn=False, use_discrete=True, use_visual=False
brain_infos = env.reset()
brain_info = brain_infos[env.external_brain_names[0]]
run_out = policy.evaluate(brain_info)
step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
run_out = policy.evaluate(step, list(step.agent_id))
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
env.close()
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_sac_visual_policy(mock_env, dummy_config):
def test_sac_visual_policy(dummy_config):
env, policy = create_sac_policy_mock(
mock_env, dummy_config, use_rnn=False, use_discrete=True, use_visual=True
policy = create_sac_policy_mock(
dummy_config, use_rnn=False, use_discrete=True, use_visual=True
brain_infos = env.reset()
brain_info = brain_infos[env.external_brain_names[0]]
run_out = policy.evaluate(brain_info)
step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
run_out = policy.evaluate(step, list(step.agent_id))
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
@mock.patch("mlagents_envs.environment.UnityEnvironment")
def test_sac_rnn_policy(mock_env, dummy_config):
def test_sac_rnn_policy(dummy_config):
env, policy = create_sac_policy_mock(
mock_env, dummy_config, use_rnn=True, use_discrete=True, use_visual=False
policy = create_sac_policy_mock(
dummy_config, use_rnn=True, use_discrete=True, use_visual=False
brain_infos = env.reset()
brain_info = brain_infos[env.external_brain_names[0]]
run_out = policy.evaluate(brain_info)
step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
run_out = policy.evaluate(step, list(step.agent_id))
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain, memory_size=8)
update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
policy.update(update_buffer, num_sequences=2)
env.close()
buffer["extrinsic_rewards"] = buffer["environment_rewards"]
update_buffer = AgentBuffer()
buffer.resequence_and_append(update_buffer, training_length=policy.sequence_length)
run_out = policy.update(
update_buffer,
num_sequences=update_buffer.num_experiences // policy.sequence_length,
)
def test_sac_model_cc_vector():

def test_sac_save_load_buffer(tmpdir, dummy_config):
env, mock_brain, _ = mb.setup_mock_env_and_brains(
mock.Mock(),
mock_brain = mb.setup_mock_brain(
num_agents=NUM_AGENTS,
vector_action_space=VECTOR_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE,
discrete_action_space=DISCRETE_ACTION_SPACE,

policy = trainer.create_policy(mock_brain)
trainer.add_policy(mock_brain.brain_name, policy)
trainer.update_buffer = mb.simulate_rollout(
env, trainer.policy, BUFFER_INIT_SAMPLES
)
trainer.update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
buffer_len = trainer.update_buffer.num_experiences
trainer.save_model(mock_brain.brain_name)

trainer.subscribe_trajectory_queue(trajectory_queue)
trajectory = make_fake_trajectory(
length=15, max_step_complete=True, vec_obs_size=6, num_vis_obs=0, action_space=2
length=15,
max_step_complete=True,
vec_obs_size=6,
num_vis_obs=0,
action_space=[2],
)
trajectory_queue.put(trajectory)
trainer.advance()

max_step_complete=False,
vec_obs_size=6,
num_vis_obs=0,
action_space=2,
action_space=[2],
)
trajectory_queue.put(trajectory)
trainer.advance()

16
ml-agents/mlagents/trainers/tests/test_simple_rl.py


m_reward = np.array([reward], dtype=np.float32)
m_done = np.array([done], dtype=np.bool)
m_agent_id = np.array([0], dtype=np.int32)
action_mask = self._generate_mask()
m_vector_obs, m_reward, m_done, m_done, m_agent_id, None
m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask
def _generate_mask(self):
if self.discrete:
# LL-Python API will return an empty dim if there is only 1 agent.
ndmask = np.array(2 * [False], dtype=np.bool)
ndmask = np.expand_dims(ndmask, axis=0)
action_mask = [ndmask]
else:
action_mask = None
return action_mask
def _reset_agent(self):
self.position = 0.0
self.step_count = 0

m_reward = np.array([0], dtype=np.float32)
m_done = np.array([False], dtype=np.bool)
m_agent_id = np.array([0], dtype=np.int32)
action_mask = self._generate_mask()
m_vector_obs, m_reward, m_done, m_done, m_agent_id, None
m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask
)
@property

4
ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py


env.recv.assert_called()
# Check that the "last steps" are set to the value returned for each step
self.assertEqual(
manager.env_workers[i].previous_step.current_all_brain_info, i
manager.env_workers[i].previous_step.current_all_step_result, i
)
assert res == list(map(lambda ew: ew.previous_step, manager.env_workers))

manager.step_queue.get_nowait.assert_called()
# Check that the "last steps" are set to the value returned for each step
self.assertEqual(
manager.env_workers[i].previous_step.current_all_brain_info, i
manager.env_workers[i].previous_step.current_all_step_result, i
)
assert res == [
manager.env_workers[0].previous_step,

14
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


action_info_dict = {brain_name: MagicMock()}
brain_info_dict = {brain_name: Mock()}
old_step_info = EnvironmentStep(brain_info_dict, action_info_dict)
new_step_info = EnvironmentStep(brain_info_dict, action_info_dict)
old_step_info = EnvironmentStep(brain_info_dict, 0, action_info_dict)
new_step_info = EnvironmentStep(brain_info_dict, 0, action_info_dict)
trainer_mock._is_ready_update = MagicMock(return_value=True)
env_mock = MagicMock()

manager_mock = tc.managers[brain_name]
manager_mock.add_experiences.assert_called_once_with(
new_step_info.current_all_brain_info[brain_name],
new_step_info.current_all_step_result[brain_name],
0,
new_step_info.brain_name_to_action_info[brain_name],
)

action_info_dict = {brain_name: MagicMock()}
brain_info_dict = {brain_name: Mock()}
old_step_info = EnvironmentStep(brain_info_dict, action_info_dict)
new_step_info = EnvironmentStep(brain_info_dict, action_info_dict)
old_step_info = EnvironmentStep(brain_info_dict, 0, action_info_dict)
new_step_info = EnvironmentStep(brain_info_dict, 0, action_info_dict)
trainer_mock._is_ready_update = MagicMock(return_value=False)

env_mock.step.assert_called_once()
manager_mock = tc.managers[brain_name]
manager_mock.add_experiences.assert_called_once_with(
new_step_info.current_all_brain_info[brain_name],
new_step_info.current_all_step_result[brain_name],
0,
new_step_info.brain_name_to_action_info[brain_name],
)
trainer_mock.advance.assert_called_once()

66
ml-agents/mlagents/trainers/tests/test_trajectory.py


import numpy as np
import pytest
from mlagents.trainers.trajectory import AgentExperience, Trajectory, SplitObservations
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.tests.mock_brain import make_fake_trajectory
def make_fake_trajectory(
length: int,
max_step_complete: bool = False,
vec_obs_size: int = VEC_OBS_SIZE,
num_vis_obs: int = 1,
action_space: int = ACTION_SIZE,
) -> Trajectory:
"""
Makes a fake trajectory of length length. If max_step_complete,
the trajectory is terminated by a max step rather than a done.
"""
steps_list = []
for _i in range(length - 1):
obs = []
for _j in range(num_vis_obs):
obs.append(np.ones((84, 84, 3), dtype=np.float32))
obs.append(np.ones(vec_obs_size, dtype=np.float32))
reward = 1.0
done = False
action = np.zeros(action_space, dtype=np.float32)
action_probs = np.ones(action_space, dtype=np.float32)
action_pre = np.zeros(action_space, dtype=np.float32)
action_mask = np.ones(action_space, dtype=np.float32)
prev_action = np.ones(action_space, dtype=np.float32)
max_step = False
memory = np.ones(10, dtype=np.float32)
agent_id = "test_agent"
behavior_id = "test_brain"
experience = AgentExperience(
obs=obs,
reward=reward,
done=done,
action=action,
action_probs=action_probs,
action_pre=action_pre,
action_mask=action_mask,
prev_action=prev_action,
max_step=max_step,
memory=memory,
)
steps_list.append(experience)
last_experience = AgentExperience(
obs=obs,
reward=reward,
done=not max_step_complete,
action=action,
action_probs=action_probs,
action_pre=action_pre,
action_mask=action_mask,
prev_action=prev_action,
max_step=max_step_complete,
memory=memory,
)
steps_list.append(last_experience)
return Trajectory(
steps=steps_list, agent_id=agent_id, behavior_id=behavior_id, next_obs=obs
)
@pytest.mark.parametrize("num_visual_obs", [0, 1, 2])
@pytest.mark.parametrize("num_vec_obs", [0, 1])
def test_split_obs(num_visual_obs, num_vec_obs):

"environment_rewards",
]
wanted_keys = set(wanted_keys)
trajectory = make_fake_trajectory(length=length)
trajectory = make_fake_trajectory(
length=length, vec_obs_size=VEC_OBS_SIZE, action_space=[ACTION_SIZE]
)
agentbuffer = trajectory.to_agentbuffer()
seen_keys = set()
for key, field in agentbuffer.items():

57
ml-agents/mlagents/trainers/tf_policy.py


from mlagents.trainers import tensorflow_to_barracuda as tf2bc
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.brain import BrainInfo
from mlagents.trainers.env_manager import get_global_agent_id
from mlagents_envs.base_env import BatchedStepResult
logger = logging.getLogger("mlagents.trainers")

)
self.saver.restore(self.sess, ckpt.model_checkpoint_path)
def evaluate(self, brain_info: BrainInfo) -> Dict[str, Any]:
def evaluate(
self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]
) -> Dict[str, Any]:
:param brain_info: BrainInfo input to network.
:param batched_step_result: BatchedStepResult input to network.
def get_action(self, brain_info: BrainInfo) -> ActionInfo:
def get_action(
self, batched_step_result: BatchedStepResult, worker_id: int = 0
) -> ActionInfo:
:param brain_info: A dictionary of brain names and BrainInfo from environment.
:param batched_step_result: A dictionary of brain names and BatchedStepResult from environment.
:param worker_id: In parallel environment training, the unique id of the environment worker that
the BatchedStepResult came from. Used to construct a globally unique id for each agent.
if len(brain_info.agents) == 0:
if batched_step_result.n_agents() == 0:
for agent, done in zip(brain_info.agents, brain_info.local_done)
for agent, done in zip(
batched_step_result.agent_id, batched_step_result.done
)
if done
]

run_out = self.evaluate(brain_info) # pylint: disable=assignment-from-no-return
self.save_memories(brain_info.agents, run_out.get("memory_out"))
global_agent_ids = [
get_global_agent_id(worker_id, int(agent_id))
for agent_id in batched_step_result.agent_id
] # For 1-D array, the iterator order is correct.
run_out = self.evaluate( # pylint: disable=assignment-from-no-return
batched_step_result, global_agent_ids
)
self.save_memories(global_agent_ids, run_out.get("memory_out"))
agents=brain_info.agents,
agent_ids=batched_step_result.agent_id,
)
def update(self, mini_batch, num_sequences):

run_out = dict(zip(list(out_dict.keys()), network_out))
return run_out
def fill_eval_dict(self, feed_dict, brain_info):
for i, _ in enumerate(brain_info.visual_observations):
feed_dict[self.model.visual_in[i]] = brain_info.visual_observations[i]
def fill_eval_dict(self, feed_dict, batched_step_result):
vec_vis_obs = SplitObservations.from_observations(batched_step_result.obs)
for i, _ in enumerate(vec_vis_obs.visual_observations):
feed_dict[self.model.visual_in[i]] = vec_vis_obs.visual_observations[i]
feed_dict[self.model.vector_in] = brain_info.vector_observations
feed_dict[self.model.vector_in] = vec_vis_obs.vector_observations
feed_dict[self.model.action_masks] = brain_info.action_masks
mask = np.ones(
(
batched_step_result.n_agents(),
np.sum(self.brain.vector_action_space_size),
),
dtype=np.float32,
)
if batched_step_result.action_mask is not None:
mask = 1 - np.concatenate(batched_step_result.action_mask, axis=1)
feed_dict[self.model.action_masks] = mask
return feed_dict
def make_empty_memory(self, num_agents):

3
ml-agents/mlagents/trainers/trainer_controller.py


)
continue
self.managers[name_behavior_id].add_experiences(
step_info.current_all_brain_info[name_behavior_id],
step_info.current_all_step_result[name_behavior_id],
step_info.worker_id,
step_info.brain_name_to_action_info.get(
name_behavior_id, ActionInfo([], [], {}, [])
),

39
ml-agents/mlagents/trainers/trajectory.py


"""
vis_obs_list: List[np.ndarray] = []
vec_obs_list: List[np.ndarray] = []
last_obs = None
if len(observation.shape) == 1:
# Obs could be batched or single
if len(observation.shape) == 1 or len(observation.shape) == 2:
if len(observation.shape) == 3:
if len(observation.shape) == 3 or len(observation.shape) == 4:
vec_obs = (
np.concatenate(vec_obs_list, axis=0)
if len(vec_obs_list) > 0
else np.array([], dtype=np.float32)
)
last_obs = observation
if last_obs is not None:
is_batched = len(last_obs.shape) == 2 or len(last_obs.shape) == 4
if is_batched:
vec_obs = (
np.concatenate(vec_obs_list, axis=1)
if len(vec_obs_list) > 0
else np.zeros((last_obs.shape[0], 0), dtype=np.float32)
)
else:
vec_obs = (
np.concatenate(vec_obs_list, axis=0)
if len(vec_obs_list) > 0
else np.array([], dtype=np.float32)
)
else:
vec_obs = []
return SplitObservations(
vector_observations=vec_obs, visual_observations=vis_obs_list
)

agent_buffer_trajectory["actions"].append(exp.action)
agent_buffer_trajectory["action_probs"].append(exp.action_probs)
# Store action masks if necessary. Eventually these will be
# None for continuous actions
# Store action masks if necessary. Note that 1 means active, while
# in AgentExperience False means active.
mask = 1 - np.concatenate(exp.action_mask)
agent_buffer_trajectory["action_mask"].append(mask, padding_value=1)
else:
# This should never be needed unless the environment somehow doesn't supply the
# action mask in a discrete space.
exp.action_mask, padding_value=1
np.ones(exp.action_probs.shape, dtype=np.float32), padding_value=1
)
agent_buffer_trajectory["prev_action"].append(exp.prev_action)

正在加载...
取消
保存