浏览代码

Action Model (#4580)

Co-authored-by: Ervin T <ervin@unity3d.com>
Co-authored-by: Vincent-Pierre BERGES <vincentpierre@unity3d.com>
/fix-conflict-base-env
GitHub 4 年前
当前提交
3c96a3a2
共有 43 个文件被更改,包括 1337 次插入1004 次删除
  1. 4
      .github/workflows/pytest.yml
  2. 93
      ml-agents-envs/mlagents_envs/base_env.py
  3. 2
      ml-agents-envs/mlagents_envs/rpc_utils.py
  4. 26
      ml-agents/mlagents/trainers/agent_processor.py
  5. 5
      ml-agents/mlagents/trainers/demo_loader.py
  6. 21
      ml-agents/mlagents/trainers/env_manager.py
  7. 17
      ml-agents/mlagents/trainers/policy/policy.py
  8. 24
      ml-agents/mlagents/trainers/policy/tf_policy.py
  9. 54
      ml-agents/mlagents/trainers/policy/torch_policy.py
  10. 6
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  11. 5
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  12. 2
      ml-agents/mlagents/trainers/ppo/trainer.py
  13. 319
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  14. 3
      ml-agents/mlagents/trainers/simple_env_manager.py
  15. 7
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  16. 20
      ml-agents/mlagents/trainers/tests/mock_brain.py
  17. 49
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  18. 66
      ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
  19. 114
      ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py
  20. 2
      ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py
  21. 27
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  22. 2
      ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
  23. 4
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  24. 2
      ml-agents/mlagents/trainers/tests/torch/test_distributions.py
  25. 78
      ml-agents/mlagents/trainers/tests/torch/test_networks.py
  26. 13
      ml-agents/mlagents/trainers/tests/torch/test_policy.py
  27. 28
      ml-agents/mlagents/trainers/tests/torch/test_ppo.py
  28. 3
      ml-agents/mlagents/trainers/tests/torch/test_sac.py
  29. 118
      ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
  30. 47
      ml-agents/mlagents/trainers/tests/torch/test_utils.py
  31. 35
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  32. 75
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  33. 6
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
  34. 19
      ml-agents/mlagents/trainers/torch/distributions.py
  35. 183
      ml-agents/mlagents/trainers/torch/networks.py
  36. 240
      ml-agents/mlagents/trainers/torch/utils.py
  37. 25
      ml-agents/mlagents/trainers/trajectory.py
  38. 81
      ml-agents/mlagents/trainers/tests/torch/test_action_model.py
  39. 122
      ml-agents/mlagents/trainers/tests/torch/test_hybrid.py
  40. 44
      ml-agents/mlagents/trainers/torch/action_flattener.py
  41. 108
      ml-agents/mlagents/trainers/torch/action_log_probs.py
  42. 184
      ml-agents/mlagents/trainers/torch/action_model.py
  43. 58
      ml-agents/mlagents/trainers/torch/agent_action.py

4
.github/workflows/pytest.yml


- 'gym-unity/**'
- 'test_constraints*.txt'
- 'test_requirements.txt'
- '.github/workflows/pytest.yml'
push:
branches: [master]

run: python -c "import sys; print(sys.version)"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
# pin pip to workaround https://github.com/pypa/pip/issues/9180
python -m pip install pip==20.2
python -m pip install --upgrade setuptools
python -m pip install --progress-bar=off -e ./ml-agents-envs -c ${{ matrix.pip_constraints }}
python -m pip install --progress-bar=off -e ./ml-agents -c ${{ matrix.pip_constraints }}

93
ml-agents-envs/mlagents_envs/base_env.py


)
class ActionTuple:
class _ActionTupleBase(ABC):
An object whose fields correspond to actions of different types.
Continuous and discrete actions are numpy arrays of type float32 and
int32, respectively and are type checked on construction.
Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively.
An object whose fields correspond to action data of continuous and discrete
spaces. Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively. Note, this also holds when continuous or discrete size is
zero.
def __init__(self, continuous: np.ndarray, discrete: np.ndarray):
def __init__(
self,
continuous: Optional[np.ndarray] = None,
discrete: Optional[np.ndarray] = None,
):
self._continuous: Optional[np.ndarray] = None
self._discrete: Optional[np.ndarray] = None
if continuous is not None:
self.add_continuous(continuous)
if discrete is not None:
self.add_discrete(discrete)
@property
def continuous(self) -> np.ndarray:
return self._continuous
@property
def discrete(self) -> np.ndarray:
return self._discrete
def add_continuous(self, continuous: np.ndarray) -> None:
if self._discrete is None:
self._discrete = np.zeros(
(continuous.shape[0], 0), dtype=self.discrete_dtype
)
if discrete.dtype != np.int32:
discrete = discrete.astype(np.int32, copy=False)
def add_discrete(self, discrete: np.ndarray) -> None:
if discrete.dtype != self.discrete_dtype:
discrete = discrete.astype(self.discrete_dtype, copy=False)
if self._continuous is None:
self._continuous = np.zeros((discrete.shape[0], 0), dtype=np.float32)
def continuous(self) -> np.ndarray:
return self._continuous
@abstractmethod
def discrete_dtype(self) -> np.dtype:
pass
@property
def discrete(self) -> np.ndarray:
return self._discrete
@staticmethod
def create_continuous(continuous: np.ndarray) -> "ActionTuple":
discrete = np.zeros((continuous.shape[0], 0), dtype=np.int32)
return ActionTuple(continuous, discrete)
class ActionTuple(_ActionTupleBase):
"""
An object whose fields correspond to actions of different types.
Continuous and discrete actions are numpy arrays of type float32 and
int32, respectively and are type checked on construction.
Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively. Note, this also holds when continuous or discrete size is
zero.
"""
@staticmethod
def create_discrete(discrete: np.ndarray) -> "ActionTuple":
continuous = np.zeros((discrete.shape[0], 0), dtype=np.float32)
return ActionTuple(continuous, discrete)
@property
def discrete_dtype(self) -> np.dtype:
"""
The dtype of a discrete action.
"""
return np.int32
class ActionSpec(NamedTuple):

for a number of agents.
:param n_agents: The number of agents that will have actions generated
"""
continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
return ActionTuple(continuous, discrete)
_continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
_discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
return ActionTuple(continuous=_continuous, discrete=_discrete)
def random_action(self, n_agents: int) -> ActionTuple:
"""

"""
continuous = np.random.uniform(
_continuous = np.random.uniform(
discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
_discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
discrete = np.column_stack(
_discrete = np.column_stack(
[
np.random.randint(
0,

for i in range(self.discrete_size)
]
)
return ActionTuple(continuous, discrete)
return ActionTuple(continuous=_continuous, discrete=_discrete)
def _validate_action(
self, actions: ActionTuple, n_agents: int, name: str

for the correct number of agents and ensures the type.
"""
_expected_shape = (n_agents, self.continuous_size)
if self.continuous_size > 0 and actions.continuous.shape != _expected_shape:
if actions.continuous.shape != _expected_shape:
raise UnityActionException(
f"The behavior {name} needs a continuous input of dimension "
f"{_expected_shape} for (<number of agents>, <action size>) but "

if self.discrete_size > 0 and actions.discrete.shape != _expected_shape:
if actions.discrete.shape != _expected_shape:
raise UnityActionException(
f"The behavior {name} needs a discrete input of dimension "
f"{_expected_shape} for (<number of agents>, <action size>) but "

2
ml-agents-envs/mlagents_envs/rpc_utils.py


from mlagents_envs.base_env import (
BehaviorSpec,
BehaviorSpec,
DecisionSteps,
TerminalSteps,
)

26
ml-agents/mlagents/trainers/agent_processor.py


from typing import List, Dict, TypeVar, Generic, Tuple, Any, Union
from collections import defaultdict, Counter
import queue
import numpy as np
ActionTuple,
DecisionSteps,
DecisionStep,
TerminalSteps,

from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents.trainers.policy import Policy
from mlagents.trainers.action_info import ActionInfo, ActionInfoOutputs
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents.trainers.stats import StatsReporter
from mlagents.trainers.behavior_id_utils import get_global_agent_id

done = terminated # Since this is an ongoing step
interrupted = step.interrupted if terminated else False
# Add the outputs of the last eval
action_dict = stored_take_action_outputs["action"]
action: Dict[str, np.ndarray] = {}
for act_type, act_array in action_dict.items():
action[act_type] = act_array[idx]
stored_actions = stored_take_action_outputs["action"]
action_tuple = ActionTuple(
continuous=stored_actions.continuous[idx],
discrete=stored_actions.discrete[idx],
)
action_probs_dict = stored_take_action_outputs["log_probs"]
action_probs: Dict[str, np.ndarray] = {}
for prob_type, prob_array in action_probs_dict.items():
action_probs[prob_type] = prob_array[idx]
stored_action_probs = stored_take_action_outputs["log_probs"]
log_probs_tuple = LogProbsTuple(
continuous=stored_action_probs.continuous[idx],
discrete=stored_action_probs.discrete[idx],
)
action_mask = stored_decision_step.action_mask
prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
experience = AgentExperience(

action=action,
action_probs=action_probs,
action=action_tuple,
action_probs=log_probs_tuple,
action_pre=action_pre,
action_mask=action_mask,
prev_action=prev_action,

5
ml-agents/mlagents/trainers/demo_loader.py


for i, obs in enumerate(split_obs.visual_observations):
demo_raw_buffer["visual_obs%d" % i].append(obs)
demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
if behavior_spec.action_spec.is_continuous():
# TODO: update to read from the new proto format
if behavior_spec.action_spec.continuous_size > 0:
else:
if behavior_spec.action_spec.discrete_size > 0:
demo_raw_buffer["discrete_action"].append(
current_pair_info.action_info.vector_actions
)

21
ml-agents/mlagents/trainers/env_manager.py


from abc import ABC, abstractmethod
import numpy as np
from typing import List, Dict, NamedTuple, Iterable, Tuple
from mlagents_envs.base_env import (

BehaviorName,
ActionTuple,
)
from mlagents_envs.side_channel.stats_side_channel import EnvironmentStats

from mlagents_envs.logging_util import get_logger
from mlagents_envs.exception import UnityActionException
AllStepResult = Dict[BehaviorName, Tuple[DecisionSteps, TerminalSteps]]
AllGroupSpec = Dict[BehaviorName, BehaviorSpec]

step_info.environment_stats, step_info.worker_id
)
return len(step_infos)
@staticmethod
def action_tuple_from_numpy_dict(action_dict: Dict[str, np.ndarray]) -> ActionTuple:
if "continuous_action" in action_dict:
continuous = action_dict["continuous_action"]
if "discrete_action" in action_dict:
discrete = action_dict["discrete_action"]
action_tuple = ActionTuple(continuous, discrete)
else:
action_tuple = ActionTuple.create_continuous(continuous)
elif "discrete_action" in action_dict:
discrete = action_dict["discrete_action"]
action_tuple = ActionTuple.create_discrete(discrete)
else:
raise UnityActionException(
"The action dict must contain entries for either continuous_action or discrete_action."
)
return action_tuple

17
ml-agents/mlagents/trainers/policy/policy.py


from typing import Dict, List, Optional
import numpy as np
from mlagents_envs.base_env import DecisionSteps
from mlagents_envs.base_env import ActionTuple, BehaviorSpec, DecisionSteps
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.settings import TrainerSettings, NetworkSettings

self.trainer_settings = trainer_settings
self.network_settings: NetworkSettings = trainer_settings.network_settings
self.seed = seed
if (
self.behavior_spec.action_spec.continuous_size > 0
and self.behavior_spec.action_spec.discrete_size > 0
):
raise UnityPolicyException("Trainers do not support mixed action spaces.")
self.act_size = (
list(self.behavior_spec.action_spec.discrete_branches)
if self.behavior_spec.action_spec.is_discrete()

) -> None:
if memory_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.memory_dict[agent_id] = memory_matrix[index, :]

)
def save_previous_action(
self, agent_ids: List[str], action_dict: Dict[str, np.ndarray]
self, agent_ids: List[str], action_tuple: ActionTuple
if action_dict is None or "discrete_action" not in action_dict:
return
self.previous_action_dict[agent_id] = action_dict["discrete_action"][
index, :
]
self.previous_action_dict[agent_id] = action_tuple.discrete[index, :]
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = self.make_empty_previous_action(len(agent_ids))

24
ml-agents/mlagents/trainers/policy/tf_policy.py


from mlagents.tf_utils import tf
from mlagents import tf_utils
from mlagents_envs.exception import UnityException
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents_envs.base_env import DecisionSteps
from mlagents_envs.base_env import DecisionSteps, ActionTuple, BehaviorSpec
from mlagents.trainers.tf.models import ModelUtils
from mlagents.trainers.settings import TrainerSettings, EncoderType
from mlagents.trainers import __version__

reparameterize,
condition_sigma_on_obs,
)
if (
self.behavior_spec.action_spec.continuous_size > 0
and self.behavior_spec.action_spec.discrete_size > 0
):
raise UnityPolicyException(
"TensorFlow does not support mixed action spaces. Please run with the Torch framework."
)
# for ghost trainer save/load snapshots
self.assign_phs: List[tf.Tensor] = []
self.assign_ops: List[tf.Operation] = []

self.save_memories(global_agent_ids, run_out.get("memory_out"))
# For Compatibility with buffer changes for hybrid action support
if "log_probs" in run_out:
run_out["log_probs"] = {"action_probs": run_out["log_probs"]}
log_probs_tuple = LogProbsTuple()
if self.behavior_spec.action_spec.is_continuous():
log_probs_tuple.add_continuous(run_out["log_probs"])
else:
log_probs_tuple.add_discrete(run_out["log_probs"])
run_out["log_probs"] = log_probs_tuple
action_tuple = ActionTuple()
run_out["action"] = {"continuous_action": run_out["action"]}
action_tuple.add_continuous(run_out["action"])
run_out["action"] = {"discrete_action": run_out["action"]}
action_tuple.add_discrete(run_out["action"])
run_out["action"] = action_tuple
return ActionInfo(
action=run_out.get("action"),
value=run_out.get("value"),

54
ml-agents/mlagents/trainers/policy/torch_policy.py


GlobalSteps,
)
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
EPSILON = 1e-7 # Small value to avoid divide by zero

) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
mask = None
if not self.use_continuous_act:
if self.behavior_spec.action_spec.discrete_size > 0:
mask = torch.ones([len(decision_requests), np.sum(self.act_size)])
if decision_requests.action_mask is not None:
mask = torch.as_tensor(

:param masks: Loss masks for RNN, else None.
:param memories: Input memories when using RNN, else None.
:param seq_len: Sequence length when using RNN.
:return: Tuple of actions, log probabilities (dependent on all_log_probs), entropies, and
output memories, all as Torch Tensors.
:return: Tuple of AgentAction, ActionLogProbs, entropies, and output memories.
if memories is None:
dists, memories = self.actor_critic.get_dists(
vec_obs, vis_obs, masks, memories, seq_len
)
else:
# If we're using LSTM. we need to execute the values to get the critic memories
dists, _, memories = self.actor_critic.get_dist_and_value(
vec_obs, vis_obs, masks, memories, seq_len
)
action_list = self.actor_critic.sample_action(dists)
log_probs_list, entropies, all_logs_list = ModelUtils.get_probs_and_entropy(
action_list, dists
)
actions = AgentAction.create(action_list, self.behavior_spec.action_spec)
log_probs = ActionLogProbs.create(
log_probs_list, self.behavior_spec.action_spec, all_logs_list
actions, log_probs, entropies, _, memories = self.actor_critic.get_action_stats_and_value(
vec_obs, vis_obs, masks, memories, seq_len
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return (actions, log_probs, entropy_sum, memories)
return (actions, log_probs, entropies, memories)
def evaluate_actions(
self,

memories: Optional[torch.Tensor] = None,
seq_len: int = 1,
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
dists, value_heads, _ = self.actor_critic.get_dist_and_value(
vec_obs, vis_obs, masks, memories, seq_len
)
action_list = actions.to_tensor_list()
log_probs_list, entropies, _ = ModelUtils.get_probs_and_entropy(
action_list, dists
log_probs, entropies, value_heads = self.actor_critic.get_stats_and_value(
vec_obs, vis_obs, actions, masks, memories, seq_len
log_probs = ActionLogProbs.create(
log_probs_list, self.behavior_spec.action_spec
)
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return log_probs, entropy_sum, value_heads
return log_probs, entropies, value_heads
@timed
def evaluate(

action, log_probs, entropy, memories = self.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
)
action_dict = action.to_numpy_dict()
run_out["action"] = action_dict
action_tuple = action.to_action_tuple()
run_out["action"] = action_tuple
action_dict["continuous_action"] if self.use_continuous_act else None
action_tuple.continuous if self.use_continuous_act else None
run_out["log_probs"] = log_probs.to_numpy_dict()
run_out["log_probs"] = log_probs.to_log_probs_tuple()
run_out["entropy"] = ModelUtils.to_numpy(entropy)
run_out["learning_rate"] = 0.0
if self.use_recurrent:

6
ml-agents/mlagents/trainers/ppo/optimizer_tf.py


self.policy.sequence_length_ph: self.policy.sequence_length,
self.policy.mask_input: mini_batch["masks"] * burn_in_mask,
self.advantage: mini_batch["advantages"],
self.all_old_log_probs: mini_batch["action_probs"],
if self.policy.use_continuous_act: # For hybrid action buffer support
feed_dict[self.all_old_log_probs] = mini_batch["continuous_log_probs"]
else:
feed_dict[self.all_old_log_probs] = mini_batch["discrete_log_probs"]
if self.policy.output_pre is not None and "actions_pre" in mini_batch:
feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]

5
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
from mlagents.trainers.settings import TrainerSettings, PPOSettings
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils
class TorchPPOOptimizer(TorchOptimizer):

vis_obs.append(vis_ob)
else:
vis_obs = []
log_probs, entropy, values = self.policy.evaluate_actions(
vec_obs,
vis_obs,

2
ml-agents/mlagents/trainers/ppo/trainer.py


behavior_spec,
self.trainer_settings,
condition_sigma_on_obs=False, # Faster training for PPO
separate_critic=behavior_spec.action_spec.is_continuous(),
separate_critic=behavior_spec.action_spec.continuous_size > 0,
)
return policy

319
ml-agents/mlagents/trainers/sac/optimizer_torch.py


import numpy as np
from typing import Dict, List, Mapping, cast, Tuple, Optional
from typing import Dict, List, Mapping, NamedTuple, cast, Tuple, Optional
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.settings import TrainerSettings, SACSettings
from contextlib import ExitStack

action_spec: ActionSpec,
):
super().__init__()
self.action_spec = action_spec
if self.action_spec.is_continuous():
self.act_size = self.action_spec.continuous_size
num_value_outs = 1
num_action_ins = self.act_size
num_value_outs = max(sum(action_spec.discrete_branches), 1)
num_action_ins = int(action_spec.continuous_size)
else:
self.act_size = self.action_spec.discrete_branches
num_value_outs = sum(self.act_size)
num_action_ins = 0
self.q1_network = ValueNetwork(
stream_names,
observation_shapes,

)
return q1_out, q2_out
class TargetEntropy(NamedTuple):
discrete: List[float] = [] # One per branch
continuous: float = 0.0
class LogEntCoef(nn.Module):
def __init__(self, discrete, continuous):
super().__init__()
self.discrete = discrete
self.continuous = continuous
def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings):
super().__init__(policy, trainer_params)
hyperparameters: SACSettings = cast(SACSettings, trainer_params.hyperparameters)

self.policy = policy
self.act_size = policy.act_size
policy_network_settings = policy.network_settings
self.tau = hyperparameters.tau

name: int(not self.reward_signals[name].ignore_done)
for name in self.stream_names
}
self._action_spec = self.policy.behavior_spec.action_spec
self.policy.behavior_spec.action_spec,
self._action_spec,
)
self.target_network = ValueNetwork(

self.policy.actor_critic.critic, self.target_network, 1.0
)
self._log_ent_coef = torch.nn.Parameter(
torch.log(torch.as_tensor([self.init_entcoef] * len(self.act_size))),
# We create one entropy coefficient per action, whether discrete or continuous.
_disc_log_ent_coef = torch.nn.Parameter(
torch.log(
torch.as_tensor(
[self.init_entcoef] * len(self._action_spec.discrete_branches)
)
),
if self.policy.use_continuous_act:
self.target_entropy = torch.as_tensor(
-1
* self.continuous_target_entropy_scale
* np.prod(self.act_size[0]).astype(np.float32)
)
else:
self.target_entropy = [
self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
for i in self.act_size
]
_cont_log_ent_coef = torch.nn.Parameter(
torch.log(
torch.as_tensor([self.init_entcoef] * self._action_spec.continuous_size)
),
requires_grad=True,
)
self._log_ent_coef = TorchSACOptimizer.LogEntCoef(
discrete=_disc_log_ent_coef, continuous=_cont_log_ent_coef
)
_cont_target = (
-1
* self.continuous_target_entropy_scale
* np.prod(self._action_spec.continuous_size).astype(np.float32)
)
_disc_target = [
self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
for i in self._action_spec.discrete_branches
]
self.target_entropy = TorchSACOptimizer.TargetEntropy(
continuous=_cont_target, discrete=_disc_target
)
self.policy.actor_critic.distribution.parameters()
self.policy.actor_critic.action_model.parameters()
)
value_params = list(self.value_network.parameters()) + list(
self.policy.actor_critic.critic.parameters()

value_params, lr=hyperparameters.learning_rate
)
self.entropy_optimizer = torch.optim.Adam(
[self._log_ent_coef], lr=hyperparameters.learning_rate
self._log_ent_coef.parameters(), lr=hyperparameters.learning_rate
)
self._move_to_device(default_device())

q1p_out: Dict[str, torch.Tensor],
q2p_out: Dict[str, torch.Tensor],
loss_masks: torch.Tensor,
discrete: bool,
_ent_coef = torch.exp(self._log_ent_coef)
for name in values.keys():
if not discrete:
min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name])
else:
action_probs = log_probs.all_discrete_tensor.exp()
_branched_q1p = ModelUtils.break_into_branches(
q1p_out[name] * action_probs, self.act_size
)
_branched_q2p = ModelUtils.break_into_branches(
q2p_out[name] * action_probs, self.act_size
)
_q1p_mean = torch.mean(
torch.stack(
[
torch.sum(_br, dim=1, keepdim=True)
for _br in _branched_q1p
]
),
dim=0,
)
_q2p_mean = torch.mean(
torch.stack(
[
torch.sum(_br, dim=1, keepdim=True)
for _br in _branched_q2p
]
),
dim=0,
)
_cont_ent_coef = self._log_ent_coef.continuous.exp()
_disc_ent_coef = self._log_ent_coef.discrete.exp()
for name in values.keys():
if self._action_spec.discrete_size <= 0:
min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name])
else:
disc_action_probs = log_probs.all_discrete_tensor.exp()
_branched_q1p = ModelUtils.break_into_branches(
q1p_out[name] * disc_action_probs,
self._action_spec.discrete_branches,
)
_branched_q2p = ModelUtils.break_into_branches(
q2p_out[name] * disc_action_probs,
self._action_spec.discrete_branches,
)
_q1p_mean = torch.mean(
torch.stack(
[torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q1p]
),
dim=0,
)
_q2p_mean = torch.mean(
torch.stack(
[torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q2p]
),
dim=0,
)
min_policy_qs[name] = torch.min(_q1p_mean, _q2p_mean)
min_policy_qs[name] = torch.min(_q1p_mean, _q2p_mean)
if not discrete:
if self._action_spec.discrete_size <= 0:
_ent_coef * log_probs.continuous_tensor, dim=1
_cont_ent_coef * log_probs.continuous_tensor, dim=1
)
value_loss = 0.5 * ModelUtils.masked_mean(
torch.nn.functional.mse_loss(values[name], v_backup), loss_masks

disc_log_probs = log_probs.all_discrete_tensor
log_probs.all_discrete_tensor * log_probs.all_discrete_tensor.exp(),
self.act_size,
disc_log_probs * disc_log_probs.exp(),
self._action_spec.discrete_branches,
torch.sum(_ent_coef[i] * _lp, dim=1, keepdim=True)
torch.sum(_disc_ent_coef[i] * _lp, dim=1, keepdim=True)
for i, _lp in enumerate(branched_per_action_ent)
]
)

branched_ent_bonus, axis=0
)
# Add continuous entropy bonus to minimum Q
if self._action_spec.continuous_size > 0:
torch.sum(
_cont_ent_coef * log_probs.continuous_tensor,
dim=1,
keepdim=True,
)
value_loss = 0.5 * ModelUtils.masked_mean(
torch.nn.functional.mse_loss(values[name], v_backup.squeeze()),
loss_masks,

log_probs: ActionLogProbs,
q1p_outs: Dict[str, torch.Tensor],
loss_masks: torch.Tensor,
discrete: bool,
_ent_coef = torch.exp(self._log_ent_coef)
_cont_ent_coef, _disc_ent_coef = (
self._log_ent_coef.continuous,
self._log_ent_coef.discrete,
)
_cont_ent_coef = _cont_ent_coef.exp()
_disc_ent_coef = _disc_ent_coef.exp()
if not discrete:
mean_q1 = mean_q1.unsqueeze(1)
batch_policy_loss = torch.mean(
_ent_coef * log_probs.continuous_tensor - mean_q1, dim=1
)
policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
else:
action_probs = log_probs.all_discrete_tensor.exp()
batch_policy_loss = 0
if self._action_spec.discrete_size > 0:
disc_log_probs = log_probs.all_discrete_tensor
disc_action_probs = disc_log_probs.exp()
log_probs.all_discrete_tensor * action_probs, self.act_size
disc_log_probs * disc_action_probs, self._action_spec.discrete_branches
mean_q1 * action_probs, self.act_size
mean_q1 * disc_action_probs, self._action_spec.discrete_branches
torch.sum(_ent_coef[i] * _lp - _qt, dim=1, keepdim=True)
torch.sum(_disc_ent_coef[i] * _lp - _qt, dim=1, keepdim=False)
for i, (_lp, _qt) in enumerate(
zip(branched_per_action_ent, branched_q_term)
)

batch_policy_loss = torch.squeeze(branched_policy_loss)
policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
batch_policy_loss += torch.sum(branched_policy_loss, dim=1)
all_mean_q1 = torch.sum(disc_action_probs * mean_q1, dim=1)
else:
all_mean_q1 = mean_q1
if self._action_spec.continuous_size > 0:
cont_log_probs = log_probs.continuous_tensor
batch_policy_loss += torch.mean(
_cont_ent_coef * cont_log_probs - all_mean_q1.unsqueeze(1), dim=1
)
policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
self, log_probs: ActionLogProbs, loss_masks: torch.Tensor, discrete: bool
self, log_probs: ActionLogProbs, loss_masks: torch.Tensor
if not discrete:
with torch.no_grad():
target_current_diff = torch.sum(
log_probs.continuous_tensor + self.target_entropy, dim=1
)
entropy_loss = -1 * ModelUtils.masked_mean(
self._log_ent_coef * target_current_diff, loss_masks
)
else:
_cont_ent_coef, _disc_ent_coef = (
self._log_ent_coef.continuous,
self._log_ent_coef.discrete,
)
entropy_loss = 0
if self._action_spec.discrete_size > 0:
# Break continuous into separate branch
disc_log_probs = log_probs.all_discrete_tensor
log_probs.all_discrete_tensor * log_probs.all_discrete_tensor.exp(),
self.act_size,
disc_log_probs * disc_log_probs.exp(),
self._action_spec.discrete_branches,
branched_per_action_ent, self.target_entropy
branched_per_action_ent, self.target_entropy.discrete
)
],
axis=1,

)
entropy_loss = -1 * ModelUtils.masked_mean(
torch.mean(self._log_ent_coef * target_current_diff, axis=1), loss_masks
entropy_loss += -1 * ModelUtils.masked_mean(
torch.mean(_disc_ent_coef * target_current_diff, axis=1), loss_masks
)
if self._action_spec.continuous_size > 0:
with torch.no_grad():
cont_log_probs = log_probs.continuous_tensor
target_current_diff = torch.sum(
cont_log_probs + self.target_entropy.continuous, dim=1
)
# We update all the _cont_ent_coef as one block
entropy_loss += -1 * ModelUtils.masked_mean(
torch.mean(_cont_ent_coef) * target_current_diff, loss_masks
)
return entropy_loss

) -> Dict[str, torch.Tensor]:
condensed_q_output = {}
onehot_actions = ModelUtils.actions_to_onehot(discrete_actions, self.act_size)
onehot_actions = ModelUtils.actions_to_onehot(
discrete_actions, self._action_spec.discrete_branches
)
branched_q = ModelUtils.break_into_branches(item, self.act_size)
branched_q = ModelUtils.break_into_branches(
item, self._action_spec.discrete_branches
)
only_action_qs = torch.stack(
[
torch.sum(_act * _q, dim=1, keepdim=True)

value_estimates, _ = self.policy.actor_critic.critic_pass(
vec_obs, vis_obs, memories, sequence_length=self.policy.sequence_length
)
if self.policy.use_continuous_act:
squeezed_actions = actions.continuous_tensor
# Only need grad for q1, as that is used for policy.
q1p_out, q2p_out = self.value_network(
vec_obs,
vis_obs,
sampled_actions.continuous_tensor,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q2_grad=False,
)
q1_out, q2_out = self.value_network(
vec_obs,
vis_obs,
squeezed_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,
)
cont_sampled_actions = sampled_actions.continuous_tensor
cont_actions = actions.continuous_tensor
q1p_out, q2p_out = self.value_network(
vec_obs,
vis_obs,
cont_sampled_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,
)
q1_out, q2_out = self.value_network(
vec_obs,
vis_obs,
cont_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,
)
if self._action_spec.discrete_size > 0:
disc_actions = actions.discrete_tensor
q1_stream = self._condense_q_streams(q1_out, disc_actions)
q2_stream = self._condense_q_streams(q2_out, disc_actions)
else:
else:
# For discrete, you don't need to backprop through the Q for the policy
q1p_out, q2p_out = self.value_network(
vec_obs,
vis_obs,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q1_grad=False,
q2_grad=False,
)
q1_out, q2_out = self.value_network(
vec_obs,
vis_obs,
memories=q_memories,
sequence_length=self.policy.sequence_length,
)
q1_stream = self._condense_q_streams(q1_out, actions.discrete_tensor)
q2_stream = self._condense_q_streams(q2_out, actions.discrete_tensor)
with torch.no_grad():
target_values, _ = self.target_network(

sequence_length=self.policy.sequence_length,
)
masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
use_discrete = not self.policy.use_continuous_act
dones = ModelUtils.list_to_tensor(batch["done"])
q1_loss, q2_loss = self.sac_q_loss(

log_probs, value_estimates, q1p_out, q2p_out, masks, use_discrete
log_probs, value_estimates, q1p_out, q2p_out, masks
policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks, use_discrete)
entropy_loss = self.sac_entropy_loss(log_probs, masks, use_discrete)
policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks)
entropy_loss = self.sac_entropy_loss(log_probs, masks)
total_value_loss = q1_loss + q2_loss + value_loss

"Losses/Value Loss": value_loss.item(),
"Losses/Q1 Loss": q1_loss.item(),
"Losses/Q2 Loss": q2_loss.item(),
"Policy/Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef)).item(),
"Policy/Discrete Entropy Coeff": torch.mean(
torch.exp(self._log_ent_coef.discrete)
).item(),
"Policy/Continuous Entropy Coeff": torch.mean(
torch.exp(self._log_ent_coef.continuous)
).item(),
"Policy/Learning Rate": decay_lr,
}

3
ml-agents/mlagents/trainers/simple_env_manager.py


self.previous_all_action_info = all_action_info
for brain_name, action_info in all_action_info.items():
_action = EnvManager.action_tuple_from_numpy_dict(action_info.action)
self.env.set_actions(brain_name, _action)
self.env.set_actions(brain_name, action_info.action)
self.env.step()
all_step_result = self._generate_all_results()

7
ml-agents/mlagents/trainers/subprocess_env_manager.py


if req.cmd == EnvironmentCommand.STEP:
all_action_info = req.payload
for brain_name, action_info in all_action_info.items():
if len(action_info.action) != 0:
_action = EnvManager.action_tuple_from_numpy_dict(
action_info.action
)
env.set_actions(brain_name, _action)
if len(action_info.agent_ids) > 0:
env.set_actions(brain_name, action_info.action)
env.step()
all_step_result = _generate_all_results()
# The timers in this process are independent from all the processes and the "main" process

20
ml-agents/mlagents/trainers/tests/mock_brain.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents_envs.base_env import (
DecisionSteps,

ActionTuple,
)

steps_list = []
action_size = action_spec.discrete_size + action_spec.continuous_size
action_probs = {
"action_probs": np.ones(
int(np.sum(action_spec.discrete_branches) + action_spec.continuous_size),
dtype=np.float32,
)
}
for _i in range(length - 1):
obs = []
for _shape in observation_shapes:

if action_spec.is_continuous():
action = {"continuous_action": np.zeros(action_size, dtype=np.float32)}
else:
action = {"discrete_action": np.zeros(action_size, dtype=np.float32)}
action = ActionTuple(
continuous=np.zeros(action_spec.continuous_size, dtype=np.float32),
discrete=np.zeros(action_spec.discrete_size, dtype=np.int32),
)
action_probs = LogProbsTuple(
continuous=np.ones(action_spec.continuous_size, dtype=np.float32),
discrete=np.ones(action_spec.discrete_size, dtype=np.float32),
)
action_pre = np.zeros(action_size, dtype=np.float32)
action_mask = (
[

49
ml-agents/mlagents/trainers/tests/simple_test_envs.py


OBS_SIZE = 1
VIS_OBS_SIZE = (20, 20, 3)
STEP_SIZE = 0.1
STEP_SIZE = 0.2
TIME_PENALTY = 0.01
MIN_STEPS = int(1.0 / STEP_SIZE) + 1

def __init__(
self,
brain_names,
use_discrete,
action_size=1,
action_sizes=(1, 0),
self.discrete = use_discrete
if use_discrete:
action_spec = ActionSpec.create_discrete(
tuple(2 for _ in range(action_size))
)
else:
action_spec = ActionSpec.create_continuous(action_size)
continuous_action_size, discrete_action_size = action_sizes
discrete_tuple = tuple(2 for _ in range(discrete_action_size))
action_spec = ActionSpec(continuous_action_size, discrete_tuple)
self.total_action_size = (
continuous_action_size + discrete_action_size
) # to set the goals/positions
self.action_spec = action_spec
self.action_size = action_size
self.names = brain_names
self.positions: Dict[str, List[float]] = {}
self.step_count: Dict[str, float] = {}

def _take_action(self, name: str) -> bool:
deltas = []
_act = self.action[name]
if self.action_spec.continuous_size > 0:
for _cont in _act.continuous[0]:
deltas.append(_cont)
if self.action_spec.continuous_size > 0:
for _cont in _act.continuous[0]:
deltas.append(_cont)
for i, _delta in enumerate(deltas):
_delta = clamp(_delta, -self.step_size, self.step_size)
self.positions[name][i] += _delta

return done
def _generate_mask(self):
if self.discrete:
action_mask = None
if self.action_spec.discrete_size > 0:
ndmask = np.array(2 * self.action_size * [False], dtype=np.bool)
ndmask = np.array(
2 * self.action_spec.discrete_size * [False], dtype=np.bool
)
else:
action_mask = None
return action_mask
def _compute_reward(self, name: str, done: bool) -> float:

def _reset_agent(self, name):
self.goal[name] = self.random.choice([-1, 1])
self.positions[name] = [0.0 for _ in range(self.action_size)]
self.positions[name] = [0.0 for _ in range(self.total_action_size)]
self.step_count[name] = 0
self.rewards[name] = 0
self.agent_id[name] = self.agent_id[name] + 1

class MemoryEnvironment(SimpleEnvironment):
def __init__(self, brain_names, use_discrete, step_size=0.2):
super().__init__(brain_names, use_discrete, step_size=step_size)
def __init__(self, brain_names, action_sizes=(1, 0), step_size=0.2):
super().__init__(brain_names, action_sizes=action_sizes, step_size=step_size)
# Number of steps to reveal the goal for. Lower is harder. Should be
# less than 1/step_size to force agent to use memory
self.num_show_steps = 2

def __init__(
self,
brain_names,
use_discrete,
action_sizes=(1, 0),
use_discrete,
action_sizes=action_sizes,
)
self.demonstration_protos: Dict[str, List[AgentInfoActionPairProto]] = {}
self.n_demos = n_demos

def step(self) -> None:
super().step()
for name in self.names:
if self.discrete:
if self.action_spec.discrete_size > 0:
action = self.action[name].discrete
else:
action = self.action[name].continuous

self.reset()
for _ in range(self.n_demos):
for name in self.names:
if self.discrete:
if self.action_spec.discrete_size > 0:
self.action[name] = ActionTuple(
np.array([], dtype=np.float32),
np.array(

66
ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py


dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
behavior_spec = optimizer.policy.behavior_spec
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
if discrete:
n_agents = len(update_buffer["discrete_log_probs"])
update_buffer["discrete_log_probs"] = np.ones(
(n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
dtype=np.float32,
)
else:
n_agents = len(update_buffer["continuous_log_probs"])
update_buffer["continuous_log_probs"] = np.ones(
(n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
behavior_spec = optimizer.policy.behavior_spec
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]

# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
if discrete:
n_agents = len(update_buffer["discrete_log_probs"])
update_buffer["discrete_log_probs"] = np.ones(
(n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
dtype=np.float32,
)
else:
n_agents = len(update_buffer["continuous_log_probs"])
update_buffer["continuous_log_probs"] = np.ones(
(n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

use_visual=False,
)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
behavior_spec = optimizer.policy.behavior_spec
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]

# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
n_agents = len(update_buffer["continuous_log_probs"])
update_buffer["continuous_log_probs"] = np.ones(
(n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

buffer["curiosity_returns"] = buffer["environment_rewards"]
buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
buffer["advantages"] = buffer["environment_rewards"]
# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
if use_discrete:
n_agents = len(buffer["discrete_log_probs"])
buffer["discrete_log_probs"].reset_field()
for _ in range(n_agents):
buffer["discrete_log_probs"].append(
np.ones(
int(sum(mock_behavior_spec.action_spec.discrete_branches)),
dtype=np.float32,
)
)
else:
n_agents = len(buffer["continuous_log_probs"])
buffer["continuous_log_probs"].reset_field()
for _ in range(n_agents):
buffer["continuous_log_probs"].append(
np.ones(
mock_behavior_spec.action_spec.continuous_size, dtype=np.float32
)
)
trainer.update_buffer = buffer
trainer._update_policy()

114
ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py


assert all(reward > success_threshold for reward in processed_rewards)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ppo(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_ppo(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_ppo(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
def test_2d_ppo(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
new_hyperparams = attr.evolve(
PPO_TF_CONFIG.hyperparameters, batch_size=64, buffer_size=640
)

_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_visual_ppo(num_visual, use_discrete):
def test_visual_ppo(num_visual, action_sizes):
use_discrete=use_discrete,
action_sizes=action_sizes,
num_visual=num_visual,
num_vector=0,
step_size=0.2,

def test_visual_advanced_ppo(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
action_sizes=(0, 1),
num_visual=num_visual,
num_vector=0,
step_size=0.5,

_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_ppo(use_discrete):
env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_recurrent_ppo(action_sizes):
env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes)
new_network_settings = attr.evolve(
PPO_TF_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16),

_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_sac(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_sac(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_sac(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
def test_2d_sac(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
new_hyperparams = attr.evolve(SAC_TF_CONFIG.hyperparameters, buffer_init_steps=2000)
config = attr.evolve(
SAC_TF_CONFIG,

_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_visual_sac(num_visual, use_discrete):
def test_visual_sac(num_visual, action_sizes):
use_discrete=use_discrete,
action_sizes=action_sizes,
num_visual=num_visual,
num_vector=0,
step_size=0.2,

def test_visual_advanced_sac(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
action_sizes=(0, 1),
num_visual=num_visual,
num_vector=0,
step_size=0.5,

SAC_TF_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=100,
max_steps=200,
framework=FrameworkType.TENSORFLOW,
)
# The number of steps is pretty small for these encoders

@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_sac(use_discrete):
step_size = 0.2 if use_discrete else 0.5
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_recurrent_sac(action_sizes):
step_size = 0.2 if action_sizes == (0, 1) else 0.5
[BRAIN_NAME], use_discrete=use_discrete, step_size=step_size
[BRAIN_NAME], action_sizes=action_sizes, step_size=step_size
)
new_networksettings = attr.evolve(
SAC_TF_CONFIG.network_settings,

_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_ghost(action_sizes):
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes
)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000

_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost_fails(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_ghost_fails(action_sizes):
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes
)
# This config should fail because the ghosted policy is never swapped with a competent policy.
# Swap occurs after max step is reached.

)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_asymm_ghost(action_sizes):
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes
)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0,

_check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost_fails(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_asymm_ghost_fails(action_sizes):
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes
)
# This config should fail because the team that us not learning when both have reached
# max step should be executing the initial, untrained poliy.

@pytest.fixture(scope="session")
def simple_record(tmpdir_factory):
def record_demo(use_discrete, num_visual=0, num_vector=1):
def record_demo(action_sizes, num_visual=0, num_vector=1):
use_discrete=use_discrete,
action_sizes=action_sizes,
num_visual=num_visual,
num_vector=num_vector,
n_demos=100,

env.solve()
continuous_size, discrete_size = action_sizes
use_discrete = True if discrete_size > 0 else False
agent_info_protos = env.demonstration_protos[BRAIN_NAME]
meta_data_proto = DemonstrationMetaProto()
brain_param_proto = BrainParametersProto(

return record_demo
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_gail(simple_record, use_discrete, trainer_config):
demo_path = simple_record(use_discrete)
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
def test_gail(simple_record, action_sizes, trainer_config):
demo_path = simple_record(action_sizes)
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.2)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)

_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_ppo(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_gail_visual_ppo(simple_record, action_sizes):
demo_path = simple_record(action_sizes, num_visual=1, num_vector=0)
use_discrete=use_discrete,
action_sizes=action_sizes,
step_size=0.2,
)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)

_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_sac(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_gail_visual_sac(simple_record, action_sizes):
demo_path = simple_record(action_sizes, num_visual=1, num_vector=0)
use_discrete=use_discrete,
action_sizes=action_sizes,
step_size=0.2,
)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)

2
ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py


behavior_spec = basic_behavior_spec()
policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output")
policy_eval_out = {
"action": {"continuous_action": np.array([1.0], dtype=np.float32)},
"action": np.array([[1.0]], dtype=np.float32),
"memory_out": np.array([[2.5]], dtype=np.float32),
"value": np.array([1.1], dtype=np.float32),
}

27
ml-agents/mlagents/trainers/tests/test_agent_processor.py


AgentManagerQueue,
)
from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents_envs.base_env import ActionSpec
from mlagents_envs.base_env import ActionSpec, ActionTuple
def create_mock_policy():

)
fake_action_outputs = {
"action": {"continuous_action": [0.1, 0.1]},
"action": ActionTuple(continuous=np.array([[0.1], [0.1]])),
"log_probs": {"continuous_log_probs": [0.1, 0.1]},
"log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])),
}
mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
num_agents=2,

fake_action_info = ActionInfo(
action={"continuous_action": [0.1, 0.1]},
action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
value=[0.1, 0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_steps.agent_id,

max_trajectory_length=5,
stats_reporter=StatsReporter("testcat"),
)
"action": {"continuous_action": [0.1]},
"action": ActionTuple(continuous=np.array([[0.1]])),
"log_probs": {"continuous_log_probs": [0.1]},
"log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
mock_decision_step, mock_terminal_step = mb.create_mock_steps(
num_agents=1,
observation_shapes=[(8,)],

done=True,
)
fake_action_info = ActionInfo(
action={"continuous_action": [0.1]},
action=ActionTuple(continuous=np.array([[0.1]])),
value=[0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_step.agent_id,

mock_decision_step, mock_terminal_step, _ep, fake_action_info
)
add_calls.append(
mock.call([get_global_agent_id(_ep, 0)], {"continuous_action": [0.1]})
mock.call([get_global_agent_id(_ep, 0)], fake_action_outputs["action"])
)
processor.add_experiences(
mock_done_decision_step, mock_done_terminal_step, _ep, fake_action_info

max_trajectory_length=5,
stats_reporter=StatsReporter("testcat"),
)
"action": {"continuous_action": [0.1]},
"action": ActionTuple(continuous=np.array([[0.1]])),
"log_probs": {"continuous_log_probs": [0.1]},
"log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
mock_decision_step, mock_terminal_step = mb.create_mock_steps(
num_agents=1,
observation_shapes=[(8,)],

action={"continuous_action": [0.1]},
action=ActionTuple(continuous=np.array([[0.1]])),
value=[0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_step.agent_id,

2
ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py


@pytest.mark.parametrize("num_envs", [1, 4])
def test_subprocess_env_endtoend(num_envs):
def simple_env_factory(worker_id, config):
env = SimpleEnvironment(["1D"], use_discrete=True)
env = SimpleEnvironment(["1D"], action_sizes=(0, 1))
return env
env_manager = SubprocessEnvManager(

4
ml-agents/mlagents/trainers/tests/test_trajectory.py


"done",
"actions_pre",
"continuous_action",
"action_probs",
"discrete_action",
"continuous_log_probs",
"discrete_log_probs",
"action_mask",
"prev_action",
"environment_rewards",

2
ml-agents/mlagents/trainers/tests/torch/test_distributions.py


optimizer = torch.optim.Adam(gauss_dist.parameters(), lr=3e-3)
for _ in range(50):
dist_inst = gauss_dist(sample_embedding)[0]
dist_inst = gauss_dist(sample_embedding)
if tanh_squash:
assert isinstance(dist_inst, TanhGaussianDistInstance)
else:

78
ml-agents/mlagents/trainers/tests/torch/test_networks.py


from mlagents.trainers.torch.networks import (
NetworkBody,
ValueNetwork,
SimpleActor,
from mlagents.trainers.torch.distributions import (
GaussianDistInstance,
CategoricalDistInstance,
)
from mlagents_envs.base_env import ActionSpec

assert _out[0] == pytest.approx(1.0, abs=0.1)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_actor(use_discrete):
obs_size = 4
network_settings = NetworkSettings()
obs_shapes = [(obs_size,)]
act_size = [2]
if use_discrete:
masks = torch.ones((1, 1))
action_spec = ActionSpec.create_discrete(tuple(act_size))
else:
masks = None
action_spec = ActionSpec.create_continuous(act_size[0])
actor = SimpleActor(obs_shapes, network_settings, action_spec)
# Test get_dist
sample_obs = torch.ones((1, obs_size))
dists, _ = actor.get_dists([sample_obs], [], masks=masks)
for dist in dists:
if use_discrete:
assert isinstance(dist, CategoricalDistInstance)
else:
assert isinstance(dist, GaussianDistInstance)
# Test sample_actions
actions = actor.sample_action(dists)
for act in actions:
if use_discrete:
assert act.shape == (1, 1)
else:
assert act.shape == (1, act_size[0])
# Test forward
actions, ver_num, mem_size, is_cont, act_size_vec = actor.forward(
[sample_obs], [], masks=masks
)
for act in actions:
# This is different from above for ONNX export
if use_discrete:
assert act.shape == tuple(act_size)
else:
assert act.shape == (act_size[0], 1)
assert mem_size == 0
assert is_cont == int(not use_discrete)
assert act_size_vec == torch.tensor(act_size)
@pytest.mark.parametrize("ac_type", [SharedActorCritic, SeparateActorCritic])
@pytest.mark.parametrize("lstm", [True, False])
def test_actor_critic(ac_type, lstm):

)
obs_shapes = [(obs_size,)]
act_size = [2]
act_size = 2
mask = torch.ones([1, act_size * 2])
action_spec = ActionSpec.create_continuous(act_size[0])
# action_spec = ActionSpec.create_continuous(act_size[0])
action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size)))
actor = ac_type(obs_shapes, network_settings, action_spec, stream_names)
if lstm:
sample_obs = torch.ones((1, network_settings.memory.sequence_length, obs_size))

else:
assert value_out[stream].shape == (1,)
# Test get_dist_and_value
dists, value_out, mem_out = actor.get_dist_and_value(
[sample_obs], [], memories=memories
# Test get action stats and_value
action, log_probs, entropies, value_out, mem_out = actor.get_action_stats_and_value(
[sample_obs], [], memories=memories, masks=mask
if lstm:
assert action.continuous_tensor.shape == (64, 2)
else:
assert action.continuous_tensor.shape == (1, 2)
assert len(action.discrete_list) == 2
for _disc in action.discrete_list:
if lstm:
assert _disc.shape == (64, 1)
else:
assert _disc.shape == (1, 1)
for dist in dists:
assert isinstance(dist, GaussianDistInstance)
for stream in stream_names:
if lstm:
assert value_out[stream].shape == (network_settings.memory.sequence_length,)

13
ml-agents/mlagents/trainers/tests/torch/test_policy.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.torch.utils import ModelUtils, AgentAction
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.agent_action import AgentAction
VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8

run_out = policy.evaluate(decision_step, list(decision_step.agent_id))
if discrete:
run_out["action"]["discrete_action"].shape == (
NUM_AGENTS,
len(DISCRETE_ACTION_SPACE),
)
run_out["action"].discrete.shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
assert run_out["action"]["continuous_action"].shape == (
NUM_AGENTS,
VECTOR_ACTION_SPACE,
)
assert run_out["action"].continuous.shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])

28
ml-agents/mlagents/trainers/tests/torch/test_ppo.py


update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
if discrete:
update_buffer["discrete_log_probs"] = np.ones_like(
update_buffer["discrete_action"]
)
else:
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
return_stats = optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"]
# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
if discrete:
update_buffer["discrete_log_probs"] = np.ones_like(
update_buffer["discrete_action"]
)
else:
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["gail_returns"] = update_buffer["environment_rewards"]
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

3
ml-agents/mlagents/trainers/tests/torch/test_sac.py


"Losses/Value Loss",
"Losses/Q1 Loss",
"Losses/Q2 Loss",
"Policy/Entropy Coeff",
"Policy/Continuous Entropy Coeff",
"Policy/Discrete Entropy Coeff",
"Policy/Learning Rate",
]
for stat in required_stats:

118
ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py


SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ppo(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_ppo(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_ppo(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
def test_2d_ppo(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
new_hyperparams = attr.evolve(
PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=640
)

check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_visual_ppo(num_visual, use_discrete):
def test_visual_ppo(num_visual, action_sizes):
use_discrete=use_discrete,
action_sizes=action_sizes,
num_visual=num_visual,
num_vector=0,
step_size=0.2,

def test_visual_advanced_ppo(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
action_sizes=(0, 1),
num_visual=num_visual,
num_vector=0,
step_size=0.5,

check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_ppo(use_discrete):
env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_recurrent_ppo(action_sizes):
env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes)
new_network_settings = attr.evolve(
PPO_TORCH_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16),

check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_sac(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_sac(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_sac(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
def test_2d_sac(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=6000
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_visual_sac(num_visual, use_discrete):
def test_visual_sac(num_visual, action_sizes):
use_discrete=use_discrete,
action_sizes=action_sizes,
num_visual=num_visual,
num_vector=0,
step_size=0.2,

def test_visual_advanced_sac(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
action_sizes=(0, 1),
num_visual=num_visual,
num_vector=0,
step_size=0.5,

check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_sac(use_discrete):
step_size = 0.2 if use_discrete else 0.5
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_recurrent_sac(action_sizes):
step_size = 0.2 if action_sizes == (0, 1) else 0.5
[BRAIN_NAME], use_discrete=use_discrete, step_size=step_size
[BRAIN_NAME], action_sizes=action_sizes, step_size=step_size
)
new_networksettings = attr.evolve(
SAC_TORCH_CONFIG.network_settings,

check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_ghost(action_sizes):
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes
)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000

@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost_fails(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_ghost_fails(action_sizes):
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes
)
# This config should fail because the ghosted policy is never swapped with a competent policy.
# Swap occurs after max step is reached.

)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_asymm_ghost(action_sizes):
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes
)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0,

check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost_fails(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_asymm_ghost_fails(action_sizes):
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes
)
# This config should fail because the team that us not learning when both have reached
# max step should be executing the initial, untrained poliy.

@pytest.fixture(scope="session")
def simple_record(tmpdir_factory):
def record_demo(use_discrete, num_visual=0, num_vector=1):
def record_demo(action_sizes, num_visual=0, num_vector=1):
use_discrete=use_discrete,
action_sizes=action_sizes,
num_visual=num_visual,
num_vector=num_vector,
n_demos=100,

agent_info_protos = env.demonstration_protos[BRAIN_NAME]
meta_data_proto = DemonstrationMetaProto()
brain_param_proto = BrainParametersProto(
vector_action_size=[2] if use_discrete else [1],
vector_action_size=[2] if action_sizes else [1],
vector_action_space_type=discrete if use_discrete else continuous,
vector_action_space_type=discrete if action_sizes else continuous,
action_type = "Discrete" if use_discrete else "Continuous"
action_type = "Discrete" if action_sizes else "Continuous"
demo_path_name = "1DTest" + action_type + ".demo"
demo_path = str(tmpdir_factory.mktemp("tmp_demo").join(demo_path_name))
write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos)

@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_gail(simple_record, use_discrete, trainer_config):
demo_path = simple_record(use_discrete)
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
def test_gail(simple_record, action_sizes, trainer_config):
demo_path = simple_record(action_sizes)
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.2)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)

check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_ppo(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_gail_visual_ppo(simple_record, action_sizes):
demo_path = simple_record(action_sizes, num_visual=1, num_vector=0)
use_discrete=use_discrete,
action_sizes=action_sizes,
step_size=0.2,
)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)

check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_sac(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_gail_visual_sac(simple_record, action_sizes):
demo_path = simple_record(action_sizes, num_visual=1, num_vector=0)
use_discrete=use_discrete,
action_sizes=action_sizes,
step_size=0.2,
)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)

47
ml-agents/mlagents/trainers/tests/torch/test_utils.py


from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.torch.encoders import VectorInput
from mlagents.trainers.torch.distributions import (
CategoricalDistInstance,
GaussianDistInstance,
)
def test_min_visual_size():

]
for res, exp in zip(oh_actions, expected_result):
assert torch.equal(res, exp)
def test_get_probs_and_entropy():
# Test continuous
# Add two dists to the list. This isn't done in the code but we'd like to support it.
dist_list = [
GaussianDistInstance(torch.zeros((1, 2)), torch.ones((1, 2))),
GaussianDistInstance(torch.zeros((1, 2)), torch.ones((1, 2))),
]
action_list = [torch.zeros((1, 2)), torch.zeros((1, 2))]
log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy(
action_list, dist_list
)
for lp in log_probs:
assert lp.shape == (1, 2)
assert entropies.shape == (1, 2, 2)
assert all_probs == []
for log_prob in log_probs:
# Log prob of standard normal at 0
for lp in log_prob.flatten():
assert lp == pytest.approx(-0.919, abs=0.01)
for ent in entropies.flatten():
# entropy of standard normal at 0
assert ent == pytest.approx(1.42, abs=0.01)
# Test continuous
# Add two dists to the list.
act_size = 2
test_prob = torch.tensor(
[[1.0 - 0.1 * (act_size - 1)] + [0.1] * (act_size - 1)]
) # High prob for first action
dist_list = [CategoricalDistInstance(test_prob), CategoricalDistInstance(test_prob)]
action_list = [torch.tensor([0]), torch.tensor([1])]
log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy(
action_list, dist_list
)
for all_prob in all_probs:
assert all_prob.shape == (1, act_size)
assert entropies.shape == (1, len(dist_list))
# Make sure the first action has high probability than the others.
assert log_probs[0] > log_probs[1]
def test_masked_mean():

35
ml-agents/mlagents/trainers/torch/components/bc/module.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.settings import BehavioralCloningSettings, ScheduleType
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils
class BCModule:

log_probs: ActionLogProbs,
expert_actions: torch.Tensor,
) -> torch.Tensor:
if self.policy.use_continuous_act:
bc_loss = torch.nn.functional.mse_loss(
selected_actions.continuous_tensor, expert_actions
bc_loss = 0
if self.policy.behavior_spec.action_spec.continuous_size > 0:
bc_loss += torch.nn.functional.mse_loss(
selected_actions.continuous_tensor, expert_actions.continuous_tensor
else:
if self.policy.behavior_spec.action_spec.discrete_size > 0:
one_hot_expert_actions = ModelUtils.actions_to_onehot(
expert_actions.discrete_tensor,
self.policy.behavior_spec.action_spec.discrete_branches,
)
bc_loss = torch.mean(
bc_loss += torch.mean(
torch.stack(
[
torch.sum(

)
for log_prob_branch, expert_actions_branch in zip(
log_prob_branches, expert_actions
log_prob_branches, one_hot_expert_actions
)
]
)

"""
vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])]
act_masks = None
if self.policy.use_continuous_act:
expert_actions = ModelUtils.list_to_tensor(
mini_batch_demo["continuous_action"]
)
else:
raw_expert_actions = ModelUtils.list_to_tensor(
mini_batch_demo["discrete_action"], dtype=torch.long
)
expert_actions = ModelUtils.actions_to_onehot(
raw_expert_actions, self.policy.act_size
)
expert_actions = AgentAction.from_dict(mini_batch_demo)
if self.policy.behavior_spec.action_spec.discrete_size > 0:
act_masks = ModelUtils.list_to_tensor(
np.ones(
(

75
ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py


import numpy as np
from typing import Dict
from typing import Dict, NamedTuple
from mlagents.torch_utils import torch, default_device
from mlagents.trainers.buffer import AgentBuffer

from mlagents.trainers.settings import CuriositySettings
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.torch.utils import ModelUtils, AgentAction
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_flattener import ActionFlattener
from mlagents.trainers.torch.utils import ModelUtils
class ActionPredictionTuple(NamedTuple):
continuous: torch.Tensor
discrete: torch.Tensor
class CuriosityRewardProvider(BaseRewardProvider):

specs.observation_shapes, state_encoder_settings
)
self._action_flattener = ModelUtils.ActionFlattener(self._action_spec)
self._action_flattener = ActionFlattener(self._action_spec)
self.inverse_model_action_prediction = torch.nn.Sequential(
LinearEncoder(2 * settings.encoding_size, 1, 256),
linear_layer(256, self._action_flattener.flattened_size),
self.inverse_model_action_encoding = torch.nn.Sequential(
LinearEncoder(2 * settings.encoding_size, 1, 256)
if self._action_spec.continuous_size > 0:
self.continuous_action_prediction = linear_layer(
256, self._action_spec.continuous_size
)
if self._action_spec.discrete_size > 0:
self.discrete_action_prediction = linear_layer(
256, sum(self._action_spec.discrete_branches)
)
self.forward_model_next_state_prediction = torch.nn.Sequential(
LinearEncoder(
settings.encoding_size + self._action_flattener.flattened_size, 1, 256

)
return hidden
def predict_action(self, mini_batch: AgentBuffer) -> torch.Tensor:
def predict_action(self, mini_batch: AgentBuffer) -> ActionPredictionTuple:
"""
In the continuous case, returns the predicted action.
In the discrete case, returns the logits.

)
hidden = self.inverse_model_action_prediction(inverse_model_input)
if self._action_spec.is_continuous():
return hidden
else:
continuous_pred = None
discrete_pred = None
hidden = self.inverse_model_action_encoding(inverse_model_input)
if self._action_spec.continuous_size > 0:
continuous_pred = self.continuous_action_prediction(hidden)
if self._action_spec.discrete_size > 0:
raw_discrete_pred = self.discrete_action_prediction(hidden)
hidden, self._action_spec.discrete_branches
raw_discrete_pred, self._action_spec.discrete_branches
return torch.cat(branches, dim=1)
discrete_pred = torch.cat(branches, dim=1)
return ActionPredictionTuple(continuous_pred, discrete_pred)
def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor:
"""

actions = AgentAction.from_dict(mini_batch)
if self._action_spec.is_continuous():
action = actions.continuous_tensor
else:
action = torch.cat(
ModelUtils.actions_to_onehot(
actions.discrete_tensor, self._action_spec.discrete_branches
),
dim=1,
)
flattened_action = self._action_flattener.forward(actions)
(self.get_current_state(mini_batch), action), dim=1
(self.get_current_state(mini_batch), flattened_action), dim=1
)
return self.forward_model_next_state_prediction(forward_model_input)

"""
predicted_action = self.predict_action(mini_batch)
actions = AgentAction.from_dict(mini_batch)
if self._action_spec.is_continuous():
sq_difference = (actions.continuous_tensor - predicted_action) ** 2
_inverse_loss = 0
if self._action_spec.continuous_size > 0:
sq_difference = (
actions.continuous_tensor - predicted_action.continuous
) ** 2
return torch.mean(
_inverse_loss += torch.mean(
ModelUtils.dynamic_partition(
sq_difference,
ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float),

else:
if self._action_spec.discrete_size > 0:
true_action = torch.cat(
ModelUtils.actions_to_onehot(
actions.discrete_tensor, self._action_spec.discrete_branches

cross_entropy = torch.sum(
-torch.log(predicted_action + self.EPSILON) * true_action, dim=1
-torch.log(predicted_action.discrete + self.EPSILON) * true_action,
dim=1,
return torch.mean(
_inverse_loss += torch.mean(
ModelUtils.dynamic_partition(
cross_entropy,
ModelUtils.list_to_tensor(

)[1]
)
return _inverse_loss
def compute_reward(self, mini_batch: AgentBuffer) -> torch.Tensor:
"""

6
ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py


)
from mlagents.trainers.settings import GAILSettings
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.torch.utils import ModelUtils, AgentAction
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_flattener import ActionFlattener
from mlagents.trainers.torch.networks import NetworkBody
from mlagents.trainers.torch.layers import linear_layer, Initialization
from mlagents.trainers.settings import NetworkSettings, EncoderType

vis_encode_type=EncoderType.SIMPLE,
memory=None,
)
self._action_flattener = ModelUtils.ActionFlattener(specs.action_spec)
self._action_flattener = ActionFlattener(specs.action_spec)
unencoded_size = (
self._action_flattener.flattened_size + 1 if settings.use_actions else 0
) # +1 is for dones

19
ml-agents/mlagents/trainers/torch/distributions.py


"""
pass
@abc.abstractmethod
def exported_model_output(self) -> torch.Tensor:
"""
Returns the tensor to be exported to ONNX for the distribution
"""
pass
class DiscreteDistInstance(DistInstance):
@abc.abstractmethod

def entropy(self):
return 0.5 * torch.log(2 * math.pi * math.e * self.std + EPSILON)
def exported_model_output(self):
return self.sample()
class TanhGaussianDistInstance(GaussianDistInstance):

return torch.log(self.probs)
def entropy(self):
return -torch.sum(self.probs * torch.log(self.probs), dim=-1)
return -torch.sum(self.probs * torch.log(self.probs), dim=-1).unsqueeze(-1)
def exported_model_output(self):
return self.all_log_prob()
class GaussianDistribution(nn.Module):

# verified version of Barracuda (1.0.2).
log_sigma = torch.cat([self.log_sigma] * inputs.shape[0], axis=0)
if self.tanh_squash:
return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))]
return TanhGaussianDistInstance(mu, torch.exp(log_sigma))
return [GaussianDistInstance(mu, torch.exp(log_sigma))]
return GaussianDistInstance(mu, torch.exp(log_sigma))
class MultiCategoricalDistribution(nn.Module):

183
ml-agents/mlagents/trainers/torch/networks.py


from mlagents.torch_utils import torch, nn
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.torch.distributions import (
GaussianDistribution,
MultiCategoricalDistribution,
DistInstance,
)
from mlagents.trainers.torch.action_model import ActionModel
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.decoders import ValueHeads

else 0
)
self.visual_processors, self.vector_processors, encoder_input_size = ModelUtils.create_input_processors(
(
self.visual_processors,
self.vector_processors,
encoder_input_size,
) = ModelUtils.create_input_processors(
observation_shapes,
self.h_size,
network_settings.vis_encode_type,

pass
@abc.abstractmethod
def sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
"""
Takes a List of Distribution iinstances and samples an action from each.
"""
pass
@abc.abstractmethod
def get_dists(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[List[DistInstance], Optional[torch.Tensor]]:
"""
Returns distributions from this Actor, from which actions can be sampled.
If memory is enabled, return the memories as well.
:param vec_inputs: A List of vector inputs as tensors.
:param vis_inputs: A List of visual inputs as tensors.
:param masks: If using discrete actions, a Tensor of action masks.
:param memories: If using memory, a Tensor of initial memories.
:param sequence_length: If using memory, the sequence length.
:return: A Tuple of a List of action distribution instances, and memories.
Memories will be None if not using memory.
"""
pass
@abc.abstractmethod
def forward(
self,
vec_inputs: List[torch.Tensor],

pass
@abc.abstractmethod
def get_dist_and_value(
def get_action_stats_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],

) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
) -> Tuple[
AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
]:
"""
Returns distributions, from which actions can be sampled, and value estimates.
If memory is enabled, return the memories as well.

:param memories: If using memory, a Tensor of initial memories.
:param sequence_length: If using memory, the sequence length.
:return: A Tuple of a List of action distribution instances, a Dict of reward signal
:return: A Tuple of AgentAction, ActionLogProbs, entropies, Dict of reward signal
name to value estimate, and memories. Memories will be None if not using memory.
"""
pass

else:
self.encoding_size = network_settings.hidden_units
if self.action_spec.is_continuous():
self.distribution = GaussianDistribution(
self.encoding_size,
self.action_spec.continuous_size,
conditional_sigma=conditional_sigma,
tanh_squash=tanh_squash,
)
else:
self.distribution = MultiCategoricalDistribution(
self.encoding_size, self.action_spec.discrete_branches
)
self.action_model = ActionModel(
self.encoding_size,
action_spec,
conditional_sigma=conditional_sigma,
tanh_squash=tanh_squash,
)
@property
def memory_size(self) -> int:

self.network_body.update_normalization(vector_obs)
def sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
actions = []
for action_dist in dists:
action = action_dist.sample()
actions.append(action)
return actions
def get_dists(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[List[DistInstance], Optional[torch.Tensor]]:
encoding, memories = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
)
if self.action_spec.is_continuous():
dists = self.distribution(encoding)
else:
dists = self.distribution(encoding, masks)
return dists, memories
def forward(
self,
vec_inputs: List[torch.Tensor],

"""
Note: This forward() method is required for exporting to ONNX. Don't modify the inputs and outputs.
"""
dists, _ = self.get_dists(vec_inputs, vis_inputs, masks, memories, 1)
if self.action_spec.is_continuous():
action_list = self.sample_action(dists)
action_out = torch.stack(action_list, dim=-1)
else:
action_out = torch.cat([dist.all_log_prob() for dist in dists], dim=1)
encoding, memories_out = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=1
)
# TODO: How this is written depends on how the inference model is structured
action_out = self.action_model.get_action_out(encoding, masks)
return (
action_out,
self.version_number,

conditional_sigma: bool = False,
tanh_squash: bool = False,
):
self.use_lstm = network_settings.memory is not None
super().__init__(
observation_shapes,
network_settings,

)
return self.value_heads(encoding), memories_out
def get_dist_and_value(
def get_stats_and_value(
actions: AgentAction,
) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
if self.action_spec.is_continuous():
dists = self.distribution(encoding)
else:
dists = self.distribution(encoding, masks=masks)
log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
value_outputs = self.value_heads(encoding)
return log_probs, entropies, value_outputs
def get_action_stats_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[
AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
]:
encoding, memories = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
)
action, log_probs, entropies = self.action_model(encoding, masks)
return dists, value_outputs, memories
return action, log_probs, entropies, value_outputs, memories
class SeparateActorCritic(SimpleActor, ActorCritic):

conditional_sigma: bool = False,
tanh_squash: bool = False,
):
# Give the Actor only half the memories. Note we previously validate
# that memory_size must be a multiple of 4.
self.use_lstm = network_settings.memory is not None
super().__init__(
observation_shapes,

memories_out = None
return value_outputs, memories_out
def get_dist_and_value(
def get_stats_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
actions: AgentAction,
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
if self.use_lstm:
# Use only the back half of memories for critic and actor
actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1)
else:
critic_mem = None
actor_mem = None
encoding, actor_mem_outs = self.network_body(
vec_inputs, vis_inputs, memories=actor_mem, sequence_length=sequence_length
)
log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
value_outputs, critic_mem_outs = self.critic(
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
)
return log_probs, entropies, value_outputs
def get_action_stats_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],

) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
) -> Tuple[
AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
]:
if self.use_lstm:
# Use only the back half of memories for critic and actor
actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1)

dists, actor_mem_outs = self.get_dists(
vec_inputs,
vis_inputs,
memories=actor_mem,
sequence_length=sequence_length,
masks=masks,
encoding, actor_mem_outs = self.network_body(
vec_inputs, vis_inputs, memories=actor_mem, sequence_length=sequence_length
action, log_probs, entropies = self.action_model(encoding, masks)
value_outputs, critic_mem_outs = self.critic(
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
)

mem_out = None
return dists, value_outputs, mem_out
def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
super().update_normalization(vector_obs)
self.critic.network_body.update_normalization(vector_obs)
return action, log_probs, entropies, value_outputs, mem_out
class GlobalSteps(nn.Module):

240
ml-agents/mlagents/trainers/torch/utils.py


from typing import List, Optional, Tuple, NamedTuple, Dict
from typing import List, Optional, Tuple
from mlagents.torch_utils import torch, nn
import numpy as np

)
from mlagents.trainers.settings import EncoderType, ScheduleType
from mlagents.trainers.exception import UnityTrainerException
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.torch.distributions import DistInstance, DiscreteDistInstance
class AgentAction(NamedTuple):
"""
A NamedTuple containing the tensor for continuous actions and list of tensors for
discrete actions. Utility functions provide numpy <=> tensor conversions to be
sent as actions to the environment manager as well as used by the optimizers.
:param continuous_tensor: Torch tensor corresponding to continuous actions
:param discrete_list: List of Torch tensors each corresponding to discrete actions
"""
continuous_tensor: torch.Tensor
discrete_list: List[torch.Tensor]
@property
def discrete_tensor(self):
"""
Returns the discrete action list as a stacked tensor
"""
return torch.stack(self.discrete_list, dim=-1)
def to_numpy_dict(self) -> Dict[str, np.ndarray]:
"""
Returns a Dict of np arrays with an entry correspinding to the continuous action
and an entry corresponding to the discrete action. "continuous_action" and
"discrete_action" are added to the agents buffer individually to maintain a flat buffer.
"""
array_dict: Dict[str, np.ndarray] = {}
if self.continuous_tensor is not None:
array_dict["continuous_action"] = ModelUtils.to_numpy(
self.continuous_tensor
)
if self.discrete_list is not None:
array_dict["discrete_action"] = ModelUtils.to_numpy(
self.discrete_tensor[:, 0, :]
)
return array_dict
def to_tensor_list(self) -> List[torch.Tensor]:
"""
Returns the tensors in the AgentAction as a flat List of torch Tensors. This will be removed
when the ActionModel is merged.
"""
tensor_list: List[torch.Tensor] = []
if self.continuous_tensor is not None:
tensor_list.append(self.continuous_tensor)
if self.discrete_list is not None:
tensor_list += (
self.discrete_list
) # Note this is different for ActionLogProbs
return tensor_list
@staticmethod
def create(
tensor_list: List[torch.Tensor], action_spec: ActionSpec
) -> "AgentAction":
"""
A static method that converts a list of torch Tensors into an AgentAction using the ActionSpec.
This will change (and may be removed) in the ActionModel.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
_offset = 0
if action_spec.continuous_size > 0:
continuous = tensor_list[0]
_offset = 1
if action_spec.discrete_size > 0:
discrete = tensor_list[_offset:]
return AgentAction(continuous, discrete)
@staticmethod
def from_dict(buff: Dict[str, np.ndarray]) -> "AgentAction":
"""
A static method that accesses continuous and discrete action fields in an AgentBuffer
and constructs the corresponding AgentAction from the retrieved np arrays.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
if "continuous_action" in buff:
continuous = ModelUtils.list_to_tensor(buff["continuous_action"])
if "discrete_action" in buff:
discrete_tensor = ModelUtils.list_to_tensor(
buff["discrete_action"], dtype=torch.long
)
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
return AgentAction(continuous, discrete)
class ActionLogProbs(NamedTuple):
"""
A NamedTuple containing the tensor for continuous log probs and list of tensors for
discrete log probs of individual actions as well as all the log probs for an entire branch.
Utility functions provide numpy <=> tensor conversions to be used by the optimizers.
:param continuous_tensor: Torch tensor corresponding to log probs of continuous actions
:param discrete_list: List of Torch tensors each corresponding to log probs of the discrete actions that were
sampled.
:param all_discrete_list: List of Torch tensors each corresponding to all log probs of
a discrete action branch, even the discrete actions that were not sampled. all_discrete_list is a list of Tensors,
each Tensor corresponds to one discrete branch log probabilities.
"""
continuous_tensor: torch.Tensor
discrete_list: List[torch.Tensor]
all_discrete_list: Optional[List[torch.Tensor]]
@property
def discrete_tensor(self):
"""
Returns the discrete log probs list as a stacked tensor
"""
return torch.stack(self.discrete_list, dim=-1)
@property
def all_discrete_tensor(self):
"""
Returns the discrete log probs of each branch as a tensor
"""
return torch.cat(self.all_discrete_list, dim=1)
def to_numpy_dict(self) -> Dict[str, np.ndarray]:
"""
Returns a Dict of np arrays with an entry correspinding to the continuous log probs
and an entry corresponding to the discrete log probs. "continuous_log_probs" and
"discrete_log_probs" are added to the agents buffer individually to maintain a flat buffer.
"""
array_dict: Dict[str, np.ndarray] = {}
if self.continuous_tensor is not None:
array_dict["continuous_log_probs"] = ModelUtils.to_numpy(
self.continuous_tensor
)
if self.discrete_list is not None:
array_dict["discrete_log_probs"] = ModelUtils.to_numpy(self.discrete_tensor)
return array_dict
def _to_tensor_list(self) -> List[torch.Tensor]:
"""
Returns the tensors in the ActionLogProbs as a flat List of torch Tensors. This
is private and serves as a utility for self.flatten()
"""
tensor_list: List[torch.Tensor] = []
if self.continuous_tensor is not None:
tensor_list.append(self.continuous_tensor)
if self.discrete_list is not None:
tensor_list.append(
self.discrete_tensor
) # Note this is different for AgentActions
return tensor_list
def flatten(self) -> torch.Tensor:
"""
A utility method that returns all log probs in ActionLogProbs as a flattened tensor.
This is useful for algorithms like PPO which can treat all log probs in the same way.
"""
return torch.cat(self._to_tensor_list(), dim=1)
@staticmethod
def create(
log_prob_list: List[torch.Tensor],
action_spec: ActionSpec,
all_log_prob_list: List[torch.Tensor] = None,
) -> "ActionLogProbs":
"""
A static method that converts a list of torch Tensors into an ActionLogProbs using the ActionSpec.
This will change (and may be removed) in the ActionModel.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
_offset = 0
if action_spec.continuous_size > 0:
continuous = log_prob_list[0]
_offset = 1
if action_spec.discrete_size > 0:
discrete = log_prob_list[_offset:]
return ActionLogProbs(continuous, discrete, all_log_prob_list)
@staticmethod
def from_dict(buff: Dict[str, np.ndarray]) -> "ActionLogProbs":
"""
A static method that accesses continuous and discrete log probs fields in an AgentBuffer
and constructs the corresponding ActionLogProbs from the retrieved np arrays.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
if "continuous_log_probs" in buff:
continuous = ModelUtils.list_to_tensor(buff["continuous_log_probs"])
if "discrete_log_probs" in buff:
discrete_tensor = ModelUtils.list_to_tensor(buff["discrete_log_probs"])
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
return ActionLogProbs(continuous, discrete, None)
class ModelUtils:

EncoderType.NATURE_CNN: 36,
EncoderType.RESNET: 15,
}
class ActionFlattener:
def __init__(self, action_spec: ActionSpec):
self._specs = action_spec
@property
def flattened_size(self) -> int:
if self._specs.is_continuous():
return self._specs.continuous_size
else:
return sum(self._specs.discrete_branches)
def forward(self, action: AgentAction) -> torch.Tensor:
if self._specs.is_continuous():
return action.continuous_tensor
else:
return torch.cat(
ModelUtils.actions_to_onehot(
torch.as_tensor(action.discrete_tensor, dtype=torch.long),
self._specs.discrete_branches,
),
dim=1,
)
@staticmethod
def update_learning_rate(optim: torch.optim.Optimizer, lr: float) -> None:

for i in range(num_partitions):
res += [data[(partitions == i).nonzero().squeeze(1)]]
return res
@staticmethod
def get_probs_and_entropy(
action_list: List[torch.Tensor], dists: List[DistInstance]
) -> Tuple[List[torch.Tensor], torch.Tensor, Optional[torch.Tensor]]:
log_probs_list = []
all_probs_list = []
entropies_list = []
for action, action_dist in zip(action_list, dists):
log_prob = action_dist.log_prob(action)
log_probs_list.append(log_prob)
entropies_list.append(action_dist.entropy())
if isinstance(action_dist, DiscreteDistInstance):
all_probs_list.append(action_dist.all_log_prob())
entropies = torch.stack(entropies_list, dim=-1)
if not all_probs_list:
entropies = entropies.squeeze(-1)
return log_probs_list, entropies, all_probs_list
@staticmethod
def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:

25
ml-agents/mlagents/trainers/trajectory.py


from typing import List, NamedTuple, Dict
from typing import List, NamedTuple
from mlagents_envs.base_env import ActionTuple
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
class AgentExperience(NamedTuple):

action: Dict[str, np.ndarray]
action_probs: Dict[str, np.ndarray]
action: ActionTuple
action_probs: LogProbsTuple
action_pre: np.ndarray # TODO: Remove this
action_mask: np.ndarray
prev_action: np.ndarray

agent_buffer_trajectory["actions_pre"].append(exp.action_pre)
# Adds the log prob and action of continuous/discrete separately
for act_type, act_array in exp.action.items():
agent_buffer_trajectory[act_type].append(act_array)
for log_type, log_array in exp.action_probs.items():
agent_buffer_trajectory[log_type].append(log_array)
agent_buffer_trajectory["continuous_action"].append(exp.action.continuous)
agent_buffer_trajectory["discrete_action"].append(exp.action.discrete)
agent_buffer_trajectory["continuous_log_probs"].append(
exp.action_probs.continuous
)
agent_buffer_trajectory["discrete_log_probs"].append(
exp.action_probs.discrete
)
# Store action masks if necessary. Note that 1 means active, while
# in AgentExperience False means active.

# This should never be needed unless the environment somehow doesn't supply the
# action mask in a discrete space.
if "discrete_action" in exp.action:
action_shape = exp.action["discrete_action"].shape
else:
action_shape = exp.action["continuous_action"].shape
action_shape = exp.action.discrete.shape
agent_buffer_trajectory["action_mask"].append(
np.ones(action_shape, dtype=np.float32), padding_value=1
)

81
ml-agents/mlagents/trainers/tests/torch/test_action_model.py


import pytest
from mlagents.torch_utils import torch
from mlagents.trainers.torch.action_model import ActionModel, DistInstances
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.distributions import (
GaussianDistInstance,
CategoricalDistInstance,
)
from mlagents_envs.base_env import ActionSpec
def create_action_model(inp_size, act_size):
mask = torch.ones([1, act_size * 2])
action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size)))
action_model = ActionModel(inp_size, action_spec)
return action_model, mask
def test_get_dists():
inp_size = 4
act_size = 2
action_model, masks = create_action_model(inp_size, act_size)
sample_inp = torch.ones((1, inp_size))
dists = action_model._get_dists(sample_inp, masks=masks)
assert isinstance(dists.continuous, GaussianDistInstance)
assert len(dists.discrete) == 2
for _dist in dists.discrete:
assert isinstance(_dist, CategoricalDistInstance)
def test_sample_action():
inp_size = 4
act_size = 2
action_model, masks = create_action_model(inp_size, act_size)
sample_inp = torch.ones((1, inp_size))
dists = action_model._get_dists(sample_inp, masks=masks)
agent_action = action_model._sample_action(dists)
assert agent_action.continuous_tensor.shape == (1, 2)
assert len(agent_action.discrete_list) == 2
for _disc in agent_action.discrete_list:
assert _disc.shape == (1, 1)
def test_get_probs_and_entropy():
inp_size = 4
act_size = 2
action_model, masks = create_action_model(inp_size, act_size)
_continuous_dist = GaussianDistInstance(torch.zeros((1, 2)), torch.ones((1, 2)))
act_size = 2
test_prob = torch.tensor([[1.0 - 0.1 * (act_size - 1)] + [0.1] * (act_size - 1)])
_discrete_dist_list = [
CategoricalDistInstance(test_prob),
CategoricalDistInstance(test_prob),
]
dist_tuple = DistInstances(_continuous_dist, _discrete_dist_list)
agent_action = AgentAction(
torch.zeros((1, 2)), [torch.tensor([0]), torch.tensor([1])]
)
log_probs, entropies = action_model._get_probs_and_entropy(agent_action, dist_tuple)
assert log_probs.continuous_tensor.shape == (1, 2)
assert len(log_probs.discrete_list) == 2
for _disc in log_probs.discrete_list:
assert _disc.shape == (1,)
assert len(log_probs.all_discrete_list) == 2
for _disc in log_probs.all_discrete_list:
assert _disc.shape == (1, 2)
for clp in log_probs.continuous_tensor[0]:
# Log prob of standard normal at 0
assert clp == pytest.approx(-0.919, abs=0.01)
assert log_probs.discrete_list[0] > log_probs.discrete_list[1]
for ent, val in zip(entropies[0], [1.4189, 1.4189, 0.6191, 0.6191]):
assert ent == pytest.approx(val, abs=0.01)

122
ml-agents/mlagents/trainers/tests/torch/test_hybrid.py


import attr
import pytest
from mlagents.trainers.tests.simple_test_envs import (
SimpleEnvironment,
MemoryEnvironment,
)
from mlagents.trainers.settings import NetworkSettings, FrameworkType
from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config
from mlagents.trainers.tests.check_env_trains import check_environment_trains
BRAIN_NAME = "1D"
PPO_TORCH_CONFIG = attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH)
SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)
@pytest.mark.parametrize("action_size", [(1, 1), (2, 2), (1, 2), (2, 1)])
def test_hybrid_ppo(action_size):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_size, step_size=0.8)
new_network_settings = attr.evolve(PPO_TORCH_CONFIG.network_settings)
new_hyperparams = attr.evolve(
PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=1024
)
config = attr.evolve(
PPO_TORCH_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_network_settings,
max_steps=10000,
)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("num_visual", [1, 2])
def test_hybrid_visual_ppo(num_visual):
env = SimpleEnvironment(
[BRAIN_NAME], num_visual=num_visual, num_vector=0, action_sizes=(1, 1)
)
new_hyperparams = attr.evolve(
PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
)
config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams)
check_environment_trains(env, {BRAIN_NAME: config})
def test_hybrid_recurrent_ppo():
env = MemoryEnvironment([BRAIN_NAME], action_sizes=(1, 1), step_size=0.5)
new_network_settings = attr.evolve(
PPO_TORCH_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16),
)
new_hyperparams = attr.evolve(
PPO_TORCH_CONFIG.hyperparameters,
learning_rate=1.0e-3,
batch_size=64,
buffer_size=512,
)
config = attr.evolve(
PPO_TORCH_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_network_settings,
max_steps=3000,
)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("action_size", [(1, 1), (2, 2), (1, 2), (2, 1)])
def test_hybrid_sac(action_size):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_size, step_size=0.8)
new_hyperparams = attr.evolve(
SAC_TORCH_CONFIG.hyperparameters,
buffer_size=50000,
batch_size=256,
buffer_init_steps=2000,
)
config = attr.evolve(
SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=5000
)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("num_visual", [1, 2])
def test_hybrid_visual_sac(num_visual):
env = SimpleEnvironment(
[BRAIN_NAME], num_visual=num_visual, num_vector=0, action_sizes=(1, 1)
)
new_hyperparams = attr.evolve(
SAC_TORCH_CONFIG.hyperparameters,
buffer_size=50000,
batch_size=128,
learning_rate=3.0e-4,
)
config = attr.evolve(
SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=3000
)
check_environment_trains(env, {BRAIN_NAME: config})
def test_hybrid_recurrent_sac():
env = MemoryEnvironment([BRAIN_NAME], action_sizes=(1, 1), step_size=0.5)
new_networksettings = attr.evolve(
SAC_TORCH_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16),
)
new_hyperparams = attr.evolve(
SAC_TORCH_CONFIG.hyperparameters,
batch_size=256,
learning_rate=1e-3,
buffer_init_steps=1000,
steps_per_update=2,
)
config = attr.evolve(
SAC_TORCH_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=2000,
)
check_environment_trains(env, {BRAIN_NAME: config})

44
ml-agents/mlagents/trainers/torch/action_flattener.py


from typing import List
from mlagents.torch_utils import torch
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.utils import ModelUtils
class ActionFlattener:
def __init__(self, action_spec: ActionSpec):
"""
A torch module that creates the flattened form of an AgentAction object.
The flattened form is the continuous action concatenated with the
concatenated one hot encodings of the discrete actions.
:param action_spec: An ActionSpec that describes the action space dimensions
"""
self._specs = action_spec
@property
def flattened_size(self) -> int:
"""
The flattened size is the continuous size plus the sum of the branch sizes
since discrete actions are encoded as one hots.
"""
return self._specs.continuous_size + sum(self._specs.discrete_branches)
def forward(self, action: AgentAction) -> torch.Tensor:
"""
Returns a tensor corresponding the flattened action
:param action: An AgentAction object
"""
action_list: List[torch.Tensor] = []
if self._specs.continuous_size > 0:
action_list.append(action.continuous_tensor)
if self._specs.discrete_size > 0:
flat_discrete = torch.cat(
ModelUtils.actions_to_onehot(
torch.as_tensor(action.discrete_tensor, dtype=torch.long),
self._specs.discrete_branches,
),
dim=1,
)
action_list.append(flat_discrete)
return torch.cat(action_list, dim=1)

108
ml-agents/mlagents/trainers/torch/action_log_probs.py


from typing import List, Optional, NamedTuple, Dict
from mlagents.torch_utils import torch
import numpy as np
from mlagents.trainers.torch.utils import ModelUtils
from mlagents_envs.base_env import _ActionTupleBase
class LogProbsTuple(_ActionTupleBase):
"""
An object whose fields correspond to the log probs of actions of different types.
Continuous and discrete are numpy arrays
Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively. Note, this also holds when continuous or discrete size is
zero.
"""
@property
def discrete_dtype(self) -> np.dtype:
"""
The dtype of a discrete log probability.
"""
return np.float32
class ActionLogProbs(NamedTuple):
"""
A NamedTuple containing the tensor for continuous log probs and list of tensors for
discrete log probs of individual actions as well as all the log probs for an entire branch.
Utility functions provide numpy <=> tensor conversions to be used by the optimizers.
:param continuous_tensor: Torch tensor corresponding to log probs of continuous actions
:param discrete_list: List of Torch tensors each corresponding to log probs of the discrete actions that were
sampled.
:param all_discrete_list: List of Torch tensors each corresponding to all log probs of
a discrete action branch, even the discrete actions that were not sampled. all_discrete_list is a list of Tensors,
each Tensor corresponds to one discrete branch log probabilities.
"""
continuous_tensor: torch.Tensor
discrete_list: Optional[List[torch.Tensor]]
all_discrete_list: Optional[List[torch.Tensor]]
@property
def discrete_tensor(self):
"""
Returns the discrete log probs list as a stacked tensor
"""
return torch.stack(self.discrete_list, dim=-1)
@property
def all_discrete_tensor(self):
"""
Returns the discrete log probs of each branch as a tensor
"""
return torch.cat(self.all_discrete_list, dim=1)
def to_log_probs_tuple(self) -> LogProbsTuple:
"""
Returns a LogProbsTuple. Only adds if tensor is not None. Otherwise,
LogProbsTuple uses a default.
"""
log_probs_tuple = LogProbsTuple()
if self.continuous_tensor is not None:
continuous = ModelUtils.to_numpy(self.continuous_tensor)
log_probs_tuple.add_continuous(continuous)
if self.discrete_list is not None:
discrete = ModelUtils.to_numpy(self.discrete_tensor)
log_probs_tuple.add_discrete(discrete)
return log_probs_tuple
def _to_tensor_list(self) -> List[torch.Tensor]:
"""
Returns the tensors in the ActionLogProbs as a flat List of torch Tensors. This
is private and serves as a utility for self.flatten()
"""
tensor_list: List[torch.Tensor] = []
if self.continuous_tensor is not None:
tensor_list.append(self.continuous_tensor)
if self.discrete_list is not None:
tensor_list.append(self.discrete_tensor)
return tensor_list
def flatten(self) -> torch.Tensor:
"""
A utility method that returns all log probs in ActionLogProbs as a flattened tensor.
This is useful for algorithms like PPO which can treat all log probs in the same way.
"""
return torch.cat(self._to_tensor_list(), dim=1)
@staticmethod
def from_dict(buff: Dict[str, np.ndarray]) -> "ActionLogProbs":
"""
A static method that accesses continuous and discrete log probs fields in an AgentBuffer
and constructs the corresponding ActionLogProbs from the retrieved np arrays.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
if "continuous_log_probs" in buff:
continuous = ModelUtils.list_to_tensor(buff["continuous_log_probs"])
if "discrete_log_probs" in buff:
discrete_tensor = ModelUtils.list_to_tensor(buff["discrete_log_probs"])
# This will keep discrete_list = None which enables flatten()
if discrete_tensor.shape[1] > 0:
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
return ActionLogProbs(continuous, discrete, None)

184
ml-agents/mlagents/trainers/torch/action_model.py


from typing import List, Tuple, NamedTuple, Optional
from mlagents.torch_utils import torch, nn
from mlagents.trainers.torch.distributions import (
DistInstance,
DiscreteDistInstance,
GaussianDistribution,
MultiCategoricalDistribution,
)
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents_envs.base_env import ActionSpec
EPSILON = 1e-7 # Small value to avoid divide by zero
class DistInstances(NamedTuple):
"""
A NamedTuple with fields corresponding the the DistInstance objects
output by continuous and discrete distributions, respectively. Discrete distributions
output a list of DistInstance objects whereas continuous distributions output a single
DistInstance object.
"""
continuous: Optional[DistInstance]
discrete: Optional[List[DiscreteDistInstance]]
class ActionModel(nn.Module):
def __init__(
self,
hidden_size: int,
action_spec: ActionSpec,
conditional_sigma: bool = False,
tanh_squash: bool = False,
):
"""
A torch module that represents the action space of a policy. The ActionModel may contain
a continuous distribution, a discrete distribution or both where construction depends on
the action_spec. The ActionModel uses the encoded input of the network body to parameterize
these distributions. The forward method of this module outputs the action, log probs,
and entropies given the encoding from the network body.
:params hidden_size: Size of the input to the ActionModel.
:params action_spec: The ActionSpec defining the action space dimensions and distributions.
:params conditional_sigma: Whether or not the std of a Gaussian is conditioned on state.
:params tanh_squash: Whether to squash the output of a Gaussian with the tanh function.
"""
super().__init__()
self.encoding_size = hidden_size
self.action_spec = action_spec
self._continuous_distribution = None
self._discrete_distribution = None
if self.action_spec.continuous_size > 0:
self._continuous_distribution = GaussianDistribution(
self.encoding_size,
self.action_spec.continuous_size,
conditional_sigma=conditional_sigma,
tanh_squash=tanh_squash,
)
if self.action_spec.discrete_size > 0:
self._discrete_distribution = MultiCategoricalDistribution(
self.encoding_size, self.action_spec.discrete_branches
)
def _sample_action(self, dists: DistInstances) -> AgentAction:
"""
Samples actions from a DistInstances tuple
:params dists: The DistInstances tuple
:return: An AgentAction corresponding to the actions sampled from the DistInstances
"""
continuous_action: Optional[torch.Tensor] = None
discrete_action: Optional[List[torch.Tensor]] = None
# This checks None because mypy complains otherwise
if dists.continuous is not None:
continuous_action = dists.continuous.sample()
if dists.discrete is not None:
discrete_action = []
for discrete_dist in dists.discrete:
discrete_action.append(discrete_dist.sample())
return AgentAction(continuous_action, discrete_action)
def _get_dists(self, inputs: torch.Tensor, masks: torch.Tensor) -> DistInstances:
"""
Creates a DistInstances tuple using the continuous and discrete distributions
:params inputs: The encoding from the network body
:params masks: Action masks for discrete actions
:return: A DistInstances tuple
"""
continuous_dist: Optional[DistInstance] = None
discrete_dist: Optional[List[DiscreteDistInstance]] = None
# This checks None because mypy complains otherwise
if self._continuous_distribution is not None:
continuous_dist = self._continuous_distribution(inputs)
if self._discrete_distribution is not None:
discrete_dist = self._discrete_distribution(inputs, masks)
return DistInstances(continuous_dist, discrete_dist)
def _get_probs_and_entropy(
self, actions: AgentAction, dists: DistInstances
) -> Tuple[ActionLogProbs, torch.Tensor]:
"""
Computes the log probabilites of the actions given distributions and entropies of
the given distributions.
:params actions: The AgentAction
:params dists: The DistInstances tuple
:return: An ActionLogProbs tuple and a torch tensor of the distribution entropies.
"""
entropies_list: List[torch.Tensor] = []
continuous_log_prob: Optional[torch.Tensor] = None
discrete_log_probs: Optional[List[torch.Tensor]] = None
all_discrete_log_probs: Optional[List[torch.Tensor]] = None
# This checks None because mypy complains otherwise
if dists.continuous is not None:
continuous_log_prob = dists.continuous.log_prob(actions.continuous_tensor)
entropies_list.append(dists.continuous.entropy())
if dists.discrete is not None:
discrete_log_probs = []
all_discrete_log_probs = []
for discrete_action, discrete_dist in zip(
actions.discrete_list, dists.discrete # type: ignore
):
discrete_log_prob = discrete_dist.log_prob(discrete_action)
entropies_list.append(discrete_dist.entropy())
discrete_log_probs.append(discrete_log_prob)
all_discrete_log_probs.append(discrete_dist.all_log_prob())
action_log_probs = ActionLogProbs(
continuous_log_prob, discrete_log_probs, all_discrete_log_probs
)
entropies = torch.cat(entropies_list, dim=1)
return action_log_probs, entropies
def evaluate(
self, inputs: torch.Tensor, masks: torch.Tensor, actions: AgentAction
) -> Tuple[ActionLogProbs, torch.Tensor]:
"""
Given actions and encoding from the network body, gets the distributions and
computes the log probabilites and entropies.
:params inputs: The encoding from the network body
:params masks: Action masks for discrete actions
:params actions: The AgentAction
:return: An ActionLogProbs tuple and a torch tensor of the distribution entropies.
"""
dists = self._get_dists(inputs, masks)
log_probs, entropies = self._get_probs_and_entropy(actions, dists)
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return log_probs, entropy_sum
def get_action_out(self, inputs: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:
"""
Gets the tensors corresponding to the output of the policy network to be used for
inference. Called by the Actor's forward call.
:params inputs: The encoding from the network body
:params masks: Action masks for discrete actions
:return: A tuple of torch tensors corresponding to the inference output
"""
dists = self._get_dists(inputs, masks)
out_list: List[torch.Tensor] = []
# This checks None because mypy complains otherwise
if dists.continuous is not None:
out_list.append(dists.continuous.exported_model_output())
if dists.discrete is not None:
for discrete_dist in dists.discrete:
out_list.append(discrete_dist.exported_model_output())
return torch.cat(out_list, dim=1)
def forward(
self, inputs: torch.Tensor, masks: torch.Tensor
) -> Tuple[AgentAction, ActionLogProbs, torch.Tensor]:
"""
The forward method of this module. Outputs the action, log probs,
and entropies given the encoding from the network body.
:params inputs: The encoding from the network body
:params masks: Action masks for discrete actions
:return: Given the input, an AgentAction of the actions generated by the policy and the corresponding
ActionLogProbs and entropies.
"""
dists = self._get_dists(inputs, masks)
actions = self._sample_action(dists)
log_probs, entropies = self._get_probs_and_entropy(actions, dists)
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return (actions, log_probs, entropy_sum)

58
ml-agents/mlagents/trainers/torch/agent_action.py


from typing import List, Optional, NamedTuple, Dict
from mlagents.torch_utils import torch
import numpy as np
from mlagents.trainers.torch.utils import ModelUtils
from mlagents_envs.base_env import ActionTuple
class AgentAction(NamedTuple):
"""
A NamedTuple containing the tensor for continuous actions and list of tensors for
discrete actions. Utility functions provide numpy <=> tensor conversions to be
sent as actions to the environment manager as well as used by the optimizers.
:param continuous_tensor: Torch tensor corresponding to continuous actions
:param discrete_list: List of Torch tensors each corresponding to discrete actions
"""
continuous_tensor: torch.Tensor
discrete_list: Optional[List[torch.Tensor]]
@property
def discrete_tensor(self):
"""
Returns the discrete action list as a stacked tensor
"""
return torch.stack(self.discrete_list, dim=-1)
def to_action_tuple(self) -> ActionTuple:
"""
Returns an ActionTuple
"""
action_tuple = ActionTuple()
if self.continuous_tensor is not None:
continuous = ModelUtils.to_numpy(self.continuous_tensor)
action_tuple.add_continuous(continuous)
if self.discrete_list is not None:
discrete = ModelUtils.to_numpy(self.discrete_tensor[:, 0, :])
action_tuple.add_discrete(discrete)
return action_tuple
@staticmethod
def from_dict(buff: Dict[str, np.ndarray]) -> "AgentAction":
"""
A static method that accesses continuous and discrete action fields in an AgentBuffer
and constructs the corresponding AgentAction from the retrieved np arrays.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
if "continuous_action" in buff:
continuous = ModelUtils.list_to_tensor(buff["continuous_action"])
if "discrete_action" in buff:
discrete_tensor = ModelUtils.list_to_tensor(
buff["discrete_action"], dtype=torch.long
)
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
return AgentAction(continuous, discrete)
正在加载...
取消
保存