浏览代码

Action buffer (#4612)

Co-authored-by: Ervin T <ervin@unity3d.com>
Co-authored-by: Vincent-Pierre BERGES <vincentpierre@unity3d.com>
/fix-conflict-base-env
GitHub 4 年前
当前提交
b853e5ba
共有 38 个文件被更改,包括 633 次插入242 次删除
  1. 109
      ml-agents-envs/mlagents_envs/base_env.py
  2. 21
      ml-agents-envs/mlagents_envs/environment.py
  3. 6
      ml-agents-envs/mlagents_envs/tests/test_envs.py
  4. 27
      ml-agents-envs/mlagents_envs/tests/test_steps.py
  5. 12
      ml-agents/mlagents/trainers/agent_processor.py
  6. 2
      ml-agents/mlagents/trainers/buffer.py
  7. 9
      ml-agents/mlagents/trainers/demo_loader.py
  8. 22
      ml-agents/mlagents/trainers/env_manager.py
  9. 4
      ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
  10. 23
      ml-agents/mlagents/trainers/policy/policy.py
  11. 9
      ml-agents/mlagents/trainers/policy/tf_policy.py
  12. 48
      ml-agents/mlagents/trainers/policy/torch_policy.py
  13. 9
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  14. 12
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  15. 6
      ml-agents/mlagents/trainers/sac/optimizer_tf.py
  16. 44
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  17. 3
      ml-agents/mlagents/trainers/simple_env_manager.py
  18. 5
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  19. 21
      ml-agents/mlagents/trainers/tests/mock_brain.py
  20. 32
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  21. 2
      ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py
  22. 26
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  23. 10
      ml-agents/mlagents/trainers/tests/test_demo_loader.py
  24. 2
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  25. 9
      ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py
  26. 29
      ml-agents/mlagents/trainers/tests/torch/test_policy.py
  27. 25
      ml-agents/mlagents/trainers/tests/torch/test_ppo.py
  28. 2
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py
  29. 11
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
  30. 15
      ml-agents/mlagents/trainers/tests/torch/test_utils.py
  31. 6
      ml-agents/mlagents/trainers/tf/components/bc/module.py
  32. 10
      ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/signal.py
  33. 17
      ml-agents/mlagents/trainers/tf/components/reward_signals/gail/signal.py
  34. 27
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  35. 17
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  36. 6
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
  37. 212
      ml-agents/mlagents/trainers/torch/utils.py
  38. 25
      ml-agents/mlagents/trainers/trajectory.py

109
ml-agents-envs/mlagents_envs/base_env.py


)
class ActionTuple:
"""
An object whose fields correspond to actions of different types.
Continuous and discrete actions are numpy arrays of type float32 and
int32, respectively and are type checked on construction.
Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively.
"""
def __init__(self, continuous: np.ndarray, discrete: np.ndarray):
if continuous.dtype != np.float32:
continuous = continuous.astype(np.float32, copy=False)
self._continuous = continuous
if discrete.dtype != np.int32:
discrete = discrete.astype(np.int32, copy=False)
self._discrete = discrete
@property
def continuous(self) -> np.ndarray:
return self._continuous
@property
def discrete(self) -> np.ndarray:
return self._discrete
@staticmethod
def create_continuous(continuous: np.ndarray) -> "ActionTuple":
discrete = np.zeros((continuous.shape[0], 0), dtype=np.int32)
return ActionTuple(continuous, discrete)
@staticmethod
def create_discrete(discrete: np.ndarray) -> "ActionTuple":
continuous = np.zeros((discrete.shape[0], 0), dtype=np.float32)
return ActionTuple(continuous, discrete)
class ActionSpec(NamedTuple):
"""
A NamedTuple containing utility functions and information about the action spaces

"""
return len(self.discrete_branches)
def empty_action(self, n_agents: int) -> np.ndarray:
def empty_action(self, n_agents: int) -> ActionTuple:
Generates a numpy array corresponding to an empty action (all zeros)
Generates ActionTuple corresponding to an empty action (all zeros)
if self.is_continuous():
return np.zeros((n_agents, self.continuous_size), dtype=np.float32)
return np.zeros((n_agents, self.discrete_size), dtype=np.int32)
continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
return ActionTuple(continuous, discrete)
def random_action(self, n_agents: int) -> np.ndarray:
def random_action(self, n_agents: int) -> ActionTuple:
Generates a numpy array corresponding to a random action (either discrete
Generates ActionTuple corresponding to a random action (either discrete
if self.is_continuous():
action = np.random.uniform(
low=-1.0, high=1.0, size=(n_agents, self.continuous_size)
).astype(np.float32)
else:
branch_size = self.discrete_branches
action = np.column_stack(
continuous = np.random.uniform(
low=-1.0, high=1.0, size=(n_agents, self.continuous_size)
)
discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
if self.discrete_size > 0:
discrete = np.column_stack(
branch_size[i], # type: ignore
self.discrete_branches[i], # type: ignore
size=(n_agents),
dtype=np.int32,
)

return action
return ActionTuple(continuous, discrete)
self, actions: np.ndarray, n_agents: int, name: str
) -> np.ndarray:
self, actions: ActionTuple, n_agents: int, name: str
) -> ActionTuple:
if self.continuous_size > 0:
_size = self.continuous_size
else:
_size = self.discrete_size
_expected_shape = (n_agents, _size)
if actions.shape != _expected_shape:
_expected_shape = (n_agents, self.continuous_size)
if self.continuous_size > 0 and actions.continuous.shape != _expected_shape:
raise UnityActionException(
f"The behavior {name} needs a continuous input of dimension "
f"{_expected_shape} for (<number of agents>, <action size>) but "
f"received input of dimension {actions.continuous.shape}"
)
_expected_shape = (n_agents, self.discrete_size)
if self.discrete_size > 0 and actions.discrete.shape != _expected_shape:
f"The behavior {name} needs an input of dimension "
f"The behavior {name} needs a discrete input of dimension "
f"received input of dimension {actions.shape}"
f"received input of dimension {actions.discrete.shape}"
_expected_type = np.float32 if self.is_continuous() else np.int32
if actions.dtype != _expected_type:
actions = actions.astype(_expected_type)
return actions
@staticmethod

"""
@abstractmethod
def set_actions(self, behavior_name: BehaviorName, action: np.ndarray) -> None:
def set_actions(self, behavior_name: BehaviorName, action: ActionTuple) -> None:
:param action: A two dimensional np.ndarray corresponding to the action
(either int or float)
:param action: ActionTuple tuple of continuous and/or discrete action.
Actions are np.arrays with dimensions (n_agents, continuous_size) and
(n_agents, discrete_size), respectively.
self, behavior_name: BehaviorName, agent_id: AgentId, action: np.ndarray
self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionTuple
) -> None:
"""
Sets the action for one of the agents in the simulation for the next

:param action: A one dimensional np.ndarray corresponding to the action
(either int or float)
:param action: ActionTuple tuple of continuous and/or discrete action
Actions are np.arrays with dimensions (1, continuous_size) and
(1, discrete_size), respectively. Note, this initial dimensions of 1 is because
this action is meant for a single agent.
"""
@abstractmethod

21
ml-agents-envs/mlagents_envs/environment.py


DecisionSteps,
TerminalSteps,
BehaviorSpec,
ActionTuple,
BehaviorName,
AgentId,
BehaviorMapping,

self._env_state: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
self._env_specs: Dict[str, BehaviorSpec] = {}
self._env_actions: Dict[str, np.ndarray] = {}
self._env_actions: Dict[str, ActionTuple] = {}
self._is_first_message = True
self._update_behavior_specs(aca_output)

f"agent group in the environment"
)
def set_actions(self, behavior_name: BehaviorName, action: np.ndarray) -> None:
def set_actions(self, behavior_name: BehaviorName, action: ActionTuple) -> None:
self._assert_behavior_exists(behavior_name)
if behavior_name not in self._env_state:
return

self._env_actions[behavior_name] = action
def set_action_for_agent(
self, behavior_name: BehaviorName, agent_id: AgentId, action: np.ndarray
self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionTuple
) -> None:
self._assert_behavior_exists(behavior_name)
if behavior_name not in self._env_state:

agent_id
)
) from ie
self._env_actions[behavior_name][index] = action
if action_spec.continuous_size > 0:
self._env_actions[behavior_name].continuous[index] = action.continuous[0, :]
if action_spec.discrete_size > 0:
self._env_actions[behavior_name].discrete[index] = action.discrete[0, :]
def get_steps(
self, behavior_name: BehaviorName

@timed
def _generate_step_input(
self, vector_action: Dict[str, np.ndarray]
self, vector_action: Dict[str, ActionTuple]
) -> UnityInputProto:
rl_in = UnityRLInputProto()
for b in vector_action:

for i in range(n_agents):
action = AgentActionProto(vector_actions=vector_action[b][i])
# TODO: This check will be removed when the oroto supports hybrid actions
if vector_action[b].continuous.shape[1] > 0:
_act = vector_action[b].continuous[i]
else:
_act = vector_action[b].discrete[i]
action = AgentActionProto(vector_actions=_act)
rl_in.agent_actions[b].value.extend([action])
rl_in.command = STEP
rl_in.side_channel = bytes(

6
ml-agents-envs/mlagents_envs/tests/test_envs.py


import pytest
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.base_env import DecisionSteps, TerminalSteps
from mlagents_envs.base_env import DecisionSteps, TerminalSteps, ActionTuple
from mlagents_envs.exception import UnityEnvironmentException, UnityActionException
from mlagents_envs.mock_communicator import MockCommunicator

env.set_actions("RealFakeBrain", spec.action_spec.empty_action(n_agents - 1))
decision_steps, terminal_steps = env.get_steps("RealFakeBrain")
n_agents = len(decision_steps)
env.set_actions("RealFakeBrain", spec.action_spec.empty_action(n_agents) - 1)
_empty_act = spec.action_spec.empty_action(n_agents)
next_action = ActionTuple(_empty_act.continuous - 1, _empty_act.discrete - 1)
env.set_actions("RealFakeBrain", next_action)
env.step()
env.close()

27
ml-agents-envs/mlagents_envs/tests/test_steps.py


assert specs.discrete_branches == ()
assert specs.discrete_size == 0
assert specs.continuous_size == 3
assert specs.empty_action(5).shape == (5, 3)
assert specs.empty_action(5).dtype == np.float32
assert specs.empty_action(5).continuous.shape == (5, 3)
assert specs.empty_action(5).continuous.dtype == np.float32
assert specs.empty_action(5).shape == (5, 1)
assert specs.empty_action(5).dtype == np.int32
assert specs.empty_action(5).discrete.shape == (5, 1)
assert specs.empty_action(5).discrete.dtype == np.int32
specs = ActionSpec(3, (3,))
assert specs.continuous_size == 3
assert specs.discrete_branches == (3,)
assert specs.discrete_size == 1
assert specs.empty_action(5).continuous.shape == (5, 3)
assert specs.empty_action(5).continuous.dtype == np.float32
assert specs.empty_action(5).discrete.shape == (5, 1)
assert specs.empty_action(5).discrete.dtype == np.int32
def test_action_generator():

zero_action = specs.empty_action(4)
zero_action = specs.empty_action(4).continuous
random_action = specs.random_action(4)
print(specs.random_action(4))
random_action = specs.random_action(4).continuous
print(random_action)
assert random_action.dtype == np.float32
assert random_action.shape == (4, action_len)
assert np.min(random_action) >= -1

action_shape = (10, 20, 30)
specs = ActionSpec.create_discrete(action_shape)
zero_action = specs.empty_action(4)
zero_action = specs.empty_action(4).discrete
random_action = specs.random_action(4)
random_action = specs.random_action(4).discrete
assert random_action.dtype == np.int32
assert random_action.shape == (4, len(action_shape))
assert np.min(random_action) >= 0

12
ml-agents/mlagents/trainers/agent_processor.py


from typing import List, Dict, TypeVar, Generic, Tuple, Any, Union
from collections import defaultdict, Counter
import queue
import numpy as np
from mlagents_envs.base_env import (
DecisionSteps,

done = terminated # Since this is an ongoing step
interrupted = step.interrupted if terminated else False
# Add the outputs of the last eval
action = stored_take_action_outputs["action"][idx]
action_dict = stored_take_action_outputs["action"]
action: Dict[str, np.ndarray] = {}
for act_type, act_array in action_dict.items():
action[act_type] = act_array[idx]
action_probs = stored_take_action_outputs["log_probs"][idx]
action_probs_dict = stored_take_action_outputs["log_probs"]
action_probs: Dict[str, np.ndarray] = {}
for prob_type, prob_array in action_probs_dict.items():
action_probs[prob_type] = prob_array[idx]
action_mask = stored_decision_step.action_mask
prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
experience = AgentExperience(

2
ml-agents/mlagents/trainers/buffer.py


class AgentBufferField(list):
"""
AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to his
AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to its
AgentBufferField with the append method.
"""

9
ml-agents/mlagents/trainers/demo_loader.py


for i, obs in enumerate(split_obs.visual_observations):
demo_raw_buffer["visual_obs%d" % i].append(obs)
demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
if behavior_spec.action_spec.is_continuous():
demo_raw_buffer["continuous_action"].append(
current_pair_info.action_info.vector_actions
)
else:
demo_raw_buffer["discrete_action"].append(
current_pair_info.action_info.vector_actions
)
demo_raw_buffer["prev_action"].append(previous_action)
if next_done:
demo_raw_buffer.resequence_and_append(

22
ml-agents/mlagents/trainers/env_manager.py


from abc import ABC, abstractmethod
import numpy as np
from typing import List, Dict, NamedTuple, Iterable, Tuple
from mlagents_envs.base_env import (
DecisionSteps,

ActionTuple,
)
from mlagents_envs.side_channel.stats_side_channel import EnvironmentStats

from mlagents_envs.logging_util import get_logger
from mlagents_envs.exception import UnityActionException
AllStepResult = Dict[BehaviorName, Tuple[DecisionSteps, TerminalSteps]]
AllGroupSpec = Dict[BehaviorName, BehaviorSpec]

step_info.environment_stats, step_info.worker_id
)
return len(step_infos)
@staticmethod
def action_tuple_from_numpy_dict(action_dict: Dict[str, np.ndarray]) -> ActionTuple:
if "continuous_action" in action_dict:
continuous = action_dict["continuous_action"]
if "discrete_action" in action_dict:
discrete = action_dict["discrete_action"]
action_tuple = ActionTuple(continuous, discrete)
else:
action_tuple = ActionTuple.create_continuous(continuous)
elif "discrete_action" in action_dict:
discrete = action_dict["discrete_action"]
action_tuple = ActionTuple.create_discrete(discrete)
else:
raise UnityActionException(
"The action dict must contain entries for either continuous_action or discrete_action."
)
return action_tuple

4
ml-agents/mlagents/trainers/optimizer/tf_optimizer.py


[self.value_heads, self.policy.memory_out, self.memory_out], feed_dict
)
prev_action = (
batch["actions"][-1] if not self.policy.use_continuous_act else None
batch["discrete_action"][-1]
if not self.policy.use_continuous_act
else None
)
else:
value_estimates = self.sess.run(self.value_heads, feed_dict)

23
ml-agents/mlagents/trainers/policy/policy.py


1 for shape in behavior_spec.observation_shapes if len(shape) == 3
)
self.use_continuous_act = self.behavior_spec.action_spec.is_continuous()
# This line will be removed in the ActionBuffer change
self.num_branches = (
self.behavior_spec.action_spec.continuous_size
+ self.behavior_spec.action_spec.discrete_size
)
self.previous_action_dict: Dict[str, np.array] = {}
self.previous_action_dict: Dict[str, np.ndarray] = {}
self.memory_dict: Dict[str, np.ndarray] = {}
self.normalize = trainer_settings.network_settings.normalize
self.use_recurrent = self.network_settings.memory is not None

if agent_id in self.memory_dict:
self.memory_dict.pop(agent_id)
def make_empty_previous_action(self, num_agents):
def make_empty_previous_action(self, num_agents: int) -> np.ndarray:
return np.zeros((num_agents, self.num_branches), dtype=np.int)
return np.zeros(
(num_agents, self.behavior_spec.action_spec.discrete_size), dtype=np.int32
)
self, agent_ids: List[str], action_matrix: Optional[np.ndarray]
self, agent_ids: List[str], action_dict: Dict[str, np.ndarray]
if action_matrix is None:
if action_dict is None or "discrete_action" not in action_dict:
self.previous_action_dict[agent_id] = action_matrix[index, :]
self.previous_action_dict[agent_id] = action_dict["discrete_action"][
index, :
]
action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int)
action_matrix = self.make_empty_previous_action(len(agent_ids))
for index, agent_id in enumerate(agent_ids):
if agent_id in self.previous_action_dict:
action_matrix[index, :] = self.previous_action_dict[agent_id]

9
ml-agents/mlagents/trainers/policy/tf_policy.py


feed_dict[self.prev_action] = self.retrieve_previous_action(
global_agent_ids
)
feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
feed_dict = self.fill_eval_dict(feed_dict, decision_requests)
run_out = self._execute_model(feed_dict, self.inference_dict)

)
self.save_memories(global_agent_ids, run_out.get("memory_out"))
# For Compatibility with buffer changes for hybrid action support
if "log_probs" in run_out:
run_out["log_probs"] = {"action_probs": run_out["log_probs"]}
if "action" in run_out:
if self.behavior_spec.action_spec.is_continuous():
run_out["action"] = {"continuous_action": run_out["action"]}
else:
run_out["action"] = {"discrete_action": run_out["action"]}
return ActionInfo(
action=run_out.get("action"),
value=run_out.get("value"),

48
ml-agents/mlagents/trainers/policy/torch_policy.py


SeparateActorCritic,
GlobalSteps,
)
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
EPSILON = 1e-7 # Small value to avoid divide by zero

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
seq_len: int = 1,
all_log_probs: bool = False,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
) -> Tuple[AgentAction, ActionLogProbs, torch.Tensor, torch.Tensor]:
"""
:param vec_obs: List of vector observations.
:param vis_obs: List of visual observations.

:param all_log_probs: Returns (for discrete actions) a tensor of log probs, one for each action.
:return: Tuple of actions, log probabilities (dependent on all_log_probs), entropies, and
output memories, all as Torch Tensors.
"""

vec_obs, vis_obs, masks, memories, seq_len
)
action_list = self.actor_critic.sample_action(dists)
log_probs, entropies, all_logs = ModelUtils.get_probs_and_entropy(
log_probs_list, entropies, all_logs_list = ModelUtils.get_probs_and_entropy(
actions = torch.stack(action_list, dim=-1)
if self.use_continuous_act:
actions = actions[:, :, 0]
else:
actions = actions[:, 0, :]
actions = AgentAction.create(action_list, self.behavior_spec.action_spec)
log_probs = ActionLogProbs.create(
log_probs_list, self.behavior_spec.action_spec, all_logs_list
)
return (
actions,
all_logs if all_log_probs else log_probs,
entropy_sum,
memories,
)
return (actions, log_probs, entropy_sum, memories)
actions: torch.Tensor,
actions: AgentAction,
) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]:
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
action_list = [actions[..., i] for i in range(actions.shape[-1])]
log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_list, dists)
action_list = actions.to_tensor_list()
log_probs_list, entropies, _ = ModelUtils.get_probs_and_entropy(
action_list, dists
)
log_probs = ActionLogProbs.create(
log_probs_list, self.behavior_spec.action_spec
)
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return log_probs, entropy_sum, value_heads

action, log_probs, entropy, memories = self.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
)
run_out["action"] = ModelUtils.to_numpy(action)
run_out["pre_action"] = ModelUtils.to_numpy(action)
# Todo - make pre_action difference
run_out["log_probs"] = ModelUtils.to_numpy(log_probs)
action_dict = action.to_numpy_dict()
run_out["action"] = action_dict
run_out["pre_action"] = (
action_dict["continuous_action"] if self.use_continuous_act else None
)
run_out["log_probs"] = log_probs.to_numpy_dict()
run_out["entropy"] = ModelUtils.to_numpy(entropy)
run_out["learning_rate"] = 0.0
if self.use_recurrent:

9
ml-agents/mlagents/trainers/ppo/optimizer_tf.py


if self.policy.output_pre is not None and "actions_pre" in mini_batch:
feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]
else:
feed_dict[self.policy.output] = mini_batch["actions"]
if self.policy.use_recurrent:
feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
if self.policy.use_continuous_act: # For hybrid action buffer support
feed_dict[self.policy.output] = mini_batch["continuous_action"]
else:
feed_dict[self.policy.output] = mini_batch["discrete_action"]
if self.policy.use_recurrent:
feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
feed_dict[self.policy.action_masks] = mini_batch["action_mask"]
if "vector_obs" in mini_batch:
feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]

12
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
from mlagents.trainers.settings import TrainerSettings, PPOSettings
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
class TorchPPOOptimizer(TorchOptimizer):

advantage = advantages.unsqueeze(-1)
decay_epsilon = self.hyperparameters.epsilon
r_theta = torch.exp(log_probs - old_log_probs)
p_opt_a = r_theta * advantage
p_opt_b = (

vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
else:
actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long)
actions = AgentAction.from_dict(batch)
memories = [
ModelUtils.list_to_tensor(batch["memory"][i])

memories=memories,
seq_len=self.policy.sequence_length,
)
old_log_probs = ActionLogProbs.from_dict(batch).flatten()
log_probs = log_probs.flatten()
loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
value_loss = self.ppo_value_loss(
values, old_values, returns, decay_eps, loss_masks

log_probs,
ModelUtils.list_to_tensor(batch["action_probs"]),
old_log_probs,
loss_masks,
)
loss = (

6
ml-agents/mlagents/trainers/sac/optimizer_tf.py


feed_dict[self.rewards_holders[name]] = batch[f"{name}_rewards"]
if self.policy.use_continuous_act:
feed_dict[self.policy_network.external_action_in] = batch["actions"]
feed_dict[self.policy_network.external_action_in] = batch[
"continuous_action"
]
feed_dict[policy.output] = batch["actions"]
feed_dict[policy.output] = batch["discrete_action"]
if self.policy.use_recurrent:
feed_dict[policy.prev_action] = batch["prev_action"]
feed_dict[policy.action_masks] = batch["action_mask"]

44
ml-agents/mlagents/trainers/sac/optimizer_torch.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.torch.networks import ValueNetwork
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
from mlagents.trainers.buffer import AgentBuffer
from mlagents_envs.timers import timed
from mlagents.trainers.exception import UnityTrainerException

def sac_value_loss(
self,
log_probs: torch.Tensor,
log_probs: ActionLogProbs,
values: Dict[str, torch.Tensor],
q1p_out: Dict[str, torch.Tensor],
q2p_out: Dict[str, torch.Tensor],

if not discrete:
min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name])
else:
action_probs = log_probs.exp()
action_probs = log_probs.all_discrete_tensor.exp()
_branched_q1p = ModelUtils.break_into_branches(
q1p_out[name] * action_probs, self.act_size
)

for name in values.keys():
with torch.no_grad():
v_backup = min_policy_qs[name] - torch.sum(
_ent_coef * log_probs, dim=1
_ent_coef * log_probs.continuous_tensor, dim=1
)
value_loss = 0.5 * ModelUtils.masked_mean(
torch.nn.functional.mse_loss(values[name], v_backup), loss_masks

branched_per_action_ent = ModelUtils.break_into_branches(
log_probs * log_probs.exp(), self.act_size
log_probs.all_discrete_tensor * log_probs.all_discrete_tensor.exp(),
self.act_size,
)
# We have to do entropy bonus per action branch
branched_ent_bonus = torch.stack(

def sac_policy_loss(
self,
log_probs: torch.Tensor,
log_probs: ActionLogProbs,
q1p_outs: Dict[str, torch.Tensor],
loss_masks: torch.Tensor,
discrete: bool,

if not discrete:
mean_q1 = mean_q1.unsqueeze(1)
batch_policy_loss = torch.mean(_ent_coef * log_probs - mean_q1, dim=1)
batch_policy_loss = torch.mean(
_ent_coef * log_probs.continuous_tensor - mean_q1, dim=1
)
action_probs = log_probs.exp()
action_probs = log_probs.all_discrete_tensor.exp()
log_probs * action_probs, self.act_size
log_probs.all_discrete_tensor * action_probs, self.act_size
)
branched_q_term = ModelUtils.break_into_branches(
mean_q1 * action_probs, self.act_size

return policy_loss
def sac_entropy_loss(
self, log_probs: torch.Tensor, loss_masks: torch.Tensor, discrete: bool
self, log_probs: ActionLogProbs, loss_masks: torch.Tensor, discrete: bool
target_current_diff = torch.sum(log_probs + self.target_entropy, dim=1)
target_current_diff = torch.sum(
log_probs.continuous_tensor + self.target_entropy, dim=1
)
entropy_loss = -1 * ModelUtils.masked_mean(
self._log_ent_coef * target_current_diff, loss_masks
)

log_probs * log_probs.exp(), self.act_size
log_probs.all_discrete_tensor * log_probs.all_discrete_tensor.exp(),
self.act_size,
)
target_current_diff_branched = torch.stack(
[

vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
next_vec_obs = [ModelUtils.list_to_tensor(batch["next_vector_in"])]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
else:
actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long)
actions = AgentAction.from_dict(batch)
memories_list = [
ModelUtils.list_to_tensor(batch["memory"][i])

masks=act_masks,
memories=memories,
seq_len=self.policy.sequence_length,
all_log_probs=not self.policy.use_continuous_act,
squeezed_actions = actions.squeeze(-1)
squeezed_actions = actions.continuous_tensor
sampled_actions,
sampled_actions.continuous_tensor,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q2_grad=False,

memories=q_memories,
sequence_length=self.policy.sequence_length,
)
q1_stream = self._condense_q_streams(q1_out, actions)
q2_stream = self._condense_q_streams(q2_out, actions)
q1_stream = self._condense_q_streams(q1_out, actions.discrete_tensor)
q2_stream = self._condense_q_streams(q2_out, actions.discrete_tensor)
with torch.no_grad():
target_values, _ = self.target_network(

3
ml-agents/mlagents/trainers/simple_env_manager.py


self.previous_all_action_info = all_action_info
for brain_name, action_info in all_action_info.items():
self.env.set_actions(brain_name, action_info.action)
_action = EnvManager.action_tuple_from_numpy_dict(action_info.action)
self.env.set_actions(brain_name, _action)
self.env.step()
all_step_result = self._generate_all_results()

5
ml-agents/mlagents/trainers/subprocess_env_manager.py


all_action_info = req.payload
for brain_name, action_info in all_action_info.items():
if len(action_info.action) != 0:
env.set_actions(brain_name, action_info.action)
_action = EnvManager.action_tuple_from_numpy_dict(
action_info.action
)
env.set_actions(brain_name, _action)
env.step()
all_step_result = _generate_all_results()
# The timers in this process are independent from all the processes and the "main" process

21
ml-agents/mlagents/trainers/tests/mock_brain.py


steps_list = []
action_size = action_spec.discrete_size + action_spec.continuous_size
action_probs = np.ones(
int(np.sum(action_spec.discrete_branches) + action_spec.continuous_size),
dtype=np.float32,
)
action_probs = {
"action_probs": np.ones(
int(np.sum(action_spec.discrete_branches) + action_spec.continuous_size),
dtype=np.float32,
)
}
for _i in range(length - 1):
obs = []
for _shape in observation_shapes:

action = np.zeros(action_size, dtype=np.float32)
if action_spec.is_continuous():
action = {"continuous_action": np.zeros(action_size, dtype=np.float32)}
else:
action = {"discrete_action": np.zeros(action_size, dtype=np.float32)}
action_pre = np.zeros(action_size, dtype=np.float32)
action_mask = (
[

if action_spec.is_discrete()
else None
)
prev_action = np.ones(action_size, dtype=np.float32)
if action_spec.is_discrete():
prev_action = np.ones(action_size, dtype=np.int32)
else:
prev_action = np.ones(action_size, dtype=np.float32)
max_step = False
memory = np.ones(memory_size, dtype=np.float32)
agent_id = "test_agent"

32
ml-agents/mlagents/trainers/tests/simple_test_envs.py


from mlagents_envs.base_env import (
ActionSpec,
ActionTuple,
BaseEnv,
BehaviorSpec,
DecisionSteps,

else:
action_spec = ActionSpec.create_continuous(action_size)
self.behavior_spec = BehaviorSpec(self._make_obs_spec(), action_spec)
self.action_spec = action_spec
self.action_size = action_size
self.names = brain_names
self.positions: Dict[str, List[float]] = {}

def _take_action(self, name: str) -> bool:
deltas = []
for _act in self.action[name][0]:
if self.discrete:
deltas.append(1 if _act else -1)
else:
deltas.append(_act)
_act = self.action[name]
if self.action_spec.discrete_size > 0:
for _disc in _act.discrete[0]:
deltas.append(1 if _disc else -1)
if self.action_spec.continuous_size > 0:
for _cont in _act.continuous[0]:
deltas.append(_cont)
for i, _delta in enumerate(deltas):
_delta = clamp(_delta, -self.step_size, self.step_size)
self.positions[name][i] += _delta

def step(self) -> None:
super().step()
for name in self.names:
if self.discrete:
action = self.action[name].discrete
else:
action = self.action[name].continuous
self.step_result[name][0], self.step_result[name][1], self.action[name]
self.step_result[name][0], self.step_result[name][1], action
)
self.demonstration_protos[name] = self.demonstration_protos[name][
-self.n_demos :

for _ in range(self.n_demos):
for name in self.names:
if self.discrete:
self.action[name] = [[1]] if self.goal[name] > 0 else [[0]]
self.action[name] = ActionTuple(
np.array([], dtype=np.float32),
np.array(
[[1]] if self.goal[name] > 0 else [[0]], dtype=np.int32
),
)
self.action[name] = [[float(self.goal[name])]]
self.action[name] = ActionTuple(
np.array([[float(self.goal[name])]], dtype=np.float32),
np.array([], dtype=np.int32),
)
self.step()

2
ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py


behavior_spec = basic_behavior_spec()
policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output")
policy_eval_out = {
"action": np.array([1.0], dtype=np.float32),
"action": {"continuous_action": np.array([1.0], dtype=np.float32)},
"memory_out": np.array([[2.5]], dtype=np.float32),
"value": np.array([1.1], dtype=np.float32),
}

26
ml-agents/mlagents/trainers/tests/test_agent_processor.py


mock_policy = mock.Mock()
mock_policy.reward_signals = {}
mock_policy.retrieve_memories.return_value = np.zeros((1, 1), dtype=np.float32)
mock_policy.retrieve_previous_action.return_value = np.zeros(
(1, 1), dtype=np.float32
)
mock_policy.retrieve_previous_action.return_value = np.zeros((1, 1), dtype=np.int32)
return mock_policy

)
fake_action_outputs = {
"action": [0.1, 0.1],
"action": {"continuous_action": [0.1, 0.1]},
"log_probs": [0.1, 0.1],
"log_probs": {"continuous_log_probs": [0.1, 0.1]},
}
mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
num_agents=2,

fake_action_info = ActionInfo(
action=[0.1, 0.1],
action={"continuous_action": [0.1, 0.1]},
value=[0.1, 0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_steps.agent_id,

)
fake_action_outputs = {
"action": [0.1],
"action": {"continuous_action": [0.1]},
"log_probs": [0.1],
"log_probs": {"continuous_log_probs": [0.1]},
}
mock_decision_step, mock_terminal_step = mb.create_mock_steps(
num_agents=1,

done=True,
)
fake_action_info = ActionInfo(
action=[0.1],
action={"continuous_action": [0.1]},
value=[0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_step.agent_id,

processor.add_experiences(
mock_decision_step, mock_terminal_step, _ep, fake_action_info
)
add_calls.append(mock.call([get_global_agent_id(_ep, 0)], [0.1]))
add_calls.append(
mock.call([get_global_agent_id(_ep, 0)], {"continuous_action": [0.1]})
)
processor.add_experiences(
mock_done_decision_step, mock_done_terminal_step, _ep, fake_action_info
)

)
fake_action_outputs = {
"action": [0.1],
"action": {"continuous_action": [0.1]},
"log_probs": [0.1],
"log_probs": {"continuous_log_probs": [0.1]},
}
mock_decision_step, mock_terminal_step = mb.create_mock_steps(
num_agents=1,

fake_action_info = ActionInfo(
action=[0.1],
action={"continuous_action": [0.1]},
value=[0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_step.agent_id,

10
ml-agents/mlagents/trainers/tests/test_demo_loader.py


assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, BEHAVIOR_SPEC)
assert len(demo_buffer["actions"]) == total_expected - 1
assert (
len(demo_buffer["continuous_action"]) == total_expected - 1
or len(demo_buffer["discrete_action"]) == total_expected - 1
)
def test_load_demo_dir():

assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1, BEHAVIOR_SPEC)
assert len(demo_buffer["actions"]) == total_expected - 1
assert (
len(demo_buffer["continuous_action"]) == total_expected - 1
or len(demo_buffer["discrete_action"]) == total_expected - 1
)
def test_demo_mismatch():

2
ml-agents/mlagents/trainers/tests/test_trajectory.py


"masks",
"done",
"actions_pre",
"actions",
"continuous_action",
"action_probs",
"action_mask",
"prev_action",

9
ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py


with torch.no_grad():
_, log_probs1, _, _ = policy1.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories, all_log_probs=True
vec_obs, vis_obs, masks=masks, memories=memories
vec_obs, vis_obs, masks=masks, memories=memories, all_log_probs=True
vec_obs, vis_obs, masks=masks, memories=memories
np.testing.assert_array_equal(log_probs1, log_probs2)
np.testing.assert_array_equal(
log_probs1.all_discrete_tensor, log_probs2.all_discrete_tensor
)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])

29
ml-agents/mlagents/trainers/tests/torch/test_policy.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction
VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8

run_out = policy.evaluate(decision_step, list(decision_step.agent_id))
if discrete:
run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
run_out["action"]["discrete_action"].shape == (
NUM_AGENTS,
len(DISCRETE_ACTION_SPACE),
)
assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)
assert run_out["action"]["continuous_action"].shape == (
NUM_AGENTS,
VECTOR_ACTION_SPACE,
)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])

buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])]
act_masks = ModelUtils.list_to_tensor(buffer["action_mask"])
if policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(buffer["actions"]).unsqueeze(-1)
else:
actions = ModelUtils.list_to_tensor(buffer["actions"], dtype=torch.long)
agent_action = AgentAction.from_dict(buffer)
vis_obs = []
for idx, _ in enumerate(policy.actor_critic.network_body.visual_processors):
vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx])

vec_obs,
vis_obs,
masks=act_masks,
actions=actions,
actions=agent_action,
memories=memories,
seq_len=policy.sequence_length,
)

_size = policy.behavior_spec.action_spec.continuous_size
assert log_probs.shape == (64, _size)
assert log_probs.flatten().shape == (64, _size)
assert entropy.shape == (64,)
for val in values.values():
assert val.shape == (64,)

masks=act_masks,
memories=memories,
seq_len=policy.sequence_length,
all_log_probs=not policy.use_continuous_act,
assert log_probs.shape == (
assert log_probs.all_discrete_tensor.shape == (
assert log_probs.shape == (64, policy.behavior_spec.action_spec.continuous_size)
assert log_probs.continuous_tensor.shape == (
64,
policy.behavior_spec.action_spec.continuous_size,
)
assert entropies.shape == (64,)
if rnn:

25
ml-agents/mlagents/trainers/tests/torch/test_ppo.py


# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
if discrete:
update_buffer["discrete_log_probs"] = np.ones_like(
update_buffer["discrete_action"]
)
else:
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
return_stats = optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
if discrete:
update_buffer["discrete_log_probs"] = np.ones_like(
update_buffer["discrete_action"]
)
else:
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["gail_returns"] = update_buffer["environment_rewards"]
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

2
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py


for _ in range(200):
curiosity_rp.update(buffer)
prediction = curiosity_rp._network.predict_action(buffer)[0]
target = torch.tensor(buffer["actions"][0])
target = torch.tensor(buffer["continuous_action"][0])
error = torch.mean((prediction - target) ** 2).item()
assert error < 0.001

11
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py


np.random.normal(size=shape).astype(np.float32)
for shape in behavior_spec.observation_shapes
]
action = behavior_spec.action_spec.random_action(1)[0, :]
action_buffer = behavior_spec.action_spec.random_action(1)
action = {}
if behavior_spec.action_spec.continuous_size > 0:
action["continuous_action"] = action_buffer.continuous
if behavior_spec.action_spec.discrete_size > 0:
action["discrete_action"] = action_buffer.discrete
for _ in range(number):
curr_split_obs = SplitObservations.from_observations(curr_observations)
next_split_obs = SplitObservations.from_observations(next_observations)

)
buffer["vector_obs"].append(curr_split_obs.vector_observations)
buffer["next_vector_in"].append(next_split_obs.vector_observations)
buffer["actions"].append(action)
for _act_type, _act in action.items():
buffer[_act_type].append(_act[0, :])
buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
buffer["masks"].append(np.ones(1, dtype=np.float32))
buffer["done"] = np.zeros(number, dtype=np.float32)

15
ml-agents/mlagents/trainers/tests/torch/test_utils.py


log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy(
action_list, dist_list
)
assert log_probs.shape == (1, 2, 2)
for lp in log_probs:
assert lp.shape == (1, 2)
assert all_probs is None
assert all_probs == []
for log_prob in log_probs.flatten():
for log_prob in log_probs:
assert log_prob == pytest.approx(-0.919, abs=0.01)
for lp in log_prob.flatten():
assert lp == pytest.approx(-0.919, abs=0.01)
for ent in entropies.flatten():
# entropy of standard normal at 0

log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy(
action_list, dist_list
)
assert all_probs.shape == (1, len(dist_list * act_size))
for all_prob in all_probs:
assert all_prob.shape == (1, act_size)
assert log_probs.flatten()[0] > log_probs.flatten()[1]
assert log_probs[0] > log_probs[1]
def test_masked_mean():

6
ml-agents/mlagents/trainers/tf/components/bc/module.py


self.policy.batch_size_ph: n_sequences,