浏览代码

Action buffer (#4612)

Co-authored-by: Ervin T <ervin@unity3d.com>
Co-authored-by: Vincent-Pierre BERGES <vincentpierre@unity3d.com>
/fix-conflict-base-env
GitHub 4 年前
当前提交
b853e5ba
共有 38 个文件被更改,包括 633 次插入242 次删除
  1. 109
      ml-agents-envs/mlagents_envs/base_env.py
  2. 21
      ml-agents-envs/mlagents_envs/environment.py
  3. 6
      ml-agents-envs/mlagents_envs/tests/test_envs.py
  4. 27
      ml-agents-envs/mlagents_envs/tests/test_steps.py
  5. 12
      ml-agents/mlagents/trainers/agent_processor.py
  6. 2
      ml-agents/mlagents/trainers/buffer.py
  7. 9
      ml-agents/mlagents/trainers/demo_loader.py
  8. 22
      ml-agents/mlagents/trainers/env_manager.py
  9. 4
      ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
  10. 23
      ml-agents/mlagents/trainers/policy/policy.py
  11. 9
      ml-agents/mlagents/trainers/policy/tf_policy.py
  12. 48
      ml-agents/mlagents/trainers/policy/torch_policy.py
  13. 9
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  14. 12
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  15. 6
      ml-agents/mlagents/trainers/sac/optimizer_tf.py
  16. 44
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  17. 3
      ml-agents/mlagents/trainers/simple_env_manager.py
  18. 5
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  19. 21
      ml-agents/mlagents/trainers/tests/mock_brain.py
  20. 32
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  21. 2
      ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py
  22. 26
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  23. 10
      ml-agents/mlagents/trainers/tests/test_demo_loader.py
  24. 2
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  25. 9
      ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py
  26. 29
      ml-agents/mlagents/trainers/tests/torch/test_policy.py
  27. 25
      ml-agents/mlagents/trainers/tests/torch/test_ppo.py
  28. 2
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py
  29. 11
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
  30. 15
      ml-agents/mlagents/trainers/tests/torch/test_utils.py
  31. 6
      ml-agents/mlagents/trainers/tf/components/bc/module.py
  32. 10
      ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/signal.py
  33. 17
      ml-agents/mlagents/trainers/tf/components/reward_signals/gail/signal.py
  34. 27
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  35. 17
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  36. 6
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
  37. 212
      ml-agents/mlagents/trainers/torch/utils.py
  38. 25
      ml-agents/mlagents/trainers/trajectory.py

109
ml-agents-envs/mlagents_envs/base_env.py


)
class ActionTuple:
"""
An object whose fields correspond to actions of different types.
Continuous and discrete actions are numpy arrays of type float32 and
int32, respectively and are type checked on construction.
Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively.
"""
def __init__(self, continuous: np.ndarray, discrete: np.ndarray):
if continuous.dtype != np.float32:
continuous = continuous.astype(np.float32, copy=False)
self._continuous = continuous
if discrete.dtype != np.int32:
discrete = discrete.astype(np.int32, copy=False)
self._discrete = discrete
@property
def continuous(self) -> np.ndarray:
return self._continuous
@property
def discrete(self) -> np.ndarray:
return self._discrete
@staticmethod
def create_continuous(continuous: np.ndarray) -> "ActionTuple":
discrete = np.zeros((continuous.shape[0], 0), dtype=np.int32)
return ActionTuple(continuous, discrete)
@staticmethod
def create_discrete(discrete: np.ndarray) -> "ActionTuple":
continuous = np.zeros((discrete.shape[0], 0), dtype=np.float32)
return ActionTuple(continuous, discrete)
class ActionSpec(NamedTuple):
"""
A NamedTuple containing utility functions and information about the action spaces

"""
return len(self.discrete_branches)
def empty_action(self, n_agents: int) -> np.ndarray:
def empty_action(self, n_agents: int) -> ActionTuple:
Generates a numpy array corresponding to an empty action (all zeros)
Generates ActionTuple corresponding to an empty action (all zeros)
if self.is_continuous():
return np.zeros((n_agents, self.continuous_size), dtype=np.float32)
return np.zeros((n_agents, self.discrete_size), dtype=np.int32)
continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
return ActionTuple(continuous, discrete)
def random_action(self, n_agents: int) -> np.ndarray:
def random_action(self, n_agents: int) -> ActionTuple:
Generates a numpy array corresponding to a random action (either discrete
Generates ActionTuple corresponding to a random action (either discrete
if self.is_continuous():
action = np.random.uniform(
low=-1.0, high=1.0, size=(n_agents, self.continuous_size)
).astype(np.float32)
else:
branch_size = self.discrete_branches
action = np.column_stack(
continuous = np.random.uniform(
low=-1.0, high=1.0, size=(n_agents, self.continuous_size)
)
discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
if self.discrete_size > 0:
discrete = np.column_stack(
branch_size[i], # type: ignore
self.discrete_branches[i], # type: ignore
size=(n_agents),
dtype=np.int32,
)

return action
return ActionTuple(continuous, discrete)
self, actions: np.ndarray, n_agents: int, name: str
) -> np.ndarray:
self, actions: ActionTuple, n_agents: int, name: str
) -> ActionTuple:
if self.continuous_size > 0:
_size = self.continuous_size
else:
_size = self.discrete_size
_expected_shape = (n_agents, _size)
if actions.shape != _expected_shape:
_expected_shape = (n_agents, self.continuous_size)
if self.continuous_size > 0 and actions.continuous.shape != _expected_shape:
raise UnityActionException(
f"The behavior {name} needs a continuous input of dimension "
f"{_expected_shape} for (<number of agents>, <action size>) but "
f"received input of dimension {actions.continuous.shape}"
)
_expected_shape = (n_agents, self.discrete_size)
if self.discrete_size > 0 and actions.discrete.shape != _expected_shape:
f"The behavior {name} needs an input of dimension "
f"The behavior {name} needs a discrete input of dimension "
f"received input of dimension {actions.shape}"
f"received input of dimension {actions.discrete.shape}"
_expected_type = np.float32 if self.is_continuous() else np.int32
if actions.dtype != _expected_type:
actions = actions.astype(_expected_type)
return actions
@staticmethod

"""
@abstractmethod
def set_actions(self, behavior_name: BehaviorName, action: np.ndarray) -> None:
def set_actions(self, behavior_name: BehaviorName, action: ActionTuple) -> None:
:param action: A two dimensional np.ndarray corresponding to the action
(either int or float)
:param action: ActionTuple tuple of continuous and/or discrete action.
Actions are np.arrays with dimensions (n_agents, continuous_size) and
(n_agents, discrete_size), respectively.
self, behavior_name: BehaviorName, agent_id: AgentId, action: np.ndarray
self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionTuple
) -> None:
"""
Sets the action for one of the agents in the simulation for the next

:param action: A one dimensional np.ndarray corresponding to the action
(either int or float)
:param action: ActionTuple tuple of continuous and/or discrete action
Actions are np.arrays with dimensions (1, continuous_size) and
(1, discrete_size), respectively. Note, this initial dimensions of 1 is because
this action is meant for a single agent.
"""
@abstractmethod

21
ml-agents-envs/mlagents_envs/environment.py


DecisionSteps,
TerminalSteps,
BehaviorSpec,
ActionTuple,
BehaviorName,
AgentId,
BehaviorMapping,

self._env_state: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
self._env_specs: Dict[str, BehaviorSpec] = {}
self._env_actions: Dict[str, np.ndarray] = {}
self._env_actions: Dict[str, ActionTuple] = {}
self._is_first_message = True
self._update_behavior_specs(aca_output)

f"agent group in the environment"
)
def set_actions(self, behavior_name: BehaviorName, action: np.ndarray) -> None:
def set_actions(self, behavior_name: BehaviorName, action: ActionTuple) -> None:
self._assert_behavior_exists(behavior_name)
if behavior_name not in self._env_state:
return

self._env_actions[behavior_name] = action
def set_action_for_agent(
self, behavior_name: BehaviorName, agent_id: AgentId, action: np.ndarray
self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionTuple
) -> None:
self._assert_behavior_exists(behavior_name)
if behavior_name not in self._env_state:

agent_id
)
) from ie
self._env_actions[behavior_name][index] = action
if action_spec.continuous_size > 0:
self._env_actions[behavior_name].continuous[index] = action.continuous[0, :]
if action_spec.discrete_size > 0:
self._env_actions[behavior_name].discrete[index] = action.discrete[0, :]
def get_steps(
self, behavior_name: BehaviorName

@timed
def _generate_step_input(
self, vector_action: Dict[str, np.ndarray]
self, vector_action: Dict[str, ActionTuple]
) -> UnityInputProto:
rl_in = UnityRLInputProto()
for b in vector_action:

for i in range(n_agents):
action = AgentActionProto(vector_actions=vector_action[b][i])
# TODO: This check will be removed when the oroto supports hybrid actions
if vector_action[b].continuous.shape[1] > 0:
_act = vector_action[b].continuous[i]
else:
_act = vector_action[b].discrete[i]
action = AgentActionProto(vector_actions=_act)
rl_in.agent_actions[b].value.extend([action])
rl_in.command = STEP
rl_in.side_channel = bytes(

6
ml-agents-envs/mlagents_envs/tests/test_envs.py


import pytest
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.base_env import DecisionSteps, TerminalSteps
from mlagents_envs.base_env import DecisionSteps, TerminalSteps, ActionTuple
from mlagents_envs.exception import UnityEnvironmentException, UnityActionException
from mlagents_envs.mock_communicator import MockCommunicator

env.set_actions("RealFakeBrain", spec.action_spec.empty_action(n_agents - 1))
decision_steps, terminal_steps = env.get_steps("RealFakeBrain")
n_agents = len(decision_steps)
env.set_actions("RealFakeBrain", spec.action_spec.empty_action(n_agents) - 1)
_empty_act = spec.action_spec.empty_action(n_agents)
next_action = ActionTuple(_empty_act.continuous - 1, _empty_act.discrete - 1)
env.set_actions("RealFakeBrain", next_action)
env.step()
env.close()

27
ml-agents-envs/mlagents_envs/tests/test_steps.py


assert specs.discrete_branches == ()
assert specs.discrete_size == 0
assert specs.continuous_size == 3
assert specs.empty_action(5).shape == (5, 3)
assert specs.empty_action(5).dtype == np.float32
assert specs.empty_action(5).continuous.shape == (5, 3)
assert specs.empty_action(5).continuous.dtype == np.float32
assert specs.empty_action(5).shape == (5, 1)
assert specs.empty_action(5).dtype == np.int32
assert specs.empty_action(5).discrete.shape == (5, 1)
assert specs.empty_action(5).discrete.dtype == np.int32
specs = ActionSpec(3, (3,))
assert specs.continuous_size == 3
assert specs.discrete_branches == (3,)
assert specs.discrete_size == 1
assert specs.empty_action(5).continuous.shape == (5, 3)
assert specs.empty_action(5).continuous.dtype == np.float32
assert specs.empty_action(5).discrete.shape == (5, 1)
assert specs.empty_action(5).discrete.dtype == np.int32
def test_action_generator():

zero_action = specs.empty_action(4)
zero_action = specs.empty_action(4).continuous
random_action = specs.random_action(4)
print(specs.random_action(4))
random_action = specs.random_action(4).continuous
print(random_action)
assert random_action.dtype == np.float32
assert random_action.shape == (4, action_len)
assert np.min(random_action) >= -1

action_shape = (10, 20, 30)
specs = ActionSpec.create_discrete(action_shape)
zero_action = specs.empty_action(4)
zero_action = specs.empty_action(4).discrete
random_action = specs.random_action(4)
random_action = specs.random_action(4).discrete
assert random_action.dtype == np.int32
assert random_action.shape == (4, len(action_shape))
assert np.min(random_action) >= 0

12
ml-agents/mlagents/trainers/agent_processor.py


from typing import List, Dict, TypeVar, Generic, Tuple, Any, Union
from collections import defaultdict, Counter
import queue
import numpy as np
from mlagents_envs.base_env import (
DecisionSteps,

done = terminated # Since this is an ongoing step
interrupted = step.interrupted if terminated else False
# Add the outputs of the last eval
action = stored_take_action_outputs["action"][idx]
action_dict = stored_take_action_outputs["action"]
action: Dict[str, np.ndarray] = {}
for act_type, act_array in action_dict.items():
action[act_type] = act_array[idx]
action_probs = stored_take_action_outputs["log_probs"][idx]
action_probs_dict = stored_take_action_outputs["log_probs"]
action_probs: Dict[str, np.ndarray] = {}
for prob_type, prob_array in action_probs_dict.items():
action_probs[prob_type] = prob_array[idx]
action_mask = stored_decision_step.action_mask
prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
experience = AgentExperience(

2
ml-agents/mlagents/trainers/buffer.py


class AgentBufferField(list):
"""
AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to his
AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to its
AgentBufferField with the append method.
"""

9
ml-agents/mlagents/trainers/demo_loader.py


for i, obs in enumerate(split_obs.visual_observations):
demo_raw_buffer["visual_obs%d" % i].append(obs)
demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
if behavior_spec.action_spec.is_continuous():
demo_raw_buffer["continuous_action"].append(
current_pair_info.action_info.vector_actions
)
else:
demo_raw_buffer["discrete_action"].append(
current_pair_info.action_info.vector_actions
)
demo_raw_buffer["prev_action"].append(previous_action)
if next_done:
demo_raw_buffer.resequence_and_append(

22
ml-agents/mlagents/trainers/env_manager.py


from abc import ABC, abstractmethod
import numpy as np
from typing import List, Dict, NamedTuple, Iterable, Tuple
from mlagents_envs.base_env import (
DecisionSteps,

ActionTuple,
)
from mlagents_envs.side_channel.stats_side_channel import EnvironmentStats

from mlagents_envs.logging_util import get_logger
from mlagents_envs.exception import UnityActionException
AllStepResult = Dict[BehaviorName, Tuple[DecisionSteps, TerminalSteps]]
AllGroupSpec = Dict[BehaviorName, BehaviorSpec]

step_info.environment_stats, step_info.worker_id
)
return len(step_infos)
@staticmethod
def action_tuple_from_numpy_dict(action_dict: Dict[str, np.ndarray]) -> ActionTuple:
if "continuous_action" in action_dict:
continuous = action_dict["continuous_action"]
if "discrete_action" in action_dict:
discrete = action_dict["discrete_action"]
action_tuple = ActionTuple(continuous, discrete)
else:
action_tuple = ActionTuple.create_continuous(continuous)
elif "discrete_action" in action_dict:
discrete = action_dict["discrete_action"]
action_tuple = ActionTuple.create_discrete(discrete)
else:
raise UnityActionException(
"The action dict must contain entries for either continuous_action or discrete_action."
)
return action_tuple

4
ml-agents/mlagents/trainers/optimizer/tf_optimizer.py


[self.value_heads, self.policy.memory_out, self.memory_out], feed_dict
)
prev_action = (
batch["actions"][-1] if not self.policy.use_continuous_act else None
batch["discrete_action"][-1]
if not self.policy.use_continuous_act
else None
)
else:
value_estimates = self.sess.run(self.value_heads, feed_dict)

23
ml-agents/mlagents/trainers/policy/policy.py


1 for shape in behavior_spec.observation_shapes if len(shape) == 3
)
self.use_continuous_act = self.behavior_spec.action_spec.is_continuous()
# This line will be removed in the ActionBuffer change
self.num_branches = (
self.behavior_spec.action_spec.continuous_size
+ self.behavior_spec.action_spec.discrete_size
)
self.previous_action_dict: Dict[str, np.array] = {}
self.previous_action_dict: Dict[str, np.ndarray] = {}
self.memory_dict: Dict[str, np.ndarray] = {}
self.normalize = trainer_settings.network_settings.normalize
self.use_recurrent = self.network_settings.memory is not None

if agent_id in self.memory_dict:
self.memory_dict.pop(agent_id)
def make_empty_previous_action(self, num_agents):
def make_empty_previous_action(self, num_agents: int) -> np.ndarray:
return np.zeros((num_agents, self.num_branches), dtype=np.int)
return np.zeros(
(num_agents, self.behavior_spec.action_spec.discrete_size), dtype=np.int32
)
self, agent_ids: List[str], action_matrix: Optional[np.ndarray]
self, agent_ids: List[str], action_dict: Dict[str, np.ndarray]
if action_matrix is None:
if action_dict is None or "discrete_action" not in action_dict:
self.previous_action_dict[agent_id] = action_matrix[index, :]
self.previous_action_dict[agent_id] = action_dict["discrete_action"][
index, :
]
action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int)
action_matrix = self.make_empty_previous_action(len(agent_ids))
for index, agent_id in enumerate(agent_ids):
if agent_id in self.previous_action_dict:
action_matrix[index, :] = self.previous_action_dict[agent_id]

9
ml-agents/mlagents/trainers/policy/tf_policy.py


feed_dict[self.prev_action] = self.retrieve_previous_action(
global_agent_ids
)
feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
feed_dict = self.fill_eval_dict(feed_dict, decision_requests)
run_out = self._execute_model(feed_dict, self.inference_dict)

)
self.save_memories(global_agent_ids, run_out.get("memory_out"))
# For Compatibility with buffer changes for hybrid action support
if "log_probs" in run_out:
run_out["log_probs"] = {"action_probs": run_out["log_probs"]}
if "action" in run_out:
if self.behavior_spec.action_spec.is_continuous():
run_out["action"] = {"continuous_action": run_out["action"]}
else:
run_out["action"] = {"discrete_action": run_out["action"]}
return ActionInfo(
action=run_out.get("action"),
value=run_out.get("value"),

48
ml-agents/mlagents/trainers/policy/torch_policy.py


SeparateActorCritic,
GlobalSteps,
)
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
EPSILON = 1e-7 # Small value to avoid divide by zero

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
seq_len: int = 1,
all_log_probs: bool = False,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
) -> Tuple[AgentAction, ActionLogProbs, torch.Tensor, torch.Tensor]:
"""
:param vec_obs: List of vector observations.
:param vis_obs: List of visual observations.

:param all_log_probs: Returns (for discrete actions) a tensor of log probs, one for each action.
:return: Tuple of actions, log probabilities (dependent on all_log_probs), entropies, and
output memories, all as Torch Tensors.
"""

vec_obs, vis_obs, masks, memories, seq_len
)
action_list = self.actor_critic.sample_action(dists)
log_probs, entropies, all_logs = ModelUtils.get_probs_and_entropy(
log_probs_list, entropies, all_logs_list = ModelUtils.get_probs_and_entropy(
actions = torch.stack(action_list, dim=-1)
if self.use_continuous_act:
actions = actions[:, :, 0]
else:
actions = actions[:, 0, :]
actions = AgentAction.create(action_list, self.behavior_spec.action_spec)
log_probs = ActionLogProbs.create(
log_probs_list, self.behavior_spec.action_spec, all_logs_list
)
return (
actions,
all_logs if all_log_probs else log_probs,
entropy_sum,
memories,
)
return (actions, log_probs, entropy_sum, memories)
actions: torch.Tensor,
actions: AgentAction,
) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]:
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
action_list = [actions[..., i] for i in range(actions.shape[-1])]
log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_list, dists)
action_list = actions.to_tensor_list()
log_probs_list, entropies, _ = ModelUtils.get_probs_and_entropy(
action_list, dists
)
log_probs = ActionLogProbs.create(
log_probs_list, self.behavior_spec.action_spec
)
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return log_probs, entropy_sum, value_heads

action, log_probs, entropy, memories = self.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
)
run_out["action"] = ModelUtils.to_numpy(action)
run_out["pre_action"] = ModelUtils.to_numpy(action)
# Todo - make pre_action difference
run_out["log_probs"] = ModelUtils.to_numpy(log_probs)
action_dict = action.to_numpy_dict()
run_out["action"] = action_dict
run_out["pre_action"] = (
action_dict["continuous_action"] if self.use_continuous_act else None
)
run_out["log_probs"] = log_probs.to_numpy_dict()
run_out["entropy"] = ModelUtils.to_numpy(entropy)
run_out["learning_rate"] = 0.0
if self.use_recurrent:

9
ml-agents/mlagents/trainers/ppo/optimizer_tf.py


if self.policy.output_pre is not None and "actions_pre" in mini_batch:
feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]
else:
feed_dict[self.policy.output] = mini_batch["actions"]
if self.policy.use_recurrent:
feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
if self.policy.use_continuous_act: # For hybrid action buffer support
feed_dict[self.policy.output] = mini_batch["continuous_action"]
else:
feed_dict[self.policy.output] = mini_batch["discrete_action"]
if self.policy.use_recurrent:
feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
feed_dict[self.policy.action_masks] = mini_batch["action_mask"]
if "vector_obs" in mini_batch:
feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]

12
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
from mlagents.trainers.settings import TrainerSettings, PPOSettings
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
class TorchPPOOptimizer(TorchOptimizer):

advantage = advantages.unsqueeze(-1)
decay_epsilon = self.hyperparameters.epsilon
r_theta = torch.exp(log_probs - old_log_probs)
p_opt_a = r_theta * advantage
p_opt_b = (

vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
else:
actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long)
actions = AgentAction.from_dict(batch)
memories = [
ModelUtils.list_to_tensor(batch["memory"][i])

memories=memories,
seq_len=self.policy.sequence_length,
)
old_log_probs = ActionLogProbs.from_dict(batch).flatten()
log_probs = log_probs.flatten()
loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
value_loss = self.ppo_value_loss(
values, old_values, returns, decay_eps, loss_masks

log_probs,
ModelUtils.list_to_tensor(batch["action_probs"]),
old_log_probs,
loss_masks,
)
loss = (

6
ml-agents/mlagents/trainers/sac/optimizer_tf.py


feed_dict[self.rewards_holders[name]] = batch[f"{name}_rewards"]
if self.policy.use_continuous_act:
feed_dict[self.policy_network.external_action_in] = batch["actions"]
feed_dict[self.policy_network.external_action_in] = batch[
"continuous_action"
]
feed_dict[policy.output] = batch["actions"]
feed_dict[policy.output] = batch["discrete_action"]
if self.policy.use_recurrent:
feed_dict[policy.prev_action] = batch["prev_action"]
feed_dict[policy.action_masks] = batch["action_mask"]

44
ml-agents/mlagents/trainers/sac/optimizer_torch.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.torch.networks import ValueNetwork
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
from mlagents.trainers.buffer import AgentBuffer
from mlagents_envs.timers import timed
from mlagents.trainers.exception import UnityTrainerException

def sac_value_loss(
self,
log_probs: torch.Tensor,
log_probs: ActionLogProbs,
values: Dict[str, torch.Tensor],
q1p_out: Dict[str, torch.Tensor],
q2p_out: Dict[str, torch.Tensor],

if not discrete:
min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name])
else:
action_probs = log_probs.exp()
action_probs = log_probs.all_discrete_tensor.exp()
_branched_q1p = ModelUtils.break_into_branches(
q1p_out[name] * action_probs, self.act_size
)

for name in values.keys():
with torch.no_grad():
v_backup = min_policy_qs[name] - torch.sum(
_ent_coef * log_probs, dim=1
_ent_coef * log_probs.continuous_tensor, dim=1
)
value_loss = 0.5 * ModelUtils.masked_mean(
torch.nn.functional.mse_loss(values[name], v_backup), loss_masks

branched_per_action_ent = ModelUtils.break_into_branches(
log_probs * log_probs.exp(), self.act_size
log_probs.all_discrete_tensor * log_probs.all_discrete_tensor.exp(),
self.act_size,
)
# We have to do entropy bonus per action branch
branched_ent_bonus = torch.stack(

def sac_policy_loss(
self,
log_probs: torch.Tensor,
log_probs: ActionLogProbs,
q1p_outs: Dict[str, torch.Tensor],
loss_masks: torch.Tensor,
discrete: bool,

if not discrete:
mean_q1 = mean_q1.unsqueeze(1)
batch_policy_loss = torch.mean(_ent_coef * log_probs - mean_q1, dim=1)
batch_policy_loss = torch.mean(
_ent_coef * log_probs.continuous_tensor - mean_q1, dim=1
)
action_probs = log_probs.exp()
action_probs = log_probs.all_discrete_tensor.exp()
log_probs * action_probs, self.act_size
log_probs.all_discrete_tensor * action_probs, self.act_size
)
branched_q_term = ModelUtils.break_into_branches(
mean_q1 * action_probs, self.act_size

return policy_loss
def sac_entropy_loss(
self, log_probs: torch.Tensor, loss_masks: torch.Tensor, discrete: bool
self, log_probs: ActionLogProbs, loss_masks: torch.Tensor, discrete: bool
target_current_diff = torch.sum(log_probs + self.target_entropy, dim=1)
target_current_diff = torch.sum(
log_probs.continuous_tensor + self.target_entropy, dim=1
)
entropy_loss = -1 * ModelUtils.masked_mean(
self._log_ent_coef * target_current_diff, loss_masks
)

log_probs * log_probs.exp(), self.act_size
log_probs.all_discrete_tensor * log_probs.all_discrete_tensor.exp(),
self.act_size,
)
target_current_diff_branched = torch.stack(
[

vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
next_vec_obs = [ModelUtils.list_to_tensor(batch["next_vector_in"])]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
else:
actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long)
actions = AgentAction.from_dict(batch)
memories_list = [
ModelUtils.list_to_tensor(batch["memory"][i])

masks=act_masks,
memories=memories,
seq_len=self.policy.sequence_length,
all_log_probs=not self.policy.use_continuous_act,
squeezed_actions = actions.squeeze(-1)
squeezed_actions = actions.continuous_tensor
sampled_actions,
sampled_actions.continuous_tensor,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q2_grad=False,

memories=q_memories,
sequence_length=self.policy.sequence_length,
)
q1_stream = self._condense_q_streams(q1_out, actions)
q2_stream = self._condense_q_streams(q2_out, actions)
q1_stream = self._condense_q_streams(q1_out, actions.discrete_tensor)
q2_stream = self._condense_q_streams(q2_out, actions.discrete_tensor)
with torch.no_grad():
target_values, _ = self.target_network(

3
ml-agents/mlagents/trainers/simple_env_manager.py


self.previous_all_action_info = all_action_info
for brain_name, action_info in all_action_info.items():
self.env.set_actions(brain_name, action_info.action)
_action = EnvManager.action_tuple_from_numpy_dict(action_info.action)
self.env.set_actions(brain_name, _action)
self.env.step()
all_step_result = self._generate_all_results()

5
ml-agents/mlagents/trainers/subprocess_env_manager.py


all_action_info = req.payload
for brain_name, action_info in all_action_info.items():
if len(action_info.action) != 0:
env.set_actions(brain_name, action_info.action)
_action = EnvManager.action_tuple_from_numpy_dict(
action_info.action
)
env.set_actions(brain_name, _action)
env.step()
all_step_result = _generate_all_results()
# The timers in this process are independent from all the processes and the "main" process

21
ml-agents/mlagents/trainers/tests/mock_brain.py


steps_list = []
action_size = action_spec.discrete_size + action_spec.continuous_size
action_probs = np.ones(
int(np.sum(action_spec.discrete_branches) + action_spec.continuous_size),
dtype=np.float32,
)
action_probs = {
"action_probs": np.ones(
int(np.sum(action_spec.discrete_branches) + action_spec.continuous_size),
dtype=np.float32,
)
}
for _i in range(length - 1):
obs = []
for _shape in observation_shapes:

action = np.zeros(action_size, dtype=np.float32)
if action_spec.is_continuous():
action = {"continuous_action": np.zeros(action_size, dtype=np.float32)}
else:
action = {"discrete_action": np.zeros(action_size, dtype=np.float32)}
action_pre = np.zeros(action_size, dtype=np.float32)
action_mask = (
[

if action_spec.is_discrete()
else None
)
prev_action = np.ones(action_size, dtype=np.float32)
if action_spec.is_discrete():
prev_action = np.ones(action_size, dtype=np.int32)
else:
prev_action = np.ones(action_size, dtype=np.float32)
max_step = False
memory = np.ones(memory_size, dtype=np.float32)
agent_id = "test_agent"

32
ml-agents/mlagents/trainers/tests/simple_test_envs.py


from mlagents_envs.base_env import (
ActionSpec,
ActionTuple,
BaseEnv,
BehaviorSpec,
DecisionSteps,

else:
action_spec = ActionSpec.create_continuous(action_size)
self.behavior_spec = BehaviorSpec(self._make_obs_spec(), action_spec)
self.action_spec = action_spec
self.action_size = action_size
self.names = brain_names
self.positions: Dict[str, List[float]] = {}

def _take_action(self, name: str) -> bool:
deltas = []
for _act in self.action[name][0]:
if self.discrete:
deltas.append(1 if _act else -1)
else:
deltas.append(_act)
_act = self.action[name]
if self.action_spec.discrete_size > 0:
for _disc in _act.discrete[0]:
deltas.append(1 if _disc else -1)
if self.action_spec.continuous_size > 0:
for _cont in _act.continuous[0]:
deltas.append(_cont)
for i, _delta in enumerate(deltas):
_delta = clamp(_delta, -self.step_size, self.step_size)
self.positions[name][i] += _delta

def step(self) -> None:
super().step()
for name in self.names:
if self.discrete:
action = self.action[name].discrete
else:
action = self.action[name].continuous
self.step_result[name][0], self.step_result[name][1], self.action[name]
self.step_result[name][0], self.step_result[name][1], action
)
self.demonstration_protos[name] = self.demonstration_protos[name][
-self.n_demos :

for _ in range(self.n_demos):
for name in self.names:
if self.discrete:
self.action[name] = [[1]] if self.goal[name] > 0 else [[0]]
self.action[name] = ActionTuple(
np.array([], dtype=np.float32),
np.array(
[[1]] if self.goal[name] > 0 else [[0]], dtype=np.int32
),
)
self.action[name] = [[float(self.goal[name])]]
self.action[name] = ActionTuple(
np.array([[float(self.goal[name])]], dtype=np.float32),
np.array([], dtype=np.int32),
)
self.step()

2
ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py


behavior_spec = basic_behavior_spec()
policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output")
policy_eval_out = {
"action": np.array([1.0], dtype=np.float32),
"action": {"continuous_action": np.array([1.0], dtype=np.float32)},
"memory_out": np.array([[2.5]], dtype=np.float32),
"value": np.array([1.1], dtype=np.float32),
}

26
ml-agents/mlagents/trainers/tests/test_agent_processor.py


mock_policy = mock.Mock()
mock_policy.reward_signals = {}
mock_policy.retrieve_memories.return_value = np.zeros((1, 1), dtype=np.float32)
mock_policy.retrieve_previous_action.return_value = np.zeros(
(1, 1), dtype=np.float32
)
mock_policy.retrieve_previous_action.return_value = np.zeros((1, 1), dtype=np.int32)
return mock_policy

)
fake_action_outputs = {
"action": [0.1, 0.1],
"action": {"continuous_action": [0.1, 0.1]},
"log_probs": [0.1, 0.1],
"log_probs": {"continuous_log_probs": [0.1, 0.1]},
}
mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
num_agents=2,

fake_action_info = ActionInfo(
action=[0.1, 0.1],
action={"continuous_action": [0.1, 0.1]},
value=[0.1, 0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_steps.agent_id,

)
fake_action_outputs = {
"action": [0.1],
"action": {"continuous_action": [0.1]},
"log_probs": [0.1],
"log_probs": {"continuous_log_probs": [0.1]},
}
mock_decision_step, mock_terminal_step = mb.create_mock_steps(
num_agents=1,

done=True,
)
fake_action_info = ActionInfo(
action=[0.1],
action={"continuous_action": [0.1]},
value=[0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_step.agent_id,

processor.add_experiences(
mock_decision_step, mock_terminal_step, _ep, fake_action_info
)
add_calls.append(mock.call([get_global_agent_id(_ep, 0)], [0.1]))
add_calls.append(
mock.call([get_global_agent_id(_ep, 0)], {"continuous_action": [0.1]})
)
processor.add_experiences(
mock_done_decision_step, mock_done_terminal_step, _ep, fake_action_info
)

)
fake_action_outputs = {
"action": [0.1],
"action": {"continuous_action": [0.1]},
"log_probs": [0.1],
"log_probs": {"continuous_log_probs": [0.1]},
}
mock_decision_step, mock_terminal_step = mb.create_mock_steps(
num_agents=1,

fake_action_info = ActionInfo(
action=[0.1],
action={"continuous_action": [0.1]},
value=[0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_step.agent_id,

10
ml-agents/mlagents/trainers/tests/test_demo_loader.py


assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, BEHAVIOR_SPEC)
assert len(demo_buffer["actions"]) == total_expected - 1
assert (
len(demo_buffer["continuous_action"]) == total_expected - 1
or len(demo_buffer["discrete_action"]) == total_expected - 1
)
def test_load_demo_dir():

assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1, BEHAVIOR_SPEC)
assert len(demo_buffer["actions"]) == total_expected - 1
assert (
len(demo_buffer["continuous_action"]) == total_expected - 1
or len(demo_buffer["discrete_action"]) == total_expected - 1
)
def test_demo_mismatch():

2
ml-agents/mlagents/trainers/tests/test_trajectory.py


"masks",
"done",
"actions_pre",
"actions",
"continuous_action",
"action_probs",
"action_mask",
"prev_action",

9
ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py


with torch.no_grad():
_, log_probs1, _, _ = policy1.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories, all_log_probs=True
vec_obs, vis_obs, masks=masks, memories=memories
vec_obs, vis_obs, masks=masks, memories=memories, all_log_probs=True
vec_obs, vis_obs, masks=masks, memories=memories
np.testing.assert_array_equal(log_probs1, log_probs2)
np.testing.assert_array_equal(
log_probs1.all_discrete_tensor, log_probs2.all_discrete_tensor
)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])

29
ml-agents/mlagents/trainers/tests/torch/test_policy.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction
VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8

run_out = policy.evaluate(decision_step, list(decision_step.agent_id))
if discrete:
run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
run_out["action"]["discrete_action"].shape == (
NUM_AGENTS,
len(DISCRETE_ACTION_SPACE),
)
assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)
assert run_out["action"]["continuous_action"].shape == (
NUM_AGENTS,
VECTOR_ACTION_SPACE,
)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])

buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])]
act_masks = ModelUtils.list_to_tensor(buffer["action_mask"])
if policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(buffer["actions"]).unsqueeze(-1)
else:
actions = ModelUtils.list_to_tensor(buffer["actions"], dtype=torch.long)
agent_action = AgentAction.from_dict(buffer)
vis_obs = []
for idx, _ in enumerate(policy.actor_critic.network_body.visual_processors):
vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx])

vec_obs,
vis_obs,
masks=act_masks,
actions=actions,
actions=agent_action,
memories=memories,
seq_len=policy.sequence_length,
)

_size = policy.behavior_spec.action_spec.continuous_size
assert log_probs.shape == (64, _size)
assert log_probs.flatten().shape == (64, _size)
assert entropy.shape == (64,)
for val in values.values():
assert val.shape == (64,)

masks=act_masks,
memories=memories,
seq_len=policy.sequence_length,
all_log_probs=not policy.use_continuous_act,
assert log_probs.shape == (
assert log_probs.all_discrete_tensor.shape == (
assert log_probs.shape == (64, policy.behavior_spec.action_spec.continuous_size)
assert log_probs.continuous_tensor.shape == (
64,
policy.behavior_spec.action_spec.continuous_size,
)
assert entropies.shape == (64,)
if rnn:

25
ml-agents/mlagents/trainers/tests/torch/test_ppo.py


# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
if discrete:
update_buffer["discrete_log_probs"] = np.ones_like(
update_buffer["discrete_action"]
)
else:
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
return_stats = optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
if discrete:
update_buffer["discrete_log_probs"] = np.ones_like(
update_buffer["discrete_action"]
)
else:
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["gail_returns"] = update_buffer["environment_rewards"]
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

2
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py


for _ in range(200):
curiosity_rp.update(buffer)
prediction = curiosity_rp._network.predict_action(buffer)[0]
target = torch.tensor(buffer["actions"][0])
target = torch.tensor(buffer["continuous_action"][0])
error = torch.mean((prediction - target) ** 2).item()
assert error < 0.001

11
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py


np.random.normal(size=shape).astype(np.float32)
for shape in behavior_spec.observation_shapes
]
action = behavior_spec.action_spec.random_action(1)[0, :]
action_buffer = behavior_spec.action_spec.random_action(1)
action = {}
if behavior_spec.action_spec.continuous_size > 0:
action["continuous_action"] = action_buffer.continuous
if behavior_spec.action_spec.discrete_size > 0:
action["discrete_action"] = action_buffer.discrete
for _ in range(number):
curr_split_obs = SplitObservations.from_observations(curr_observations)
next_split_obs = SplitObservations.from_observations(next_observations)

)
buffer["vector_obs"].append(curr_split_obs.vector_observations)
buffer["next_vector_in"].append(next_split_obs.vector_observations)
buffer["actions"].append(action)
for _act_type, _act in action.items():
buffer[_act_type].append(_act[0, :])
buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
buffer["masks"].append(np.ones(1, dtype=np.float32))
buffer["done"] = np.zeros(number, dtype=np.float32)

15
ml-agents/mlagents/trainers/tests/torch/test_utils.py


log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy(
action_list, dist_list
)
assert log_probs.shape == (1, 2, 2)
for lp in log_probs:
assert lp.shape == (1, 2)
assert all_probs is None
assert all_probs == []
for log_prob in log_probs.flatten():
for log_prob in log_probs:
assert log_prob == pytest.approx(-0.919, abs=0.01)
for lp in log_prob.flatten():
assert lp == pytest.approx(-0.919, abs=0.01)
for ent in entropies.flatten():
# entropy of standard normal at 0

log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy(
action_list, dist_list
)
assert all_probs.shape == (1, len(dist_list * act_size))
for all_prob in all_probs:
assert all_prob.shape == (1, act_size)
assert log_probs.flatten()[0] > log_probs.flatten()[1]
assert log_probs[0] > log_probs[1]
def test_masked_mean():

6
ml-agents/mlagents/trainers/tf/components/bc/module.py


self.policy.batch_size_ph: n_sequences,
self.policy.sequence_length_ph: self.policy.sequence_length,
}
feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"]
feed_dict[self.model.action_in_expert] = mini_batch_demo["discrete_action"]
feed_dict[self.policy.action_masks] = np.ones(
(
self.n_sequences * self.policy.sequence_length,

)
else:
feed_dict[self.model.action_in_expert] = mini_batch_demo[
"continuous_action"
]
if self.policy.vec_obs_size > 0:
feed_dict[self.policy.vector_in] = mini_batch_demo["vector_obs"]
for i, _ in enumerate(self.policy.visual_in):

10
ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/signal.py


def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size_ph: len(mini_batch["actions"]),
self.policy.batch_size_ph: len(mini_batch["vector_obs"]),
self.policy.sequence_length_ph: self.policy.sequence_length,
}
if self.policy.use_vec_obs:

feed_dict[self.model.next_visual_in[i]] = _next_obs
if self.policy.use_continuous_act:
feed_dict[self.policy.selected_actions] = mini_batch["actions"]
feed_dict[self.policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[self.policy.output] = mini_batch["actions"]
feed_dict[self.policy.output] = mini_batch["discrete_action"]
unscaled_reward = self.policy.sess.run(
self.model.intrinsic_reward, feed_dict=feed_dict
)

policy.mask_input: mini_batch["masks"],
}
if self.policy.use_continuous_act:
feed_dict[policy.selected_actions] = mini_batch["actions"]
feed_dict[policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[policy.output] = mini_batch["actions"]
feed_dict[policy.output] = mini_batch["discrete_action"]
if self.policy.use_vec_obs:
feed_dict[policy.vector_in] = mini_batch["vector_obs"]
feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]

17
ml-agents/mlagents/trainers/tf/components/reward_signals/gail/signal.py


def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size_ph: len(mini_batch["actions"]),
self.policy.batch_size_ph: len(mini_batch["vector_obs"]),
self.policy.sequence_length_ph: self.policy.sequence_length,
}
if self.model.use_vail:

feed_dict[self.policy.visual_in[i]] = _obs
if self.policy.use_continuous_act:
feed_dict[self.policy.selected_actions] = mini_batch["actions"]
feed_dict[self.policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[self.policy.output] = mini_batch["actions"]
feed_dict[self.policy.output] = mini_batch["discrete_action"]
feed_dict[self.model.done_policy_holder] = np.array(
mini_batch["done"]
).flatten()

if self.model.use_vail:
feed_dict[self.model.use_noise] = [1]
feed_dict[self.model.action_in_expert] = np.array(mini_batch_demo["actions"])
feed_dict[policy.selected_actions] = mini_batch["actions"]
feed_dict[policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[self.model.action_in_expert] = np.array(
mini_batch_demo["continuous_action"]
)
feed_dict[policy.output] = mini_batch["actions"]
feed_dict[policy.output] = mini_batch["discrete_action"]
feed_dict[self.model.action_in_expert] = np.array(
mini_batch_demo["discrete_action"]
)
if self.policy.use_vis_obs > 0:
for i in range(len(policy.visual_in)):

27
ml-agents/mlagents/trainers/torch/components/bc/module.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.settings import BehavioralCloningSettings, ScheduleType
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
class BCModule:

update_stats = {"Losses/Pretraining Loss": np.mean(batch_losses)}
return update_stats
def _behavioral_cloning_loss(self, selected_actions, log_probs, expert_actions):
def _behavioral_cloning_loss(
self,
selected_actions: AgentAction,
log_probs: ActionLogProbs,
expert_actions: torch.Tensor,
) -> torch.Tensor:
bc_loss = torch.nn.functional.mse_loss(selected_actions, expert_actions)
bc_loss = torch.nn.functional.mse_loss(
selected_actions.continuous_tensor, expert_actions
)
log_probs, self.policy.act_size
log_probs.all_discrete_tensor,
self.policy.behavior_spec.action_spec.discrete_branches,
)
bc_loss = torch.mean(
torch.stack(

vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])]
act_masks = None
if self.policy.use_continuous_act:
expert_actions = ModelUtils.list_to_tensor(mini_batch_demo["actions"])
expert_actions = ModelUtils.list_to_tensor(
mini_batch_demo["continuous_action"]
)
mini_batch_demo["actions"], dtype=torch.long
mini_batch_demo["discrete_action"], dtype=torch.long
)
expert_actions = ModelUtils.actions_to_onehot(
raw_expert_actions, self.policy.act_size

else:
vis_obs = []
selected_actions, all_log_probs, _, _ = self.policy.sample_actions(
selected_actions, log_probs, _, _ = self.policy.sample_actions(
all_log_probs=True,
selected_actions, all_log_probs, expert_actions
selected_actions, log_probs, expert_actions
)
self.optimizer.zero_grad()
bc_loss.backward()

17
ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py


from mlagents.trainers.settings import CuriositySettings
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction
from mlagents.trainers.torch.networks import NetworkBody
from mlagents.trainers.torch.layers import LinearEncoder, linear_layer
from mlagents.trainers.settings import NetworkSettings, EncoderType

Uses the current state embedding and the action of the mini_batch to predict
the next state embedding.
"""
actions = AgentAction.from_dict(mini_batch)
action = ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float)
action = actions.continuous_tensor
ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long),
self._action_spec.discrete_branches,
actions.discrete_tensor, self._action_spec.discrete_branches
),
dim=1,
)

action prediction (given the current and next state).
"""
predicted_action = self.predict_action(mini_batch)
actions = AgentAction.from_dict(mini_batch)
sq_difference = (
ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float)
- predicted_action
) ** 2
sq_difference = (actions.continuous_tensor - predicted_action) ** 2
sq_difference = torch.sum(sq_difference, dim=1)
return torch.mean(
ModelUtils.dynamic_partition(

else:
true_action = torch.cat(
ModelUtils.actions_to_onehot(
ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long),
self._action_spec.discrete_branches,
actions.discrete_tensor, self._action_spec.discrete_branches
),
dim=1,
)

6
ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py


)
from mlagents.trainers.settings import GAILSettings
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction
from mlagents.trainers.torch.networks import NetworkBody
from mlagents.trainers.torch.layers import linear_layer, Initialization
from mlagents.trainers.settings import NetworkSettings, EncoderType

Creates the action Tensor. In continuous case, corresponds to the action. In
the discrete case, corresponds to the concatenation of one hot action Tensors.
"""
return self._action_flattener.forward(
torch.as_tensor(mini_batch["actions"], dtype=torch.float)
)
return self._action_flattener.forward(AgentAction.from_dict(mini_batch))
def get_state_inputs(
self, mini_batch: AgentBuffer

212
ml-agents/mlagents/trainers/torch/utils.py


from typing import List, Optional, Tuple
from typing import List, Optional, Tuple, NamedTuple, Dict
from mlagents.torch_utils import torch, nn
import numpy as np

from mlagents.trainers.torch.distributions import DistInstance, DiscreteDistInstance
class AgentAction(NamedTuple):
"""
A NamedTuple containing the tensor for continuous actions and list of tensors for
discrete actions. Utility functions provide numpy <=> tensor conversions to be
sent as actions to the environment manager as well as used by the optimizers.
:param continuous_tensor: Torch tensor corresponding to continuous actions
:param discrete_list: List of Torch tensors each corresponding to discrete actions
"""
continuous_tensor: torch.Tensor
discrete_list: List[torch.Tensor]
@property
def discrete_tensor(self):
"""
Returns the discrete action list as a stacked tensor
"""
return torch.stack(self.discrete_list, dim=-1)
def to_numpy_dict(self) -> Dict[str, np.ndarray]:
"""
Returns a Dict of np arrays with an entry correspinding to the continuous action
and an entry corresponding to the discrete action. "continuous_action" and
"discrete_action" are added to the agents buffer individually to maintain a flat buffer.
"""
array_dict: Dict[str, np.ndarray] = {}
if self.continuous_tensor is not None:
array_dict["continuous_action"] = ModelUtils.to_numpy(
self.continuous_tensor
)
if self.discrete_list is not None:
array_dict["discrete_action"] = ModelUtils.to_numpy(
self.discrete_tensor[:, 0, :]
)
return array_dict
def to_tensor_list(self) -> List[torch.Tensor]:
"""
Returns the tensors in the AgentAction as a flat List of torch Tensors. This will be removed
when the ActionModel is merged.
"""
tensor_list: List[torch.Tensor] = []
if self.continuous_tensor is not None:
tensor_list.append(self.continuous_tensor)
if self.discrete_list is not None:
tensor_list += (
self.discrete_list
) # Note this is different for ActionLogProbs
return tensor_list
@staticmethod
def create(
tensor_list: List[torch.Tensor], action_spec: ActionSpec
) -> "AgentAction":
"""
A static method that converts a list of torch Tensors into an AgentAction using the ActionSpec.
This will change (and may be removed) in the ActionModel.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
_offset = 0
if action_spec.continuous_size > 0:
continuous = tensor_list[0]
_offset = 1
if action_spec.discrete_size > 0:
discrete = tensor_list[_offset:]
return AgentAction(continuous, discrete)
@staticmethod
def from_dict(buff: Dict[str, np.ndarray]) -> "AgentAction":
"""
A static method that accesses continuous and discrete action fields in an AgentBuffer
and constructs the corresponding AgentAction from the retrieved np arrays.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
if "continuous_action" in buff:
continuous = ModelUtils.list_to_tensor(buff["continuous_action"])
if "discrete_action" in buff:
discrete_tensor = ModelUtils.list_to_tensor(
buff["discrete_action"], dtype=torch.long
)
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
return AgentAction(continuous, discrete)
class ActionLogProbs(NamedTuple):
"""
A NamedTuple containing the tensor for continuous log probs and list of tensors for
discrete log probs of individual actions as well as all the log probs for an entire branch.
Utility functions provide numpy <=> tensor conversions to be used by the optimizers.
:param continuous_tensor: Torch tensor corresponding to log probs of continuous actions
:param discrete_list: List of Torch tensors each corresponding to log probs of the discrete actions that were
sampled.
:param all_discrete_list: List of Torch tensors each corresponding to all log probs of
a discrete action branch, even the discrete actions that were not sampled. all_discrete_list is a list of Tensors,
each Tensor corresponds to one discrete branch log probabilities.
"""
continuous_tensor: torch.Tensor
discrete_list: List[torch.Tensor]
all_discrete_list: Optional[List[torch.Tensor]]
@property
def discrete_tensor(self):
"""
Returns the discrete log probs list as a stacked tensor
"""
return torch.stack(self.discrete_list, dim=-1)
@property
def all_discrete_tensor(self):
"""
Returns the discrete log probs of each branch as a tensor
"""
return torch.cat(self.all_discrete_list, dim=1)
def to_numpy_dict(self) -> Dict[str, np.ndarray]:
"""
Returns a Dict of np arrays with an entry correspinding to the continuous log probs
and an entry corresponding to the discrete log probs. "continuous_log_probs" and
"discrete_log_probs" are added to the agents buffer individually to maintain a flat buffer.
"""
array_dict: Dict[str, np.ndarray] = {}
if self.continuous_tensor is not None:
array_dict["continuous_log_probs"] = ModelUtils.to_numpy(
self.continuous_tensor
)
if self.discrete_list is not None:
array_dict["discrete_log_probs"] = ModelUtils.to_numpy(self.discrete_tensor)
return array_dict
def _to_tensor_list(self) -> List[torch.Tensor]:
"""
Returns the tensors in the ActionLogProbs as a flat List of torch Tensors. This
is private and serves as a utility for self.flatten()
"""
tensor_list: List[torch.Tensor] = []
if self.continuous_tensor is not None:
tensor_list.append(self.continuous_tensor)
if self.discrete_list is not None:
tensor_list.append(
self.discrete_tensor
) # Note this is different for AgentActions
return tensor_list
def flatten(self) -> torch.Tensor:
"""
A utility method that returns all log probs in ActionLogProbs as a flattened tensor.
This is useful for algorithms like PPO which can treat all log probs in the same way.
"""
return torch.cat(self._to_tensor_list(), dim=1)
@staticmethod
def create(
log_prob_list: List[torch.Tensor],
action_spec: ActionSpec,
all_log_prob_list: List[torch.Tensor] = None,
) -> "ActionLogProbs":
"""
A static method that converts a list of torch Tensors into an ActionLogProbs using the ActionSpec.
This will change (and may be removed) in the ActionModel.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
_offset = 0
if action_spec.continuous_size > 0:
continuous = log_prob_list[0]
_offset = 1
if action_spec.discrete_size > 0:
discrete = log_prob_list[_offset:]
return ActionLogProbs(continuous, discrete, all_log_prob_list)
@staticmethod
def from_dict(buff: Dict[str, np.ndarray]) -> "ActionLogProbs":
"""
A static method that accesses continuous and discrete log probs fields in an AgentBuffer
and constructs the corresponding ActionLogProbs from the retrieved np arrays.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
if "continuous_log_probs" in buff:
continuous = ModelUtils.list_to_tensor(buff["continuous_log_probs"])
if "discrete_log_probs" in buff:
discrete_tensor = ModelUtils.list_to_tensor(buff["discrete_log_probs"])
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
return ActionLogProbs(continuous, discrete, None)
class ModelUtils:
# Minimum supported side for each encoder type. If refactoring an encoder, please
# adjust these also.

else:
return sum(self._specs.discrete_branches)
def forward(self, action: torch.Tensor) -> torch.Tensor:
def forward(self, action: AgentAction) -> torch.Tensor:
return action
return action.continuous_tensor
torch.as_tensor(action, dtype=torch.long),
torch.as_tensor(action.discrete_tensor, dtype=torch.long),
self._specs.discrete_branches,
),
dim=1,

@staticmethod
def get_probs_and_entropy(
action_list: List[torch.Tensor], dists: List[DistInstance]
) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
) -> Tuple[List[torch.Tensor], torch.Tensor, Optional[torch.Tensor]]:
log_probs_list = []
all_probs_list = []
entropies_list = []

entropies_list.append(action_dist.entropy())
if isinstance(action_dist, DiscreteDistInstance):
all_probs_list.append(action_dist.all_log_prob())
log_probs = torch.stack(log_probs_list, dim=-1)
log_probs = log_probs.squeeze(-1)
all_probs = None
else:
all_probs = torch.cat(all_probs_list, dim=-1)
return log_probs, entropies, all_probs
return log_probs_list, entropies, all_probs_list
@staticmethod
def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:

25
ml-agents/mlagents/trainers/trajectory.py


from typing import List, NamedTuple
from typing import List, NamedTuple, Dict
import numpy as np
from mlagents.trainers.buffer import AgentBuffer

obs: List[np.ndarray]
reward: float
done: bool
action: np.ndarray
action_probs: np.ndarray
action: Dict[str, np.ndarray]
action_probs: Dict[str, np.ndarray]
action_pre: np.ndarray # TODO: Remove this
action_mask: np.ndarray
prev_action: np.ndarray

agent_buffer_trajectory["done"].append(exp.done)
# Add the outputs of the last eval
if exp.action_pre is not None:
actions_pre = exp.action_pre
agent_buffer_trajectory["actions_pre"].append(actions_pre)
agent_buffer_trajectory["actions_pre"].append(exp.action_pre)
# value is a dictionary from name of reward to value estimate of the value head
agent_buffer_trajectory["actions"].append(exp.action)
agent_buffer_trajectory["action_probs"].append(exp.action_probs)
# Adds the log prob and action of continuous/discrete separately
for act_type, act_array in exp.action.items():
agent_buffer_trajectory[act_type].append(act_array)
for log_type, log_array in exp.action_probs.items():
agent_buffer_trajectory[log_type].append(log_array)
# Store action masks if necessary. Note that 1 means active, while
# in AgentExperience False means active.

else:
# This should never be needed unless the environment somehow doesn't supply the
# action mask in a discrete space.
if "discrete_action" in exp.action:
action_shape = exp.action["discrete_action"].shape
else:
action_shape = exp.action["continuous_action"].shape
np.ones(exp.action_probs.shape, dtype=np.float32), padding_value=1
np.ones(action_shape, dtype=np.float32), padding_value=1
agent_buffer_trajectory["prev_action"].append(exp.prev_action)
agent_buffer_trajectory["environment_rewards"].append(exp.reward)

正在加载...
取消
保存