浏览代码

Merge pull request #4734 from Unity-Technologies/develop-obs-as-list

Refactor trainers to use list of obs rather than vec and vis obs
/MLA-1734-demo-provider
GitHub 4 年前
当前提交
29d94c7c
共有 24 个文件被更改,包括 316 次插入462 次删除
  1. 1
      com.unity.ml-agents/CHANGELOG.md
  2. 9
      ml-agents/mlagents/trainers/demo_loader.py
  3. 31
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  4. 3
      ml-agents/mlagents/trainers/policy/policy.py
  5. 41
      ml-agents/mlagents/trainers/policy/torch_policy.py
  6. 20
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  7. 2
      ml-agents/mlagents/trainers/ppo/trainer.py
  8. 51
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  9. 14
      ml-agents/mlagents/trainers/sac/trainer.py
  10. 31
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  11. 11
      ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py
  12. 2
      ml-agents/mlagents/trainers/tests/torch/test_encoders.py
  13. 23
      ml-agents/mlagents/trainers/tests/torch/test_networks.py
  14. 24
      ml-agents/mlagents/trainers/tests/torch/test_policy.py
  15. 20
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
  16. 18
      ml-agents/mlagents/trainers/tests/torch/test_utils.py
  17. 26
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  18. 39
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  19. 72
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
  20. 21
      ml-agents/mlagents/trainers/torch/components/reward_providers/rnd_reward_provider.py
  21. 9
      ml-agents/mlagents/trainers/torch/encoders.py
  22. 144
      ml-agents/mlagents/trainers/torch/networks.py
  23. 70
      ml-agents/mlagents/trainers/torch/utils.py
  24. 96
      ml-agents/mlagents/trainers/trajectory.py

1
com.unity.ml-agents/CHANGELOG.md


#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
- PyTorch trainers now support training agents with both continuous and discrete action spaces. (#4702)
The `.onnx` models generated by the trainers of this release are incompatible with versions of Barracuda before `1.2.1-preview`. If you upgrade the trainers, you must upgrade the version of the Barracuda package as well (which can be done by upgrading the `com.unity.ml-agents` package).
### Minor Changes
#### com.unity.ml-agents / com.unity.ml-agents.extensions (C#)
- Agents with both continuous and discrete actions are now supported. You can specify

9
ml-agents/mlagents/trainers/demo_loader.py


from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
AgentInfoActionPairProto,
)
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.trajectory import ObsUtil
from mlagents_envs.rpc_utils import behavior_spec_from_proto, steps_from_proto
from mlagents_envs.base_env import BehaviorSpec
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto

demo_raw_buffer["done"].append(next_done)
demo_raw_buffer["rewards"].append(next_reward)
split_obs = SplitObservations.from_observations(current_obs)
for i, obs in enumerate(split_obs.visual_observations):
demo_raw_buffer["visual_obs%d" % i].append(obs)
demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
for i, obs in enumerate(current_obs):
demo_raw_buffer[ObsUtil.get_name_at(i)].append(obs)
if (
len(current_pair_info.action_info.continuous_actions) == 0
and len(current_pair_info.action_info.discrete_actions) == 0

demo_raw_buffer["discrete_action"].append(
current_pair_info.action_info.discrete_actions
)
demo_raw_buffer["prev_action"].append(previous_action)
if next_done:
demo_raw_buffer.resequence_and_append(

31
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.trajectory import ObsUtil
from mlagents.trainers.torch.components.bc.module import BCModule
from mlagents.trainers.torch.components.reward_providers import create_reward_provider

def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
if self.policy.use_vis_obs:
visual_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
visual_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
visual_obs.append(visual_ob)
else:
visual_obs = []
n_obs = len(self.policy.behavior_spec.observation_shapes)
current_obs = ObsUtil.from_buffer(batch, n_obs)
# Convert to tensors
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
vec_vis_obs = SplitObservations.from_observations(next_obs)
next_vec_obs = [
ModelUtils.list_to_tensor(vec_vis_obs.vector_observations).unsqueeze(0)
]
next_vis_obs = [
ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0)
for _vis_ob in vec_vis_obs.visual_observations
]
next_obs = [obs.unsqueeze(0) for obs in next_obs]
vector_obs, visual_obs, memory, sequence_length=batch.num_experiences
current_obs, memory, sequence_length=batch.num_experiences
next_vec_obs, next_vis_obs, next_memory, sequence_length=1
next_obs, next_memory, sequence_length=1
)
for name, estimate in value_estimates.items():

3
ml-agents/mlagents/trainers/policy/policy.py


from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.buffer import AgentBuffer
class UnityPolicyException(UnityException):

raise RuntimeError("Continuous NaN action detected.")
@abstractmethod
def update_normalization(self, vector_obs: np.ndarray) -> None:
def update_normalization(self, buffer: AgentBuffer) -> None:
pass
@abstractmethod

41
ml-agents/mlagents/trainers/policy/torch_policy.py


from mlagents_envs.timers import timed
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.torch.networks import (
SharedActorCritic,
SeparateActorCritic,

from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs

"""
return self._export_m_size
def _split_decision_step(
self, decision_requests: DecisionSteps
) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
def _extract_masks(self, decision_requests: DecisionSteps) -> np.ndarray:
mask = None
if self.behavior_spec.action_spec.discrete_size > 0:
mask = torch.ones([len(decision_requests), np.sum(self.act_size)])

)
return vec_vis_obs, mask
return mask
def update_normalization(self, vector_obs: np.ndarray) -> None:
def update_normalization(self, buffer: AgentBuffer) -> None:
:param vector_obs: The vector observations to add to the running estimate of the distribution.
:param buffer: The buffer with the observations to add to the running estimate
of the distribution.
vector_obs = [torch.as_tensor(vector_obs)]
self.actor_critic.update_normalization(vector_obs)
self.actor_critic.update_normalization(buffer)
vec_obs: List[torch.Tensor],
vis_obs: List[torch.Tensor],
obs: List[torch.Tensor],
:param vec_obs: List of vector observations.
:param vis_obs: List of visual observations.
:param obs: List of observations.
:param masks: Loss masks for RNN, else None.
:param memories: Input memories when using RNN, else None.
:param seq_len: Sequence length when using RNN.

vec_obs, vis_obs, masks, memories, seq_len
obs, masks, memories, seq_len
vec_obs: List[torch.Tensor],
vis_obs: List[torch.Tensor],
obs: List[torch.Tensor],
actions: AgentAction,
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,

vec_obs, vis_obs, actions, masks, memories, seq_len
obs, actions, masks, memories, seq_len
)
return log_probs, entropies, value_heads

:param decision_requests: DecisionStep object containing inputs.
:return: Outputs from network as defined by self.inference_dict.
"""
vec_vis_obs, masks = self._split_decision_step(decision_requests)
vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)]
vis_obs = [
torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations
]
obs = decision_requests.obs
masks = self._extract_masks(decision_requests)
tensor_obs = [torch.as_tensor(np_ob) for np_ob in obs]
memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze(
0
)

action, log_probs, entropy, memories = self.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
tensor_obs, masks=masks, memories=memories
)
action_tuple = action.to_action_tuple()
run_out["action"] = action_tuple

20
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.trajectory import ObsUtil
class TorchPPOOptimizer(TorchOptimizer):

)
returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])
vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
n_obs = len(self.policy.behavior_spec.observation_shapes)
current_obs = ObsUtil.from_buffer(batch, n_obs)
# Convert to tensors
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
actions = AgentAction.from_dict(batch)

if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
vis_obs.append(vis_ob)
else:
vis_obs = []
vec_obs,
vis_obs,
current_obs,
masks=act_masks,
actions=actions,
memories=memories,

2
ml-agents/mlagents/trainers/ppo/trainer.py


agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
self.policy.update_normalization(agent_buffer_trajectory)
# Get all value estimates
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(

51
ml-agents/mlagents/trainers/sac/optimizer_torch.py


from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.settings import TrainerSettings, SACSettings
from contextlib import ExitStack
from mlagents.trainers.trajectory import ObsUtil
EPSILON = 1e-6 # Small value to avoid divide by zero

def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
actions: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

"""
Performs a forward pass on the value network, which consists of a Q1 and Q2
network. Optionally does not evaluate gradients for either the Q1, Q2, or both.
:param vec_inputs: List of vector observation tensors.
:param vis_input: List of visual observation tensors.
:param inputs: List of observation tensors.
:param actions: For a continuous Q function (has actions), tensor of actions.
Otherwise, None.
:param memories: Initial memories if using memory. Otherwise, None.

if not q1_grad:
stack.enter_context(torch.no_grad())
q1_out, _ = self.q1_network(
vec_inputs,
vis_inputs,
inputs,
actions=actions,
memories=memories,
sequence_length=sequence_length,

stack.enter_context(torch.no_grad())
q2_out, _ = self.q2_network(
vec_inputs,
vis_inputs,
inputs,
actions=actions,
memories=memories,
sequence_length=sequence_length,

for name in self.reward_signals:
rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"])
vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
next_vec_obs = [ModelUtils.list_to_tensor(batch["next_vector_in"])]
n_obs = len(self.policy.behavior_spec.observation_shapes)
current_obs = ObsUtil.from_buffer(batch, n_obs)
# Convert to tensors
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
next_obs = ObsUtil.from_buffer_next(batch, n_obs)
# Convert to tensors
next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
actions = AgentAction.from_dict(batch)

torch.zeros_like(next_memories) if next_memories is not None else None
)
vis_obs: List[torch.Tensor] = []
next_vis_obs: List[torch.Tensor] = []
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
vis_obs.append(vis_ob)
next_vis_ob = ModelUtils.list_to_tensor(
batch["next_visual_obs%d" % idx]
)
next_vis_obs.append(next_vis_ob)
# Copy normalizers from policy
self.value_network.q1_network.network_body.copy_normalization(
self.policy.actor_critic.network_body

value_estimates,
_,
) = self.policy.actor_critic.get_action_stats_and_value(
vec_obs,
vis_obs,
current_obs,
masks=act_masks,
memories=memories,
sequence_length=self.policy.sequence_length,

vec_obs,
vis_obs,
current_obs,
cont_sampled_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,

vec_obs,
vis_obs,
current_obs,
cont_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,

with torch.no_grad():
target_values, _ = self.target_network(
next_vec_obs,
next_vis_obs,
next_obs,
memories=next_memories,
sequence_length=self.policy.sequence_length,
)

14
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer
from mlagents.trainers.trajectory import Trajectory, SplitObservations
from mlagents.trainers.trajectory import Trajectory, ObsUtil
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, SACSettings

# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
self.policy.update_normalization(agent_buffer_trajectory)
# Evaluate all reward functions for reporting purposes
self.collected_rewards["environment"][agent_id] += np.sum(

# Bootstrap using the last step rather than the bootstrap step if max step is reached.
# Set last element to duplicate obs and remove dones.
if last_step.interrupted:
vec_vis_obs = SplitObservations.from_observations(last_step.obs)
for i, obs in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs
if vec_vis_obs.vector_observations.size > 1:
agent_buffer_trajectory["next_vector_in"][
-1
] = vec_vis_obs.vector_observations
last_step_obs = last_step.obs
for i, obs in enumerate(last_step_obs):
agent_buffer_trajectory[ObsUtil.get_name_at_next(i)][-1] = obs
agent_buffer_trajectory["done"][-1] = False
# Append to update buffer

31
ml-agents/mlagents/trainers/tests/test_trajectory.py


import numpy as np
import pytest
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.tests.mock_brain import make_fake_trajectory
from mlagents_envs.base_env import ActionSpec

@pytest.mark.parametrize("num_visual_obs", [0, 1, 2])
@pytest.mark.parametrize("num_vec_obs", [0, 1])
def test_split_obs(num_visual_obs, num_vec_obs):
obs = []
for _ in range(num_visual_obs):
obs.append(np.ones((84, 84, 3), dtype=np.float32))
for _ in range(num_vec_obs):
obs.append(np.ones(VEC_OBS_SIZE, dtype=np.float32))
split_observations = SplitObservations.from_observations(obs)
if num_vec_obs == 1:
assert len(split_observations.vector_observations) == VEC_OBS_SIZE
else:
assert len(split_observations.vector_observations) == 0
# Assert the number of vector observations.
assert len(split_observations.visual_observations) == num_visual_obs
"next_visual_obs0",
"visual_obs0",
"vector_obs",
"next_vector_in",
"next_obs_0",
"next_obs_1",
"obs_0",
"obs_1",
"memory",
"masks",
"done",

11
ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py


from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.torch.test_policy import create_policy_mock
from mlagents.trainers.torch.utils import ModelUtils
def test_register(tmp_path):

decision_step, _ = mb.create_steps_from_behavior_spec(
policy1.behavior_spec, num_agents=1
)
vec_vis_obs, masks = policy1._split_decision_step(decision_step)
vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)]
vis_obs = [torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations]
np_obs = decision_step.obs
masks = policy1._extract_masks(decision_step)
tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
vec_obs, vis_obs, masks=masks, memories=memories
tensor_obs, masks=masks, memories=memories
vec_obs, vis_obs, masks=masks, memories=memories
tensor_obs, masks=masks, memories=memories
)
np.testing.assert_array_equal(
log_probs1.all_discrete_tensor, log_probs2.all_discrete_tensor

2
ml-agents/mlagents/trainers/tests/torch/test_encoders.py


num_outputs = 128
enc = vis_class(image_size[0], image_size[1], image_size[2], num_outputs)
# Note: NCHW not NHWC
sample_input = torch.ones((1, image_size[2], image_size[0], image_size[1]))
sample_input = torch.ones((1, image_size[0], image_size[1], image_size[2]))
encoding = enc(sample_input)
assert encoding.shape == (1, num_outputs)

23
ml-agents/mlagents/trainers/tests/torch/test_networks.py


)
from mlagents.trainers.settings import NetworkSettings
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.tests.torch.test_encoders import compare_models
def test_networkbody_vector():

sample_act = 0.1 * torch.ones((1, 2))
for _ in range(300):
encoded, _ = networkbody([sample_obs], [], sample_act)
encoded, _ = networkbody([sample_obs], sample_act)
assert encoded.shape == (1, network_settings.hidden_units)
# Try to force output to 1
loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape))

sample_obs = torch.ones((1, seq_len, obs_size))
for _ in range(200):
encoded, _ = networkbody([sample_obs], [], memories=torch.ones(1, seq_len, 12))
encoded, _ = networkbody([sample_obs], memories=torch.ones(1, seq_len, 12))
# Try to force output to 1
loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape))
optimizer.zero_grad()

optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-3)
sample_obs = 0.1 * torch.ones((1, 84, 84, 3))
sample_vec_obs = torch.ones((1, vec_obs_size))
obs = [sample_vec_obs] + [sample_obs]
encoded, _ = networkbody([sample_vec_obs], [sample_obs])
encoded, _ = networkbody(obs)
assert encoded.shape == (1, network_settings.hidden_units)
# Try to force output to 1
loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape))

for _ in range(50):
sample_obs = torch.ones((1, obs_size))
values, _ = value_net([sample_obs], [])
values, _ = value_net([sample_obs])
loss = 0
for s_name in stream_names:
assert values[s_name].shape == (1, num_outputs)

# memories isn't always set to None, the network should be able to
# deal with that.
# Test critic pass
value_out, memories_out = actor.critic_pass([sample_obs], [], memories=memories)
value_out, memories_out = actor.critic_pass([sample_obs], memories=memories)
for stream in stream_names:
if lstm:
assert value_out[stream].shape == (network_settings.memory.sequence_length,)

# Test get action stats and_value
action, log_probs, entropies, value_out, mem_out = actor.get_action_stats_and_value(
[sample_obs], [], memories=memories, masks=mask
[sample_obs], memories=memories, masks=mask
)
if lstm:
assert action.continuous_tensor.shape == (64, 2)

assert value_out[stream].shape == (network_settings.memory.sequence_length,)
else:
assert value_out[stream].shape == (1,)
# Test normalization
actor.update_normalization(sample_obs)
if isinstance(actor, SeparateActorCritic):
for act_proc, crit_proc in zip(
actor.network_body.vector_processors,
actor.critic.network_body.vector_processors,
):
assert compare_models(act_proc, crit_proc)

24
ml-agents/mlagents/trainers/tests/torch/test_policy.py


from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.trajectory import ObsUtil
from mlagents.trainers.torch.agent_action import AgentAction
VECTOR_ACTION_SPACE = 2

TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])]
vis_obs = []
for idx, _ in enumerate(policy.actor_critic.network_body.visual_processors):
vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx])
vis_obs.append(vis_ob)
np_obs = ObsUtil.from_buffer(buffer, len(policy.behavior_spec.observation_shapes))
tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
memories = [
ModelUtils.list_to_tensor(buffer["memory"][i])

memories = torch.stack(memories).unsqueeze(0)
log_probs, entropy, values = policy.evaluate_actions(
vec_obs,
vis_obs,
tensor_obs,
masks=act_masks,
actions=agent_action,
memories=memories,

TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])]
vis_obs = []
for idx, _ in enumerate(policy.actor_critic.network_body.visual_processors):
vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx])
vis_obs.append(vis_ob)
np_obs = ObsUtil.from_buffer(buffer, len(policy.behavior_spec.observation_shapes))
tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
memories = [
ModelUtils.list_to_tensor(buffer["memory"][i])

memories = torch.stack(memories).unsqueeze(0)
(sampled_actions, log_probs, entropies, memories) = policy.sample_actions(
vec_obs,
vis_obs,
masks=act_masks,
memories=memories,
seq_len=policy.sequence_length,
tensor_obs, masks=act_masks, memories=memories, seq_len=policy.sequence_length
)
if discrete:
assert log_probs.all_discrete_tensor.shape == (

20
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.trajectory import ObsUtil
def create_agent_buffer(

curr_observations = [
curr_obs = [
next_observations = [
next_obs = [
np.random.normal(size=shape).astype(np.float32)
for shape in behavior_spec.observation_shapes
]

action["discrete_action"] = action_buffer.discrete
for _ in range(number):
curr_split_obs = SplitObservations.from_observations(curr_observations)
next_split_obs = SplitObservations.from_observations(next_observations)
for i, _ in enumerate(curr_split_obs.visual_observations):
buffer["visual_obs%d" % i].append(curr_split_obs.visual_observations[i])
buffer["next_visual_obs%d" % i].append(
next_split_obs.visual_observations[i]
)
buffer["vector_obs"].append(curr_split_obs.vector_observations)
buffer["next_vector_in"].append(next_split_obs.vector_observations)
for i, obs in enumerate(curr_obs):
buffer[ObsUtil.get_name_at(i)].append(obs)
for i, obs in enumerate(next_obs):
buffer[ObsUtil.get_name_at_next(i)].append(obs)
buffer["actions"].append(action)
for _act_type, _act in action.items():
buffer[_act_type].append(_act[0, :])
buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)

18
ml-agents/mlagents/trainers/tests/torch/test_utils.py


for encoder_type in EncoderType:
good_size = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[encoder_type]
vis_input = torch.ones((1, 3, good_size, good_size))
vis_input = torch.ones((1, good_size, good_size, 3))
ModelUtils._check_resolution_for_encoder(good_size, good_size, encoder_type)
enc_func = ModelUtils.get_encoder_for_type(encoder_type)
enc = enc_func(good_size, good_size, 3, 1)

with pytest.raises(Exception):
bad_size = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[encoder_type] - 1
vis_input = torch.ones((1, 3, bad_size, bad_size))
vis_input = torch.ones((1, bad_size, bad_size, 3))
with pytest.raises(UnityTrainerException):
# Make sure we'd hit a friendly error during model setup time.

for _ in range(num_visual):
obs_shapes.append(vis_obs_shape)
h_size = 128
vis_enc, vec_enc, total_output = ModelUtils.create_input_processors(
encoders, embedding_sizes = ModelUtils.create_input_processors(
vec_enc = list(vec_enc)
vis_enc = list(vis_enc)
assert len(vec_enc) == (1 if num_vector >= 1 else 0)
total_output = sum(embedding_sizes)
vec_enc = []
vis_enc = []
for i, enc in enumerate(encoders):
if len(obs_shapes[i]) == 1:
vec_enc.append(enc)
else:
vis_enc.append(enc)
assert len(vec_enc) == num_vector
assert len(vis_enc) == num_visual
assert total_output == int(num_visual * h_size + vec_obs_shape[0] * num_vector)
if num_vector > 0:

26
ml-agents/mlagents/trainers/torch/components/bc/module.py


from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.trajectory import ObsUtil
from mlagents.trainers.buffer import AgentBuffer
class BCModule:

_, self.demonstration_buffer = demo_to_buffer(
settings.demo_path, policy.sequence_length, policy.behavior_spec
)
self.batch_size = (
settings.batch_size if settings.batch_size else default_batch_size
)

return bc_loss
def _update_batch(
self, mini_batch_demo: Dict[str, np.ndarray], n_sequences: int
self, mini_batch_demo: AgentBuffer, n_sequences: int
vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])]
np_obs = ObsUtil.from_buffer(
mini_batch_demo, len(self.policy.behavior_spec.observation_shapes)
)
# Convert to tensors
tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
act_masks = None
expert_actions = AgentAction.from_dict(mini_batch_demo)
if self.policy.behavior_spec.action_spec.discrete_size > 0:

if self.policy.use_recurrent:
memories = torch.zeros(1, self.n_sequences, self.policy.m_size)
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
vis_ob = ModelUtils.list_to_tensor(
mini_batch_demo["visual_obs%d" % idx]
)
vis_obs.append(vis_ob)
else:
vis_obs = []
vec_obs,
vis_obs,
tensor_obs,
masks=act_masks,
memories=memories,
seq_len=self.policy.sequence_length,

39
ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py


from mlagents.trainers.torch.networks import NetworkBody
from mlagents.trainers.torch.layers import LinearEncoder, linear_layer
from mlagents.trainers.settings import NetworkSettings, EncoderType
from mlagents.trainers.trajectory import ObsUtil
class ActionPredictionTuple(NamedTuple):

"""
Extracts the current state embedding from a mini_batch.
"""
n_vis = len(self._state_encoder.visual_processors)
hidden, _ = self._state_encoder.forward(
vec_inputs=[
ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float)
],
vis_inputs=[
ModelUtils.list_to_tensor(
mini_batch["visual_obs%d" % i], dtype=torch.float
)
for i in range(n_vis)
],
)
n_obs = len(self._state_encoder.processors)
np_obs = ObsUtil.from_buffer(mini_batch, n_obs)
# Convert to tensors
tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
hidden, _ = self._state_encoder.forward(tensor_obs)
return hidden
def get_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor:

n_vis = len(self._state_encoder.visual_processors)
hidden, _ = self._state_encoder.forward(
vec_inputs=[
ModelUtils.list_to_tensor(
mini_batch["next_vector_in"], dtype=torch.float
)
],
vis_inputs=[
ModelUtils.list_to_tensor(
mini_batch["next_visual_obs%d" % i], dtype=torch.float
)
for i in range(n_vis)
],
)
n_obs = len(self._state_encoder.processors)
np_obs = ObsUtil.from_buffer_next(mini_batch, n_obs)
# Convert to tensors
tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
hidden, _ = self._state_encoder.forward(tensor_obs)
return hidden
def predict_action(self, mini_batch: AgentBuffer) -> ActionPredictionTuple:

72
ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py


from typing import Optional, Dict, List, Tuple
from typing import Optional, Dict, List
import numpy as np
from mlagents.torch_utils import torch, default_device

from mlagents.trainers.torch.layers import linear_layer, Initialization
from mlagents.trainers.settings import NetworkSettings, EncoderType
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.trajectory import ObsUtil
class GAILRewardProvider(BaseRewardProvider):

"""
return self._action_flattener.forward(AgentAction.from_dict(mini_batch))
def get_state_inputs(
self, mini_batch: AgentBuffer
) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
def get_state_inputs(self, mini_batch: AgentBuffer) -> List[torch.Tensor]:
n_vis = len(self.encoder.visual_processors)
n_vec = len(self.encoder.vector_processors)
vec_inputs = (
[ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float)]
if n_vec > 0
else []
)
vis_inputs = [
ModelUtils.list_to_tensor(mini_batch["visual_obs%d" % i], dtype=torch.float)
for i in range(n_vis)
]
return vec_inputs, vis_inputs
n_obs = len(self.encoder.processors)
np_obs = ObsUtil.from_buffer(mini_batch, n_obs)
# Convert to tensors
tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
return tensor_obs
def compute_estimate(
self, mini_batch: AgentBuffer, use_vail_noise: bool = False

:param use_vail_noise: Only when using VAIL : If true, will sample the code, if
false, will return the mean of the code.
"""
vec_inputs, vis_inputs = self.get_state_inputs(mini_batch)
inputs = self.get_state_inputs(mini_batch)
hidden, _ = self.encoder(vec_inputs, vis_inputs, action_inputs)
hidden, _ = self.encoder(inputs, action_inputs)
hidden, _ = self.encoder(vec_inputs, vis_inputs)
hidden, _ = self.encoder(inputs)
z_mu: Optional[torch.Tensor] = None
if self._settings.use_vail:
z_mu = self._z_mu_layer(hidden)

Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp.
for off-policy. Compute gradients w.r.t randomly interpolated input.
"""
policy_vec_inputs, policy_vis_inputs = self.get_state_inputs(policy_batch)
expert_vec_inputs, expert_vis_inputs = self.get_state_inputs(expert_batch)
interp_vec_inputs = []
for policy_vec_input, expert_vec_input in zip(
policy_vec_inputs, expert_vec_inputs
):
obs_epsilon = torch.rand(policy_vec_input.shape)
interp_vec_input = (
obs_epsilon * policy_vec_input + (1 - obs_epsilon) * expert_vec_input
)
interp_vec_input.requires_grad = True # For gradient calculation
interp_vec_inputs.append(interp_vec_input)
interp_vis_inputs = []
for policy_vis_input, expert_vis_input in zip(
policy_vis_inputs, expert_vis_inputs
):
obs_epsilon = torch.rand(policy_vis_input.shape)
interp_vis_input = (
obs_epsilon * policy_vis_input + (1 - obs_epsilon) * expert_vis_input
)
interp_vis_input.requires_grad = True # For gradient calculation
interp_vis_inputs.append(interp_vis_input)
policy_inputs = self.get_state_inputs(policy_batch)
expert_inputs = self.get_state_inputs(expert_batch)
interp_inputs = []
for policy_input, expert_input in zip(policy_inputs, expert_inputs):
obs_epsilon = torch.rand(policy_input.shape)
interp_input = obs_epsilon * policy_input + (1 - obs_epsilon) * expert_input
interp_input.requires_grad = True # For gradient calculation
interp_inputs.append(interp_input)
if self._settings.use_actions:
policy_action = self.get_action_input(policy_batch)
expert_action = self.get_action_input(expert_batch)

dim=1,
)
action_inputs.requires_grad = True
hidden, _ = self.encoder(
interp_vec_inputs, interp_vis_inputs, action_inputs
)
encoder_input = tuple(
interp_vec_inputs + interp_vis_inputs + [action_inputs]
)
hidden, _ = self.encoder(interp_inputs, action_inputs)
encoder_input = tuple(interp_inputs + [action_inputs])
hidden, _ = self.encoder(interp_vec_inputs, interp_vis_inputs)
encoder_input = tuple(interp_vec_inputs + interp_vis_inputs)
hidden, _ = self.encoder(interp_inputs)
encoder_input = tuple(interp_inputs)
if self._settings.use_vail:
use_vail_noise = True
z_mu = self._z_mu_layer(hidden)

21
ml-agents/mlagents/trainers/torch/components/reward_providers/rnd_reward_provider.py


from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.networks import NetworkBody
from mlagents.trainers.settings import NetworkSettings, EncoderType
from mlagents.trainers.trajectory import ObsUtil
class RNDRewardProvider(BaseRewardProvider):

self._encoder = NetworkBody(specs.observation_shapes, state_encoder_settings)
def forward(self, mini_batch: AgentBuffer) -> torch.Tensor:
n_vis = len(self._encoder.visual_processors)
hidden, _ = self._encoder.forward(
vec_inputs=[
ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float)
],
vis_inputs=[
ModelUtils.list_to_tensor(
mini_batch["visual_obs%d" % i], dtype=torch.float
)
for i in range(n_vis)
],
)
self._encoder.update_normalization(torch.tensor(mini_batch["vector_obs"]))
n_obs = len(self._encoder.processors)
np_obs = ObsUtil.from_buffer(mini_batch, n_obs)
# Convert to tensors
tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
hidden, _ = self._encoder.forward(tensor_obs)
self._encoder.update_normalization(mini_batch)
return hidden

9
ml-agents/mlagents/trainers/torch/encoders.py


from mlagents.trainers.torch.layers import linear_layer, Initialization, Swish
from mlagents.torch_utils import torch, nn
from mlagents.trainers.torch.model_serialization import exporting_to_onnx
class Normalizer(nn.Module):

)
def forward(self, visual_obs: torch.Tensor) -> torch.Tensor:
if not exporting_to_onnx.is_exporting():
visual_obs = visual_obs.permute([0, 3, 1, 2])
hidden = self.conv_layers(visual_obs)
hidden = torch.reshape(hidden, (-1, self.final_flat))
return self.dense(hidden)

)
def forward(self, visual_obs: torch.Tensor) -> torch.Tensor:
if not exporting_to_onnx.is_exporting():
visual_obs = visual_obs.permute([0, 3, 1, 2])
hidden = self.conv_layers(visual_obs)
hidden = torch.reshape(hidden, (-1, self.final_flat))
return self.dense(hidden)

)
def forward(self, visual_obs: torch.Tensor) -> torch.Tensor:
if not exporting_to_onnx.is_exporting():
visual_obs = visual_obs.permute([0, 3, 1, 2])
hidden = self.conv_layers(visual_obs)
hidden = hidden.view([-1, self.final_flat])
return self.dense(hidden)

self.sequential = nn.Sequential(*layers)
def forward(self, visual_obs: torch.Tensor) -> torch.Tensor:
if not exporting_to_onnx.is_exporting():
visual_obs = visual_obs.permute([0, 3, 1, 2])
batch_size = visual_obs.shape[0]
hidden = self.sequential(visual_obs)
before_out = hidden.view(batch_size, -1)

144
ml-agents/mlagents/trainers/torch/networks.py


from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.decoders import ValueHeads
from mlagents.trainers.torch.layers import LSTM, LinearEncoder
from mlagents.trainers.torch.model_serialization import exporting_to_onnx
from mlagents.trainers.torch.encoders import VectorInput
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trajectory import ObsUtil
ActivationFunction = Callable[[torch.Tensor], torch.Tensor]
EncoderFunction = Callable[

else 0
)
(
self.visual_processors,
self.vector_processors,
encoder_input_size,
) = ModelUtils.create_input_processors(
self.processors, self.embedding_sizes = ModelUtils.create_input_processors(
total_enc_size = encoder_input_size + encoded_act_size
total_enc_size = sum(self.embedding_sizes) + encoded_act_size
self.linear_encoder = LinearEncoder(
total_enc_size, network_settings.num_layers, self.h_size
)

else:
self.lstm = None # type: ignore
def update_normalization(self, vec_inputs: List[torch.Tensor]) -> None:
for vec_input, vec_enc in zip(vec_inputs, self.vector_processors):
vec_enc.update_normalization(vec_input)
def update_normalization(self, buffer: AgentBuffer) -> None:
obs = ObsUtil.from_buffer(buffer, len(self.processors))
for vec_input, enc in zip(obs, self.processors):
if isinstance(enc, VectorInput):
enc.update_normalization(torch.as_tensor(vec_input))
for n1, n2 in zip(self.vector_processors, other_network.vector_processors):
n1.copy_normalization(n2)
for n1, n2 in zip(self.processors, other_network.processors):
if isinstance(n1, VectorInput) and isinstance(n2, VectorInput):
n1.copy_normalization(n2)
@property
def memory_size(self) -> int:

self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
for idx, processor in enumerate(self.vector_processors):
vec_input = vec_inputs[idx]
processed_vec = processor(vec_input)
encodes.append(processed_vec)
for idx, processor in enumerate(self.visual_processors):
vis_input = vis_inputs[idx]
if not exporting_to_onnx.is_exporting():
vis_input = vis_input.permute([0, 3, 1, 2])
processed_vis = processor(vis_input)
encodes.append(processed_vis)
for idx, processor in enumerate(self.processors):
obs_input = inputs[idx]
processed_obs = processor(obs_input)
encodes.append(processed_obs)
if len(encodes) == 0:
raise Exception("No valid inputs to network.")

def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
vec_inputs, vis_inputs, actions, memories, sequence_length
inputs, actions, memories, sequence_length
)
output = self.value_heads(encoding)
return output, memories

@abc.abstractmethod
def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
def update_normalization(self, buffer: AgentBuffer) -> None:
"""
Updates normalization of Actor based on the provided List of vector obs.
:param vector_obs: A List of vector obs as tensors.

def get_action_stats(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

@abc.abstractmethod
def critic_pass(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
:param vec_inputs: List of vector inputs as tensors.
:param vis_inputs: List of visual inputs as tensors.
:param inputs: List of inputs as tensors.
:param memories: Tensor of memories, if using memory. Otherwise, None.
:returns: Dict of reward stream to output tensor for values.
"""

def get_action_stats_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

"""
Returns sampled actions and value estimates.
If memory is enabled, return the memories as well.
:param vec_inputs: A List of vector inputs as tensors.
:param vis_inputs: A List of visual inputs as tensors.
:param inputs: A List of vector inputs as tensors.
:param masks: If using discrete actions, a Tensor of action masks.
:param memories: If using memory, a Tensor of initial memories.
:param sequence_length: If using memory, the sequence length.

def memory_size(self) -> int:
return self.network_body.memory_size
def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
self.network_body.update_normalization(vector_obs)
def update_normalization(self, buffer: AgentBuffer) -> None:
self.network_body.update_normalization(buffer)
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
inputs, memories=memories, sequence_length=sequence_length
)
action, log_probs, entropies = self.action_model(encoding, masks)
return action, log_probs, entropies, memories

At this moment, torch.onnx.export() doesn't accept None as tensor to be exported,
so the size of return tuple varies with action spec.
"""
# This code will convert the vec and vis obs into a list of inputs for the network
concatenated_vec_obs = vec_inputs[0]
inputs = []
start = 0
end = 0
vis_index = 0
for i, enc in enumerate(self.network_body.processors):
if isinstance(enc, VectorInput):
# This is a vec_obs
vec_size = self.network_body.embedding_sizes[i]
end = start + vec_size
inputs.append(concatenated_vec_obs[:, start:end])
start = end
else:
inputs.append(vis_inputs[vis_index])
vis_index += 1
# End of code to convert the vec and vis obs into a list of inputs for the network
vec_inputs, vis_inputs, memories=memories, sequence_length=1
inputs, memories=memories, sequence_length=1
)
(

def critic_pass(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
inputs, memories=memories, sequence_length=sequence_length
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
actions: AgentAction,
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,

vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
inputs, memories=memories, sequence_length=sequence_length
)
log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
value_outputs = self.value_heads(encoding)

self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

encoding, memories = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
inputs, memories=memories, sequence_length=sequence_length
)
action, log_probs, entropies = self.action_model(encoding, masks)
value_outputs = self.value_heads(encoding)

def critic_pass(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
inputs, memories=critic_mem, sequence_length=sequence_length
)
if actor_mem is not None:
# Make memories with the actor mem unchanged

def get_stats_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
actions: AgentAction,
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,

encoding, actor_mem_outs = self.network_body(
vec_inputs, vis_inputs, memories=actor_mem, sequence_length=sequence_length
inputs, memories=actor_mem, sequence_length=sequence_length
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length