浏览代码

Back out trainer changes

/develop/coma2/samenet
Ervin Teng 4 年前
当前提交
0bde7598
共有 9 个文件被更改,包括 88 次插入807 次删除
  1. 109
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  2. 10
      ml-agents/mlagents/trainers/policy/torch_policy.py
  3. 66
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  4. 143
      ml-agents/mlagents/trainers/ppo/trainer.py
  5. 89
      ml-agents/mlagents/trainers/torch/attention.py
  6. 2
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  7. 3
      ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py
  8. 25
      ml-agents/mlagents/trainers/torch/encoders.py
  9. 448
      ml-agents/mlagents/trainers/torch/networks.py

109
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


from typing import Dict, Optional, Tuple, List
from mlagents.torch_utils import torch
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.trajectory import ObsUtil, TeamObsUtil
from mlagents.trainers.trajectory import ObsUtil
from mlagents.trainers.torch.components.bc.module import BCModule
from mlagents.trainers.torch.components.reward_providers import create_reward_provider

)
def get_trajectory_value_estimates(
self,
batch: AgentBuffer,
next_obs: List[np.ndarray],
next_critic_obs: List[List[np.ndarray]],
done: bool,
all_dones: bool,
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
team_obs = TeamObsUtil.from_buffer(batch, n_obs)
# next_obs = ObsUtil.from_buffer_next(batch, n_obs)
# next_team_obs = TeamObsUtil.from_buffer_next(batch, n_obs)
team_obs = [
[ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
for _teammate_obs in team_obs
]
# next_team_obs = [
# [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
# for _teammate_obs in next_team_obs
# ]
actions = AgentAction.from_dict(batch)
team_actions = AgentAction.from_team_dict(batch)
# next_actions = AgentAction.from_dict_next(batch)
# next_team_actions = AgentAction.from_team_dict_next(batch)
next_obs = [obs.unsqueeze(0) for obs in next_obs]
# critic_obs = TeamObsUtil.from_buffer(batch, n_obs)
# critic_obs = [
# [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
# for _teammate_obs in critic_obs
# ]
next_critic_obs = [
ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_critic_obs
]
# Expand dimensions of next critic obs
next_critic_obs = [
[_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_critic_obs
]
baseline_estimates, _ = self.policy.actor_critic.critic_pass(
current_obs,
actions,
memory,
sequence_length=batch.num_experiences,
team_obs=team_obs,
team_act=team_actions,
)
next_obs = [obs.unsqueeze(0) for obs in next_obs]
value_estimates, mem = self.policy.actor_critic.target_critic_value(
current_obs,
memory,
sequence_length=batch.num_experiences,
team_obs=team_obs,
value_estimates, next_memory = self.policy.actor_critic.critic_pass(
current_obs, memory, sequence_length=batch.num_experiences
boot_value_estimates, mem = self.policy.actor_critic.target_critic_value(
next_obs,
memory,
sequence_length=batch.num_experiences,
team_obs=next_critic_obs,
next_value_estimate, _ = self.policy.actor_critic.critic_pass(
next_obs, next_memory, sequence_length=1
# next_value_estimates, next_marg_val_estimates, next_mem = self.policy.actor_critic.target_critic_pass(
# next_obs,
# next_actions,
# memory,
# sequence_length=batch.num_experiences,
# team_obs=next_team_obs,
# team_act=next_team_actions,
# )
# # Actions is a hack here, we need the next actions
# next_value_estimate, next_marg_val_estimate, _ = self.policy.actor_critic.critic_pass(
# next_obs, actions, next_memory, sequence_length=1, critic_obs=next_critic_obs
# )
# These aren't used in COMAttention
for name, estimate in baseline_estimates.items():
baseline_estimates[name] = ModelUtils.to_numpy(estimate)
next_value_estimate[name] = ModelUtils.to_numpy(next_value_estimate[name])
# the base line and V shpuld not be on the same done flag
for name, estimate in boot_value_estimates.items():
boot_value_estimates[name] = ModelUtils.to_numpy(estimate)
if all_dones:
for k in boot_value_estimates:
if done:
for k in next_value_estimate:
boot_value_estimates[k][-1] = 0.0
# else:
# print(len(next_critic_obs))
# print(baseline_estimates)
# print(value_estimates)
# print(boot_value_baseline[k][-1])
# if done and not all_dones:
# print("agent finished but team going")
# elif all_dones:
# print("alldone")
# else:
# print("neither")
# print("final", boot_value_estimates)
# print("value", value_estimates)
# print("base", baseline_estimates)
next_value_estimate[k] = 0.0
return (value_estimates, baseline_estimates, boot_value_estimates)
return value_estimates, next_value_estimate

10
ml-agents/mlagents/trainers/policy/torch_policy.py


masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
seq_len: int = 1,
critic_obs: Optional[List[List[torch.Tensor]]] = None,
) -> Tuple[AgentAction, ActionLogProbs, torch.Tensor, torch.Tensor]:
"""
:param obs: List of observations.

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
seq_len: int = 1,
team_obs: Optional[List[List[torch.Tensor]]] = None,
team_act: Optional[List[AgentAction]] = None,
log_probs, entropies, baseline, values = self.actor_critic.get_stats_and_value(
obs, actions, masks, memories, seq_len, team_obs, team_act
log_probs, entropies, value_heads = self.actor_critic.get_stats_and_value(
obs, actions, masks, memories, seq_len
return log_probs, entropies, baseline, values
return log_probs, entropies, value_heads
@timed
def evaluate(

memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze(
0
)
run_out = {}
with torch.no_grad():
action, log_probs, entropy, memories = self.sample_actions(

66
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.trajectory import ObsUtil, TeamObsUtil
from mlagents.trainers.trajectory import ObsUtil
class TorchPPOOptimizer(TorchOptimizer):

)
self.optimizer = torch.optim.Adam(
params,
lr=self.trainer_settings.hyperparameters.learning_rate,
weight_decay=1e-6,
params, lr=self.trainer_settings.hyperparameters.learning_rate
)
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",

self.stream_names = list(self.reward_signals.keys())
# ModelUtils.soft_update(
# self.policy.actor_critic.critic, self.policy.actor_critic.target, 1.0
# )
def ppo_value_loss(
self,
values: Dict[str, torch.Tensor],

value_loss = torch.mean(torch.stack(value_losses))
return value_loss
def coma_regularizer_loss(
self, values: Dict[str, torch.Tensor], baseline_values: Dict[str, torch.Tensor]
):
reg_losses = []
for name, head in values.items():
reg_loss = torch.nn.functional.mse_loss(head, baseline_values[name])
reg_losses.append(reg_loss)
value_loss = torch.mean(torch.stack(reg_losses))
return value_loss
def ppo_policy_loss(
self,
advantages: torch.Tensor,

decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step())
decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step())
decay_bet = self.decay_beta.get_value(self.policy.get_current_step())
# returns_b = {}
returns_v = {}
returns = {}
old_marg_values = {}
old_marg_values[name] = ModelUtils.list_to_tensor(
batch[f"{name}_baseline_estimates"]
)
returns_v[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns_v"])
# returns_b[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns_b"])
#
returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])
n_obs = len(self.policy.behavior_spec.observation_specs)
current_obs = ObsUtil.from_buffer(batch, n_obs)

team_obs = TeamObsUtil.from_buffer(batch, n_obs)
team_obs = [
[ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
for _teammate_obs in team_obs
]
team_actions = AgentAction.from_team_dict(batch)
# next_team_actions = AgentAction.from_team_dict_next(batch)
memories = [
ModelUtils.list_to_tensor(batch["memory"][i])

memories = torch.stack(memories).unsqueeze(0)
log_probs, entropy, baseline_vals, values = self.policy.evaluate_actions(
log_probs, entropy, values = self.policy.evaluate_actions(
team_obs=team_obs,
team_act=team_actions,
# q_loss = self.ppo_value_loss(qs, old_values, returns_q, decay_eps, loss_masks)
# Use trust region from value, not baseline
baseline_loss = self.ppo_value_loss(
baseline_vals, old_marg_values, returns_v, decay_eps, loss_masks
)
values, old_values, returns_v, decay_eps, loss_masks
values, old_values, returns, decay_eps, loss_masks
# Regularizer loss reduces bias between the baseline and values. Other
# regularizers are possible here.
# regularizer_loss = self.coma_regularizer_loss(values, baseline_vals)
policy_loss = self.ppo_policy_loss(
ModelUtils.list_to_tensor(batch["advantages"]),
log_probs,

loss = (
policy_loss
+ 0.5 * (value_loss + 0.5 * baseline_loss)
# + 0.25 * regularizer_loss
+ 0.5 * value_loss
- decay_bet * ModelUtils.masked_mean(entropy, loss_masks)
)

loss.backward()
self.optimizer.step()
# ModelUtils.soft_update(
# self.policy.actor_critic.critic, self.policy.actor_critic.target, 1.0
# )
# "Losses/Q Loss": q_loss.item(),
"Losses/Baseline Value Loss": baseline_loss.item(),
# "Losses/Regularization Loss": regularizer_loss.item(),
"Policy/Learning Rate": decay_lr,
"Policy/Epsilon": decay_eps,
"Policy/Beta": decay_bet,

143
ml-agents/mlagents/trainers/ppo/trainer.py


self.policy.update_normalization(agent_buffer_trajectory)
# Get all value estimates
(
value_estimates,
baseline_estimates,
value_next,
) = self.optimizer.get_trajectory_value_estimates(
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
trajectory.next_collab_obs,
trajectory.teammate_dones_reached
and trajectory.done_reached
and not trajectory.interrupted,
agent_buffer_trajectory[f"{name}_baseline_estimates"].extend(
baseline_estimates[name]
)
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Baseline Estimate",
np.mean(baseline_estimates[name]),
)
np.mean(value_estimates[name]),
np.mean(v),
# Evaluate all reward functions
self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory["environment_rewards"]
)

tmp_advantages = []
tmp_returns = []
for name in self.optimizer.reward_signals:
bootstrap_value = value_next[name]
baseline_estimates = agent_buffer_trajectory[
f"{name}_baseline_estimates"
local_value_estimates = agent_buffer_trajectory[
f"{name}_value_estimates"
v_estimates = agent_buffer_trajectory[f"{name}_value_estimates"].get_batch()
# next_value_estimates = agent_buffer_trajectory[
# f"{name}_value_estimates_next"
# ].get_batch()
# next_m_value_estimates = agent_buffer_trajectory[
# f"{name}_marginalized_value_estimates_next"
# ].get_batch()
returns_v, returns_b = get_team_returns(
local_advantage = get_gae(
baseline_estimates=baseline_estimates,
v_estimates=v_estimates,
value_next=value_next[name],
value_estimates=local_value_estimates,
value_next=bootstrap_value,
# print("loc", local_rewards[-1])
# print("tdlam", returns_v)
# local_advantage = get_team_gae(
# rewards=local_rewards,
# value_estimates=v_estimates,
# baseline=baseline_estimates,
# value_next=value_next[name],
# gamma=self.optimizer.reward_signals[name].gamma,
# lambd=self.hyperparameters.lambd,
# )
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} TD Lam",
np.mean(returns_v),
)
# local_advantage = np.array(returns_v) - baseline_estimates
local_advantage = np.array(returns_v) - np.array(baseline_estimates)
# self._stats_reporter.add_stat(
# f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} GAE Advantage Estimate",
# np.mean(gae_advantage),
# )
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} TD Advantage Estimate",
np.mean(local_advantage),
)
local_return = local_advantage + baseline_estimates
# local_return = local_advantage + q_estimates
local_return = local_advantage + local_value_estimates
# agent_buffer_trajectory[f"{name}_returns"].set(local_return)
agent_buffer_trajectory[f"{name}_returns_b"].set(returns_v)
agent_buffer_trajectory[f"{name}_returns_v"].set(returns_v)
agent_buffer_trajectory[f"{name}_returns"].set(local_return)
agent_buffer_trajectory[f"{name}_advantage"].set(local_advantage)
tmp_advantages.append(local_advantage)
tmp_returns.append(local_return)

)
global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
agent_buffer_trajectory["advantages"].set(global_advantages)
agent_buffer_trajectory["discounted_returns"].set(global_returns)
# Append to update buffer
agent_buffer_trajectory.resequence_and_append(

n_sequences = max(
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
# Normalize advantages
advantages = np.array(self.update_buffer["advantages"].get_batch())
advantages = self.update_buffer["advantages"].get_batch()
list((advantages - advantages.mean()) / (advantages.std() + 1e-10))
(advantages - advantages.mean()) / (advantages.std() + 1e-10)
)
num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)

return discounted_r
def lambd_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0):
returns = np.zeros_like(r)
returns[-1] = r[-1] + gamma * value_next
for t in reversed(range(0, r.size - 1)):
returns[t] = (
gamma * lambd * returns[t + 1]
+ r[t]
+ (1 - lambd) * gamma * value_estimates[t + 1]
)
return returns
def get_team_gae(
rewards, value_estimates, baseline, value_next=0.0, gamma=0.99, lambd=0.95
):
"""
Computes generalized advantage estimate for use in updating policy.
:param rewards: list of rewards for time-steps t to T.
:param value_next: Value estimate for time-step T+1.
:param value_estimates: list of value estimates for time-steps t to T.
:param gamma: Discount factor.
:param lambd: GAE weighing factor.
:return: list of advantage estimates for time-steps t to T.
"""
value_estimates = np.append(value_estimates, value_next)
delta_t = rewards + gamma * value_estimates[1:] - baseline
advantage = discount_rewards(r=delta_t, gamma=gamma * lambd)
return advantage
def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95):
"""
Computes generalized advantage estimate for use in updating policy.

delta_t = rewards + gamma * value_estimates[1:] - value_estimates[:-1]
advantage = discount_rewards(r=delta_t, gamma=gamma * lambd)
return advantage
def get_team_returns(
rewards,
baseline_estimates,
v_estimates,
value_next=0.0,
died=False,
gamma=0.99,
lambd=0.8,
):
"""
Computes generalized advantage estimate for use in updating policy.
:param rewards: list of rewards for time-steps t to T.
:param value_next: Value estimate for time-step T+1.
:param value_estimates: list of value estimates for time-steps t to T.
:param gamma: Discount factor.
:param lambd: GAE weighing factor.
:return: list of advantage estimates for time-steps t to T.
"""
rewards = np.array(rewards)
returns_b = lambd_return(
rewards, baseline_estimates, gamma=gamma, lambd=lambd, value_next=value_next
)
returns_v = lambd_return(
rewards, v_estimates, gamma=gamma, lambd=lambd, value_next=value_next
)
return returns_v, returns_b

89
ml-agents/mlagents/trainers/torch/attention.py


class MultiHeadAttention(torch.nn.Module):
"""
Multi Head Attention module. We do not use the regular Torch implementation since
Barracuda does not support some operators it uses.
Takes as input to the forward method 3 tensors:
- query: of dimensions (batch_size, number_of_queries, embedding_size)
- key: of dimensions (batch_size, number_of_keys, embedding_size)
- value: of dimensions (batch_size, number_of_keys, embedding_size)
The forward method will return 2 tensors:
- The output: (batch_size, number_of_queries, embedding_size)
- The attention matrix: (batch_size, num_heads, number_of_queries, number_of_keys)
"""
"""
Multi Head Attention module. We do not use the regular Torch implementation since
Barracuda does not support some operators it uses.
Takes as input to the forward method 3 tensors:
- query: of dimensions (batch_size, number_of_queries, embedding_size)
- key: of dimensions (batch_size, number_of_keys, embedding_size)
- value: of dimensions (batch_size, number_of_keys, embedding_size)
The forward method will return 2 tensors:
- The output: (batch_size, number_of_queries, embedding_size)
- The attention matrix: (batch_size, num_heads, number_of_queries, number_of_keys)
:param embedding_size: The size of the embeddings that will be generated (should be
dividable by the num_heads)
:param total_max_elements: The maximum total number of entities that can be passed to
the module
:param num_heads: The number of heads of the attention module
"""
super().__init__()
self.n_heads = num_heads
self.head_size: int = embedding_size // self.n_heads

return value_attention, att
class EntityEmbedding(torch.nn.Module):
class EntityEmbeddings(torch.nn.Module):
"""
A module used to embed entities before passing them to a self-attention block.
Used in conjunction with ResidualSelfAttention to encode information about a self
and additional entities. Can also concatenate self to entities for ego-centric self-
attention. Inspired by architecture used in https://arxiv.org/pdf/1909.07528.pdf.
"""
entity_size: int,
entity_num_max_elements: Optional[int],
entity_sizes: List[int],
entity_num_max_elements: Optional[List[int]] = None,
:param entity_size: Size of other entitiy.
:param entity_num_max_elements: Maximum elements for a given entity, None for unrestricted.
:param entity_sizes: List of sizes for other entities. Should be of length
equivalent to the number of entities.
:param embedding_size: Embedding size for entity encoders.
:param entity_num_max_elements: Maximum elements in an entity, None for unrestricted.
:param embedding_size: Embedding size for the entity encoder.
:param concat_self: Whether to concatenate x_self to entities. Set True for ego-centric
:param concat_self: Whether to concatenate x_self to entites. Set True for ego-centric
self.entity_size: int = entity_size
self.entity_num_max_elements: int = -1
self.entity_sizes: List[int] = entity_sizes
self.entity_num_max_elements: List[int] = [-1] * len(entity_sizes)
if entity_num_max_elements is not None:
self.entity_num_max_elements = entity_num_max_elements

)
self.embedding_norm = LayerNorm()
def forward(self, x_self: torch.Tensor, entities: torch.Tensor) -> torch.Tensor:
def forward(
self, x_self: torch.Tensor, entities: List[torch.Tensor]
) -> Tuple[torch.Tensor, int]:
num_entities = self.entity_num_max_elements
if num_entities < 0:
if exporting_to_onnx.is_exporting():
raise UnityTrainerException(
"Trying to export an attention mechanism that doesn't have a set max \
number of elements."
)
num_entities = entities.shape[1]
expanded_self = x_self.reshape(-1, 1, self.self_size)
expanded_self = torch.cat([expanded_self] * num_entities, dim=1)
# Concatenate all observations with self
self_and_ent: List[torch.Tensor] = []
for num_entities, ent in zip(self.entity_num_max_elements, entities):

def __init__(
self,
embedding_size: int,
entity_num_max_elements: Optional[int] = None,
entity_num_max_elements: Optional[List[int]] = None,
num_heads: int = 4,
):
"""

super().__init__()
self.max_num_ent: Optional[int] = None
if entity_num_max_elements is not None:
self.max_num_ent = entity_num_max_elements
_entity_num_max_elements = entity_num_max_elements
self.max_num_ent = sum(_entity_num_max_elements)
self.attention = MultiHeadAttention(
num_heads=num_heads, embedding_size=embedding_size

def forward(self, inp: torch.Tensor, key_masks: List[torch.Tensor]) -> torch.Tensor:
# Gather the maximum number of entities information
mask = torch.cat(key_masks, dim=1)
inp = self.embedding_norm(inp)
# Feed to self attention
query = self.fc_q(inp) # (b, n_q, emb)
key = self.fc_k(inp) # (b, n_k, emb)

denominator = torch.sum(1 - mask, dim=1, keepdim=True) + self.EPSILON
output = numerator / denominator
return output
@staticmethod
def get_masks(observations: List[torch.Tensor]) -> List[torch.Tensor]:
"""
Takes a List of Tensors and returns a List of mask Tensor with 1 if the input was
all zeros (on dimension 2) and 0 otherwise. This is used in the Attention
layer to mask the padding observations.
"""
with torch.no_grad():
# Generate the masking tensors for each entities tensor (mask only if all zeros)
key_masks: List[torch.Tensor] = [
(torch.sum(ent ** 2, axis=2) < 0.01).type(torch.FloatTensor)
for ent in observations
]
return key_masks

2
ml-agents/mlagents/trainers/torch/components/bc/module.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.buffer import AgentBuffer
class BCModule:

3
ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py


class ExtrinsicRewardProvider(BaseRewardProvider):
def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
return np.array(mini_batch["average_team_reward"], dtype=np.float32)
# return np.array(mini_batch["environment_rewards"], dtype=np.float32)
return np.array(mini_batch["environment_rewards"], dtype=np.float32)
def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
return {}

25
ml-agents/mlagents/trainers/torch/encoders.py


return height, width
class InputProcessor:
def copy_normalization(self, other_input: "InputProcessor") -> None:
pass
def update_normalization(self, inputs: torch.Tensor) -> None:
pass
class VectorInput(nn.Module, InputProcessor):
class VectorInput(nn.Module):
def __init__(self, input_size: int, normalize: bool = False):
super().__init__()
self.normalizer: Optional[Normalizer] = None

inputs = self.normalizer(inputs)
return inputs
def copy_normalization(self, other_input: "InputProcessor") -> None:
if isinstance(other_input, VectorInput):
if self.normalizer is not None and other_input.normalizer is not None:
self.normalizer.copy_from(other_input.normalizer)
def copy_normalization(self, other_input: "VectorInput") -> None:
if self.normalizer is not None and other_input.normalizer is not None:
self.normalizer.copy_from(other_input.normalizer)
def update_normalization(self, inputs: torch.Tensor) -> None:
if self.normalizer is not None:

class SmallVisualEncoder(nn.Module, InputProcessor):
class SmallVisualEncoder(nn.Module):
"""
CNN architecture used by King in their Candy Crush predictor
https://www.researchgate.net/publication/328307928_Human-Like_Playtesting_with_Deep_Learning

return self.dense(hidden)
class SimpleVisualEncoder(nn.Module, InputProcessor):
class SimpleVisualEncoder(nn.Module):
def __init__(
self, height: int, width: int, initial_channels: int, output_size: int
):

return self.dense(hidden)
class NatureVisualEncoder(nn.Module, InputProcessor):
class NatureVisualEncoder(nn.Module):
def __init__(
self, height: int, width: int, initial_channels: int, output_size: int
):

return input_tensor + self.layers(input_tensor)
class ResNetVisualEncoder(nn.Module, InputProcessor):
class ResNetVisualEncoder(nn.Module):
def __init__(
self, height: int, width: int, initial_channels: int, output_size: int
):

448
ml-agents/mlagents/trainers/torch/networks.py


from mlagents.trainers.torch.encoders import VectorInput
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trajectory import ObsUtil
from mlagents.trainers.torch.attention import ResidualSelfAttention, EntityEmbedding
ActivationFunction = Callable[[torch.Tensor], torch.Tensor]

inputs = torch.cat(encodes + [actions], dim=-1)
else:
inputs = torch.cat(encodes, dim=-1)
encoding = self.linear_encoder(inputs)
if self.use_lstm:
# Resize to (batch, sequence length, encoding size)
encoding = encoding.reshape([-1, sequence_length, self.h_size])
encoding, memories = self.lstm(encoding, memories)
encoding = encoding.reshape([-1, self.m_size // 2])
return encoding, memories
# NOTE: this class will be replaced with a multi-head attention when the time comes
class MultiInputNetworkBody(nn.Module):
def __init__(
self,
sensor_specs: List[SensorSpec],
network_settings: NetworkSettings,
action_spec: ActionSpec,
):
super().__init__()
self.normalize = network_settings.normalize
self.use_lstm = network_settings.memory is not None
# Scale network depending on num agents
self.h_size = network_settings.hidden_units
self.m_size = (
network_settings.memory.memory_size
if network_settings.memory is not None
else 0
)
self.processors, _input_size = ModelUtils.create_input_processors(
sensor_specs,
self.h_size,
network_settings.vis_encode_type,
normalize=self.normalize,
)
self.action_spec = action_spec
# Modules for self-attention
obs_only_ent_size = sum(_input_size)
q_ent_size = (
sum(_input_size)
+ sum(self.action_spec.discrete_branches)
+ self.action_spec.continuous_size
)
self.obs_encoder = EntityEmbedding(
0, obs_only_ent_size, None, self.h_size, concat_self=False
)
self.obs_action_encoder = EntityEmbedding(
0, q_ent_size, None, self.h_size, concat_self=False
)
self.self_attn = ResidualSelfAttention(self.h_size)
encoder_input_size = self.h_size
self.linear_encoder = LinearEncoder(
encoder_input_size, network_settings.num_layers, self.h_size
)
if self.use_lstm:
self.lstm = LSTM(self.h_size, self.m_size)
else:
self.lstm = None # type: ignore
@property
def memory_size(self) -> int:
return self.lstm.memory_size if self.use_lstm else 0
def update_normalization(self, buffer: AgentBuffer) -> None:
obs = ObsUtil.from_buffer(buffer, len(self.processors))
for vec_input, enc in zip(obs, self.processors):
if isinstance(enc, VectorInput):
enc.update_normalization(torch.as_tensor(vec_input))
def copy_normalization(self, other_network: "NetworkBody") -> None:
if self.normalize:
for n1, n2 in zip(self.processors, other_network.processors):
if isinstance(n1, VectorInput) and isinstance(n2, VectorInput):
n1.copy_normalization(n2)
def _get_masks_from_nans(self, obs_tensors: List[torch.Tensor]) -> torch.Tensor:
"""
Get attention masks by grabbing an arbitrary obs across all the agents
Since these are raw obs, the padded values are still NaN
"""
only_first_obs = [_all_obs[0] for _all_obs in obs_tensors]
obs_for_mask = torch.stack(only_first_obs, dim=1)
# Get the mask from nans
attn_mask = torch.any(obs_for_mask.isnan(), dim=2).type(torch.FloatTensor)
return attn_mask
def q_net(
self,
obs: List[List[torch.Tensor]],
actions: List[AgentAction],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
self_attn_masks = []
concat_f_inp = []
for inputs, action in zip(obs, actions):
encodes = []
for idx, processor in enumerate(self.processors):
obs_input = inputs[idx]
obs_input[obs_input.isnan()] = 0.0 # Remove NaNs
processed_obs = processor(obs_input)
encodes.append(processed_obs)
cat_encodes = [
torch.cat(encodes, dim=-1),
action.to_flat(self.action_spec.discrete_branches),
]
concat_f_inp.append(torch.cat(cat_encodes, dim=1))
f_inp = torch.stack(concat_f_inp, dim=1)
self_attn_masks.append(self._get_masks_from_nans(obs))
encoding, memories = self.forward(
f_inp,
None,
self_attn_masks,
memories=memories,
sequence_length=sequence_length,
)
return encoding, memories
def baseline(
self,
self_obs: List[List[torch.Tensor]],
obs: List[List[torch.Tensor]],
actions: List[AgentAction],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
self_attn_masks = []
f_inp = None
concat_f_inp = []
for inputs, action in zip(obs, actions):
encodes = []
for idx, processor in enumerate(self.processors):
obs_input = inputs[idx]
obs_input[obs_input.isnan()] = 0.0 # Remove NaNs
processed_obs = processor(obs_input)
encodes.append(processed_obs)
cat_encodes = [
torch.cat(encodes, dim=-1),
action.to_flat(self.action_spec.discrete_branches),
]
concat_f_inp.append(torch.cat(cat_encodes, dim=1))
if concat_f_inp:
f_inp = torch.stack(concat_f_inp, dim=1)
self_attn_masks.append(self._get_masks_from_nans(obs))
concat_encoded_obs = []
encodes = []
for idx, processor in enumerate(self.processors):
obs_input = self_obs[idx]
obs_input[obs_input.isnan()] = 0.0 # Remove NaNs
processed_obs = processor(obs_input)
encodes.append(processed_obs)
concat_encoded_obs.append(torch.cat(encodes, dim=-1))
g_inp = torch.stack(concat_encoded_obs, dim=1)
# Get the mask from nans
self_attn_masks.append(self._get_masks_from_nans([self_obs]))
encoding, memories = self.forward(
f_inp,
g_inp,
self_attn_masks,
memories=memories,
sequence_length=sequence_length,
)
return encoding, memories
def value(
self,
obs: List[List[torch.Tensor]],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
self_attn_masks = []
concat_encoded_obs = []
for inputs in obs:
encodes = []
for idx, processor in enumerate(self.processors):
obs_input = inputs[idx]
obs_input[obs_input.isnan()] = 0.0 # Remove NaNs
processed_obs = processor(obs_input)
encodes.append(processed_obs)
concat_encoded_obs.append(torch.cat(encodes, dim=-1))
g_inp = torch.stack(concat_encoded_obs, dim=1)
# Get the mask from nans
self_attn_masks.append(self._get_masks_from_nans(obs))
encoding, memories = self.forward(
None,
g_inp,
self_attn_masks,
memories=memories,
sequence_length=sequence_length,
)
return encoding, memories
def forward(
self,
f_enc: torch.Tensor,
g_enc: torch.Tensor,
self_attn_masks: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
self_attn_inputs = []
if f_enc is not None:
self_attn_inputs.append(self.obs_action_encoder(None, f_enc))
if g_enc is not None:
self_attn_inputs.append(self.obs_encoder(None, g_enc))
encoded_entity = torch.cat(self_attn_inputs, dim=1)
encoded_state = self.self_attn(encoded_entity, self_attn_masks)
inputs = encoded_state
encoding = self.linear_encoder(inputs)
if self.use_lstm:

return output, memories
class CentralizedValueNetwork(ValueNetwork):
def __init__(
self,
stream_names: List[str],
observation_shapes: List[SensorSpec],
network_settings: NetworkSettings,
action_spec: ActionSpec,
outputs_per_stream: int = 1,
):
# This is not a typo, we want to call __init__ of nn.Module
nn.Module.__init__(self)
self.network_body = MultiInputNetworkBody(
observation_shapes, network_settings, action_spec=action_spec
)
if network_settings.memory is not None:
encoding_size = network_settings.memory.memory_size // 2
else:
encoding_size = network_settings.hidden_units
self.value_heads = ValueHeads(stream_names, encoding_size, outputs_per_stream)
def q_net(
self,
obs: List[List[torch.Tensor]],
actions: List[AgentAction],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
encoding, memories = self.network_body.q_net(
obs, actions, memories, sequence_length
)
output = self.value_heads(encoding)
return output, memories
def value(
self,
obs: List[List[torch.Tensor]],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
encoding, memories = self.network_body.value(obs, memories, sequence_length)
output = self.value_heads(encoding)
return output, memories
def baseline(
self,
self_obs: List[List[torch.Tensor]],
obs: List[List[torch.Tensor]],
actions: List[AgentAction],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
encoding, memories = self.network_body.baseline(
self_obs, obs, actions, memories, sequence_length
)
output = self.value_heads(encoding)
return output, memories
def forward(
self,
value_inputs: List[List[torch.Tensor]],
q_inputs: List[List[torch.Tensor]],
q_actions: List[AgentAction] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
encoding, memories = self.network_body(
value_inputs, q_inputs, q_actions, memories, sequence_length
)
output = self.value_heads(encoding)
return output, memories
class Actor(abc.ABC):
@abc.abstractmethod
def update_normalization(self, buffer: AgentBuffer) -> None:

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
critic_obs: Optional[List[List[torch.Tensor]]] = None,
) -> Tuple[
AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
]:

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
team_obs: Optional[List[List[torch.Tensor]]] = None,
team_act: Optional[List[List[torch.Tensor]]] = None,
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
encoding, memories = self.network_body(
inputs, memories=memories, sequence_length=sequence_length

actor_mem = None
return actor_mem, critic_mem
def target_critic_value(
self,
inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
team_obs: List[List[torch.Tensor]] = None,
) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], torch.Tensor]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)
all_obs = [inputs]
if team_obs is not None and team_obs:
all_obs.extend(team_obs)
value_outputs, critic_mem_out = self.critic.value(
all_obs, memories=critic_mem, sequence_length=sequence_length
)
# if mar_value_outputs is None:
# mar_value_outputs = value_outputs
if actor_mem is not None:
# Make memories with the actor mem unchanged
memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1)
else:
memories_out = None
return value_outputs, memories_out
def critic_value(
self,
inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
team_obs: List[List[torch.Tensor]] = None,
) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], torch.Tensor]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)
all_obs = [inputs]
if team_obs is not None and team_obs:
all_obs.extend(team_obs)
value_outputs, critic_mem_out = self.critic.value(
all_obs, memories=critic_mem, sequence_length=sequence_length
)
# if mar_value_outputs is None:
# mar_value_outputs = value_outputs
if actor_mem is not None:
# Make memories with the actor mem unchanged
memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1)
else:
memories_out = None
return value_outputs, memories_out
def target_critic_pass(
self,
inputs: List[torch.Tensor],
actions: AgentAction,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
team_obs: List[List[torch.Tensor]] = None,
team_act: List[AgentAction] = None,
) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], torch.Tensor]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)
all_obs = [inputs]
if team_obs is not None and team_obs:
all_obs.extend(team_obs)
all_acts = [actions]
if team_act is not None and team_act:
all_acts.extend(team_act)
baseline_outputs, _ = self.critic.baseline(
inputs,
team_obs,
team_act,
memories=critic_mem,
sequence_length=sequence_length,
)
value_outputs, critic_mem_out = self.critic.q_net(
all_obs, all_acts, memories=critic_mem, sequence_length=sequence_length
)
# if mar_value_outputs is None:
# mar_value_outputs = value_outputs
if actor_mem is not None:
# Make memories with the actor mem unchanged
memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1)
else:
memories_out = None
return value_outputs, baseline_outputs, memories_out
actions: AgentAction,
team_obs: List[List[torch.Tensor]] = None,
team_act: List[AgentAction] = None,
all_obs = [inputs]
if team_obs is not None and team_obs:
all_obs.extend(team_obs)
all_acts = [actions]
if team_act is not None and team_act:
all_acts.extend(team_act)
baseline_outputs, critic_mem_out = self.critic.baseline(
inputs,
team_obs,
team_act,
memories=critic_mem,
sequence_length=sequence_length,
value_outputs, critic_mem_out = self.critic(
inputs, memories=critic_mem, sequence_length=sequence_length
# q_out, critic_mem_out = self.critic.q_net(
# all_obs, all_acts, memories=critic_mem, sequence_length=sequence_length
# )
# if mar_value_outputs is None:
# mar_value_outputs = value_outputs
return baseline_outputs, memories_out
return value_outputs, memories_out
def get_stats_and_value(
self,

memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
team_obs: Optional[List[List[torch.Tensor]]] = None,
team_act: Optional[List[List[torch.Tensor]]] = None,
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)
encoding, actor_mem_outs = self.network_body(

baseline_outputs, _ = self.critic_pass(
inputs,
actions,
memories=critic_mem,
sequence_length=sequence_length,
team_obs=team_obs,
team_act=team_act,
)
value_outputs, _ = self.target_critic_value(
inputs,
memories=critic_mem,
sequence_length=sequence_length,
team_obs=team_obs,
value_outputs, critic_mem_outs = self.critic(
inputs, memories=critic_mem, sequence_length=sequence_length
return log_probs, entropies, baseline_outputs, value_outputs
return log_probs, entropies, value_outputs
def get_action_stats(
self,

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
critic_obs: Optional[List[List[torch.Tensor]]] = None,
) -> Tuple[
AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
]:

)
action, log_probs, entropies = self.action_model(encoding, masks)
all_net_inputs = [inputs]
if critic_obs is not None:
all_net_inputs.extend(critic_obs)
all_net_inputs, memories=critic_mem, sequence_length=sequence_length
inputs, memories=critic_mem, sequence_length=sequence_length
)
if self.use_lstm:
mem_out = torch.cat([actor_mem_outs, critic_mem_outs], dim=-1)

正在加载...
取消
保存