浏览代码

formatting

/layernorm
Andrew Cohen 4 年前
当前提交
21365c04
共有 3 个文件被更改,包括 89 次插入82 次删除
  1. 116
      ml-agents/mlagents/trainers/torch/attention.py
  2. 24
      ml-agents/mlagents/trainers/torch/layers.py
  3. 31
      ml-agents/mlagents/trainers/torch/networks.py

116
ml-agents/mlagents/trainers/torch/attention.py


from mlagents.torch_utils import torch
from typing import Tuple, Optional, List
from mlagents.trainers.torch.layers import LinearEncoder, linear_layer, Initialization, LayerNorm
from mlagents.trainers.torch.layers import (
LinearEncoder,
linear_layer,
Initialization,
LayerNorm,
)
print("-" * 10 + ' Incoming Gradients ' + '-' * 10)
print("-" * 10 + " Incoming Gradients " + "-" * 10)
print('Incoming Grad value: {}'.format(inp[0].data))
print("Incoming Grad value: {}".format(inp[0].data))
print('Upstream Grad value: {}'.format(out[0].data))
print("Upstream Grad value: {}".format(out[0].data))
class MultiHeadAttention(torch.nn.Module):
"""

NEG_INF = -1e6
def __init__(
self,
num_heads: int,
embedding_size: int,
):
def __init__(self, num_heads: int, embedding_size: int):
# self.fc_q = torch.nn.linear(query_size, self.n_heads * self.embedding_size)
# self.fc_k = torch.nn.linear(key_size, self.n_heads * self.embedding_size)
# self.fc_v = torch.nn.linear(value_size, self.n_heads * self.embedding_size)
# self.fc_q = torch.nn.linear(query_size, self.n_heads * self.embedding_size)
# self.fc_k = torch.nn.linear(key_size, self.n_heads * self.embedding_size)
# self.fc_v = torch.nn.linear(value_size, self.n_heads * self.embedding_size)
#self.fc_q = torch.nn.Linear(self.embedding_size // self.n_heads, self.embedding_size // self.n_heads)
#self.fc_k = torch.nn.Linear(self.embedding_size // self.n_heads, self.embedding_size // self.n_heads)
#self.fc_v = torch.nn.Linear(self.embedding_size // self.n_heads, self.embedding_size // self.n_heads)
# self.fc_q = torch.nn.Linear(self.embedding_size // self.n_heads, self.embedding_size // self.n_heads)
# self.fc_k = torch.nn.Linear(self.embedding_size // self.n_heads, self.embedding_size // self.n_heads)
# self.fc_v = torch.nn.Linear(self.embedding_size // self.n_heads, self.embedding_size // self.n_heads)
#self.fc_qk = torch.nn.Linear(self.embedding_size, 2 * self.embedding_size)
# self.fc_qk = torch.nn.Linear(self.embedding_size, 2 * self.embedding_size)
self.fc_out = torch.nn.Linear(
self.embedding_size, self.embedding_size
)
self.fc_out = torch.nn.Linear(self.embedding_size, self.embedding_size)
#for layer in [self.fc_qk, self.fc_v, self.fc_out]:
torch.torch.nn.init.normal_(layer.weight, std = (0.125 / embedding_size ) ** 0.5)
# for layer in [self.fc_qk, self.fc_v, self.fc_out]:
torch.torch.nn.init.normal_(
layer.weight, std=(0.125 / embedding_size) ** 0.5
)
self.embedding_norm = torch.nn.LayerNorm(embedding_size)
def forward(

n_q = number_of_queries if number_of_queries != -1 else query.size(1)
n_k = number_of_keys if number_of_keys != -1 else key.size(1)
inp = self.embedding_norm(inp)
inp = self.embedding_norm(inp)
#query = self.fc_q(inp)
#qk = self.fc_qk(inp)
#qk = qk.reshape(b, n_q, self.n_heads, self.embedding_size // self.n_heads, 2)
#query, key = torch.split(qk, 1, dim=-1)
#query = torch.squeeze(query, dim=-1)
#key = torch.squeeze(key, dim=-1)
# query = self.fc_q(inp)
# qk = self.fc_qk(inp)
# qk = qk.reshape(b, n_q, self.n_heads, self.embedding_size // self.n_heads, 2)
# query, key = torch.split(qk, 1, dim=-1)
# query = torch.squeeze(query, dim=-1)
# key = torch.squeeze(key, dim=-1)
#query = query.reshape(b, n_q, self.n_heads, self.embedding_size // self.n_heads)
#key = key.reshape(b, n_k, self.n_heads, self.embedding_size // self.n_heads)
#value = value.reshape(b, n_k, self.n_heads, self.embedding_size // self.n_heads)
# query = query.reshape(b, n_q, self.n_heads, self.embedding_size // self.n_heads)
# key = key.reshape(b, n_k, self.n_heads, self.embedding_size // self.n_heads)
# value = value.reshape(b, n_k, self.n_heads, self.embedding_size // self.n_heads)
#query = self.fc_q(query) # (b, n_q, h*d)
#key = self.fc_k(key) # (b, n_k, h*d)
#value = self.fc_v(value) # (b, n_k, h*d)
# query = self.fc_q(query) # (b, n_q, h*d)
# key = self.fc_k(key) # (b, n_k, h*d)
# value = self.fc_v(value) # (b, n_k, h*d)
query = query.permute([0, 2, 1, 3]) # (b, h, n_q, emb)
# The next few lines are equivalent to : key.permute([0, 2, 3, 1])

b, n_q, self.embedding_size
) # (b, n_q, h*emb)
out = self.fc_out(value_attention) # (b, n_q, emb)
#if out.requires_grad:
# if out.requires_grad:
#out = self.out_norm(out)
# out = self.out_norm(out)
return out, att

EPISLON = 1e-7
def __init__(
self,
x_self_size: int,
entities_sizes: List[int],
embedding_size: int,
self, x_self_size: int, entities_sizes: List[int], embedding_size: int
):
super().__init__()
self.self_size = x_self_size

# LinearEncoder(self.self_size + ent_size, 2, embedding_size)
# from http://www.cs.toronto.edu/~mvolkovs/ICML2020_tfixup.pdf
# linear_layer(self.self_size + ent_size, embedding_size, Initialization.Normal, kernel_gain=(.125 / (self.self_size + ent_size)) ** 0.5)
#linear_layer(ent_size, embedding_size, Initialization.Normal, kernel_gain=(.125 / (self.self_size + ent_size)) ** 0.5)
LinearEncoder(ent_size, 1, embedding_size, kernel_init=Initialization.Normal, kernel_gain=(.125 / embedding_size) ** 0.5)
#LinearEncoder(self.self_size + ent_size, 1, embedding_size, layer_norm=False)
# linear_layer(ent_size, embedding_size, Initialization.Normal, kernel_gain=(.125 / (self.self_size + ent_size)) ** 0.5)
LinearEncoder(
ent_size,
1,
embedding_size,
kernel_init=Initialization.Normal,
kernel_gain=(0.125 / embedding_size) ** 0.5,
)
# LinearEncoder(self.self_size + ent_size, 1, embedding_size, layer_norm=False)
self.attention = MultiHeadAttention(
num_heads=4,
embedding_size=embedding_size,
)
#self.residual_layer = torch.nn.Linear(
self.attention = MultiHeadAttention(num_heads=4, embedding_size=embedding_size)
# self.residual_layer = torch.nn.Linear(
#)
#torch.torch.nn.init.normal_(self.residual_layer.weight, std = 0.125 * embedding_size ** -0.5)
# )
# torch.torch.nn.init.normal_(self.residual_layer.weight, std = 0.125 * embedding_size ** -0.5)
self.res_norm = torch.nn.LayerNorm(embedding_size)

for ent in entities:
self.entities_num_max_elements.append(ent.shape[1])
# Concatenate all observations with self
#self_and_ent: List[torch.Tensor] = []
#for num_entities, ent in zip(self.entities_num_max_elements, entities):
# self_and_ent: List[torch.Tensor] = []
# for num_entities, ent in zip(self.entities_num_max_elements, entities):
# expanded_self = x_self.reshape(-1, 1, self.self_size)
# # .repeat(
# # 1, num_entities, 1

# Feed to self attention
max_num_ent = sum(self.entities_num_max_elements)
output, _ = self.attention(qkv, mask, max_num_ent, max_num_ent)
#residual 1
# residual 1
#residual 2
#output = self.residual_layer(output) + output #qkv
# average pooling
# residual 2
# output = self.residual_layer(output) + output #qkv
# average pooling
numerator = torch.sum(output * (1 - mask).reshape(-1, max_num_ent, 1), dim=1)
denominator = torch.sum(1 - mask, dim=1, keepdim=True) + self.EPISLON
output = numerator / denominator

24
ml-agents/mlagents/trainers/torch/layers.py


Initialization.XavierGlorotUniform: torch.nn.init.xavier_uniform_,
Initialization.KaimingHeNormal: torch.nn.init.kaiming_normal_,
Initialization.KaimingHeUniform: torch.nn.init.kaiming_uniform_,
Initialization.Normal: torch.nn.init.normal_
Initialization.Normal: torch.nn.init.normal_,
}

)
return lstm
def __init__(self, input_size: int, elementwise_affine: bool=False):
def __init__(self, input_size: int, elementwise_affine: bool = False):
torch.ones(input_size, requires_grad=elementwise_affine)
torch.ones(input_size, requires_grad=elementwise_affine)
torch.zeros(input_size, requires_grad=elementwise_affine)
torch.zeros(input_size, requires_grad=elementwise_affine)
var = torch.mean(centered_activations**2, dim=-1, keepdim=True)
return centered_activations / (torch.sqrt(var + 1E-5)) * self.gamma + self.beta
var = torch.mean(centered_activations ** 2, dim=-1, keepdim=True)
return centered_activations / (torch.sqrt(var + 1e-5)) * self.gamma + self.beta
class MemoryModule(torch.nn.Module):
@abc.abstractproperty

Linear layers.
"""
def __init__(self, input_size: int, num_layers: int, hidden_size: int, kernel_init=Initialization.KaimingHeNormal, kernel_gain=1.0):
def __init__(
self,
input_size: int,
num_layers: int,
hidden_size: int,
kernel_init=Initialization.KaimingHeNormal,
kernel_gain=1.0,
):
super().__init__()
self.layers = [
linear_layer(

31
ml-agents/mlagents/trainers/torch/networks.py


if len(var_len_indices) > 0:
# there are some variable length observations
x_self_len = sum(self.embedding_sizes)
entities_sizes = [] # TODO : More robust
entities_sizes = [] # TODO : More robust
for idx in var_len_indices:
entities_sizes.append(sensor_specs[idx].shape[1])

# self.h_size
# )
self.transformer = SimpleTransformer(
x_self_len,
entities_sizes,
self.n_embd,
)
#self.transformer = SmallestAttention(x_self_len, entities_sizes, self.h_size, self.h_size)
x_self_len, entities_sizes, self.n_embd
)
# self.transformer = SmallestAttention(x_self_len, entities_sizes, self.h_size, self.h_size)
# self.transformer = SmallestAttention(64, [64], self.h_size, self.h_size)
# self.use_fc = True

if self.use_fc:
self.transformer = None
total_enc_size = 80 + sum(self.embedding_sizes)
total_enc_size = 80 + sum(self.embedding_sizes)
n_layers = max(1, network_settings.num_layers + 1)
else:
self.transformer = None

for _,tens in list(self.transformer.named_parameters()):
for _, tens in list(self.transformer.named_parameters()):
self.linear_encoder = LinearEncoder(
total_enc_size, n_layers, self.h_size
)
for _,tens in list(self.linear_encoder.named_parameters()):
self.linear_encoder = LinearEncoder(total_enc_size, n_layers, self.h_size)
for _, tens in list(self.linear_encoder.named_parameters()):
for _,tens in list(processor.named_parameters()):
for _, tens in list(processor.named_parameters()):
tens.retain_grad()
if self.use_lstm:

encoded_state = self.transformer(
x_self_encoded,
var_len_inputs,
SimpleTransformer.get_masks(var_len_inputs)
SimpleTransformer.get_masks(var_len_inputs),
)
# print("\n\n\nUsing transformer ", self.transformer, "use fc = ", self.use_fc, " x_self.shape=",x_self_encoded.shape," var_len_inputs[0].shape=",var_len_inputs[0].shape," len(var_len_inputs)=",len(var_len_inputs))
else:

x_self = torch.cat(encodes, dim=1)
encoded_state = torch.cat([x_self, inputs[0].reshape(x_self.shape[0], 80)], dim=1)
encoded_state = torch.cat(
[x_self, inputs[0].reshape(x_self.shape[0], 80)], dim=1
)
if actions is not None:
encoded_state = torch.cat([encoded_state, actions], dim=1)

vis_index += 1
else:
inputs.append(var_len_inputs[var_len_index])
var_len_index+=1
var_len_index += 1
# End of code to convert the vec and vis obs into a list of inputs for the network
encoding, memories_out = self.network_body(
inputs, memories=memories, sequence_length=1

正在加载...
取消
保存