浏览代码

layer norm

/layernorm
Andrew Cohen 4 年前
当前提交
f57875e0
共有 5 个文件被更改,包括 91 次插入15 次删除
  1. 2
      config/ppo/Bullet.yaml
  2. 13
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  3. 44
      ml-agents/mlagents/trainers/torch/attention.py
  4. 23
      ml-agents/mlagents/trainers/torch/layers.py
  5. 24
      ml-agents/mlagents/trainers/torch/networks.py

2
config/ppo/Bullet.yaml


keep_checkpoints: 5
max_steps: 50000000
time_horizon: 64
summary_freq: 100000
summary_freq: 10000
threaded: true

13
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


loss.backward()
self.optimizer.step()
update_stats = {
# NOTE: abs() is not technically correct, but matches the behavior in TensorFlow.
# TODO: After PyTorch is default, change to something more correct.

"Policy/Epsilon": decay_eps,
"Policy/Beta": decay_bet,
}
for name, params in list(self.policy.actor_critic.network_body.transformer.named_parameters()):
update_stats["Policy/" + name + '_mean'] = torch.mean(params).item()
update_stats["Policy/" + name + '_std'] = torch.std(params).item()
update_stats["Policy/" + name + '_grad_mag'] = torch.norm(params.grad).item()
#update_stats["Policy/" + name + '_grad_mean'] = torch.mean(params.grad).item()
#update_stats["Policy/" + name + '_grad_std'] = torch.std(params.grad).item()
for name, params in list(self.policy.actor_critic.network_body.linear_encoder.named_parameters()):
update_stats["Policy/" + name + '_grad_mag'] = torch.norm(params.grad).item()
#update_stats["Policy/" + name + '_grad_mean'] = torch.mean(params.grad).item()
#update_stats["Policy/" + name + '_grad_std'] = torch.std(params.grad).item()
for reward_provider in self.reward_signals.values():
update_stats.update(reward_provider.update(batch))

44
ml-agents/mlagents/trainers/torch/attention.py


from mlagents.torch_utils import torch
from typing import Tuple, Optional, List
from mlagents.trainers.torch.layers import LinearEncoder, linear_layer, Initialization
from mlagents.trainers.torch.layers import LinearEncoder, linear_layer, Initialization, LayerNorm
def grad_hook(mod, inp, out):
print("")
print(mod)
print("-" * 10 + ' Incoming Gradients ' + '-' * 10)
print("")
print('Incoming Grad value: {}'.format(inp[0].data))
print("")
print('Upstream Grad value: {}'.format(out[0].data))
class MultiHeadAttention(torch.nn.Module):
"""

super().__init__()
self.n_heads, self.embedding_size = num_heads, embedding_size
self.output_size = output_size
#self.fc_q = linear_layer(
# query_size,
# self.n_heads * self.embedding_size,
# kernel_init=Initialization.KaimingHeNormal,
# kernel_gain=1.0,
# )
#self.fc_k = linear_layer(
# key_size,
# self.n_heads * self.embedding_size,
# kernel_init=Initialization.KaimingHeNormal,
# kernel_gain=1.0,
# )
#self.fc_v = linear_layer(
# value_size,
# self.n_heads * self.embedding_size,
# kernel_init=Initialization.KaimingHeNormal,
# kernel_gain=1.0,
# )
# self.fc_q = LinearEncoder(query_size, 2, self.n_heads * self.embedding_size)
# self.fc_k = LinearEncoder(key_size,2, self.n_heads * self.embedding_size)
# self.fc_v = LinearEncoder(value_size,2, self.n_heads * self.embedding_size)

) # (b, n_q, h*emb)
out = self.fc_out(value_attention) # (b, n_q, emb)
#if out.requires_grad:
# out.register_hook(lambda x: print(x))
#out = self.out_norm(out)
return out, att

# LinearEncoder(self.self_size + ent_size, 2, embedding_size)
# from http://www.cs.toronto.edu/~mvolkovs/ICML2020_tfixup.pdf
# linear_layer(self.self_size + ent_size, embedding_size, Initialization.Normal, kernel_gain=1 / (self.self_size + ent_size) ** 0.5)
LinearEncoder(self.self_size + ent_size, 1, embedding_size)
LinearEncoder(self.self_size + ent_size, 1, embedding_size, layer_norm=True)
for ent_size in self.entities_sizes
]
)

num_heads=4,
embedding_size=embedding_size,
)
self.residual_layer = LinearEncoder(embedding_size, 1, embedding_size)
#self.residual_layer = LinearEncoder(embedding_size, 1, embedding_size)
self.res_norm = torch.nn.LayerNorm(embedding_size, elementwise_affine=True)
if output_size is None:
output_size = embedding_size

max_num_ent = sum(self.entities_num_max_elements)
output, _ = self.attention(qkv, qkv, qkv, mask, max_num_ent, max_num_ent)
# Residual
output = self.residual_layer(output) + qkv
#output = self.residual_layer(output) + qkv
#output += qkv
output = self.res_norm(output + qkv)
#output = self.res_norm(output)
output = torch.cat([output, x_self], dim=1)
#output = torch.cat([output, x_self], dim=1)
return output
@staticmethod

23
ml-agents/mlagents/trainers/torch/layers.py


)
return lstm
class LayerNorm(torch.nn.Module):
def __init__(self, input_size: int, elementwise_affine: bool=False):
super().__init__()
self.gamma = torch.nn.Parameter(
torch.ones(input_size, requires_grad=elementwise_affine)
)
self.beta = torch.nn.Parameter(
torch.zeros(input_size, requires_grad=elementwise_affine)
)
def forward(self, layer_activations: torch.Tensor):
mean = torch.mean(layer_activations, dim=-1, keepdim=True)
centered_activations = layer_activations - mean
var = torch.mean(centered_activations**2, dim=-1, keepdim=True)
return centered_activations / (torch.sqrt(var + 1E-5)) * self.gamma + self.beta
class MemoryModule(torch.nn.Module):
@abc.abstractproperty

Linear layers.
"""
def __init__(self, input_size: int, num_layers: int, hidden_size: int):
def __init__(self, input_size: int, num_layers: int, hidden_size: int, layer_norm=False):
super().__init__()
self.layers = [
linear_layer(

)
]
self.layers.append(Swish())
if layer_norm:
self.layers.append(torch.nn.LayerNorm(hidden_size, elementwise_affine=True))
for _ in range(num_layers - 1):
self.layers.append(
linear_layer(

kernel_gain=1.0,
)
)
if layer_norm:
self.layers.append(torch.nn.LayerNorm(hidden_size, elementwise_affine=True))
self.layers.append(Swish())
self.seq_layers = torch.nn.Sequential(*self.layers)

24
ml-agents/mlagents/trainers/torch/networks.py


# self.h_size,
# self.h_size
# )
# self.transformer = SimpleTransformer(
# x_self_len,
# entities_sizes,
# self.h_size,
# self.h_size
# )
self.transformer = SmallestAttention(x_self_len, entities_sizes, self.h_size, self.h_size)
self.transformer = SimpleTransformer(
x_self_len,
entities_sizes,
self.h_size,
self.h_size,
)
#self.transformer = SmallestAttention(x_self_len, entities_sizes, self.h_size, self.h_size)
total_enc_size = self.h_size + sum(self.embedding_sizes)
total_enc_size = self.h_size #+ sum(self.embedding_sizes)
# total_enc_size = 128#self.h_size + sum(self.embedding_sizes)
n_layers = 1
if self.use_fc:

if total_enc_size == 0:
raise Exception("No valid inputs to network.")
for _,tens in list(self.transformer.named_parameters()):
tens.retain_grad()
for _,tens in list(self.linear_encoder.named_parameters()):
tens.retain_grad()
for processor in self.processors:
if processor is not None:
for _,tens in list(processor.named_parameters()):
tens.retain_grad()
if self.use_lstm:
self.lstm = LSTM(self.h_size, self.m_size)

正在加载...
取消
保存