Merge branch 'master' into develop-att-network-integration

4 年前 · 5039b65a
--- a/docs/Background-Unity.md
+++ b/docs/Background-Unity.md
 If you are not familiar with the [Unity Engine](https://unity3d.com/unity), we
 highly recommend the [Unity Manual](https://docs.unity3d.com/Manual/index.html)
 and [Tutorials page](https://unity3d.com/learn/tutorials). The
-[Roll-a-ball tutorial](https://unity3d.com/learn/tutorials/s/roll-ball-tutorial)
+[Roll-a-ball tutorial](https://learn.unity.com/project/roll-a-ball)
 is a fantastic resource to learn all the basic concepts of Unity to get started
 with the ML-Agents Toolkit:

--- a/ml-agents/mlagents/trainers/tests/torch/test_attention.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_attention.py
        list(transformer.parameters()) + list(l_layer.parameters()), lr=0.001
    )
    batch_size = 200
-    point_range = 3
-    init_error = -1.0
-    for _ in range(250):
-        center = torch.rand((batch_size, size)) * point_range * 2 - point_range
-        key = torch.rand((batch_size, n_k, size)) * point_range * 2 - point_range
+    for _ in range(200):
+        center = torch.rand((batch_size, size))
+        key = torch.rand((batch_size, n_k, size))
        with torch.no_grad():
            # create the target : The key closest to the query in euclidean distance
            distance = torch.sum(
        prediction = prediction.reshape((batch_size, size))
        error = torch.mean((prediction - target) ** 2, dim=1)
        error = torch.mean(error) / 2
-        if init_error == -1.0:
-            init_error = error.item()
-        else:
-            assert error.item() < init_error
-    assert error.item() < 0.3
+    assert error.item() < 0.02
--- a/ml-agents/mlagents/trainers/tests/torch/test_layers.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_layers.py
    lstm_layer,
    Initialization,
    LSTM,
+    LayerNorm,
 )


    # Hidden size should be half of memory_size
    assert out.shape == (batch_size, seq_len, memory_size // 2)
    assert mem.shape == (1, batch_size, memory_size)
+
+
+def test_layer_norm():
+    torch.manual_seed(0)
+    torch_ln = torch.nn.LayerNorm(10, elementwise_affine=False)
+    cust_ln = LayerNorm()
+
+    sample_input = torch.rand(10)
+    assert torch.all(
+        torch.isclose(
+            torch_ln(sample_input), cust_ln(sample_input), atol=1e-5, rtol=0.0
+        )
+    )
+    sample_input = torch.rand((4, 10))
+    assert torch.all(
+        torch.isclose(
+            torch_ln(sample_input), cust_ln(sample_input), atol=1e-5, rtol=0.0
+        )
+    )
+    sample_input = torch.rand((7, 6, 10))
+    assert torch.all(
+        torch.isclose(
+            torch_ln(sample_input), cust_ln(sample_input), atol=1e-5, rtol=0.0
+        )
+    )
--- a/ml-agents/mlagents/trainers/torch/attention.py
+++ b/ml-agents/mlagents/trainers/torch/attention.py
 from mlagents.torch_utils import torch
 from typing import Tuple, Optional, List
-from mlagents.trainers.torch.layers import LinearEncoder, Initialization, linear_layer
+from mlagents.trainers.torch.layers import (
+    LinearEncoder,
+    Initialization,
+    linear_layer,
+    LayerNorm,
+)
 from mlagents.trainers.torch.model_serialization import exporting_to_onnx
 from mlagents.trainers.exception import UnityTrainerException

        # If not concatenating self, input to encoder is just entity size
        if not concat_self:
            self.self_size = 0
+        # Initialization scheme from http://www.cs.toronto.edu/~mvolkovs/ICML2020_tfixup.pdf
-                LinearEncoder(self.self_size + ent_size, 1, embedding_size)
+                LinearEncoder(
+                    self.self_size + ent_size,
+                    1,
+                    embedding_size,
+                    kernel_init=Initialization.Normal,
+                    kernel_gain=(0.125 / embedding_size) ** 0.5,
+                )
+        self.embedding_norm = LayerNorm()

    def forward(
        self, x_self: torch.Tensor, entities: List[torch.Tensor]
            [ent_encoder(x) for ent_encoder, x in zip(self.ent_encoders, self_and_ent)],
            dim=1,
        )
+        encoded_entities = self.embedding_norm(encoded_entities)
        return encoded_entities

    @staticmethod
            num_heads=num_heads, embedding_size=embedding_size
        )

+        # Initialization scheme from http://www.cs.toronto.edu/~mvolkovs/ICML2020_tfixup.pdf
        self.fc_q = linear_layer(
            embedding_size,
            embedding_size,
            kernel_init=Initialization.Normal,
            kernel_gain=(0.125 / embedding_size) ** 0.5,
        )
+        self.residual_norm = LayerNorm()

    def forward(self, inp: torch.Tensor, key_masks: List[torch.Tensor]) -> torch.Tensor:
        # Gather the maximum number of entities information
        output, _ = self.attention(query, key, value, num_ent, num_ent, mask)
        # Residual
        output = self.fc_out(output) + inp
+        output = self.residual_norm(output)
-        # Residual between x_self and the output of the module
        return output
--- a/ml-agents/mlagents/trainers/torch/layers.py
+++ b/ml-agents/mlagents/trainers/torch/layers.py
        pass


+class LayerNorm(torch.nn.Module):
+    """
+    A vanilla implementation of layer normalization  https://arxiv.org/pdf/1607.06450.pdf
+    norm_x = (x - mean) / sqrt((x - mean) ^ 2)
+    This does not include the trainable parameters gamma and beta for performance speed.
+    Typically, this is norm_x * gamma + beta
+    """
+
+    def forward(self, layer_activations: torch.Tensor) -> torch.Tensor:
+        mean = torch.mean(layer_activations, dim=-1, keepdim=True)
+        var = torch.mean((layer_activations - mean) ** 2, dim=-1, keepdim=True)
+        return (layer_activations - mean) / (torch.sqrt(var + 1e-5))
+
+
-    def __init__(self, input_size: int, num_layers: int, hidden_size: int):
+    def __init__(
+        self,
+        input_size: int,
+        num_layers: int,
+        hidden_size: int,
+        kernel_init: Initialization = Initialization.KaimingHeNormal,
+        kernel_gain: float = 1.0,
+    ):
-                kernel_init=Initialization.KaimingHeNormal,
-                kernel_gain=1.0,
+                kernel_init=kernel_init,
+                kernel_gain=kernel_gain,
            )
        ]
        self.layers.append(Swish())
                    hidden_size,
                    hidden_size,
-                    kernel_init=Initialization.KaimingHeNormal,
-                    kernel_gain=1.0,
+                    kernel_init=kernel_init,
+                    kernel_gain=kernel_gain,
                )
            )
            self.layers.append(Swish())