WIP integrate attention to networkbody

4 年前 · 56972f56
--- a/ml-agents-envs/mlagents_envs/rpc_utils.py
+++ b/ml-agents-envs/mlagents_envs/rpc_utils.py
        tuple(DimensionProperty(dim) for dim in obs.dimension_properties)
        for obs in agent_info.observations
    ]
+    dim_props = [
+        dim_prop
+        if len(dim_prop) > 0
+        else (DimensionProperty.UNSPECIFIED,) * len(observation_shape[idx])
+        for idx, dim_prop in enumerate(dim_props)
+    ]
    sensor_specs = [
        SensorSpec(obs_shape, dim_p)
        for obs_shape, dim_p in zip(observation_shape, dim_props)
--- a/ml-agents/mlagents/trainers/tests/torch/test_attention.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_attention.py
    size, n_k, = 3, 5
    embedding_size = 64
    entity_embeddings = EntityEmbeddings(size, [size], [n_k], embedding_size)
-    transformer = ResidualSelfAttention(embedding_size, [n_k])
+    transformer = ResidualSelfAttention(embedding_size, n_k)
    l_layer = linear_layer(embedding_size, size)
    optimizer = torch.optim.Adam(
        list(transformer.parameters()) + list(l_layer.parameters()), lr=0.001
--- a/ml-agents/mlagents/trainers/tests/torch/test_utils.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_utils.py
 from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.torch.encoders import VectorInput
 from mlagents.trainers.tests.dummy_config import create_sensor_specs_with_shapes
+from mlagents_envs.base_env import SensorSpec, DimensionProperty


 def test_min_visual_size():

    ModelUtils.soft_update(tm1, tm2, tau=1.0)
    assert torch.equal(tm2.parameter, tm1.parameter)
+
+
+def test_can_train_dim_property():
+    spec = SensorSpec(
+        (5, 5, 3),
+        (
+            DimensionProperty.UNSPECIFIED,
+            DimensionProperty.UNSPECIFIED,
+            DimensionProperty.UNSPECIFIED,
+        ),
+    )
+    assert ModelUtils.can_encode_visual(spec)
+    assert not ModelUtils.can_encode_vector(spec)
+    assert not ModelUtils.can_encode_attention(spec)
+
+    spec = SensorSpec(
+        (5, 5, 3),
+        (
+            DimensionProperty.TRANSLATIONAL_EQUIVARIANCE,
+            DimensionProperty.TRANSLATIONAL_EQUIVARIANCE,
+            DimensionProperty.NONE,
+        ),
+    )
+    assert ModelUtils.can_encode_visual(spec)
+    assert not ModelUtils.can_encode_vector(spec)
+    assert not ModelUtils.can_encode_attention(spec)
+
+    spec = SensorSpec(
+        (5, 5, 3, 5),
+        (
+            DimensionProperty.UNSPECIFIED,
+            DimensionProperty.UNSPECIFIED,
+            DimensionProperty.UNSPECIFIED,
+            DimensionProperty.UNSPECIFIED,
+        ),
+    )
+    assert not ModelUtils.can_encode_visual(spec)
+    assert not ModelUtils.can_encode_vector(spec)
+    assert not ModelUtils.can_encode_attention(spec)
+
+    spec = SensorSpec(
+        (5, 6), (DimensionProperty.UNSPECIFIED, DimensionProperty.UNSPECIFIED)
+    )
+    assert not ModelUtils.can_encode_visual(spec)
+    assert not ModelUtils.can_encode_vector(spec)
+    assert not ModelUtils.can_encode_attention(spec)
+
+    spec = SensorSpec(
+        (5, 6),
+        (
+            DimensionProperty.TRANSLATIONAL_EQUIVARIANCE,
+            DimensionProperty.TRANSLATIONAL_EQUIVARIANCE,
+        ),
+    )
+    assert not ModelUtils.can_encode_visual(spec)
+    assert not ModelUtils.can_encode_vector(spec)
+    assert not ModelUtils.can_encode_attention(spec)
+
+    spec = SensorSpec((5, 6), (DimensionProperty.VARIABLE_SIZE, DimensionProperty.NONE))
+    assert not ModelUtils.can_encode_visual(spec)
+    assert not ModelUtils.can_encode_vector(spec)
+    assert ModelUtils.can_encode_attention(spec)
+
+    spec = SensorSpec((5,), (DimensionProperty.UNSPECIFIED,))
+    assert not ModelUtils.can_encode_visual(spec)
+    assert ModelUtils.can_encode_vector(spec)
+    assert not ModelUtils.can_encode_attention(spec)
+
+    spec = SensorSpec((5,), (DimensionProperty.NONE,))
+    assert not ModelUtils.can_encode_visual(spec)
+    assert ModelUtils.can_encode_vector(spec)
+    assert not ModelUtils.can_encode_attention(spec)
--- a/ml-agents/mlagents/trainers/torch/attention.py
+++ b/ml-agents/mlagents/trainers/torch/attention.py


 class MultiHeadAttention(torch.nn.Module):
-    """
-    Multi Head Attention module. We do not use the regular Torch implementation since
-    Barracuda does not support some operators it uses.
-    Takes as input to the forward method 3 tensors:
-     - query: of dimensions (batch_size, number_of_queries, embedding_size)
-     - key: of dimensions (batch_size, number_of_keys, embedding_size)
-     - value: of dimensions (batch_size, number_of_keys, embedding_size)
-    The forward method will return 2 tensors:
-     - The output: (batch_size, number_of_queries, embedding_size)
-     - The attention matrix: (batch_size, num_heads, number_of_queries, number_of_keys)
-    """
+        """
+        Multi Head Attention module. We do not use the regular Torch implementation since
+        Barracuda does not support some operators it uses.
+        Takes as input to the forward method 3 tensors:
+        - query: of dimensions (batch_size, number_of_queries, embedding_size)
+        - key: of dimensions (batch_size, number_of_keys, embedding_size)
+        - value: of dimensions (batch_size, number_of_keys, embedding_size)
+        The forward method will return 2 tensors:
+        - The output: (batch_size, number_of_queries, embedding_size)
+        - The attention matrix: (batch_size, num_heads, number_of_queries, number_of_keys)
+        :param embedding_size: The size of the embeddings that will be generated (should be
+        dividable by the num_heads)
+        :param total_max_elements: The maximum total number of entities that can be passed to
+        the module
+        :param num_heads: The number of heads of the attention module
+        """
        super().__init__()
        self.n_heads = num_heads
        self.head_size: int = embedding_size // self.n_heads


 class EntityEmbeddings(torch.nn.Module):
-    """
-    """
-
    def __init__(
        self,
        x_self_size: int,
        concat_self: bool = True,
    ):
+        """
+        A module that generates embeddings for a variable number of entities given "self"
+        encoding as well as a representation of each entity of each type. The expected
+        input of the forward method will be a list of tensors: each element of the
+        list corresponds to a type of entity, the dimension of each tensor is :
+        [batch_size, max_num_entities, entity_size]
+        :param x_self_size: The size of the self embedding that will be concatenated
+        with the entities
+        :param entity_sizes: The size of each entity type
+        :param entity_num_max_elements: A list of maximum number of entities, must be
+        the same length as the number of entity tensors that will be passed to the
+        forward method.
+        :param embedding_size: The size of the output embeddings
+        :param concat_self: If true, the x_self will be concatenated with the entities
+        before embedding
+        """
        super().__init__()
        self.self_size: int = x_self_size
        self.entity_sizes: List[int] = entity_sizes

    def forward(
        self, x_self: torch.Tensor, entities: List[torch.Tensor]
-    ) -> Tuple[torch.Tensor, int]:
+    ) -> torch.Tensor:
        if self.concat_self:
            # Concatenate all observations with self
            self_and_ent: List[torch.Tensor] = []
                self_and_ent.append(torch.cat([expanded_self, ent], dim=2))
        else:
            self_and_ent = entities
-            # Encode and concatenate entites
+            # Encode and concatenate entities
        encoded_entities = torch.cat(
            [ent_encoder(x) for ent_encoder, x in zip(self.ent_encoders, self_and_ent)],
            dim=1,


 class ResidualSelfAttention(torch.nn.Module):
-    """
-    A simple architecture inspired from https://arxiv.org/pdf/1909.07528.pdf that uses
-    multi head self attention to encode information about a "Self" and a list of
-    relevant "Entities".
-    """
-
-        self,
-        embedding_size: int,
-        entity_num_max_elements: List[int],
-        num_heads: int = 4,
+        self, embedding_size: int, total_max_elements: int, num_heads: int = 4
+        """
+        A simple architecture inspired from https://arxiv.org/pdf/1909.07528.pdf that uses
+        multi head self attention to encode information about a "Self" and a list of
+        relevant "Entities".
+        :param embedding_size: The size of the embeddings that will be generated (should be
+        dividable by the num_heads)
+        :param total_max_elements: The maximum total number of entities that can be passed to
+        the module
+        :param num_heads: The number of heads of the attention module
+        """
-        self.entity_num_max_elements: List[int] = entity_num_max_elements
-        self.max_num_ent = sum(entity_num_max_elements)
+        self.max_num_ent = total_max_elements
        self.attention = MultiHeadAttention(
            num_heads=num_heads, embedding_size=embedding_size
        )
--- a/ml-agents/mlagents/trainers/torch/model_serialization.py
+++ b/ml-agents/mlagents/trainers/torch/model_serialization.py
            for sen_spec in self.policy.behavior_spec.sensor_specs
            if len(sen_spec.shape) == 3
        ]
+
+        dummy_var_len_obs = [
+            torch.zeros(batch_dim + [sen_spec.shape[0], sen_spec.shape[1]])
+            for sen_spec in self.policy.behavior_spec.sensor_specs
+            if len(sen_spec.shape) == 2
+        ]
+
        dummy_masks = torch.ones(
            batch_dim + [sum(self.policy.behavior_spec.action_spec.discrete_branches)]
        )

-        self.dummy_input = (dummy_vec_obs, dummy_vis_obs, dummy_masks, dummy_memories)
+        self.dummy_input = (
+            dummy_vec_obs,
+            dummy_vis_obs,
+            dummy_var_len_obs,
+            dummy_masks,
+            dummy_memories,
+        )
+            + [
+                f"obs_{i}"
+                for i, sens_spec in enumerate(self.policy.behavior_spec.sensor_specs)
+                if len(sens_spec.shape) == 2
+            ]
            + ["action_masks", "memories"]
        )
        self.dynamic_axes = {name: {0: "batch"} for name in self.input_names}
--- a/ml-agents/mlagents/trainers/torch/networks.py
+++ b/ml-agents/mlagents/trainers/torch/networks.py
 from mlagents.trainers.torch.encoders import VectorInput
 from mlagents.trainers.buffer import AgentBuffer
 from mlagents.trainers.trajectory import ObsUtil
+from mlagents.trainers.torch.attention import EntityEmbeddings, ResidualSelfAttention


 ActivationFunction = Callable[[torch.Tensor], torch.Tensor]
            else 0
        )

-        self.processors, self.embedding_sizes = ModelUtils.create_input_processors(
+        self.processors, self.embedding_sizes, var_len_indices = ModelUtils.create_input_processors(
            sensor_specs,
            self.h_size,
            network_settings.vis_encode_type,
+        if len(var_len_indices) > 0:
+            # There are some variable length observations and they need to be processed separately
+            x_self_len = sum(self.embedding_sizes)  # The size of the "self" embedding
+            entities_sizes = [sensor_specs[idx].shape[1] for idx in var_len_indices]
+            entities_max_len = [sensor_specs[idx].shape[0] for idx in var_len_indices]
+
+            self.entities_embeddings = EntityEmbeddings(
+                x_self_len, entities_sizes, entities_max_len, self.h_size
+            )
+            self.rsa = ResidualSelfAttention(self.h_size, sum(entities_max_len))
+
+            total_enc_size = x_self_len + self.h_size
+
+            n_layers = max(1, network_settings.num_layers - 2)
+        else:
+            total_enc_size = sum(self.embedding_sizes)
+            n_layers = max(1, network_settings.num_layers)
+
-        self.linear_encoder = LinearEncoder(
-            total_enc_size, network_settings.num_layers, self.h_size
-        )
+        self.linear_encoder = LinearEncoder(total_enc_size, n_layers, self.h_size)

        if self.use_lstm:
            self.lstm = LSTM(self.h_size, self.m_size)
        sequence_length: int = 1,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        encodes = []
+        var_len_inputs = []  # The list of variable length inputs
+
-            obs_input = inputs[idx]
-            processed_obs = processor(obs_input)
-            encodes.append(processed_obs)
+            if processor is not None:
+                # The input can be encoded without having to process other inputs
+                obs_input = inputs[idx]
+                processed_obs = processor(obs_input)
+                encodes.append(processed_obs)
+            else:
+                var_len_inputs.append(inputs[idx])
+        encoded_self = torch.cat(encodes, dim=1)
+        if len(var_len_inputs) > 0:
+            # Some inputs need to be processed with a variable length encoder
+            masks = EntityEmbeddings.get_masks(var_len_inputs)
+            qkv = self.entities_embeddings(encoded_self, var_len_inputs)
+            mu_qkv = torch.mean(qkv, dim=2, keepdim=True)
+            qkv = (qkv - mu_qkv) / (
+                torch.sqrt(torch.mean((qkv - mu_qkv) ** 2, dim=2, keepdim=True))
+                + 0.0001
+            )
+            attention_embedding = self.rsa(qkv, masks)
+            encoded_self = torch.cat([encoded_self, attention_embedding], dim=1)

        if len(encodes) == 0:
            raise Exception("No valid inputs to network.")
        self,
        vec_inputs: List[torch.Tensor],
        vis_inputs: List[torch.Tensor],
+        var_len_inputs: List[torch.Tensor],
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
    ) -> Tuple[Union[int, torch.Tensor], ...]:
        self,
        vec_inputs: List[torch.Tensor],
        vis_inputs: List[torch.Tensor],
+        var_len_inputs: List[torch.Tensor],
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
    ) -> Tuple[Union[int, torch.Tensor], ...]:
        start = 0
        end = 0
        vis_index = 0
+        var_len_index = 0
        for i, enc in enumerate(self.network_body.processors):
            if isinstance(enc, VectorInput):
                # This is a vec_obs
                start = end
-            else:
+            elif enc is not None:
+            else:
+                inputs.append(var_len_inputs[var_len_index])
+                var_len_index += 1
        # End of code to convert the vec and vis obs into a list of inputs for the network
        encoding, memories_out = self.network_body(
            inputs, memories=memories, sequence_length=1
--- a/ml-agents/mlagents/trainers/torch/utils.py
+++ b/ml-agents/mlagents/trainers/torch/utils.py
 )
 from mlagents.trainers.settings import EncoderType, ScheduleType
 from mlagents.trainers.exception import UnityTrainerException
-from mlagents_envs.base_env import SensorSpec
+from mlagents_envs.base_env import SensorSpec, DimensionProperty


 class ModelUtils:
        raise UnityTrainerException(f"Unsupported shape of {shape} for observation")

    @staticmethod
+    def can_encode_visual(sensor_spec: SensorSpec) -> bool:
+        """
+        Returns True if it is possible to create a visual embedding for the sensor
+        """
+        if len(sensor_spec.shape) != 3:
+            return False
+        for conv_dim in [0, 1]:
+            prop = sensor_spec.dimension_property[conv_dim]
+            if (prop != DimensionProperty.UNSPECIFIED) and (
+                prop != DimensionProperty.TRANSLATIONAL_EQUIVARIANCE
+            ):
+                return False
+        prop = sensor_spec.dimension_property[2]
+        if (
+            (prop != DimensionProperty.UNSPECIFIED)
+            and (prop != DimensionProperty.TRANSLATIONAL_EQUIVARIANCE)
+            and (prop != DimensionProperty.NONE)
+        ):
+            return False
+        return True
+
+    @staticmethod
+    def can_encode_vector(sensor_spec: SensorSpec) -> bool:
+        """
+        Returns True if it is possible to create a vector embedding for the sensor
+        """
+        if len(sensor_spec.shape) != 1:
+            return False
+        prop = sensor_spec.dimension_property[0]
+        if (prop != DimensionProperty.UNSPECIFIED) and (prop != DimensionProperty.NONE):
+            return False
+        return True
+
+    @staticmethod
+    def can_encode_attention(sensor_spec: SensorSpec) -> bool:
+        """
+        Returns True if it is possible to create an attention embedding for the sensor
+        """
+        if len(sensor_spec.shape) != 2:
+            return False
+        if sensor_spec.dimension_property[0] != DimensionProperty.VARIABLE_SIZE:
+            return False
+        if sensor_spec.dimension_property[1] != DimensionProperty.NONE:
+            return False
+        return True
+
+    @staticmethod
-    ) -> Tuple[nn.ModuleList, List[int]]:
+    ) -> Tuple[nn.ModuleList, List[int], List[int]]:
        """
        Creates visual and vector encoders, along with their normalizers.
        :param sensor_specs: List of SensorSpec that represent the observation dimensions.
        :param unnormalized_inputs: Vector inputs that should not be normalized, and added to the vector
            obs.
        :param normalize: Normalize all vector inputs.
-        :return: Tuple of visual encoders and vector encoders each as a list.
+        :return: Tuple of :
+         - ModuleList of the encoders (None if the input requires to be processed with a variable length
+         observation encoder)
+         - A list of embedding sizes (0 if the input requires to be processed with a variable length
+         observation encoder)
+         - A list of the inputs that need to be processed by a variable length observation encder.
-        for sen_spec in sensor_specs:
-            encoder, embedding_size = ModelUtils.get_encoder_for_obs(
-                sen_spec.shape, normalize, h_size, vis_encode_type
-            )
-            encoders.append(encoder)
-            embedding_sizes.append(embedding_size)
+        var_len_indices: List[int] = []
+        for idx, sen_spec in enumerate(sensor_specs):
+            if ModelUtils.can_encode_attention(sen_spec):
+                # This is a 2D tensor
+                # TODO : better if condition
+                var_len_indices.append(idx)
+                encoders.append(None)
+                embedding_sizes.append(0)
+            elif ModelUtils.can_encode_vector(sen_spec) or ModelUtils.can_encode_visual(
+                sen_spec
+            ):
+                encoder, embedding_size = ModelUtils.get_encoder_for_obs(
+                    sen_spec.shape, normalize, h_size, vis_encode_type
+                )
+                encoders.append(encoder)
+                embedding_sizes.append(embedding_size)
+            else:
+                raise UnityTrainerException(
+                    "The following Sensor is incompatible with the trainer {sen_spec}"
+                )
-        return (nn.ModuleList(encoders), embedding_sizes)
+        return (nn.ModuleList(encoders), embedding_sizes, var_len_indices)

    @staticmethod
    def list_to_tensor(