Buffer key enums (#4907)

4 年前 · 64fc7f43
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
 jobs:
  pytest:
    runs-on: ubuntu-latest
+    env:
+        TEST_ENFORCE_BUFFER_KEY_TYPES: 1
    strategy:
      matrix:
        python-version: [3.6.x, 3.7.x, 3.8.x]
--- a/ml-agents/mlagents/trainers/buffer.py
+++ b/ml-agents/mlagents/trainers/buffer.py
+from collections import defaultdict
+from collections.abc import MutableMapping
+import enum
+import itertools
+from typing import BinaryIO, DefaultDict, List, Tuple, Union, Optional
+
-from typing import List, BinaryIO
-import itertools

 from mlagents_envs.exception import UnityException

    pass


-class AgentBuffer(dict):
+class BufferKey(enum.Enum):
+    ACTION_MASK = "action_mask"
+    CONTINUOUS_ACTION = "continuous_action"
+    CONTINUOUS_LOG_PROBS = "continuous_log_probs"
+    DISCRETE_ACTION = "discrete_action"
+    DISCRETE_LOG_PROBS = "discrete_log_probs"
+    DONE = "done"
+    ENVIRONMENT_REWARDS = "environment_rewards"
+    MASKS = "masks"
+    MEMORY = "memory"
+    PREV_ACTION = "prev_action"
+
+    ADVANTAGES = "advantages"
+    DISCOUNTED_RETURNS = "discounted_returns"
+
+
+class ObservationKeyPrefix(enum.Enum):
+    OBSERVATION = "obs"
+    NEXT_OBSERVATION = "next_obs"
+
+
+class RewardSignalKeyPrefix(enum.Enum):
+    # Reward signals
+    REWARDS = "rewards"
+    VALUE_ESTIMATES = "value_estimates"
+    RETURNS = "returns"
+    ADVANTAGE = "advantage"
+
+
+AgentBufferKey = Union[
+    BufferKey, Tuple[ObservationKeyPrefix, int], Tuple[RewardSignalKeyPrefix, str]
+]
+
+
+class RewardSignalUtil:
+    @staticmethod
+    def rewards_key(name: str) -> AgentBufferKey:
+        return RewardSignalKeyPrefix.REWARDS, name
+
+    @staticmethod
+    def value_estimates_key(name: str) -> AgentBufferKey:
+        return RewardSignalKeyPrefix.RETURNS, name
+
+    @staticmethod
+    def returns_key(name: str) -> AgentBufferKey:
+        return RewardSignalKeyPrefix.RETURNS, name
+
+    @staticmethod
+    def advantage_key(name: str) -> AgentBufferKey:
+        return RewardSignalKeyPrefix.ADVANTAGE, name
+
+
+class AgentBufferField(list):
-    AgentBuffer contains a dictionary of AgentBufferFields. Each agent has his own AgentBuffer.
-    The keys correspond to the name of the field. Example: state, action
+    AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to its
+    AgentBufferField with the append method.
-    class AgentBufferField(list):
+    def __init__(self):
+        self.padding_value = 0
+        super().__init__()
+
+    def __str__(self):
+        return str(np.array(self).shape)
+
+    def append(self, element: np.ndarray, padding_value: float = 0.0) -> None:
-        AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to its
-        AgentBufferField with the append method.
+        Adds an element to this list. Also lets you change the padding
+        type, so that it can be set on append (e.g. action_masks should
+        be padded with 1.)
+        :param element: The element to append to the list.
+        :param padding_value: The value used to pad when get_batch is called.
+        super().append(element)
+        self.padding_value = padding_value
-        def __init__(self):
-            self.padding_value = 0
-            super().__init__()
+    def extend(self, data: np.ndarray) -> None:
+        """
+        Adds a list of np.arrays to the end of the list of np.arrays.
+        :param data: The np.array list to append.
+        """
+        self += list(np.array(data, dtype=np.float32))
-        def __str__(self):
-            return str(np.array(self).shape)
+    def set(self, data):
+        """
+        Sets the list of np.array to the input data
+        :param data: The np.array list to be set.
+        """
+        # Make sure we convert incoming data to float32 if it's a float
+        dtype = None
+        if data is not None and len(data) and isinstance(data[0], float):
+            dtype = np.float32
+        self[:] = []
+        self[:] = list(np.array(data, dtype=dtype))
-        def append(self, element: np.ndarray, padding_value: float = 0.0) -> None:
-            """
-            Adds an element to this list. Also lets you change the padding
-            type, so that it can be set on append (e.g. action_masks should
-            be padded with 1.)
-            :param element: The element to append to the list.
-            :param padding_value: The value used to pad when get_batch is called.
-            """
-            super().append(element)
-            self.padding_value = padding_value
+    def get_batch(
+        self,
+        batch_size: int = None,
+        training_length: Optional[int] = 1,
+        sequential: bool = True,
+    ) -> np.ndarray:
+        """
+        Retrieve the last batch_size elements of length training_length
+        from the list of np.array
+        :param batch_size: The number of elements to retrieve. If None:
+        All elements will be retrieved.
+        :param training_length: The length of the sequence to be retrieved. If
+        None: only takes one element.
+        :param sequential: If true and training_length is not None: the elements
+        will not repeat in the sequence. [a,b,c,d,e] with training_length = 2 and
+        sequential=True gives [[0,a],[b,c],[d,e]]. If sequential=False gives
+        [[a,b],[b,c],[c,d],[d,e]]
+        """
+        if training_length is None:
+            training_length = 1
+        if sequential:
+            # The sequences will not have overlapping elements (this involves padding)
+            leftover = len(self) % training_length
+            # leftover is the number of elements in the first sequence (this sequence might need 0 padding)
+            if batch_size is None:
+                # retrieve the maximum number of elements
+                batch_size = len(self) // training_length + 1 * (leftover != 0)
+            # The maximum number of sequences taken from a list of length len(self) without overlapping
+            # with padding is equal to batch_size
+            if batch_size > (len(self) // training_length + 1 * (leftover != 0)):
+                raise BufferException(
+                    "The batch size and training length requested for get_batch where"
+                    " too large given the current number of data points."
+                )
+            if batch_size * training_length > len(self):
+                padding = np.array(self[-1], dtype=np.float32) * self.padding_value
+                return np.array(
+                    [padding] * (training_length - leftover) + self[:], dtype=np.float32
+                )
+            else:
+                return np.array(
+                    self[len(self) - batch_size * training_length :], dtype=np.float32
+                )
+        else:
+            # The sequences will have overlapping elements
+            if batch_size is None:
+                # retrieve the maximum number of elements
+                batch_size = len(self) - training_length + 1
+            # The number of sequences of length training_length taken from a list of len(self) elements
+            # with overlapping is equal to batch_size
+            if (len(self) - training_length + 1) < batch_size:
+                raise BufferException(
+                    "The batch size and training length requested for get_batch where"
+                    " too large given the current number of data points."
+                )
+            tmp_list: List[np.ndarray] = []
+            for end in range(len(self) - batch_size + 1, len(self) + 1):
+                tmp_list += self[end - training_length : end]
+            return np.array(tmp_list, dtype=np.float32)
-        def extend(self, data: np.ndarray) -> None:
-            """
-            Adds a list of np.arrays to the end of the list of np.arrays.
-            :param data: The np.array list to append.
-            """
-            self += list(np.array(data, dtype=np.float32))
+    def reset_field(self) -> None:
+        """
+        Resets the AgentBufferField
+        """
+        self[:] = []
-        def set(self, data):
-            """
-            Sets the list of np.array to the input data
-            :param data: The np.array list to be set.
-            """
-            # Make sure we convert incoming data to float32 if it's a float
-            dtype = None
-            if data is not None and len(data) and isinstance(data[0], float):
-                dtype = np.float32
-            self[:] = []
-            self[:] = list(np.array(data, dtype=dtype))
-        def get_batch(
-            self,
-            batch_size: int = None,
-            training_length: int = 1,
-            sequential: bool = True,
-        ) -> np.ndarray:
-            """
-            Retrieve the last batch_size elements of length training_length
-            from the list of np.array
-            :param batch_size: The number of elements to retrieve. If None:
-            All elements will be retrieved.
-            :param training_length: The length of the sequence to be retrieved. If
-            None: only takes one element.
-            :param sequential: If true and training_length is not None: the elements
-            will not repeat in the sequence. [a,b,c,d,e] with training_length = 2 and
-            sequential=True gives [[0,a],[b,c],[d,e]]. If sequential=False gives
-            [[a,b],[b,c],[c,d],[d,e]]
-            """
-            if sequential:
-                # The sequences will not have overlapping elements (this involves padding)
-                leftover = len(self) % training_length
-                # leftover is the number of elements in the first sequence (this sequence might need 0 padding)
-                if batch_size is None:
-                    # retrieve the maximum number of elements
-                    batch_size = len(self) // training_length + 1 * (leftover != 0)
-                # The maximum number of sequences taken from a list of length len(self) without overlapping
-                # with padding is equal to batch_size
-                if batch_size > (len(self) // training_length + 1 * (leftover != 0)):
-                    raise BufferException(
-                        "The batch size and training length requested for get_batch where"
-                        " too large given the current number of data points."
-                    )
-                if batch_size * training_length > len(self):
-                    padding = np.array(self[-1], dtype=np.float32) * self.padding_value
-                    return np.array(
-                        [padding] * (training_length - leftover) + self[:],
-                        dtype=np.float32,
-                    )
-                else:
-                    return np.array(
-                        self[len(self) - batch_size * training_length :],
-                        dtype=np.float32,
-                    )
-            else:
-                # The sequences will have overlapping elements
-                if batch_size is None:
-                    # retrieve the maximum number of elements
-                    batch_size = len(self) - training_length + 1
-                # The number of sequences of length training_length taken from a list of len(self) elements
-                # with overlapping is equal to batch_size
-                if (len(self) - training_length + 1) < batch_size:
-                    raise BufferException(
-                        "The batch size and training length requested for get_batch where"
-                        " too large given the current number of data points."
-                    )
-                tmp_list: List[np.ndarray] = []
-                for end in range(len(self) - batch_size + 1, len(self) + 1):
-                    tmp_list += self[end - training_length : end]
-                return np.array(tmp_list, dtype=np.float32)
+class AgentBuffer(MutableMapping):
+    """
+    AgentBuffer contains a dictionary of AgentBufferFields. Each agent has his own AgentBuffer.
+    The keys correspond to the name of the field. Example: state, action
+    """
-        def reset_field(self) -> None:
-            """
-            Resets the AgentBufferField
-            """
-            self[:] = []
+    # Whether or not to validate the types of keys at runtime
+    # This should be off for training, but enabled for testing
+    CHECK_KEY_TYPES_AT_RUNTIME = False
-        super().__init__()
+        self._fields: DefaultDict[AgentBufferKey, AgentBufferField] = defaultdict(
+            AgentBufferField
+        )
-        return ", ".join(["'{}' : {}".format(k, str(self[k])) for k in self.keys()])
+        return ", ".join(
+            ["'{}' : {}".format(k, str(self[k])) for k in self._fields.keys()]
+        )
-        for k in self.keys():
-            self[k].reset_field()
+        for f in self._fields.values():
+            f.reset_field()
-    def __getitem__(self, key):
-        if key not in self.keys():
-            self[key] = self.AgentBufferField()
-        return super().__getitem__(key)
+    @staticmethod
+    def _check_key(key):
+        if isinstance(key, BufferKey):
+            return
+        if isinstance(key, tuple):
+            key0, key1 = key
+            if isinstance(key0, ObservationKeyPrefix):
+                if isinstance(key1, int):
+                    return
+                raise KeyError(f"{key} has type ({type(key0)}, {type(key1)})")
+            if isinstance(key0, RewardSignalKeyPrefix):
+                if isinstance(key1, str):
+                    return
+                raise KeyError(f"{key} has type ({type(key0)}, {type(key1)})")
+        raise KeyError(f"{key} is a {type(key)}")
-    def check_length(self, key_list: List[str]) -> bool:
+    @staticmethod
+    def _encode_key(key: AgentBufferKey) -> str:
+        """
+        Convert the key to a string representation so that it can be used for serialization.
+        """
+        if isinstance(key, BufferKey):
+            return key.value
+        prefix, suffix = key
+        return f"{prefix.value}:{suffix}"
+
+    @staticmethod
+    def _decode_key(encoded_key: str) -> AgentBufferKey:
+        """
+        Convert the string representation back to a key after serialization.
+        """
+        # Simple case: convert the string directly to a BufferKey
+        try:
+            return BufferKey(encoded_key)
+        except ValueError:
+            pass
+
+        # Not a simple key, so split into two parts
+        prefix_str, _, suffix_str = encoded_key.partition(":")
+
+        # See if it's an ObservationKeyPrefix first
+        try:
+            return ObservationKeyPrefix(prefix_str), int(suffix_str)
+        except ValueError:
+            pass
+
+        # If not, it had better be a RewardSignalKeyPrefix
+        try:
+            return RewardSignalKeyPrefix(prefix_str), suffix_str
+        except ValueError:
+            raise ValueError(f"Unable to convert {encoded_key} to an AgentBufferKey")
+
+    def __getitem__(self, key: AgentBufferKey) -> AgentBufferField:
+        if self.CHECK_KEY_TYPES_AT_RUNTIME:
+            self._check_key(key)
+        return self._fields[key]
+
+    def __setitem__(self, key: AgentBufferKey, value: AgentBufferField) -> None:
+        if self.CHECK_KEY_TYPES_AT_RUNTIME:
+            self._check_key(key)
+        self._fields[key] = value
+
+    def __delitem__(self, key: AgentBufferKey) -> None:
+        if self.CHECK_KEY_TYPES_AT_RUNTIME:
+            self._check_key(key)
+        self._fields.__delitem__(key)
+
+    def __iter__(self):
+        return self._fields.__iter__()
+
+    def __len__(self) -> int:
+        return self._fields.__len__()
+
+    def __contains__(self, key):
+        if self.CHECK_KEY_TYPES_AT_RUNTIME:
+            self._check_key(key)
+        return self._fields.__contains__(key)
+
+    def check_length(self, key_list: List[AgentBufferKey]) -> bool:
        """
        Some methods will require that some fields have the same length.
        check_length will return true if the fields in key_list
+        if self.CHECK_KEY_TYPES_AT_RUNTIME:
+            for k in key_list:
+                self._check_key(k)
+
-            if key not in self.keys():
+            if key not in self._fields:
                return False
            if (length is not None) and (length != len(self[key])):
                return False
-    def shuffle(self, sequence_length: int, key_list: List[str] = None) -> None:
+    def shuffle(
+        self, sequence_length: int, key_list: List[AgentBufferKey] = None
+    ) -> None:
        """
        Shuffles the fields in key_list in a consistent way: The reordering will
        be the same across fields.
-            key_list = list(self.keys())
+            key_list = list(self._fields.keys())
        if not self.check_length(key_list):
            raise BufferException(
                "Unable to shuffle if the fields are not of same length"
        :return: Dict of mini batch.
        """
        mini_batch = AgentBuffer()
-        for key in self:
-            mini_batch[key] = self[key][start:end]
+        for key, field in self._fields.items():
+            # slicing AgentBufferField returns a List[Any}
+            mini_batch[key] = field[start:end]  # type: ignore
        return mini_batch

    def sample_mini_batch(
        """
        with h5py.File(file_object, "w") as write_file:
            for key, data in self.items():
-                write_file.create_dataset(key, data=data, dtype="f", compression="gzip")
+                write_file.create_dataset(
+                    self._encode_key(key), data=data, dtype="f", compression="gzip"
+                )

    def load_from_file(self, file_object: BinaryIO) -> None:
        """
            for key in list(read_file.keys()):
-                self[key] = AgentBuffer.AgentBufferField()
+                decoded_key = self._decode_key(key)
+                self[decoded_key] = AgentBufferField()
-                self[key].extend(read_file[key][()])
+                self[decoded_key].extend(read_file[key][()])

    def truncate(self, max_length: int, sequence_length: int = 1) -> None:
        """
    def resequence_and_append(
        self,
        target_buffer: "AgentBuffer",
-        key_list: List[str] = None,
+        key_list: List[AgentBufferKey] = None,
        batch_size: int = None,
        training_length: int = None,
    ) -> None:
--- a/ml-agents/mlagents/trainers/demo_loader.py
+++ b/ml-agents/mlagents/trainers/demo_loader.py
 import os
 from typing import List, Tuple
 import numpy as np
-from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.buffer import AgentBuffer, BufferKey
 from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
    AgentInfoActionPairProto,
 )
        else:
            current_obs = list(current_decision_step.values())[0].obs

-        demo_raw_buffer["done"].append(next_done)
-        demo_raw_buffer["rewards"].append(next_reward)
+        demo_raw_buffer[BufferKey.DONE].append(next_done)
+        demo_raw_buffer[BufferKey.ENVIRONMENT_REWARDS].append(next_reward)
        for i, obs in enumerate(current_obs):
            demo_raw_buffer[ObsUtil.get_name_at(i)].append(obs)
        if (
            if behavior_spec.action_spec.continuous_size > 0:
-                demo_raw_buffer["continuous_action"].append(
+                demo_raw_buffer[BufferKey.CONTINUOUS_ACTION].append(
-                demo_raw_buffer["discrete_action"].append(
+                demo_raw_buffer[BufferKey.DISCRETE_ACTION].append(
-                demo_raw_buffer["continuous_action"].append(
+                demo_raw_buffer[BufferKey.CONTINUOUS_ACTION].append(
-                demo_raw_buffer["discrete_action"].append(
+                demo_raw_buffer[BufferKey.DISCRETE_ACTION].append(
-        demo_raw_buffer["prev_action"].append(previous_action)
+        demo_raw_buffer[BufferKey.PREV_ACTION].append(previous_action)
        if next_done:
            demo_raw_buffer.resequence_and_append(
                demo_processed_buffer, batch_size=None, training_length=sequence_length
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
 from typing import Dict, cast
 from mlagents.torch_utils import torch

-from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.buffer import AgentBuffer, BufferKey, RewardSignalUtil

 from mlagents_envs.timers import timed
 from mlagents.trainers.policy.torch_policy import TorchPolicy
        old_values = {}
        for name in self.reward_signals:
            old_values[name] = ModelUtils.list_to_tensor(
-                batch[f"{name}_value_estimates"]
+                batch[RewardSignalUtil.value_estimates_key(name)]
-            returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])
+            returns[name] = ModelUtils.list_to_tensor(
+                batch[RewardSignalUtil.returns_key(name)]
+            )

        n_obs = len(self.policy.behavior_spec.observation_specs)
        current_obs = ObsUtil.from_buffer(batch, n_obs)
-        act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
-        actions = AgentAction.from_dict(batch)
+        act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK])
+        actions = AgentAction.from_buffer(batch)
-            ModelUtils.list_to_tensor(batch["memory"][i])
-            for i in range(0, len(batch["memory"]), self.policy.sequence_length)
+            ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i])
+            for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length)
        ]
        if len(memories) > 0:
            memories = torch.stack(memories).unsqueeze(0)
            memories=memories,
            seq_len=self.policy.sequence_length,
        )
-        old_log_probs = ActionLogProbs.from_dict(batch).flatten()
+        old_log_probs = ActionLogProbs.from_buffer(batch).flatten()
-        loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
+        loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool)
-            ModelUtils.list_to_tensor(batch["advantages"]),
+            ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]),
            log_probs,
            old_log_probs,
            loss_masks,
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py

 from mlagents_envs.logging_util import get_logger
 from mlagents_envs.base_env import BehaviorSpec
+from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
 from mlagents.trainers.trainer.rl_trainer import RLTrainer
 from mlagents.trainers.policy import Policy
 from mlagents.trainers.policy.torch_policy import TorchPolicy
        )

        for name, v in value_estimates.items():
-            agent_buffer_trajectory[f"{name}_value_estimates"].extend(v)
+            agent_buffer_trajectory[RewardSignalUtil.value_estimates_key(name)].extend(
+                v
+            )
            self._stats_reporter.add_stat(
                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
                np.mean(v),
        self.collected_rewards["environment"][agent_id] += np.sum(
-            agent_buffer_trajectory["environment_rewards"]
+            agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]
-            agent_buffer_trajectory[f"{name}_rewards"].extend(evaluate_result)
+            agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend(
+                evaluate_result
+            )
            # Report the reward signals
            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

        for name in self.optimizer.reward_signals:
            bootstrap_value = value_next[name]

-            local_rewards = agent_buffer_trajectory[f"{name}_rewards"].get_batch()
+            local_rewards = agent_buffer_trajectory[
+                RewardSignalUtil.rewards_key(name)
+            ].get_batch()
-                f"{name}_value_estimates"
+                RewardSignalUtil.value_estimates_key(name)
            ].get_batch()

            local_advantage = get_gae(
            )
            local_return = local_advantage + local_value_estimates
            # This is later use as target for the different value estimates
-            agent_buffer_trajectory[f"{name}_returns"].set(local_return)
-            agent_buffer_trajectory[f"{name}_advantage"].set(local_advantage)
+            agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set(
+                local_return
+            )
+            agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set(
+                local_advantage
+            )
            tmp_advantages.append(local_advantage)
            tmp_returns.append(local_return)

        )
        global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
-        agent_buffer_trajectory["advantages"].set(global_advantages)
-        agent_buffer_trajectory["discounted_returns"].set(global_returns)
+        agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages)
+        agent_buffer_trajectory[BufferKey.DISCOUNTED_RETURNS].set(global_returns)
        # Append to update buffer
        agent_buffer_trajectory.resequence_and_append(
            self.update_buffer, training_length=self.policy.sequence_length
            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
        )

-        advantages = self.update_buffer["advantages"].get_batch()
-        self.update_buffer["advantages"].set(
+        advantages = self.update_buffer[BufferKey.ADVANTAGES].get_batch()
+        self.update_buffer[BufferKey.ADVANTAGES].set(
            (advantages - advantages.mean()) / (advantages.std() + 1e-10)
        )
        num_epoch = self.hyperparameters.num_epoch
--- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py
 from mlagents.trainers.torch.agent_action import AgentAction
 from mlagents.trainers.torch.action_log_probs import ActionLogProbs
 from mlagents.trainers.torch.utils import ModelUtils
-from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.buffer import AgentBuffer, BufferKey, RewardSignalUtil
 from mlagents_envs.timers import timed
 from mlagents_envs.base_env import ActionSpec, ObservationSpec
 from mlagents.trainers.exception import UnityTrainerException
        """
        rewards = {}
        for name in self.reward_signals:
-            rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"])
+            rewards[name] = ModelUtils.list_to_tensor(
+                batch[RewardSignalUtil.rewards_key(name)]
+            )

        n_obs = len(self.policy.behavior_spec.observation_specs)
        current_obs = ObsUtil.from_buffer(batch, n_obs)
        # Convert to tensors
        next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]

-        act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
-        actions = AgentAction.from_dict(batch)
+        act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK])
+        actions = AgentAction.from_buffer(batch)
-            ModelUtils.list_to_tensor(batch["memory"][i])
-            for i in range(0, len(batch["memory"]), self.policy.sequence_length)
+            ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i])
+            for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length)
-                batch["memory"][i][self.policy.m_size // 2 :]
+                batch[BufferKey.MEMORY][i][self.policy.m_size // 2 :]
-            for i in range(offset, len(batch["memory"]), self.policy.sequence_length)
+            for i in range(
+                offset, len(batch[BufferKey.MEMORY]), self.policy.sequence_length
+            )
        ]

        if len(memories_list) > 0:
                memories=next_memories,
                sequence_length=self.policy.sequence_length,
            )
-        masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
-        dones = ModelUtils.list_to_tensor(batch["done"])
+        masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool)
+        dones = ModelUtils.list_to_tensor(batch[BufferKey.DONE])

        q1_loss, q2_loss = self.sac_q_loss(
            q1_stream, q2_stream, target_values, dones, rewards, masks
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
 from mlagents_envs.logging_util import get_logger
 from mlagents_envs.timers import timed
 from mlagents_envs.base_env import BehaviorSpec
+from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
 from mlagents.trainers.policy import Policy
 from mlagents.trainers.trainer.rl_trainer import RLTrainer
 from mlagents.trainers.policy.torch_policy import TorchPolicy

        # Evaluate all reward functions for reporting purposes
        self.collected_rewards["environment"][agent_id] += np.sum(
-            agent_buffer_trajectory["environment_rewards"]
+            agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]
        )
        for name, reward_signal in self.optimizer.reward_signals.items():
            evaluate_result = (
            last_step_obs = last_step.obs
            for i, obs in enumerate(last_step_obs):
                agent_buffer_trajectory[ObsUtil.get_name_at_next(i)][-1] = obs
-            agent_buffer_trajectory["done"][-1] = False
+            agent_buffer_trajectory[BufferKey.DONE][-1] = False

        # Append to update buffer
        agent_buffer_trajectory.resequence_and_append(
                )
                # Get rewards for each reward
                for name, signal in self.optimizer.reward_signals.items():
-                    sampled_minibatch[f"{name}_rewards"] = (
+                    sampled_minibatch[RewardSignalUtil.rewards_key(name)] = (
                        signal.evaluate(sampled_minibatch) * signal.strength
                    )

--- a/ml-agents/mlagents/trainers/tests/init.py
+++ b/ml-agents/mlagents/trainers/tests/init.py
    np.array = np_array_no_float64
    np.zeros = np_zeros_no_float64
    np.ones = np_ones_no_float64
+
+
+if os.getenv("TEST_ENFORCE_BUFFER_KEY_TYPES"):
+    from mlagents.trainers.buffer import AgentBuffer
+
+    AgentBuffer.CHECK_KEY_TYPES_AT_RUNTIME = True
--- a/ml-agents/mlagents/trainers/tests/mock_brain.py
+++ b/ml-agents/mlagents/trainers/tests/mock_brain.py
 from typing import List, Tuple
 import numpy as np

-from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.buffer import AgentBuffer, AgentBufferKey
 from mlagents.trainers.torch.action_log_probs import LogProbsTuple
 from mlagents.trainers.trajectory import Trajectory, AgentExperience
 from mlagents_envs.base_env import (
    return Trajectory(
        steps=steps_list, agent_id=agent_id, behavior_id=behavior_id, next_obs=obs
    )
+
+
+def copy_buffer_fields(
+    buffer: AgentBuffer, src_key: AgentBufferKey, dst_keys: List[AgentBufferKey]
+) -> None:
+    for dst_key in dst_keys:
+        buffer[dst_key] = buffer[src_key]


 def simulate_rollout(
--- a/ml-agents/mlagents/trainers/tests/test_buffer.py
+++ b/ml-agents/mlagents/trainers/tests/test_buffer.py
 import numpy as np
-from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.buffer import (
+    AgentBuffer,
+    AgentBufferField,
+    BufferKey,
+    ObservationKeyPrefix,
+    RewardSignalKeyPrefix,
+)
+from mlagents.trainers.trajectory import ObsUtil


 def assert_array(a, b):
 def construct_fake_buffer(fake_agent_id):
    b = AgentBuffer()
    for step in range(9):
-        b["vector_observation"].append(
+        b[ObsUtil.get_name_at(0)].append(
            [
                100 * fake_agent_id + 10 * step + 1,
                100 * fake_agent_id + 10 * step + 2,
-        b["action"].append(
+        b[BufferKey.CONTINUOUS_ACTION].append(
            [100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5]
        )
    return b
    agent_1_buffer = construct_fake_buffer(1)
    agent_2_buffer = construct_fake_buffer(2)
    agent_3_buffer = construct_fake_buffer(3)
-    a = agent_1_buffer["vector_observation"].get_batch(
+    a = agent_1_buffer[ObsUtil.get_name_at(0)].get_batch(
-    a = agent_2_buffer["vector_observation"].get_batch(
+    a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(
        batch_size=2, training_length=3, sequential=True
    )
    assert_array(
            ]
        ),
    )
-    a = agent_2_buffer["vector_observation"].get_batch(
+    a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(
        batch_size=2, training_length=3, sequential=False
    )
    assert_array(
    agent_3_buffer.resequence_and_append(
        update_buffer, batch_size=None, training_length=2
    )
-    assert len(update_buffer["action"]) == 20
+    assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 20
-    assert np.array(update_buffer["action"]).shape == (20, 2)
+    assert np.array(update_buffer[BufferKey.CONTINUOUS_ACTION]).shape == (20, 2)
-    assert np.array(c["action"]).shape == (1, 2)
+    assert np.array(c[BufferKey.CONTINUOUS_ACTION]).shape == (1, 2)


 def fakerandint(values):
    # Test non-LSTM
    mb = update_buffer.sample_mini_batch(batch_size=4, sequence_length=1)
    assert mb.keys() == update_buffer.keys()
-    assert np.array(mb["action"]).shape == (4, 2)
+    assert np.array(mb[BufferKey.CONTINUOUS_ACTION]).shape == (4, 2)

    # Test LSTM
    # We need to check if we ever get a breaking start - this will maximize the probability
-    assert np.array(mb["action"]).shape == (19, 2)
+    assert np.array(mb[BufferKey.CONTINUOUS_ACTION]).shape == (19, 2)


 def test_num_experiences():

-    assert len(update_buffer["action"]) == 0
+    assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 0
    assert update_buffer.num_experiences == 0
    agent_1_buffer.resequence_and_append(
        update_buffer, batch_size=None, training_length=2
    )

-    assert len(update_buffer["action"]) == 20
+    assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 20
    assert update_buffer.num_experiences == 20


    update_buffer.truncate(4, sequence_length=3)
    assert update_buffer.num_experiences == 3
    for buffer_field in update_buffer.values():
-        assert isinstance(buffer_field, AgentBuffer.AgentBufferField)
+        assert isinstance(buffer_field, AgentBufferField)
+
+
+def test_key_encode_decode():
+    keys = (
+        list(BufferKey)
+        + [(k, 42) for k in ObservationKeyPrefix]
+        + [(k, "gail") for k in RewardSignalKeyPrefix]
+    )
+    for k in keys:
+        assert k == AgentBuffer._decode_key(AgentBuffer._encode_key(k))
+
+
+def test_buffer_save_load():
+    original = construct_fake_buffer(3)
+    import io
+
+    write_buffer = io.BytesIO()
+    original.save_to_file(write_buffer)
+
+    loaded = AgentBuffer()
+    loaded.load_from_file(write_buffer)
+
+    assert len(original) == len(loaded)
+    for k in original.keys():
+        assert np.allclose(original[k], loaded[k])
--- a/ml-agents/mlagents/trainers/tests/test_demo_loader.py
+++ b/ml-agents/mlagents/trainers/tests/test_demo_loader.py
    get_demo_files,
    write_delimited,
 )
+from mlagents.trainers.buffer import BufferKey


 BEHAVIOR_SPEC = create_mock_3dball_behavior_specs()

    _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, BEHAVIOR_SPEC)
    assert (
-        len(demo_buffer["continuous_action"]) == total_expected - 1
-        or len(demo_buffer["discrete_action"]) == total_expected - 1
+        len(demo_buffer[BufferKey.CONTINUOUS_ACTION]) == total_expected - 1
+        or len(demo_buffer[BufferKey.DISCRETE_ACTION]) == total_expected - 1
    )



    _, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1, BEHAVIOR_SPEC)
    assert (
-        len(demo_buffer["continuous_action"]) == total_expected - 1
-        or len(demo_buffer["discrete_action"]) == total_expected - 1
+        len(demo_buffer[BufferKey.CONTINUOUS_ACTION]) == total_expected - 1
+        or len(demo_buffer[BufferKey.DISCRETE_ACTION]) == total_expected - 1
    )


--- a/ml-agents/mlagents/trainers/tests/test_trajectory.py
+++ b/ml-agents/mlagents/trainers/tests/test_trajectory.py
 from mlagents.trainers.tests.mock_brain import make_fake_trajectory
 from mlagents.trainers.tests.dummy_config import create_observation_specs_with_shapes
 from mlagents_envs.base_env import ActionSpec
+from mlagents.trainers.buffer import BufferKey, ObservationKeyPrefix

 VEC_OBS_SIZE = 6
 ACTION_SIZE = 4
    length = 15
    wanted_keys = [
-        "next_obs_0",
-        "next_obs_1",
-        "obs_0",
-        "obs_1",
-        "memory",
-        "masks",
-        "done",
-        "continuous_action",
-        "discrete_action",
-        "continuous_log_probs",
-        "discrete_log_probs",
-        "action_mask",
-        "prev_action",
-        "environment_rewards",
+        (ObservationKeyPrefix.OBSERVATION, 0),
+        (ObservationKeyPrefix.OBSERVATION, 1),
+        (ObservationKeyPrefix.NEXT_OBSERVATION, 0),
+        (ObservationKeyPrefix.NEXT_OBSERVATION, 1),
+        BufferKey.MEMORY,
+        BufferKey.MASKS,
+        BufferKey.DONE,
+        BufferKey.CONTINUOUS_ACTION,
+        BufferKey.DISCRETE_ACTION,
+        BufferKey.CONTINUOUS_LOG_PROBS,
+        BufferKey.DISCRETE_LOG_PROBS,
+        BufferKey.ACTION_MASK,
+        BufferKey.PREV_ACTION,
+        BufferKey.ENVIRONMENT_REWARDS,
    ]
    wanted_keys = set(wanted_keys)
    trajectory = make_fake_trajectory(
--- a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py
 from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
 from mlagents.trainers.ppo.trainer import PPOTrainer
 from mlagents.trainers.agent_processor import AgentManagerQueue
+from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
+from mlagents.trainers.tests.mock_brain import copy_buffer_fields
 from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
 from mlagents.trainers.settings import TrainerSettings, SelfPlaySettings
 from mlagents.trainers.tests.dummy_config import create_observation_specs_with_shapes

    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs)
    # Mock out reward signal eval
-    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
-    buffer["extrinsic_returns"] = buffer["environment_rewards"]
-    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
-    buffer["curiosity_rewards"] = buffer["environment_rewards"]
-    buffer["curiosity_returns"] = buffer["environment_rewards"]
-    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
-    buffer["advantages"] = buffer["environment_rewards"]
+    copy_buffer_fields(
+        buffer,
+        src_key=BufferKey.ENVIRONMENT_REWARDS,
+        dst_keys=[
+            BufferKey.ADVANTAGES,
+            RewardSignalUtil.rewards_key("extrinsic"),
+            RewardSignalUtil.returns_key("extrinsic"),
+            RewardSignalUtil.value_estimates_key("extrinsic"),
+            RewardSignalUtil.rewards_key("curiosity"),
+            RewardSignalUtil.returns_key("curiosity"),
+            RewardSignalUtil.value_estimates_key("curiosity"),
+        ],
+    )
+
    trainer.trainer.update_buffer = buffer

    # when ghost trainer advance and wrapped trainer buffers full
--- a/ml-agents/mlagents/trainers/tests/torch/test_policy.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_policy.py
 from mlagents.trainers.torch.utils import ModelUtils
 from mlagents.trainers.trajectory import ObsUtil
 from mlagents.trainers.torch.agent_action import AgentAction
+from mlagents.trainers.buffer import BufferKey

 VECTOR_ACTION_SPACE = 2
 VECTOR_OBS_SPACE = 8
        TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
-    act_masks = ModelUtils.list_to_tensor(buffer["action_mask"])
-    agent_action = AgentAction.from_dict(buffer)
+    act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK])
+    agent_action = AgentAction.from_buffer(buffer)
-        ModelUtils.list_to_tensor(buffer["memory"][i])
-        for i in range(0, len(buffer["memory"]), policy.sequence_length)
+        ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i])
+        for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length)
    ]
    if len(memories) > 0:
        memories = torch.stack(memories).unsqueeze(0)
        TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
-    act_masks = ModelUtils.list_to_tensor(buffer["action_mask"])
+    act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK])
-        ModelUtils.list_to_tensor(buffer["memory"][i])
-        for i in range(0, len(buffer["memory"]), policy.sequence_length)
+        ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i])
+        for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length)
    ]
    if len(memories) > 0:
        memories = torch.stack(memories).unsqueeze(0)
--- a/ml-agents/mlagents/trainers/tests/torch/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_ppo.py
 from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer
 from mlagents.trainers.policy.torch_policy import TorchPolicy
 from mlagents.trainers.tests import mock_brain as mb
+from mlagents.trainers.tests.mock_brain import copy_buffer_fields
 from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
 from mlagents.trainers.settings import NetworkSettings
 from mlagents.trainers.tests.dummy_config import (  # noqa: F401
 )

 from mlagents_envs.base_env import ActionSpec
+from mlagents.trainers.buffer import BufferKey, RewardSignalUtil


@pytest.fixture
        memory_size=optimizer.policy.m_size,
    )
    # Mock out reward signal eval
-    update_buffer["advantages"] = update_buffer["environment_rewards"]
-    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
-    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
+    copy_buffer_fields(
+        update_buffer,
+        BufferKey.ENVIRONMENT_REWARDS,
+        [
+            BufferKey.ADVANTAGES,
+            RewardSignalUtil.returns_key("extrinsic"),
+            RewardSignalUtil.value_estimates_key("extrinsic"),
+        ],
+    )

    return_stats = optimizer.update(
        update_buffer,
        memory_size=optimizer.policy.m_size,
    )
    # Mock out reward signal eval
-    update_buffer["advantages"] = update_buffer["environment_rewards"]
-    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
-    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
-    update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
-    update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"]
+    copy_buffer_fields(
+        update_buffer,
+        src_key=BufferKey.ENVIRONMENT_REWARDS,
+        dst_keys=[
+            BufferKey.ADVANTAGES,
+            RewardSignalUtil.returns_key("extrinsic"),
+            RewardSignalUtil.value_estimates_key("extrinsic"),
+            RewardSignalUtil.returns_key("curiosity"),
+            RewardSignalUtil.value_estimates_key("curiosity"),
+        ],
+    )
+
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
        BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
    )
    # Mock out reward signal eval
-    update_buffer["advantages"] = update_buffer["environment_rewards"]
-    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
-    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
-    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
-    update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
-    update_buffer["continuous_log_probs"] = np.ones_like(
-        update_buffer["continuous_action"]
+    copy_buffer_fields(
+        update_buffer,
+        src_key=BufferKey.ENVIRONMENT_REWARDS,
+        dst_keys=[
+            BufferKey.ADVANTAGES,
+            RewardSignalUtil.returns_key("extrinsic"),
+            RewardSignalUtil.value_estimates_key("extrinsic"),
+            RewardSignalUtil.returns_key("gail"),
+            RewardSignalUtil.value_estimates_key("gail"),
+        ],
+    )
+
+    update_buffer[BufferKey.CONTINUOUS_LOG_PROBS] = np.ones_like(
+        update_buffer[BufferKey.CONTINUOUS_ACTION]
    )
    optimizer.update(
        update_buffer,
    # Check if buffer size is too big
    update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
    # Mock out reward signal eval
-    update_buffer["advantages"] = update_buffer["environment_rewards"]
-    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
-    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
-    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
-    update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
+    copy_buffer_fields(
+        update_buffer,
+        src_key=BufferKey.ENVIRONMENT_REWARDS,
+        dst_keys=[
+            BufferKey.ADVANTAGES,
+            RewardSignalUtil.returns_key("extrinsic"),
+            RewardSignalUtil.value_estimates_key("extrinsic"),
+            RewardSignalUtil.returns_key("gail"),
+            RewardSignalUtil.value_estimates_key("gail"),
+        ],
+    )
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
--- a/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py
 import numpy as np
 import pytest
 from mlagents.torch_utils import torch
+from mlagents.trainers.buffer import BufferKey
 from mlagents.trainers.torch.components.reward_providers import (
    CuriosityRewardProvider,
    create_reward_provider,
    for _ in range(200):
        curiosity_rp.update(buffer)
    prediction = curiosity_rp._network.predict_action(buffer)[0]
-    target = torch.tensor(buffer["continuous_action"][0])
+    target = torch.tensor(buffer[BufferKey.CONTINUOUS_ACTION][0])
    error = torch.mean((prediction - target) ** 2).item()
    assert error < 0.001

--- a/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
 import numpy as np
-from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.buffer import AgentBuffer, BufferKey
 from mlagents_envs.base_env import BehaviorSpec
 from mlagents.trainers.trajectory import ObsUtil

    action_buffer = behavior_spec.action_spec.random_action(1)
    action = {}
    if behavior_spec.action_spec.continuous_size > 0:
-        action["continuous_action"] = action_buffer.continuous
+        action[BufferKey.CONTINUOUS_ACTION] = action_buffer.continuous
-        action["discrete_action"] = action_buffer.discrete
+        action[BufferKey.DISCRETE_ACTION] = action_buffer.discrete

    for _ in range(number):
        for i, obs in enumerate(curr_obs):
-        buffer["actions"].append(action)
+        # TODO
+        # buffer[AgentBufferKey.ACTIONS].append(action)
-        buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
-        buffer["masks"].append(np.ones(1, dtype=np.float32))
-    buffer["done"] = np.zeros(number, dtype=np.float32)
+        # TODO was "rewards"
+        buffer[BufferKey.ENVIRONMENT_REWARDS].append(
+            np.ones(1, dtype=np.float32) * reward
+        )
+        buffer[BufferKey.MASKS].append(np.ones(1, dtype=np.float32))
+    buffer[BufferKey.DONE] = np.zeros(number, dtype=np.float32)
    return buffer
--- a/ml-agents/mlagents/trainers/tests/torch/test_sac.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_sac.py
 import pytest
 from mlagents.torch_utils import torch

+from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
 from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer
 from mlagents.trainers.policy.torch_policy import TorchPolicy
 from mlagents.trainers.tests import mock_brain as mb
        BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=24
    )
    # Mock out reward signal eval
-    update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
+    update_buffer[RewardSignalUtil.rewards_key("extrinsic")] = update_buffer[
+        BufferKey.ENVIRONMENT_REWARDS
+    ]
    return_stats = optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    )

    # Mock out reward signal eval
-    update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
-    update_buffer["curiosity_rewards"] = update_buffer["environment_rewards"]
+    update_buffer[RewardSignalUtil.rewards_key("extrinsic")] = update_buffer[
+        BufferKey.ENVIRONMENT_REWARDS
+    ]
+    update_buffer[RewardSignalUtil.rewards_key("curiosity")] = update_buffer[
+        BufferKey.ENVIRONMENT_REWARDS
+    ]
    return_stats = optimizer.update_reward_signals(
        {"curiosity": update_buffer}, num_sequences=update_buffer.num_experiences
    )
--- a/ml-agents/mlagents/trainers/torch/action_log_probs.py
+++ b/ml-agents/mlagents/trainers/torch/action_log_probs.py
-from typing import List, Optional, NamedTuple, Dict
+from typing import List, Optional, NamedTuple
+from mlagents.trainers.buffer import AgentBuffer, BufferKey
 from mlagents_envs.base_env import _ActionTupleBase


        return torch.cat(self._to_tensor_list(), dim=1)

    @staticmethod
-    def from_dict(buff: Dict[str, np.ndarray]) -> "ActionLogProbs":
+    def from_buffer(buff: AgentBuffer) -> "ActionLogProbs":
        """
        A static method that accesses continuous and discrete log probs fields in an AgentBuffer
        and constructs the corresponding ActionLogProbs from the retrieved np arrays.

-        if "continuous_log_probs" in buff:
-            continuous = ModelUtils.list_to_tensor(buff["continuous_log_probs"])
-        if "discrete_log_probs" in buff:
-            discrete_tensor = ModelUtils.list_to_tensor(buff["discrete_log_probs"])
+        if BufferKey.CONTINUOUS_LOG_PROBS in buff:
+            continuous = ModelUtils.list_to_tensor(buff[BufferKey.CONTINUOUS_LOG_PROBS])
+        if BufferKey.DISCRETE_LOG_PROBS in buff:
+            discrete_tensor = ModelUtils.list_to_tensor(
+                buff[BufferKey.DISCRETE_LOG_PROBS]
+            )
            # This will keep discrete_list = None which enables flatten()
            if discrete_tensor.shape[1] > 0:
                discrete = [
--- a/ml-agents/mlagents/trainers/torch/agent_action.py
+++ b/ml-agents/mlagents/trainers/torch/agent_action.py
-from typing import List, Optional, NamedTuple, Dict
+from typing import List, Optional, NamedTuple
-import numpy as np
+from mlagents.trainers.buffer import AgentBuffer, BufferKey
 from mlagents.trainers.torch.utils import ModelUtils
 from mlagents_envs.base_env import ActionTuple

        return action_tuple

    @staticmethod
-    def from_dict(buff: Dict[str, np.ndarray]) -> "AgentAction":
+    def from_buffer(buff: AgentBuffer) -> "AgentAction":
        """
        A static method that accesses continuous and discrete action fields in an AgentBuffer
        and constructs the corresponding AgentAction from the retrieved np arrays.
-        if "continuous_action" in buff:
-            continuous = ModelUtils.list_to_tensor(buff["continuous_action"])
-        if "discrete_action" in buff:
+        if BufferKey.CONTINUOUS_ACTION in buff:
+            continuous = ModelUtils.list_to_tensor(buff[BufferKey.CONTINUOUS_ACTION])
+        if BufferKey.DISCRETE_ACTION in buff:
-                buff["discrete_action"], dtype=torch.long
+                buff[BufferKey.DISCRETE_ACTION], dtype=torch.long
            )
            discrete = [
                discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
--- a/ml-agents/mlagents/trainers/torch/components/bc/module.py
+++ b/ml-agents/mlagents/trainers/torch/components/bc/module.py
        # Convert to tensors
        tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
        act_masks = None
-        expert_actions = AgentAction.from_dict(mini_batch_demo)
+        expert_actions = AgentAction.from_buffer(mini_batch_demo)
        if self.policy.behavior_spec.action_spec.discrete_size > 0:

            act_masks = ModelUtils.list_to_tensor(
--- a/ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
+++ b/ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
 from typing import Dict, NamedTuple
 from mlagents.torch_utils import torch, default_device

-from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.buffer import AgentBuffer, BufferKey
 from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
    BaseRewardProvider,
 )
        Uses the current state embedding and the action of the mini_batch to predict
        the next state embedding.
        """
-        actions = AgentAction.from_dict(mini_batch)
+        actions = AgentAction.from_buffer(mini_batch)
        flattened_action = self._action_flattener.forward(actions)
        forward_model_input = torch.cat(
            (self.get_current_state(mini_batch), flattened_action), dim=1
        action prediction (given the current and next state).
        """
        predicted_action = self.predict_action(mini_batch)
-        actions = AgentAction.from_dict(mini_batch)
+        actions = AgentAction.from_buffer(mini_batch)
        _inverse_loss = 0
        if self._action_spec.continuous_size > 0:
            sq_difference = (
            _inverse_loss += torch.mean(
                ModelUtils.dynamic_partition(
                    sq_difference,
-                    ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float),
+                    ModelUtils.list_to_tensor(
+                        mini_batch[BufferKey.MASKS], dtype=torch.float
+                    ),
                    2,
                )[1]
            )
                ModelUtils.dynamic_partition(
                    cross_entropy,
                    ModelUtils.list_to_tensor(
-                        mini_batch["masks"], dtype=torch.float
+                        mini_batch[BufferKey.MASKS], dtype=torch.float
                    ),  # use masks not action_masks
                    2,
                )[1]
        return torch.mean(
            ModelUtils.dynamic_partition(
                self.compute_reward(mini_batch),
-                ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float),
+                ModelUtils.list_to_tensor(
+                    mini_batch[BufferKey.MASKS], dtype=torch.float
+                ),
                2,
            )[1]
        )
--- a/ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py
+++ b/ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py
 import numpy as np
 from typing import Dict

-from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.buffer import AgentBuffer, BufferKey
 from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
    BaseRewardProvider,
 )
    def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
-        return np.array(mini_batch["environment_rewards"], dtype=np.float32)
+        return np.array(mini_batch[BufferKey.ENVIRONMENT_REWARDS], dtype=np.float32)

    def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
        return {}
--- a/ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
+++ b/ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
 import numpy as np
 from mlagents.torch_utils import torch, default_device

-from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.buffer import AgentBuffer, BufferKey
 from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
    BaseRewardProvider,
 )
        Creates the action Tensor. In continuous case, corresponds to the action. In
        the discrete case, corresponds to the concatenation of one hot action Tensors.
        """
-        return self._action_flattener.forward(AgentAction.from_dict(mini_batch))
+        return self._action_flattener.forward(AgentAction.from_buffer(mini_batch))

    def get_state_inputs(self, mini_batch: AgentBuffer) -> List[torch.Tensor]:
        """
        inputs = self.get_state_inputs(mini_batch)
        if self._settings.use_actions:
            actions = self.get_action_input(mini_batch)
-            dones = torch.as_tensor(mini_batch["done"], dtype=torch.float).unsqueeze(1)
+            dones = torch.as_tensor(
+                mini_batch[BufferKey.DONE], dtype=torch.float
+            ).unsqueeze(1)
            action_inputs = torch.cat([actions, dones], dim=1)
            hidden, _ = self.encoder(inputs, action_inputs)
        else:
            expert_action = self.get_action_input(expert_batch)
            action_epsilon = torch.rand(policy_action.shape)
            policy_dones = torch.as_tensor(
-                policy_batch["done"], dtype=torch.float
+                policy_batch[BufferKey.DONE], dtype=torch.float
-                expert_batch["done"], dtype=torch.float
+                expert_batch[BufferKey.DONE], dtype=torch.float
            ).unsqueeze(1)
            dones_epsilon = torch.rand(policy_dones.shape)
            action_inputs = torch.cat(
--- a/ml-agents/mlagents/trainers/trajectory.py
+++ b/ml-agents/mlagents/trainers/trajectory.py
 from typing import List, NamedTuple
 import numpy as np

-from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.buffer import (
+    AgentBuffer,
+    ObservationKeyPrefix,
+    AgentBufferKey,
+    BufferKey,
+)
 from mlagents_envs.base_env import ActionTuple
 from mlagents.trainers.torch.action_log_probs import LogProbsTuple


 class ObsUtil:
    @staticmethod
-    def get_name_at(index: int) -> str:
+    def get_name_at(index: int) -> AgentBufferKey:
-        return f"obs_{index}"
+        return ObservationKeyPrefix.OBSERVATION, index
-    def get_name_at_next(index: int) -> str:
+    def get_name_at_next(index: int) -> AgentBufferKey:
-        return f"next_obs_{index}"
+        return ObservationKeyPrefix.NEXT_OBSERVATION, index

    @staticmethod
    def from_buffer(batch: AgentBuffer, num_obs: int) -> List[np.array]:
                agent_buffer_trajectory[ObsUtil.get_name_at_next(i)].append(next_obs[i])

            if exp.memory is not None:
-                agent_buffer_trajectory["memory"].append(exp.memory)
+                agent_buffer_trajectory[BufferKey.MEMORY].append(exp.memory)
-            agent_buffer_trajectory["masks"].append(1.0)
-            agent_buffer_trajectory["done"].append(exp.done)
+            agent_buffer_trajectory[BufferKey.MASKS].append(1.0)
+            agent_buffer_trajectory[BufferKey.DONE].append(exp.done)
-            agent_buffer_trajectory["continuous_action"].append(exp.action.continuous)
-            agent_buffer_trajectory["discrete_action"].append(exp.action.discrete)
-            agent_buffer_trajectory["continuous_log_probs"].append(
+            agent_buffer_trajectory[BufferKey.CONTINUOUS_ACTION].append(
+                exp.action.continuous
+            )
+            agent_buffer_trajectory[BufferKey.DISCRETE_ACTION].append(
+                exp.action.discrete
+            )
+            agent_buffer_trajectory[BufferKey.CONTINUOUS_LOG_PROBS].append(
-            agent_buffer_trajectory["discrete_log_probs"].append(
+            agent_buffer_trajectory[BufferKey.DISCRETE_LOG_PROBS].append(
                exp.action_probs.discrete
            )

                mask = 1 - np.concatenate(exp.action_mask)
-                agent_buffer_trajectory["action_mask"].append(mask, padding_value=1)
+                agent_buffer_trajectory[BufferKey.ACTION_MASK].append(
+                    mask, padding_value=1
+                )
-                agent_buffer_trajectory["action_mask"].append(
+                agent_buffer_trajectory[BufferKey.ACTION_MASK].append(
-            agent_buffer_trajectory["prev_action"].append(exp.prev_action)
-            agent_buffer_trajectory["environment_rewards"].append(exp.reward)
+            agent_buffer_trajectory[BufferKey.PREV_ACTION].append(exp.prev_action)
+            agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS].append(exp.reward)

            # Store the next visual obs as the current
            obs = next_obs