action buffer passes continuous

4 年前 · bd917c9c
--- a/ml-agents-envs/mlagents_envs/base_env.py
+++ b/ml-agents-envs/mlagents_envs/base_env.py
    continuous: np.ndarray
    discrete: np.ndarray

+    @staticmethod
+    def from_numpy_dict(action_dict: Dict[str, np.ndarray]) -> "ActionBuffers":
+        continuous: np.ndarray = []
+        discrete: np.ndarray = []
+        if "continuous_action" in action_dict:
+            continuous = action_dict["continuous_action"]
+        if "discrete_action" in action_dict:
+            discrete = action_dict["discrete_action"]
+        return ActionBuffers(continuous, discrete)
+

 class ActionSpec(NamedTuple):
    """
        """
        return len(self.discrete_branches)

-    def empty_action(self, n_agents: int) -> ActionBuffers:
+    def empty_action(self, n_agents: int) -> Dict[str, np.ndarray]:
-        return ActionBuffers(
-            np.zeros((n_agents, self.continuous_size), dtype=np.float32),
-            np.zeros((n_agents, self.discrete_size), dtype=np.int32),
-        )
+        action_dict: Dict[str, np.ndarray] = {}
+        if self.continuous_size > 0:
+            action_dict["continuous_action"] = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
+
+        if self.discrete_size > 0:
+            action_dict["discrete_action"] = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
+        return action_dict
+
+       # return ActionBuffers(
+       #     np.zeros((n_agents, self.continuous_size), dtype=np.float32),
+       #     np.zeros((n_agents, self.discrete_size), dtype=np.int32),
+       # )
-    def random_action(self, n_agents: int) -> ActionBuffers:
+    def random_action(self, n_agents: int) -> Dict[str, np.ndarray]:
-        continuous_action = np.random.uniform(
-            low=-1.0, high=1.0, size=(n_agents, self.continuous_size)
-        ).astype(np.float32)
+        action_dict: Dict[str, np.ndarray] = {}
+        if self.continuous_size > 0:
+            continuous_action = np.random.uniform(
+                low=-1.0, high=1.0, size=(n_agents, self.continuous_size)
+            ).astype(np.float32)
+            action_dict["continuous_action"] = continuous_action
-        discrete_action = np.column_stack(
-            [
-                np.random.randint(
-                    0,
-                    self.discrete_branches[i],  # type: ignore
-                    size=(n_agents),
-                    dtype=np.int32,
-                )
-                for i in range(self.discrete_size)
-            ]
-        )
-        return ActionBuffers(continuous_action, discrete_action)
+        if self.discrete_size > 0:
+            discrete_action = np.column_stack(
+                [
+                    np.random.randint(
+                        0,
+                        self.discrete_branches[i],  # type: ignore
+                        size=(n_agents),
+                        dtype=np.int32,
+                    )
+                    for i in range(self.discrete_size)
+                ]
+            )
+            action_dict["discrete_action"] = discrete_action
+        return action_dict
+        #return ActionBuffers(continuous_action, discrete_action)

    def _validate_action(
        self, actions: ActionBuffers, n_agents: int, name: str
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
            done = terminated  # Since this is an ongoing step
            interrupted = step.interrupted if terminated else False
            # Add the outputs of the last eval
-            action = stored_take_action_outputs["action"][idx]
+            action_dict = stored_take_action_outputs["action"]
+            action: Dict[str, np.ndarray] = {}
+            for act_type, act_array in action_dict.items():
+                action[act_type] = act_array[idx]
-            action_probs = stored_take_action_outputs["log_probs"][idx]
+            action_probs_dict = stored_take_action_outputs["log_probs"]
+            action_probs: Dict[str, np.ndarray] = {}
+            for prob_type, prob_array in action_probs_dict.items():
+                action_probs[prob_type] = prob_array[idx]
+
            action_mask = stored_decision_step.action_mask
            prev_action = self.policy.retrieve_previous_action([global_id])#[0, :]
            experience = AgentExperience(
--- a/ml-agents/mlagents/trainers/policy/policy.py
+++ b/ml-agents/mlagents/trainers/policy/policy.py
        return self.behavior_spec.action_spec.empty_action(num_agents)

    def save_previous_action(
-        self, agent_ids: List[str], action_buffers: Optional[ActionBuffers]
+        self, agent_ids: List[str], action_dict: Dict[str, np.ndarray]
-        if action_buffers is None:
+        if action_dict is None:
-            self.previous_action_dict[agent_id] = action_buffers
+            self.previous_action_dict[agent_id] = action_dict
-    def retrieve_previous_action(self, agent_ids: List[str]) -> ActionBuffers:
-        action_buffers = self.behavior_spec.action_spec.create_empty(len(agent_ids))
+    def retrieve_previous_action(self, agent_ids: List[str]) -> Dict[str, np.ndarray]:
+        action_dict = self.behavior_spec.action_spec.empty_action(len(agent_ids))
-                for action, previous_action in zip(action_buffers, self.previous_action_dict[agent_id]):
-                    action[index, :] = previous_action
-        return action_buffers
+                for act_type in action_dict:
+                    action_dict[act_type][index, :] = self.previous_action_dict[agent_id][act_type]
+        return action_dict

    def remove_previous_action(self, agent_ids):
        for agent_id in agent_ids:
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py
    SeparateActorCritic,
    GlobalSteps,
 )
-from mlagents.trainers.torch.utils import ModelUtils
+
+from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs

 EPSILON = 1e-7  # Small value to avoid divide by zero

                vec_obs, vis_obs, masks, memories, seq_len
            )
        action_list = self.actor_critic.sample_action(dists)
-        log_probs, entropies, all_logs = ModelUtils.get_probs_and_entropy(
+        log_probs_list, entropies, all_logs = ModelUtils.get_probs_and_entropy(
+        log_probs = ActionLogProbs.create_action_log_probs(log_probs_list, self.behavior_spec.action_spec)
 #        actions = torch.stack(action_list, dim=-1)
 #        if self.use_continuous_act:
 #            actions = actions[:, :, 0]
        self,
        vec_obs: torch.Tensor,
        vis_obs: torch.Tensor,
-        actions: List[torch.Tensor],
+        actions: AgentAction,
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        seq_len: int = 1,
        )
-        #action_list = [actions[..., i] for i in range(actions.shape[-1])]
-        #log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_list, dists)
-        log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(actions, dists)
+        log_probs_list, entropies, _ = ModelUtils.get_probs_and_entropy(actions, dists)
+        log_probs = ActionLogProbs.create_action_log_probs(log_probs_list, self.behavior_spec.action_spec)
        # Use the sum of entropy across actions, not the mean
        entropy_sum = torch.sum(entropies, dim=1)
        return log_probs, entropy_sum, value_heads
            action, log_probs, entropy, memories = self.sample_actions(
                vec_obs, vis_obs, masks=masks, memories=memories
            )
-        run_out["action"] = ModelUtils.to_action_buffers(action, self.behavior_spec.action_spec)
-        run_out["pre_action"] = ModelUtils.to_action_buffers(action, self.behavior_spec.action_spec)
-        # Todo - make pre_action difference
-        run_out["log_probs"] = ModelUtils.to_numpy(log_probs)
+        run_out["action"] = action.to_numpy_dict()
+        run_out["pre_action"] = action.to_numpy_dict()["continuous_action"] # Todo - make pre_action difference
+        run_out["log_probs"] = log_probs.to_numpy_dict()
        run_out["entropy"] = ModelUtils.to_numpy(entropy)
        run_out["learning_rate"] = 0.0
        if self.use_recurrent:
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
 from mlagents.trainers.policy.torch_policy import TorchPolicy
 from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
 from mlagents.trainers.settings import TrainerSettings, PPOSettings
-from mlagents.trainers.torch.utils import ModelUtils
+from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs


 class TorchPPOOptimizer(TorchOptimizer):

        vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
        act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
-        actions = ModelUtils.action_buffers_to_tensor_list(batch["actions"], self.policy.behavior_spec.action_spec)
+        actions = AgentAction.extract_agent_action(batch)

        memories = [
            ModelUtils.list_to_tensor(batch["memory"][i])
            memories=memories,
            seq_len=self.policy.sequence_length,
        )
+        old_log_probs = ActionLogProbs.extract_action_log_probs(batch).flatten()
+        log_probs = log_probs.flatten()
        loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
        value_loss = self.ppo_value_loss(
            values, old_values, returns, decay_eps, loss_masks
            log_probs,
-            ModelUtils.list_to_tensor(batch["action_probs"]),
+            old_log_probs,
            loss_masks,
        )
        loss = (
--- a/ml-agents/mlagents/trainers/simple_env_manager.py
+++ b/ml-agents/mlagents/trainers/simple_env_manager.py
 from typing import Dict, List

-from mlagents_envs.base_env import BaseEnv, BehaviorName, BehaviorSpec
+from mlagents_envs.base_env import BaseEnv, BehaviorName, BehaviorSpec, ActionBuffers
 from mlagents.trainers.env_manager import EnvManager, EnvironmentStep, AllStepResult
 from mlagents_envs.timers import timed
 from mlagents.trainers.action_info import ActionInfo
        self.previous_all_action_info = all_action_info

        for brain_name, action_info in all_action_info.items():
-            self.env.set_actions(brain_name, action_info.action)
+            _action = ActionBuffers.from_numpy_dict(action_info.action)
+            self.env.set_actions(brain_name, _action)
        self.env.step()
        all_step_result = self._generate_all_results()

--- a/ml-agents/mlagents/trainers/subprocess_env_manager.py
+++ b/ml-agents/mlagents/trainers/subprocess_env_manager.py
 from multiprocessing import Process, Pipe, Queue
 from multiprocessing.connection import Connection
 from queue import Empty as EmptyQueueException
-from mlagents_envs.base_env import BaseEnv, BehaviorName, BehaviorSpec
+from mlagents_envs.base_env import BaseEnv, BehaviorName, BehaviorSpec, ActionBuffers
 from mlagents_envs import logging_util
 from mlagents.trainers.env_manager import EnvManager, EnvironmentStep, AllStepResult
 from mlagents_envs.timers import (
                all_action_info = req.payload
                for brain_name, action_info in all_action_info.items():
                    if len(action_info.action) != 0:
-                        env.set_actions(brain_name, action_info.action)
+                        _action = ActionBuffers.from_numpy_dict(action_info.action)
+                        env.set_actions(brain_name, _action)
                env.step()
                all_step_result = _generate_all_results()
                # The timers in this process are independent from all the processes and the "main" process
--- a/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
 SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)


-@pytest.mark.parametrize("use_discrete", [True, False])
+@pytest.mark.parametrize("use_discrete", [False])
 def test_simple_ppo(use_discrete):
    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
    config = attr.evolve(PPO_TORCH_CONFIG)
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_2d_ppo(use_discrete):
-    env = SimpleEnvironment(
-        [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
-    )
-    new_hyperparams = attr.evolve(
-        PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=640
-    )
-    config = attr.evolve(
-        PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
-    )
-    check_environment_trains(env, {BRAIN_NAME: config})
-
-
-@pytest.mark.parametrize("use_discrete", [True, False])
-@pytest.mark.parametrize("num_visual", [1, 2])
-def test_visual_ppo(num_visual, use_discrete):
-    env = SimpleEnvironment(
-        [BRAIN_NAME],
-        use_discrete=use_discrete,
-        num_visual=num_visual,
-        num_vector=0,
-        step_size=0.2,
-    )
-    new_hyperparams = attr.evolve(
-        PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
-    )
-    config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams)
-    check_environment_trains(env, {BRAIN_NAME: config})
-
-
-@pytest.mark.parametrize("num_visual", [1, 2])
-@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
-def test_visual_advanced_ppo(vis_encode_type, num_visual):
-    env = SimpleEnvironment(
-        [BRAIN_NAME],
-        use_discrete=True,
-        num_visual=num_visual,
-        num_vector=0,
-        step_size=0.5,
-        vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
-    )
-    new_networksettings = attr.evolve(
-        SAC_TORCH_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
-    )
-    new_hyperparams = attr.evolve(
-        PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
-    )
-    config = attr.evolve(
-        PPO_TORCH_CONFIG,
-        hyperparameters=new_hyperparams,
-        network_settings=new_networksettings,
-        max_steps=900,
-        summary_freq=100,
-    )
-    # The number of steps is pretty small for these encoders
-    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
-
-
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_recurrent_ppo(use_discrete):
-    env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
-    new_network_settings = attr.evolve(
-        PPO_TORCH_CONFIG.network_settings,
-        memory=NetworkSettings.MemorySettings(memory_size=16),
-    )
-    new_hyperparams = attr.evolve(
-        PPO_TORCH_CONFIG.hyperparameters,
-        learning_rate=1.0e-3,
-        batch_size=64,
-        buffer_size=128,
-    )
-    config = attr.evolve(
-        PPO_TORCH_CONFIG,
-        hyperparameters=new_hyperparams,
-        network_settings=new_network_settings,
-        max_steps=5000,
-    )
-    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
-
-
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_sac(use_discrete):
-    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
-    config = attr.evolve(SAC_TORCH_CONFIG)
-    check_environment_trains(env, {BRAIN_NAME: config})
-
-
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_2d_sac(use_discrete):
-    env = SimpleEnvironment(
-        [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
-    )
-    new_hyperparams = attr.evolve(
-        SAC_TORCH_CONFIG.hyperparameters, buffer_init_steps=2000
-    )
-    config = attr.evolve(
-        SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
-    )
-    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
-
-
-@pytest.mark.parametrize("use_discrete", [True, False])
-@pytest.mark.parametrize("num_visual", [1, 2])
-def test_visual_sac(num_visual, use_discrete):
-    env = SimpleEnvironment(
-        [BRAIN_NAME],
-        use_discrete=use_discrete,
-        num_visual=num_visual,
-        num_vector=0,
-        step_size=0.2,
-    )
-    new_hyperparams = attr.evolve(
-        SAC_TORCH_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4
-    )
-    config = attr.evolve(SAC_TORCH_CONFIG, hyperparameters=new_hyperparams)
-    check_environment_trains(env, {BRAIN_NAME: config})
-
-
-@pytest.mark.parametrize("num_visual", [1, 2])
-@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
-def test_visual_advanced_sac(vis_encode_type, num_visual):
-    env = SimpleEnvironment(
-        [BRAIN_NAME],
-        use_discrete=True,
-        num_visual=num_visual,
-        num_vector=0,
-        step_size=0.5,
-        vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
-    )
-    new_networksettings = attr.evolve(
-        SAC_TORCH_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
-    )
-    new_hyperparams = attr.evolve(
-        SAC_TORCH_CONFIG.hyperparameters,
-        batch_size=16,
-        learning_rate=3e-4,
-        buffer_init_steps=0,
-    )
-    config = attr.evolve(
-        SAC_TORCH_CONFIG,
-        hyperparameters=new_hyperparams,
-        network_settings=new_networksettings,
-        max_steps=100,
-    )
-    # The number of steps is pretty small for these encoders
-    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
+#@pytest.mark.parametrize("use_discrete", [True, False])
+#def test_2d_ppo(use_discrete):
+#    env = SimpleEnvironment(
+#        [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
+#    )
+#    new_hyperparams = attr.evolve(
+#        PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=640
+#    )
+#    config = attr.evolve(
+#        PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
+#    )
+#    check_environment_trains(env, {BRAIN_NAME: config})
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_recurrent_sac(use_discrete):
-    step_size = 0.2 if use_discrete else 0.5
-    env = MemoryEnvironment(
-        [BRAIN_NAME], use_discrete=use_discrete, step_size=step_size
-    )
-    new_networksettings = attr.evolve(
-        SAC_TORCH_CONFIG.network_settings,
-        memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16),
-    )
-    new_hyperparams = attr.evolve(
-        SAC_TORCH_CONFIG.hyperparameters,
-        batch_size=256,
-        learning_rate=1e-3,
-        buffer_init_steps=1000,
-        steps_per_update=2,
-    )
-    config = attr.evolve(
-        SAC_TORCH_CONFIG,
-        hyperparameters=new_hyperparams,
-        network_settings=new_networksettings,
-        max_steps=2000,
-    )
-    check_environment_trains(env, {BRAIN_NAME: config})
-
-
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_ghost(use_discrete):
-    env = SimpleEnvironment(
-        [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
-    )
-    self_play_settings = SelfPlaySettings(
-        play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000
-    )
-    config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=2500)
-    check_environment_trains(env, {BRAIN_NAME: config})
-
-
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_ghost_fails(use_discrete):
-    env = SimpleEnvironment(
-        [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
-    )
-    # This config should fail because the ghosted policy is never swapped with a competent policy.
-    # Swap occurs after max step is reached.
-    self_play_settings = SelfPlaySettings(
-        play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=4000
-    )
-    config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=2500)
-    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None)
-    processed_rewards = [
-        default_reward_processor(rewards) for rewards in env.final_rewards.values()
-    ]
-    success_threshold = 0.9
-    assert any(reward > success_threshold for reward in processed_rewards) and any(
-        reward < success_threshold for reward in processed_rewards
-    )
-
-
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_asymm_ghost(use_discrete):
-    # Make opponent for asymmetric case
-    brain_name_opp = BRAIN_NAME + "Opp"
-    env = SimpleEnvironment(
-        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
-    )
-    self_play_settings = SelfPlaySettings(
-        play_against_latest_model_ratio=1.0,
-        save_steps=10000,
-        swap_steps=10000,
-        team_change=400,
-    )
-    config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=4000)
-    check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
-
-
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_asymm_ghost_fails(use_discrete):
-    # Make opponent for asymmetric case
-    brain_name_opp = BRAIN_NAME + "Opp"
-    env = SimpleEnvironment(
-        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
-    )
-    # This config should fail because the team that us not learning when both have reached
-    # max step should be executing the initial, untrained poliy.
-    self_play_settings = SelfPlaySettings(
-        play_against_latest_model_ratio=0.0,
-        save_steps=5000,
-        swap_steps=5000,
-        team_change=2000,
-    )
-    config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=3000)
-    check_environment_trains(
-        env, {BRAIN_NAME: config, brain_name_opp: config}, success_threshold=None
-    )
-    processed_rewards = [
-        default_reward_processor(rewards) for rewards in env.final_rewards.values()
-    ]
-    success_threshold = 0.9
-    assert any(reward > success_threshold for reward in processed_rewards) and any(
-        reward < success_threshold for reward in processed_rewards
-    )
-
-
-@pytest.fixture(scope="session")
-def simple_record(tmpdir_factory):
-    def record_demo(use_discrete, num_visual=0, num_vector=1):
-        env = RecordEnvironment(
-            [BRAIN_NAME],
-            use_discrete=use_discrete,
-            num_visual=num_visual,
-            num_vector=num_vector,
-            n_demos=100,
-        )
-        # If we want to use true demos, we can solve the env in the usual way
-        # Otherwise, we can just call solve to execute the optimal policy
-        env.solve()
-        agent_info_protos = env.demonstration_protos[BRAIN_NAME]
-        meta_data_proto = DemonstrationMetaProto()
-        brain_param_proto = BrainParametersProto(
-            vector_action_size=[2] if use_discrete else [1],
-            vector_action_descriptions=[""],
-            vector_action_space_type=discrete if use_discrete else continuous,
-            brain_name=BRAIN_NAME,
-            is_training=True,
-        )
-        action_type = "Discrete" if use_discrete else "Continuous"
-        demo_path_name = "1DTest" + action_type + ".demo"
-        demo_path = str(tmpdir_factory.mktemp("tmp_demo").join(demo_path_name))
-        write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos)
-        return demo_path
-
-    return record_demo
-
-
-@pytest.mark.parametrize("use_discrete", [True, False])
-@pytest.mark.parametrize("trainer_config", [PPO_TORCH_CONFIG, SAC_TORCH_CONFIG])
-def test_gail(simple_record, use_discrete, trainer_config):
-    demo_path = simple_record(use_discrete)
-    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
-    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
-    reward_signals = {
-        RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
-    }
-    config = attr.evolve(
-        trainer_config,
-        reward_signals=reward_signals,
-        behavioral_cloning=bc_settings,
-        max_steps=500,
-    )
-    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
-
-
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_gail_visual_ppo(simple_record, use_discrete):
-    demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
-    env = SimpleEnvironment(
-        [BRAIN_NAME],
-        num_visual=1,
-        num_vector=0,
-        use_discrete=use_discrete,
-        step_size=0.2,
-    )
-    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)
-    reward_signals = {
-        RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
-    }
-    hyperparams = attr.evolve(PPO_TORCH_CONFIG.hyperparameters, learning_rate=3e-4)
-    config = attr.evolve(
-        PPO_TORCH_CONFIG,
-        reward_signals=reward_signals,
-        hyperparameters=hyperparams,
-        behavioral_cloning=bc_settings,
-        max_steps=1000,
-    )
-    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
-
-
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_gail_visual_sac(simple_record, use_discrete):
-    demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
-    env = SimpleEnvironment(
-        [BRAIN_NAME],
-        num_visual=1,
-        num_vector=0,
-        use_discrete=use_discrete,
-        step_size=0.2,
-    )
-    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
-    reward_signals = {
-        RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
-    }
-    hyperparams = attr.evolve(
-        SAC_TORCH_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16
-    )
-    config = attr.evolve(
-        SAC_TORCH_CONFIG,
-        reward_signals=reward_signals,
-        hyperparameters=hyperparams,
-        behavioral_cloning=bc_settings,
-        max_steps=500,
-    )
-    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
+#@pytest.mark.parametrize("use_discrete", [True, False])
+#@pytest.mark.parametrize("num_visual", [1, 2])
+#def test_visual_ppo(num_visual, use_discrete):
+#    env = SimpleEnvironment(
+#        [BRAIN_NAME],
+#        use_discrete=use_discrete,
+#        num_visual=num_visual,
+#        num_vector=0,
+#        step_size=0.2,
+#    )
+#    new_hyperparams = attr.evolve(
+#        PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
+#    )
+#    config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams)
+#    check_environment_trains(env, {BRAIN_NAME: config})
+#
+#
+#@pytest.mark.parametrize("num_visual", [1, 2])
+#@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
+#def test_visual_advanced_ppo(vis_encode_type, num_visual):
+#    env = SimpleEnvironment(
+#        [BRAIN_NAME],
+#        use_discrete=True,
+#        num_visual=num_visual,
+#        num_vector=0,
+#        step_size=0.5,
+#        vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
+#    )
+#    new_networksettings = attr.evolve(
+#        SAC_TORCH_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
+#    )
+#    new_hyperparams = attr.evolve(
+#        PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
+#    )
+#    config = attr.evolve(
+#        PPO_TORCH_CONFIG,
+#        hyperparameters=new_hyperparams,
+#        network_settings=new_networksettings,
+#        max_steps=900,
+#        summary_freq=100,
+#    )
+#    # The number of steps is pretty small for these encoders
+#    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
+#
+#
+#@pytest.mark.parametrize("use_discrete", [True, False])
+#def test_recurrent_ppo(use_discrete):
+#    env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
+#    new_network_settings = attr.evolve(
+#        PPO_TORCH_CONFIG.network_settings,
+#        memory=NetworkSettings.MemorySettings(memory_size=16),
+#    )
+#    new_hyperparams = attr.evolve(
+#        PPO_TORCH_CONFIG.hyperparameters,
+#        learning_rate=1.0e-3,
+#        batch_size=64,
+#        buffer_size=128,
+#    )
+#    config = attr.evolve(
+#        PPO_TORCH_CONFIG,
+#        hyperparameters=new_hyperparams,
+#        network_settings=new_network_settings,
+#        max_steps=5000,
+#    )
+#    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
+#
+#
+#@pytest.mark.parametrize("use_discrete", [True, False])
+#def test_simple_sac(use_discrete):
+#    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
+#    config = attr.evolve(SAC_TORCH_CONFIG)
+#    check_environment_trains(env, {BRAIN_NAME: config})
+#
+#
+#@pytest.mark.parametrize("use_discrete", [True, False])
+#def test_2d_sac(use_discrete):
+#    env = SimpleEnvironment(
+#        [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
+#    )
+#    new_hyperparams = attr.evolve(
+#        SAC_TORCH_CONFIG.hyperparameters, buffer_init_steps=2000
+#    )
+#    config = attr.evolve(
+#        SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
+#    )
+#    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
+#
+#
+#@pytest.mark.parametrize("use_discrete", [True, False])
+#@pytest.mark.parametrize("num_visual", [1, 2])
+#def test_visual_sac(num_visual, use_discrete):
+#    env = SimpleEnvironment(
+#        [BRAIN_NAME],
+#        use_discrete=use_discrete,
+#        num_visual=num_visual,
+#        num_vector=0,
+#        step_size=0.2,
+#    )
+#    new_hyperparams = attr.evolve(
+#        SAC_TORCH_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4
+#    )
+#    config = attr.evolve(SAC_TORCH_CONFIG, hyperparameters=new_hyperparams)
+#    check_environment_trains(env, {BRAIN_NAME: config})
+#
+#
+#@pytest.mark.parametrize("num_visual", [1, 2])
+#@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
+#def test_visual_advanced_sac(vis_encode_type, num_visual):
+#    env = SimpleEnvironment(
+#        [BRAIN_NAME],
+#        use_discrete=True,
+#        num_visual=num_visual,
+#        num_vector=0,
+#        step_size=0.5,
+#        vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
+#    )
+#    new_networksettings = attr.evolve(
+#        SAC_TORCH_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
+#    )
+#    new_hyperparams = attr.evolve(
+#        SAC_TORCH_CONFIG.hyperparameters,
+#        batch_size=16,
+#        learning_rate=3e-4,
+#        buffer_init_steps=0,
+#    )
+#    config = attr.evolve(
+#        SAC_TORCH_CONFIG,
+#        hyperparameters=new_hyperparams,
+#        network_settings=new_networksettings,
+#        max_steps=100,
+#    )
+#    # The number of steps is pretty small for these encoders
+#    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
+#
+#
+#@pytest.mark.parametrize("use_discrete", [True, False])
+#def test_recurrent_sac(use_discrete):
+#    step_size = 0.2 if use_discrete else 0.5
+#    env = MemoryEnvironment(
+#        [BRAIN_NAME], use_discrete=use_discrete, step_size=step_size
+#    )
+#    new_networksettings = attr.evolve(
+#        SAC_TORCH_CONFIG.network_settings,
+#        memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16),
+#    )
+#    new_hyperparams = attr.evolve(
+#        SAC_TORCH_CONFIG.hyperparameters,
+#        batch_size=256,
+#        learning_rate=1e-3,
+#        buffer_init_steps=1000,
+#        steps_per_update=2,
+#    )
+#    config = attr.evolve(
+#        SAC_TORCH_CONFIG,
+#        hyperparameters=new_hyperparams,
+#        network_settings=new_networksettings,
+#        max_steps=2000,
+#    )
+#    check_environment_trains(env, {BRAIN_NAME: config})
+#
+#
+#@pytest.mark.parametrize("use_discrete", [True, False])
+#def test_simple_ghost(use_discrete):
+#    env = SimpleEnvironment(
+#        [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
+#    )
+#    self_play_settings = SelfPlaySettings(
+#        play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000
+#    )
+#    config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=2500)
+#    check_environment_trains(env, {BRAIN_NAME: config})
+#
+#
+#@pytest.mark.parametrize("use_discrete", [True, False])
+#def test_simple_ghost_fails(use_discrete):
+#    env = SimpleEnvironment(
+#        [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
+#    )
+#    # This config should fail because the ghosted policy is never swapped with a competent policy.
+#    # Swap occurs after max step is reached.
+#    self_play_settings = SelfPlaySettings(
+#        play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=4000
+#    )
+#    config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=2500)
+#    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None)
+#    processed_rewards = [
+#        default_reward_processor(rewards) for rewards in env.final_rewards.values()
+#    ]
+#    success_threshold = 0.9
+#    assert any(reward > success_threshold for reward in processed_rewards) and any(
+#        reward < success_threshold for reward in processed_rewards
+#    )
+#
+#
+#@pytest.mark.parametrize("use_discrete", [True, False])
+#def test_simple_asymm_ghost(use_discrete):
+#    # Make opponent for asymmetric case
+#    brain_name_opp = BRAIN_NAME + "Opp"
+#    env = SimpleEnvironment(
+#        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
+#    )
+#    self_play_settings = SelfPlaySettings(
+#        play_against_latest_model_ratio=1.0,
+#        save_steps=10000,
+#        swap_steps=10000,
+#        team_change=400,
+#    )
+#    config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=4000)
+#    check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
+#
+#
+#@pytest.mark.parametrize("use_discrete", [True, False])
+#def test_simple_asymm_ghost_fails(use_discrete):
+#    # Make opponent for asymmetric case
+#    brain_name_opp = BRAIN_NAME + "Opp"
+#    env = SimpleEnvironment(
+#        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
+#    )
+#    # This config should fail because the team that us not learning when both have reached
+#    # max step should be executing the initial, untrained poliy.
+#    self_play_settings = SelfPlaySettings(
+#        play_against_latest_model_ratio=0.0,
+#        save_steps=5000,
+#        swap_steps=5000,
+#        team_change=2000,
+#    )
+#    config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=3000)
+#    check_environment_trains(
+#        env, {BRAIN_NAME: config, brain_name_opp: config}, success_threshold=None
+#    )
+#    processed_rewards = [
+#        default_reward_processor(rewards) for rewards in env.final_rewards.values()
+#    ]
+#    success_threshold = 0.9
+#    assert any(reward > success_threshold for reward in processed_rewards) and any(
+#        reward < success_threshold for reward in processed_rewards
+#    )
+#
+#
+#@pytest.fixture(scope="session")
+#def simple_record(tmpdir_factory):
+#    def record_demo(use_discrete, num_visual=0, num_vector=1):
+#        env = RecordEnvironment(
+#            [BRAIN_NAME],
+#            use_discrete=use_discrete,
+#            num_visual=num_visual,
+#            num_vector=num_vector,
+#            n_demos=100,
+#        )
+#        # If we want to use true demos, we can solve the env in the usual way
+#        # Otherwise, we can just call solve to execute the optimal policy
+#        env.solve()
+#        agent_info_protos = env.demonstration_protos[BRAIN_NAME]
+#        meta_data_proto = DemonstrationMetaProto()
+#        brain_param_proto = BrainParametersProto(
+#            vector_action_size=[2] if use_discrete else [1],
+#            vector_action_descriptions=[""],
+#            vector_action_space_type=discrete if use_discrete else continuous,
+#            brain_name=BRAIN_NAME,
+#            is_training=True,
+#        )
+#        action_type = "Discrete" if use_discrete else "Continuous"
+#        demo_path_name = "1DTest" + action_type + ".demo"
+#        demo_path = str(tmpdir_factory.mktemp("tmp_demo").join(demo_path_name))
+#        write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos)
+#        return demo_path
+#
+#    return record_demo
+#
+#
+#@pytest.mark.parametrize("use_discrete", [True, False])
+#@pytest.mark.parametrize("trainer_config", [PPO_TORCH_CONFIG, SAC_TORCH_CONFIG])
+#def test_gail(simple_record, use_discrete, trainer_config):
+#    demo_path = simple_record(use_discrete)
+#    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
+#    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
+#    reward_signals = {
+#        RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
+#    }
+#    config = attr.evolve(
+#        trainer_config,
+#        reward_signals=reward_signals,
+#        behavioral_cloning=bc_settings,
+#        max_steps=500,
+#    )
+#    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
+#
+#
+#@pytest.mark.parametrize("use_discrete", [True, False])
+#def test_gail_visual_ppo(simple_record, use_discrete):
+#    demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
+#    env = SimpleEnvironment(
+#        [BRAIN_NAME],
+#        num_visual=1,
+#        num_vector=0,
+#        use_discrete=use_discrete,
+#        step_size=0.2,
+#    )
+#    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)
+#    reward_signals = {
+#        RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
+#    }
+#    hyperparams = attr.evolve(PPO_TORCH_CONFIG.hyperparameters, learning_rate=3e-4)
+#    config = attr.evolve(
+#        PPO_TORCH_CONFIG,
+#        reward_signals=reward_signals,
+#        hyperparameters=hyperparams,
+#        behavioral_cloning=bc_settings,
+#        max_steps=1000,
+#    )
+#    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
+#
+#
+#@pytest.mark.parametrize("use_discrete", [True, False])
+#def test_gail_visual_sac(simple_record, use_discrete):
+#    demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
+#    env = SimpleEnvironment(
+#        [BRAIN_NAME],
+#        num_visual=1,
+#        num_vector=0,
+#        use_discrete=use_discrete,
+#        step_size=0.2,
+#    )
+#    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
+#    reward_signals = {
+#        RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
+#    }
+#    hyperparams = attr.evolve(
+#        SAC_TORCH_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16
+#    )
+#    config = attr.evolve(
+#        SAC_TORCH_CONFIG,
+#        reward_signals=reward_signals,
+#        hyperparameters=hyperparams,
+#        behavioral_cloning=bc_settings,
+#        max_steps=500,
+#    )
+#    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
--- a/ml-agents/mlagents/trainers/torch/networks.py
+++ b/ml-agents/mlagents/trainers/torch/networks.py
    DistInstance,
 )
 from mlagents.trainers.settings import NetworkSettings
-from mlagents.trainers.torch.utils import ModelUtils
+from mlagents.trainers.torch.utils import ModelUtils, AgentAction
 from mlagents.trainers.torch.decoders import ValueHeads
 from mlagents.trainers.torch.layers import LSTM, LinearEncoder
 from mlagents.trainers.torch.model_serialization import exporting_to_onnx
    def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
        self.network_body.update_normalization(vector_obs)

-    def sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
+    def sample_action(self, dists: List[DistInstance]) -> AgentAction:
-        return actions
+        return AgentAction.create_agent_action(actions, self.action_spec)

    def get_dists(
        self,
        """
        dists, _ = self.get_dists(vec_inputs, vis_inputs, masks, memories, 1)
        if self.action_spec.is_continuous():
-            action_list = self.sample_action(dists)
-            action_out = torch.stack(action_list, dim=-1)
+            agent_action = self.sample_action(dists)
+            action_out = agent_action.flatten()#torch.stack(action_list, dim=-1)
        else:
            action_out = torch.cat([dist.all_log_prob() for dist in dists], dim=1)
        return (
--- a/ml-agents/mlagents/trainers/torch/utils.py
+++ b/ml-agents/mlagents/trainers/torch/utils.py
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, NamedTuple, Dict
 from mlagents.torch_utils import torch, nn
 import numpy as np

 from mlagents.trainers.torch.distributions import DistInstance, DiscreteDistInstance


+class AgentAction(NamedTuple):
+    continuous: torch.Tensor
+    discrete: List[torch.Tensor]
+
+    def to_numpy_dict(self) -> Dict[str, np.ndarray]:
+        action_arrays_dict: Dict[str, np.ndarray] = {}
+        if self.continuous is not None:
+            action_arrays_dict["continuous_action"] = ModelUtils.to_numpy(self.continuous)
+        if self.discrete is not None:
+            action_arrays_dict["discrete_action"] = ModelUtils.to_numpy(self.discrete)
+        return action_arrays_dict
+
+    def to_tensor_list(self) -> List[torch.Tensor]:
+        tensor_list: List[torch.Tensor] = []
+        if self.continuous is not None:
+            tensor_list.append(self.continuous)
+        if self.discrete is not None:
+            tensor_list += self.discrete
+        return tensor_list
+
+    def flatten(self) -> torch.Tensor:
+        return torch.stack(self.to_tensor_list(), dim=-1)
+
+    @staticmethod    
+    def extract_agent_action(buff: Dict[str, np.ndarray]) -> "AgentAction":
+        continuous: torch.Tensor = None
+        discrete: List[torch.Tensor] = None
+        if "continuous_action" in buff:
+            continuous = ModelUtils.list_to_tensor(buff["continuous_action"])
+        if "discrete_action" in buff:
+            discrete = ModelUtils.list_to_tensor(buff["discrete_action"])
+        return AgentAction(continuous, discrete)
+            
+    @staticmethod    
+    def create_agent_action(action_tensors: List[torch.Tensor], action_spec: ActionSpec) -> "AgentAction":
+        continuous: torch.Tensor = None
+        discrete: List[torch.Tensor] = None
+        _offset = 0
+        if action_spec.continuous_size > 0:
+            continuous = action_tensors[0]
+            _offset = 1
+        if action_spec.discrete_size > 0:
+            discrete = action_tensors[_offset:]
+        return AgentAction(continuous, discrete)
+
+class ActionLogProbs(NamedTuple):
+    continuous: torch.Tensor
+    discrete: List[torch.Tensor]
+
+    def to_numpy_dict(self) -> Dict[str, np.ndarray]:
+        log_prob_arrays_dict: Dict[str, np.ndarray] = {}
+        if self.continuous is not None:
+            log_prob_arrays_dict["continuous_log_probs"] = ModelUtils.to_numpy(self.continuous)
+        if self.discrete is not None:
+            log_prob_arrays_dict["discrete_log_probs"] = ModelUtils.to_numpy(self.discrete)
+        return log_prob_arrays_dict
+
+    def to_tensor_list(self) -> List[torch.Tensor]:
+        tensor_list: List[torch.Tensor] = []
+        if self.continuous is not None:
+            tensor_list.append(self.continuous)
+        if self.discrete is not None:
+            tensor_list += self.discrete
+        return tensor_list
+
+    def flatten(self) -> torch.Tensor:
+        return torch.stack(self.to_tensor_list(), dim=-1)
+
+    @staticmethod    
+    def extract_action_log_probs(buff: Dict[str, np.ndarray]) -> "AgentAction":
+        continuous: torch.Tensor = None
+        discrete: List[torch.Tensor] = None
+        if "continuous_action" in buff:
+            continuous = ModelUtils.list_to_tensor(buff["continuous_log_probs"])
+        if "discrete_action" in buff:
+            discrete = ModelUtils.list_to_tensor(buff["discrete_log_probs"])
+        return ActionLogProbs(continuous, discrete)
+
+    @staticmethod    
+    def create_action_log_probs(log_prob_tensors: List[torch.Tensor], action_spec: ActionSpec) -> "AgentAction":
+        continuous: torch.Tensor = None
+        discrete: List[torch.Tensor] = None
+        _offset = 0
+        if action_spec.continuous_size > 0:
+            continuous = log_prob_tensors[0]
+            _offset = 1
+        if action_spec.discrete_size > 0:
+            discrete = log_prob_tensors[_offset:]
+        return ActionLogProbs(continuous, discrete)
+
 class ModelUtils:
    # Minimum supported side for each encoder type. If refactoring an encoder, please
    # adjust these also.
        )

    @staticmethod
-    def to_action_buffers(actions: List[torch.Tensor], action_spec: ActionSpec) -> ActionBuffers:
+    def to_action_buffers(agent_actions: AgentAction, action_spec: ActionSpec) -> ActionBuffers:
-        continuous_action: np.ndarray = np.array([])
-        discrete_action_list: List[np.ndarray] = []
-        discrete_action: np.ndarray = np.array([])
-        # offset to index discrete actions depending on presence of continuous actions
-        _offset = 0
-        if action_spec.continuous_size > 0:
-            continuous_action = actions[0].detach().cpu().numpy()
-            _offset = 1
-        if action_spec.discrete_size > 0:
-            for _disc in range(action_spec.discrete_size):
-                discrete_action_list.append(actions[_disc + _offset].detach().cpu().numpy())
-            #print(discrete_action_list)
-            discrete_action = np.array(discrete_action_list)
-        return ActionBuffers(continuous_action, discrete_action)
+        
+        return ActionBuffers(agent_actions.continuous.detach().cpu().numpy(), agent_actions.discrete.detach().cpu().numpy())
-    @staticmethod
-    def action_buffers_to_tensor_list(
-        action_buffers: ActionBuffers, action_spec: ActionSpec, dtype: Optional[torch.dtype] = None
-    ) -> List[torch.Tensor]:
-        """
-        Converts ActionBuffers fields into a List of tensors.
-        """
-        #print(action_buffers)
-        action_tensors: List[torch.Tensor] = []
-        if action_spec.continuous_size > 0:
-            action_tensors.append(torch.as_tensor(np.asanyarray(action_buffers.continuous), dtype=dtype))
-        if action_spec.discrete_size > 0:
-            for _disc in range(action_buffers.discrete):
-                action_tensors.append(torch.as_tensor(np.asanyarray(_disc), dtype=dtype))
-        return actiion_tensors 
+    #@staticmethod
+    #def action_buffers_to_agent_action(
+    #    action_buffers: ActionBuffers, dtype: Optional[torch.dtype] = None
+    #) -> AgentAction:
+    #    """
+    #    Converts ActionBuffers fields into a AgentAction fields
+    #    """
+    #    return AgentAction(torch.as_tensor(np.asanyarray(action_buffers.continuous), dtype=dtype),
+                                    #torch.as_tensor(np.asanyarray(_disc), dtype=dtype))

    @staticmethod
    def list_to_tensor(

    @staticmethod
    def get_probs_and_entropy(
-        action_list: List[torch.Tensor], dists: List[DistInstance]
-    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        agent_action: AgentAction, dists: List[DistInstance]
+    ) -> Tuple[List[torch.Tensor], torch.Tensor, Optional[torch.Tensor]]:
+        action_list = agent_action.to_tensor_list()
        for action, action_dist in zip(action_list, dists):
            log_prob = action_dist.log_prob(action)
            log_probs_list.append(log_prob)
-        log_probs = torch.stack(log_probs_list, dim=-1)
+        #log_probs = torch.stack(log_probs_list, dim=-1)
-            log_probs = log_probs.squeeze(-1)
+        #    log_probs = log_probs.squeeze(-1)
-        return log_probs, entropies, all_probs
+        return log_probs_list, entropies, all_probs

    @staticmethod
    def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:
                    alpha=tau,
                    out=target_param.data,
                )
+
+
--- a/ml-agents/mlagents/trainers/trajectory.py
+++ b/ml-agents/mlagents/trainers/trajectory.py
-from typing import List, NamedTuple
+from typing import List, NamedTuple, Dict
 import numpy as np

 from mlagents.trainers.buffer import AgentBuffer
    obs: List[np.ndarray]
    reward: float
    done: bool
-    action: ActionBuffers
-    action_probs: np.ndarray
+    action: Dict[str, np.ndarray]
+    action_probs: Dict[str, np.ndarray]
-    prev_action: ActionBuffers
+    prev_action: Dict[str, np.ndarray]
    interrupted: bool
    memory: np.ndarray

                actions_pre = exp.action_pre
                agent_buffer_trajectory["actions_pre"].append(actions_pre)

-            # value is a dictionary from name of reward to value estimate of the value head
-            agent_buffer_trajectory["actions"].append(exp.action)
-            agent_buffer_trajectory["action_probs"].append(exp.action_probs)
+            # Adds the log prob and action of continuous/discrete separately 
+            for act_type, act_array in exp.action.items():
+                agent_buffer_trajectory[act_type].append(act_array)
+            for log_type, log_array in exp.action_probs.items():
+                agent_buffer_trajectory[log_type].append(log_array)

            # Store action masks if necessary. Note that 1 means active, while
            # in AgentExperience False means active.
                # This should never be needed unless the environment somehow doesn't supply the
                # action mask in a discrete space.
                agent_buffer_trajectory["action_mask"].append(
-                    np.ones(exp.action_probs.shape, dtype=np.float32), padding_value=1
+                    np.ones(exp.action_probs["continuous_log_probs"].shape, dtype=np.float32), padding_value=1
-            agent_buffer_trajectory["prev_action"].append(exp.prev_action)
+            #agent_buffer_trajectory["prev_action"].append(exp.prev_action)
+            for act_type, act_array in exp.prev_action.items():
+                agent_buffer_trajectory["prev_" + act_type].append(act_array)
+
            agent_buffer_trajectory["environment_rewards"].append(exp.reward)

            # Store the next visual obs as the current