Action Model (#4580)

Co-authored-by: Ervin T <ervin@unity3d.com> Co-authored-by: Vincent-Pierre BERGES <vincentpierre@unity3d.com>
4 年前 · 3c96a3a2
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
    - 'gym-unity/**'
    - 'test_constraints*.txt'
    - 'test_requirements.txt'
+    - '.github/workflows/pytest.yml'
  push:
    branches: [master]

      run: python -c "import sys; print(sys.version)"
    - name: Install dependencies
      run: |
-        python -m pip install --upgrade pip
+        # pin pip to workaround https://github.com/pypa/pip/issues/9180
+        python -m pip install pip==20.2
        python -m pip install --upgrade setuptools
        python -m pip install --progress-bar=off -e ./ml-agents-envs -c ${{ matrix.pip_constraints }}
        python -m pip install --progress-bar=off -e ./ml-agents -c ${{ matrix.pip_constraints }}
--- a/ml-agents-envs/mlagents_envs/base_env.py
+++ b/ml-agents-envs/mlagents_envs/base_env.py
        )


-class ActionTuple:
+class _ActionTupleBase(ABC):
-    An object whose fields correspond to actions of different types.
-    Continuous and discrete actions are numpy arrays of type float32 and
-    int32, respectively and are type checked on construction.
-    Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
-    respectively.
+    An object whose fields correspond to action data of continuous and discrete
+    spaces. Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
+    respectively. Note, this also holds when continuous or discrete size is
+    zero.
-    def __init__(self, continuous: np.ndarray, discrete: np.ndarray):
+    def __init__(
+        self,
+        continuous: Optional[np.ndarray] = None,
+        discrete: Optional[np.ndarray] = None,
+    ):
+        self._continuous: Optional[np.ndarray] = None
+        self._discrete: Optional[np.ndarray] = None
+        if continuous is not None:
+            self.add_continuous(continuous)
+        if discrete is not None:
+            self.add_discrete(discrete)
+
+    @property
+    def continuous(self) -> np.ndarray:
+        return self._continuous
+
+    @property
+    def discrete(self) -> np.ndarray:
+        return self._discrete
+
+    def add_continuous(self, continuous: np.ndarray) -> None:
+        if self._discrete is None:
+            self._discrete = np.zeros(
+                (continuous.shape[0], 0), dtype=self.discrete_dtype
+            )
-        if discrete.dtype != np.int32:
-            discrete = discrete.astype(np.int32, copy=False)
+    def add_discrete(self, discrete: np.ndarray) -> None:
+        if discrete.dtype != self.discrete_dtype:
+            discrete = discrete.astype(self.discrete_dtype, copy=False)
+        if self._continuous is None:
+            self._continuous = np.zeros((discrete.shape[0], 0), dtype=np.float32)
-    def continuous(self) -> np.ndarray:
-        return self._continuous
+    @abstractmethod
+    def discrete_dtype(self) -> np.dtype:
+        pass
-    @property
-    def discrete(self) -> np.ndarray:
-        return self._discrete
-    @staticmethod
-    def create_continuous(continuous: np.ndarray) -> "ActionTuple":
-        discrete = np.zeros((continuous.shape[0], 0), dtype=np.int32)
-        return ActionTuple(continuous, discrete)
+class ActionTuple(_ActionTupleBase):
+    """
+    An object whose fields correspond to actions of different types.
+    Continuous and discrete actions are numpy arrays of type float32 and
+    int32, respectively and are type checked on construction.
+    Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
+    respectively. Note, this also holds when continuous or discrete size is
+    zero.
+    """
-    @staticmethod
-    def create_discrete(discrete: np.ndarray) -> "ActionTuple":
-        continuous = np.zeros((discrete.shape[0], 0), dtype=np.float32)
-        return ActionTuple(continuous, discrete)
+    @property
+    def discrete_dtype(self) -> np.dtype:
+        """
+        The dtype of a discrete action.
+        """
+        return np.int32


 class ActionSpec(NamedTuple):
        for a number of agents.
        :param n_agents: The number of agents that will have actions generated
        """
-        continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
-        discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
-        return ActionTuple(continuous, discrete)
+        _continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
+        _discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
+        return ActionTuple(continuous=_continuous, discrete=_discrete)

    def random_action(self, n_agents: int) -> ActionTuple:
        """
        """
-        continuous = np.random.uniform(
+        _continuous = np.random.uniform(
-        discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
+        _discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
-            discrete = np.column_stack(
+            _discrete = np.column_stack(
                [
                    np.random.randint(
                        0,
                    for i in range(self.discrete_size)
                ]
            )
-        return ActionTuple(continuous, discrete)
+        return ActionTuple(continuous=_continuous, discrete=_discrete)

    def _validate_action(
        self, actions: ActionTuple, n_agents: int, name: str
        for the correct number of agents and ensures the type.
        """
        _expected_shape = (n_agents, self.continuous_size)
-        if self.continuous_size > 0 and actions.continuous.shape != _expected_shape:
+        if actions.continuous.shape != _expected_shape:
            raise UnityActionException(
                f"The behavior {name} needs a continuous input of dimension "
                f"{_expected_shape} for (<number of agents>, <action size>) but "
-        if self.discrete_size > 0 and actions.discrete.shape != _expected_shape:
+        if actions.discrete.shape != _expected_shape:
            raise UnityActionException(
                f"The behavior {name} needs a discrete input of dimension "
                f"{_expected_shape} for (<number of agents>, <action size>) but "
--- a/ml-agents-envs/mlagents_envs/rpc_utils.py
+++ b/ml-agents-envs/mlagents_envs/rpc_utils.py
 from mlagents_envs.base_env import (
-    BehaviorSpec,
+    BehaviorSpec,
    DecisionSteps,
    TerminalSteps,
 )
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
 from typing import List, Dict, TypeVar, Generic, Tuple, Any, Union
 from collections import defaultdict, Counter
 import queue
-import numpy as np
+    ActionTuple,
    DecisionSteps,
    DecisionStep,
    TerminalSteps,
 from mlagents.trainers.trajectory import Trajectory, AgentExperience
 from mlagents.trainers.policy import Policy
 from mlagents.trainers.action_info import ActionInfo, ActionInfoOutputs
+from mlagents.trainers.torch.action_log_probs import LogProbsTuple
 from mlagents.trainers.stats import StatsReporter
 from mlagents.trainers.behavior_id_utils import get_global_agent_id

            done = terminated  # Since this is an ongoing step
            interrupted = step.interrupted if terminated else False
            # Add the outputs of the last eval
-            action_dict = stored_take_action_outputs["action"]
-            action: Dict[str, np.ndarray] = {}
-            for act_type, act_array in action_dict.items():
-                action[act_type] = act_array[idx]
+            stored_actions = stored_take_action_outputs["action"]
+            action_tuple = ActionTuple(
+                continuous=stored_actions.continuous[idx],
+                discrete=stored_actions.discrete[idx],
+            )
-            action_probs_dict = stored_take_action_outputs["log_probs"]
-            action_probs: Dict[str, np.ndarray] = {}
-            for prob_type, prob_array in action_probs_dict.items():
-                action_probs[prob_type] = prob_array[idx]
-
+            stored_action_probs = stored_take_action_outputs["log_probs"]
+            log_probs_tuple = LogProbsTuple(
+                continuous=stored_action_probs.continuous[idx],
+                discrete=stored_action_probs.discrete[idx],
+            )
            action_mask = stored_decision_step.action_mask
            prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
            experience = AgentExperience(
-                action=action,
-                action_probs=action_probs,
+                action=action_tuple,
+                action_probs=log_probs_tuple,
                action_pre=action_pre,
                action_mask=action_mask,
                prev_action=prev_action,
--- a/ml-agents/mlagents/trainers/demo_loader.py
+++ b/ml-agents/mlagents/trainers/demo_loader.py
        for i, obs in enumerate(split_obs.visual_observations):
            demo_raw_buffer["visual_obs%d" % i].append(obs)
        demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
-        if behavior_spec.action_spec.is_continuous():
+        # TODO: update to read from the new proto format
+        if behavior_spec.action_spec.continuous_size > 0:
-        else:
+        if behavior_spec.action_spec.discrete_size > 0:
            demo_raw_buffer["discrete_action"].append(
                current_pair_info.action_info.vector_actions
            )
--- a/ml-agents/mlagents/trainers/env_manager.py
+++ b/ml-agents/mlagents/trainers/env_manager.py
 from abc import ABC, abstractmethod
-import numpy as np

 from typing import List, Dict, NamedTuple, Iterable, Tuple
 from mlagents_envs.base_env import (
    BehaviorName,
-    ActionTuple,
 )
 from mlagents_envs.side_channel.stats_side_channel import EnvironmentStats

 from mlagents_envs.logging_util import get_logger
-from mlagents_envs.exception import UnityActionException

 AllStepResult = Dict[BehaviorName, Tuple[DecisionSteps, TerminalSteps]]
 AllGroupSpec = Dict[BehaviorName, BehaviorSpec]
                    step_info.environment_stats, step_info.worker_id
                )
        return len(step_infos)
-
-    @staticmethod
-    def action_tuple_from_numpy_dict(action_dict: Dict[str, np.ndarray]) -> ActionTuple:
-        if "continuous_action" in action_dict:
-            continuous = action_dict["continuous_action"]
-            if "discrete_action" in action_dict:
-                discrete = action_dict["discrete_action"]
-                action_tuple = ActionTuple(continuous, discrete)
-            else:
-                action_tuple = ActionTuple.create_continuous(continuous)
-        elif "discrete_action" in action_dict:
-            discrete = action_dict["discrete_action"]
-            action_tuple = ActionTuple.create_discrete(discrete)
-        else:
-            raise UnityActionException(
-                "The action dict must contain entries for either continuous_action or discrete_action."
-            )
-        return action_tuple
--- a/ml-agents/mlagents/trainers/policy/policy.py
+++ b/ml-agents/mlagents/trainers/policy/policy.py
 from typing import Dict, List, Optional
 import numpy as np

-from mlagents_envs.base_env import DecisionSteps
+from mlagents_envs.base_env import ActionTuple, BehaviorSpec, DecisionSteps
-from mlagents_envs.base_env import BehaviorSpec
 from mlagents.trainers.settings import TrainerSettings, NetworkSettings


        self.trainer_settings = trainer_settings
        self.network_settings: NetworkSettings = trainer_settings.network_settings
        self.seed = seed
-        if (
-            self.behavior_spec.action_spec.continuous_size > 0
-            and self.behavior_spec.action_spec.discrete_size > 0
-        ):
-            raise UnityPolicyException("Trainers do not support mixed action spaces.")
        self.act_size = (
            list(self.behavior_spec.action_spec.discrete_branches)
            if self.behavior_spec.action_spec.is_discrete()
    ) -> None:
        if memory_matrix is None:
            return
+
        for index, agent_id in enumerate(agent_ids):
            self.memory_dict[agent_id] = memory_matrix[index, :]

        )

    def save_previous_action(
-        self, agent_ids: List[str], action_dict: Dict[str, np.ndarray]
+        self, agent_ids: List[str], action_tuple: ActionTuple
-        if action_dict is None or "discrete_action" not in action_dict:
-            return
-            self.previous_action_dict[agent_id] = action_dict["discrete_action"][
-                index, :
-            ]
+            self.previous_action_dict[agent_id] = action_tuple.discrete[index, :]

    def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
        action_matrix = self.make_empty_previous_action(len(agent_ids))
--- a/ml-agents/mlagents/trainers/policy/tf_policy.py
+++ b/ml-agents/mlagents/trainers/policy/tf_policy.py
 from mlagents.tf_utils import tf
 from mlagents import tf_utils
 from mlagents_envs.exception import UnityException
-from mlagents_envs.base_env import BehaviorSpec
+from mlagents.trainers.torch.action_log_probs import LogProbsTuple
-from mlagents_envs.base_env import DecisionSteps
+from mlagents_envs.base_env import DecisionSteps, ActionTuple, BehaviorSpec
 from mlagents.trainers.tf.models import ModelUtils
 from mlagents.trainers.settings import TrainerSettings, EncoderType
 from mlagents.trainers import __version__
            reparameterize,
            condition_sigma_on_obs,
        )
+        if (
+            self.behavior_spec.action_spec.continuous_size > 0
+            and self.behavior_spec.action_spec.discrete_size > 0
+        ):
+            raise UnityPolicyException(
+                "TensorFlow does not support mixed action spaces. Please run with the Torch framework."
+            )
        # for ghost trainer save/load snapshots
        self.assign_phs: List[tf.Tensor] = []
        self.assign_ops: List[tf.Operation] = []
        self.save_memories(global_agent_ids, run_out.get("memory_out"))
        # For Compatibility with buffer changes for hybrid action support
        if "log_probs" in run_out:
-            run_out["log_probs"] = {"action_probs": run_out["log_probs"]}
+            log_probs_tuple = LogProbsTuple()
+            if self.behavior_spec.action_spec.is_continuous():
+                log_probs_tuple.add_continuous(run_out["log_probs"])
+            else:
+                log_probs_tuple.add_discrete(run_out["log_probs"])
+            run_out["log_probs"] = log_probs_tuple
+            action_tuple = ActionTuple()
-                run_out["action"] = {"continuous_action": run_out["action"]}
+                action_tuple.add_continuous(run_out["action"])
-                run_out["action"] = {"discrete_action": run_out["action"]}
+                action_tuple.add_discrete(run_out["action"])
+            run_out["action"] = action_tuple
        return ActionInfo(
            action=run_out.get("action"),
            value=run_out.get("value"),
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py
    GlobalSteps,
 )

-from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
+from mlagents.trainers.torch.utils import ModelUtils
+from mlagents.trainers.torch.agent_action import AgentAction
+from mlagents.trainers.torch.action_log_probs import ActionLogProbs

 EPSILON = 1e-7  # Small value to avoid divide by zero

    ) -> Tuple[SplitObservations, np.ndarray]:
        vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
        mask = None
-        if not self.use_continuous_act:
+        if self.behavior_spec.action_spec.discrete_size > 0:
            mask = torch.ones([len(decision_requests), np.sum(self.act_size)])
            if decision_requests.action_mask is not None:
                mask = torch.as_tensor(
        :param masks: Loss masks for RNN, else None.
        :param memories: Input memories when using RNN, else None.
        :param seq_len: Sequence length when using RNN.
-        :return: Tuple of actions, log probabilities (dependent on all_log_probs), entropies, and
-            output memories, all as Torch Tensors.
+        :return: Tuple of AgentAction, ActionLogProbs, entropies, and output memories.
-        if memories is None:
-            dists, memories = self.actor_critic.get_dists(
-                vec_obs, vis_obs, masks, memories, seq_len
-            )
-        else:
-            # If we're using LSTM. we need to execute the values to get the critic memories
-            dists, _, memories = self.actor_critic.get_dist_and_value(
-                vec_obs, vis_obs, masks, memories, seq_len
-            )
-        action_list = self.actor_critic.sample_action(dists)
-        log_probs_list, entropies, all_logs_list = ModelUtils.get_probs_and_entropy(
-            action_list, dists
-        )
-        actions = AgentAction.create(action_list, self.behavior_spec.action_spec)
-        log_probs = ActionLogProbs.create(
-            log_probs_list, self.behavior_spec.action_spec, all_logs_list
+        actions, log_probs, entropies, _, memories = self.actor_critic.get_action_stats_and_value(
+            vec_obs, vis_obs, masks, memories, seq_len
-        # Use the sum of entropy across actions, not the mean
-        entropy_sum = torch.sum(entropies, dim=1)
-        return (actions, log_probs, entropy_sum, memories)
+        return (actions, log_probs, entropies, memories)

    def evaluate_actions(
        self,
        memories: Optional[torch.Tensor] = None,
        seq_len: int = 1,
    ) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
-        dists, value_heads, _ = self.actor_critic.get_dist_and_value(
-            vec_obs, vis_obs, masks, memories, seq_len
-        )
-        action_list = actions.to_tensor_list()
-        log_probs_list, entropies, _ = ModelUtils.get_probs_and_entropy(
-            action_list, dists
+        log_probs, entropies, value_heads = self.actor_critic.get_stats_and_value(
+            vec_obs, vis_obs, actions, masks, memories, seq_len
-        log_probs = ActionLogProbs.create(
-            log_probs_list, self.behavior_spec.action_spec
-        )
-        # Use the sum of entropy across actions, not the mean
-        entropy_sum = torch.sum(entropies, dim=1)
-        return log_probs, entropy_sum, value_heads
+        return log_probs, entropies, value_heads

    @timed
    def evaluate(
            action, log_probs, entropy, memories = self.sample_actions(
                vec_obs, vis_obs, masks=masks, memories=memories
            )
-        action_dict = action.to_numpy_dict()
-        run_out["action"] = action_dict
+        action_tuple = action.to_action_tuple()
+        run_out["action"] = action_tuple
-            action_dict["continuous_action"] if self.use_continuous_act else None
+            action_tuple.continuous if self.use_continuous_act else None
-        run_out["log_probs"] = log_probs.to_numpy_dict()
+        run_out["log_probs"] = log_probs.to_log_probs_tuple()
        run_out["entropy"] = ModelUtils.to_numpy(entropy)
        run_out["learning_rate"] = 0.0
        if self.use_recurrent:
--- a/ml-agents/mlagents/trainers/ppo/optimizer_tf.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_tf.py
            self.policy.sequence_length_ph: self.policy.sequence_length,
            self.policy.mask_input: mini_batch["masks"] * burn_in_mask,
            self.advantage: mini_batch["advantages"],
-            self.all_old_log_probs: mini_batch["action_probs"],
+
+        if self.policy.use_continuous_act:  # For hybrid action buffer support
+            feed_dict[self.all_old_log_probs] = mini_batch["continuous_log_probs"]
+        else:
+            feed_dict[self.all_old_log_probs] = mini_batch["discrete_log_probs"]

        if self.policy.output_pre is not None and "actions_pre" in mini_batch:
            feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
 from mlagents.trainers.policy.torch_policy import TorchPolicy
 from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
 from mlagents.trainers.settings import TrainerSettings, PPOSettings
-from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
+from mlagents.trainers.torch.agent_action import AgentAction
+from mlagents.trainers.torch.action_log_probs import ActionLogProbs
+from mlagents.trainers.torch.utils import ModelUtils


 class TorchPPOOptimizer(TorchOptimizer):
                vis_obs.append(vis_ob)
        else:
            vis_obs = []
+
        log_probs, entropy, values = self.policy.evaluate_actions(
            vec_obs,
            vis_obs,
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
            behavior_spec,
            self.trainer_settings,
            condition_sigma_on_obs=False,  # Faster training for PPO
-            separate_critic=behavior_spec.action_spec.is_continuous(),
+            separate_critic=behavior_spec.action_spec.continuous_size > 0,
        )
        return policy

--- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py
 import numpy as np
-from typing import Dict, List, Mapping, cast, Tuple, Optional
+from typing import Dict, List, Mapping, NamedTuple, cast, Tuple, Optional
-from mlagents_envs.base_env import ActionSpec
-from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
+from mlagents.trainers.torch.agent_action import AgentAction
+from mlagents.trainers.torch.action_log_probs import ActionLogProbs
+from mlagents.trainers.torch.utils import ModelUtils
+from mlagents_envs.base_env import ActionSpec
 from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.settings import TrainerSettings, SACSettings
 from contextlib import ExitStack
            action_spec: ActionSpec,
        ):
            super().__init__()
-            self.action_spec = action_spec
-            if self.action_spec.is_continuous():
-                self.act_size = self.action_spec.continuous_size
-                num_value_outs = 1
-                num_action_ins = self.act_size
+            num_value_outs = max(sum(action_spec.discrete_branches), 1)
+            num_action_ins = int(action_spec.continuous_size)
-            else:
-                self.act_size = self.action_spec.discrete_branches
-                num_value_outs = sum(self.act_size)
-                num_action_ins = 0
            self.q1_network = ValueNetwork(
                stream_names,
                observation_shapes,
                )
            return q1_out, q2_out

+    class TargetEntropy(NamedTuple):
+
+        discrete: List[float] = []  # One per branch
+        continuous: float = 0.0
+
+    class LogEntCoef(nn.Module):
+        def __init__(self, discrete, continuous):
+            super().__init__()
+            self.discrete = discrete
+            self.continuous = continuous
+
    def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings):
        super().__init__(policy, trainer_params)
        hyperparameters: SACSettings = cast(SACSettings, trainer_params.hyperparameters)
        self.policy = policy
-        self.act_size = policy.act_size
        policy_network_settings = policy.network_settings

        self.tau = hyperparameters.tau
            name: int(not self.reward_signals[name].ignore_done)
            for name in self.stream_names
        }
+        self._action_spec = self.policy.behavior_spec.action_spec
-            self.policy.behavior_spec.action_spec,
+            self._action_spec,
        )

        self.target_network = ValueNetwork(
            self.policy.actor_critic.critic, self.target_network, 1.0
        )

-        self._log_ent_coef = torch.nn.Parameter(
-            torch.log(torch.as_tensor([self.init_entcoef] * len(self.act_size))),
+        # We create one entropy coefficient per action, whether discrete or continuous.
+        _disc_log_ent_coef = torch.nn.Parameter(
+            torch.log(
+                torch.as_tensor(
+                    [self.init_entcoef] * len(self._action_spec.discrete_branches)
+                )
+            ),
-        if self.policy.use_continuous_act:
-            self.target_entropy = torch.as_tensor(
-                -1
-                * self.continuous_target_entropy_scale
-                * np.prod(self.act_size[0]).astype(np.float32)
-            )
-        else:
-            self.target_entropy = [
-                self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
-                for i in self.act_size
-            ]
-
+        _cont_log_ent_coef = torch.nn.Parameter(
+            torch.log(
+                torch.as_tensor([self.init_entcoef] * self._action_spec.continuous_size)
+            ),
+            requires_grad=True,
+        )
+        self._log_ent_coef = TorchSACOptimizer.LogEntCoef(
+            discrete=_disc_log_ent_coef, continuous=_cont_log_ent_coef
+        )
+        _cont_target = (
+            -1
+            * self.continuous_target_entropy_scale
+            * np.prod(self._action_spec.continuous_size).astype(np.float32)
+        )
+        _disc_target = [
+            self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
+            for i in self._action_spec.discrete_branches
+        ]
+        self.target_entropy = TorchSACOptimizer.TargetEntropy(
+            continuous=_cont_target, discrete=_disc_target
+        )
-            self.policy.actor_critic.distribution.parameters()
+            self.policy.actor_critic.action_model.parameters()
        )
        value_params = list(self.value_network.parameters()) + list(
            self.policy.actor_critic.critic.parameters()
            value_params, lr=hyperparameters.learning_rate
        )
        self.entropy_optimizer = torch.optim.Adam(
-            [self._log_ent_coef], lr=hyperparameters.learning_rate
+            self._log_ent_coef.parameters(), lr=hyperparameters.learning_rate
        )
        self._move_to_device(default_device())

        q1p_out: Dict[str, torch.Tensor],
        q2p_out: Dict[str, torch.Tensor],
        loss_masks: torch.Tensor,
-        discrete: bool,
-            _ent_coef = torch.exp(self._log_ent_coef)
-            for name in values.keys():
-                if not discrete:
-                    min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name])
-                else:
-                    action_probs = log_probs.all_discrete_tensor.exp()
-                    _branched_q1p = ModelUtils.break_into_branches(
-                        q1p_out[name] * action_probs, self.act_size
-                    )
-                    _branched_q2p = ModelUtils.break_into_branches(
-                        q2p_out[name] * action_probs, self.act_size
-                    )
-                    _q1p_mean = torch.mean(
-                        torch.stack(
-                            [
-                                torch.sum(_br, dim=1, keepdim=True)
-                                for _br in _branched_q1p
-                            ]
-                        ),
-                        dim=0,
-                    )
-                    _q2p_mean = torch.mean(
-                        torch.stack(
-                            [
-                                torch.sum(_br, dim=1, keepdim=True)
-                                for _br in _branched_q2p
-                            ]
-                        ),
-                        dim=0,
-                    )
+            _cont_ent_coef = self._log_ent_coef.continuous.exp()
+            _disc_ent_coef = self._log_ent_coef.discrete.exp()
+        for name in values.keys():
+            if self._action_spec.discrete_size <= 0:
+                min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name])
+            else:
+                disc_action_probs = log_probs.all_discrete_tensor.exp()
+                _branched_q1p = ModelUtils.break_into_branches(
+                    q1p_out[name] * disc_action_probs,
+                    self._action_spec.discrete_branches,
+                )
+                _branched_q2p = ModelUtils.break_into_branches(
+                    q2p_out[name] * disc_action_probs,
+                    self._action_spec.discrete_branches,
+                )
+                _q1p_mean = torch.mean(
+                    torch.stack(
+                        [torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q1p]
+                    ),
+                    dim=0,
+                )
+                _q2p_mean = torch.mean(
+                    torch.stack(
+                        [torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q2p]
+                    ),
+                    dim=0,
+                )
-                    min_policy_qs[name] = torch.min(_q1p_mean, _q2p_mean)
+                min_policy_qs[name] = torch.min(_q1p_mean, _q2p_mean)
-        if not discrete:
+        if self._action_spec.discrete_size <= 0:
-                        _ent_coef * log_probs.continuous_tensor, dim=1
+                        _cont_ent_coef * log_probs.continuous_tensor, dim=1
                    )
                value_loss = 0.5 * ModelUtils.masked_mean(
                    torch.nn.functional.mse_loss(values[name], v_backup), loss_masks
+            disc_log_probs = log_probs.all_discrete_tensor
-                log_probs.all_discrete_tensor * log_probs.all_discrete_tensor.exp(),
-                self.act_size,
+                disc_log_probs * disc_log_probs.exp(),
+                self._action_spec.discrete_branches,
-                    torch.sum(_ent_coef[i] * _lp, dim=1, keepdim=True)
+                    torch.sum(_disc_ent_coef[i] * _lp, dim=1, keepdim=True)
                    for i, _lp in enumerate(branched_per_action_ent)
                ]
            )
                        branched_ent_bonus, axis=0
                    )
+                    # Add continuous entropy bonus to minimum Q
+                    if self._action_spec.continuous_size > 0:
+                        torch.sum(
+                            _cont_ent_coef * log_probs.continuous_tensor,
+                            dim=1,
+                            keepdim=True,
+                        )
                value_loss = 0.5 * ModelUtils.masked_mean(
                    torch.nn.functional.mse_loss(values[name], v_backup.squeeze()),
                    loss_masks,
        log_probs: ActionLogProbs,
        q1p_outs: Dict[str, torch.Tensor],
        loss_masks: torch.Tensor,
-        discrete: bool,
-        _ent_coef = torch.exp(self._log_ent_coef)
+        _cont_ent_coef, _disc_ent_coef = (
+            self._log_ent_coef.continuous,
+            self._log_ent_coef.discrete,
+        )
+        _cont_ent_coef = _cont_ent_coef.exp()
+        _disc_ent_coef = _disc_ent_coef.exp()
+
-        if not discrete:
-            mean_q1 = mean_q1.unsqueeze(1)
-            batch_policy_loss = torch.mean(
-                _ent_coef * log_probs.continuous_tensor - mean_q1, dim=1
-            )
-            policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
-        else:
-            action_probs = log_probs.all_discrete_tensor.exp()
+        batch_policy_loss = 0
+        if self._action_spec.discrete_size > 0:
+            disc_log_probs = log_probs.all_discrete_tensor
+            disc_action_probs = disc_log_probs.exp()
-                log_probs.all_discrete_tensor * action_probs, self.act_size
+                disc_log_probs * disc_action_probs, self._action_spec.discrete_branches
-                mean_q1 * action_probs, self.act_size
+                mean_q1 * disc_action_probs, self._action_spec.discrete_branches
-                    torch.sum(_ent_coef[i] * _lp - _qt, dim=1, keepdim=True)
+                    torch.sum(_disc_ent_coef[i] * _lp - _qt, dim=1, keepdim=False)
                    for i, (_lp, _qt) in enumerate(
                        zip(branched_per_action_ent, branched_q_term)
                    )
-            batch_policy_loss = torch.squeeze(branched_policy_loss)
-            policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
+            batch_policy_loss += torch.sum(branched_policy_loss, dim=1)
+            all_mean_q1 = torch.sum(disc_action_probs * mean_q1, dim=1)
+        else:
+            all_mean_q1 = mean_q1
+        if self._action_spec.continuous_size > 0:
+            cont_log_probs = log_probs.continuous_tensor
+            batch_policy_loss += torch.mean(
+                _cont_ent_coef * cont_log_probs - all_mean_q1.unsqueeze(1), dim=1
+            )
+        policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
+
-        self, log_probs: ActionLogProbs, loss_masks: torch.Tensor, discrete: bool
+        self, log_probs: ActionLogProbs, loss_masks: torch.Tensor
-        if not discrete:
-            with torch.no_grad():
-                target_current_diff = torch.sum(
-                    log_probs.continuous_tensor + self.target_entropy, dim=1
-                )
-            entropy_loss = -1 * ModelUtils.masked_mean(
-                self._log_ent_coef * target_current_diff, loss_masks
-            )
-        else:
+        _cont_ent_coef, _disc_ent_coef = (
+            self._log_ent_coef.continuous,
+            self._log_ent_coef.discrete,
+        )
+        entropy_loss = 0
+        if self._action_spec.discrete_size > 0:
+                # Break continuous into separate branch
+                disc_log_probs = log_probs.all_discrete_tensor
-                    log_probs.all_discrete_tensor * log_probs.all_discrete_tensor.exp(),
-                    self.act_size,
+                    disc_log_probs * disc_log_probs.exp(),
+                    self._action_spec.discrete_branches,
-                            branched_per_action_ent, self.target_entropy
+                            branched_per_action_ent, self.target_entropy.discrete
                        )
                    ],
                    axis=1,
                )
-            entropy_loss = -1 * ModelUtils.masked_mean(
-                torch.mean(self._log_ent_coef * target_current_diff, axis=1), loss_masks
+            entropy_loss += -1 * ModelUtils.masked_mean(
+                torch.mean(_disc_ent_coef * target_current_diff, axis=1), loss_masks
+            )
+        if self._action_spec.continuous_size > 0:
+            with torch.no_grad():
+                cont_log_probs = log_probs.continuous_tensor
+                target_current_diff = torch.sum(
+                    cont_log_probs + self.target_entropy.continuous, dim=1
+                )
+            # We update all the _cont_ent_coef as one block
+            entropy_loss += -1 * ModelUtils.masked_mean(
+                torch.mean(_cont_ent_coef) * target_current_diff, loss_masks
            )

        return entropy_loss
    ) -> Dict[str, torch.Tensor]:
        condensed_q_output = {}
-        onehot_actions = ModelUtils.actions_to_onehot(discrete_actions, self.act_size)
+        onehot_actions = ModelUtils.actions_to_onehot(
+            discrete_actions, self._action_spec.discrete_branches
+        )
-            branched_q = ModelUtils.break_into_branches(item, self.act_size)
+            branched_q = ModelUtils.break_into_branches(
+                item, self._action_spec.discrete_branches
+            )
            only_action_qs = torch.stack(
                [
                    torch.sum(_act * _q, dim=1, keepdim=True)
        value_estimates, _ = self.policy.actor_critic.critic_pass(
            vec_obs, vis_obs, memories, sequence_length=self.policy.sequence_length
        )
-        if self.policy.use_continuous_act:
-            squeezed_actions = actions.continuous_tensor
-            # Only need grad for q1, as that is used for policy.
-            q1p_out, q2p_out = self.value_network(
-                vec_obs,
-                vis_obs,
-                sampled_actions.continuous_tensor,
-                memories=q_memories,
-                sequence_length=self.policy.sequence_length,
-                q2_grad=False,
-            )
-            q1_out, q2_out = self.value_network(
-                vec_obs,
-                vis_obs,
-                squeezed_actions,
-                memories=q_memories,
-                sequence_length=self.policy.sequence_length,
-            )
+
+        cont_sampled_actions = sampled_actions.continuous_tensor
+
+        cont_actions = actions.continuous_tensor
+        q1p_out, q2p_out = self.value_network(
+            vec_obs,
+            vis_obs,
+            cont_sampled_actions,
+            memories=q_memories,
+            sequence_length=self.policy.sequence_length,
+        )
+        q1_out, q2_out = self.value_network(
+            vec_obs,
+            vis_obs,
+            cont_actions,
+            memories=q_memories,
+            sequence_length=self.policy.sequence_length,
+        )
+
+        if self._action_spec.discrete_size > 0:
+            disc_actions = actions.discrete_tensor
+            q1_stream = self._condense_q_streams(q1_out, disc_actions)
+            q2_stream = self._condense_q_streams(q2_out, disc_actions)
+        else:
-        else:
-            # For discrete, you don't need to backprop through the Q for the policy
-            q1p_out, q2p_out = self.value_network(
-                vec_obs,
-                vis_obs,
-                memories=q_memories,
-                sequence_length=self.policy.sequence_length,
-                q1_grad=False,
-                q2_grad=False,
-            )
-            q1_out, q2_out = self.value_network(
-                vec_obs,
-                vis_obs,
-                memories=q_memories,
-                sequence_length=self.policy.sequence_length,
-            )
-            q1_stream = self._condense_q_streams(q1_out, actions.discrete_tensor)
-            q2_stream = self._condense_q_streams(q2_out, actions.discrete_tensor)

        with torch.no_grad():
            target_values, _ = self.target_network(
                sequence_length=self.policy.sequence_length,
            )
        masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
-        use_discrete = not self.policy.use_continuous_act
        dones = ModelUtils.list_to_tensor(batch["done"])

        q1_loss, q2_loss = self.sac_q_loss(
-            log_probs, value_estimates, q1p_out, q2p_out, masks, use_discrete
+            log_probs, value_estimates, q1p_out, q2p_out, masks
-        policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks, use_discrete)
-        entropy_loss = self.sac_entropy_loss(log_probs, masks, use_discrete)
+        policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks)
+        entropy_loss = self.sac_entropy_loss(log_probs, masks)

        total_value_loss = q1_loss + q2_loss + value_loss

            "Losses/Value Loss": value_loss.item(),
            "Losses/Q1 Loss": q1_loss.item(),
            "Losses/Q2 Loss": q2_loss.item(),
-            "Policy/Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef)).item(),
+            "Policy/Discrete Entropy Coeff": torch.mean(
+                torch.exp(self._log_ent_coef.discrete)
+            ).item(),
+            "Policy/Continuous Entropy Coeff": torch.mean(
+                torch.exp(self._log_ent_coef.continuous)
+            ).item(),
            "Policy/Learning Rate": decay_lr,
        }

--- a/ml-agents/mlagents/trainers/simple_env_manager.py
+++ b/ml-agents/mlagents/trainers/simple_env_manager.py
        self.previous_all_action_info = all_action_info

        for brain_name, action_info in all_action_info.items():
-            _action = EnvManager.action_tuple_from_numpy_dict(action_info.action)
-            self.env.set_actions(brain_name, _action)
+            self.env.set_actions(brain_name, action_info.action)
        self.env.step()
        all_step_result = self._generate_all_results()

--- a/ml-agents/mlagents/trainers/subprocess_env_manager.py
+++ b/ml-agents/mlagents/trainers/subprocess_env_manager.py
            if req.cmd == EnvironmentCommand.STEP:
                all_action_info = req.payload
                for brain_name, action_info in all_action_info.items():
-                    if len(action_info.action) != 0:
-                        _action = EnvManager.action_tuple_from_numpy_dict(
-                            action_info.action
-                        )
-                        env.set_actions(brain_name, _action)
+                    if len(action_info.agent_ids) > 0:
+                        env.set_actions(brain_name, action_info.action)
                env.step()
                all_step_result = _generate_all_results()
                # The timers in this process are independent from all the processes and the "main" process
--- a/ml-agents/mlagents/trainers/tests/mock_brain.py
+++ b/ml-agents/mlagents/trainers/tests/mock_brain.py
 import numpy as np

 from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.torch.action_log_probs import LogProbsTuple
 from mlagents.trainers.trajectory import Trajectory, AgentExperience
 from mlagents_envs.base_env import (
    DecisionSteps,
+    ActionTuple,
 )


    steps_list = []

    action_size = action_spec.discrete_size + action_spec.continuous_size
-    action_probs = {
-        "action_probs": np.ones(
-            int(np.sum(action_spec.discrete_branches) + action_spec.continuous_size),
-            dtype=np.float32,
-        )
-    }
    for _i in range(length - 1):
        obs = []
        for _shape in observation_shapes:
-        if action_spec.is_continuous():
-            action = {"continuous_action": np.zeros(action_size, dtype=np.float32)}
-        else:
-            action = {"discrete_action": np.zeros(action_size, dtype=np.float32)}
+        action = ActionTuple(
+            continuous=np.zeros(action_spec.continuous_size, dtype=np.float32),
+            discrete=np.zeros(action_spec.discrete_size, dtype=np.int32),
+        )
+        action_probs = LogProbsTuple(
+            continuous=np.ones(action_spec.continuous_size, dtype=np.float32),
+            discrete=np.ones(action_spec.discrete_size, dtype=np.float32),
+        )
        action_pre = np.zeros(action_size, dtype=np.float32)
        action_mask = (
            [
--- a/ml-agents/mlagents/trainers/tests/simple_test_envs.py
+++ b/ml-agents/mlagents/trainers/tests/simple_test_envs.py

 OBS_SIZE = 1
 VIS_OBS_SIZE = (20, 20, 3)
-STEP_SIZE = 0.1
+STEP_SIZE = 0.2

 TIME_PENALTY = 0.01
 MIN_STEPS = int(1.0 / STEP_SIZE) + 1
    def __init__(
        self,
        brain_names,
-        use_discrete,
-        action_size=1,
+        action_sizes=(1, 0),
-        self.discrete = use_discrete
-        if use_discrete:
-            action_spec = ActionSpec.create_discrete(
-                tuple(2 for _ in range(action_size))
-            )
-        else:
-            action_spec = ActionSpec.create_continuous(action_size)
+        continuous_action_size, discrete_action_size = action_sizes
+        discrete_tuple = tuple(2 for _ in range(discrete_action_size))
+        action_spec = ActionSpec(continuous_action_size, discrete_tuple)
+        self.total_action_size = (
+            continuous_action_size + discrete_action_size
+        )  # to set the goals/positions
+        self.action_spec = action_spec
-        self.action_size = action_size
        self.names = brain_names
        self.positions: Dict[str, List[float]] = {}
        self.step_count: Dict[str, float] = {}
    def _take_action(self, name: str) -> bool:
        deltas = []
        _act = self.action[name]
+        if self.action_spec.continuous_size > 0:
+            for _cont in _act.continuous[0]:
+                deltas.append(_cont)
-        if self.action_spec.continuous_size > 0:
-            for _cont in _act.continuous[0]:
-                deltas.append(_cont)
        for i, _delta in enumerate(deltas):
            _delta = clamp(_delta, -self.step_size, self.step_size)
            self.positions[name][i] += _delta
        return done

    def _generate_mask(self):
-        if self.discrete:
+        action_mask = None
+        if self.action_spec.discrete_size > 0:
-            ndmask = np.array(2 * self.action_size * [False], dtype=np.bool)
+            ndmask = np.array(
+                2 * self.action_spec.discrete_size * [False], dtype=np.bool
+            )
-        else:
-            action_mask = None
        return action_mask

    def _compute_reward(self, name: str, done: bool) -> float:

    def _reset_agent(self, name):
        self.goal[name] = self.random.choice([-1, 1])
-        self.positions[name] = [0.0 for _ in range(self.action_size)]
+        self.positions[name] = [0.0 for _ in range(self.total_action_size)]
        self.step_count[name] = 0
        self.rewards[name] = 0
        self.agent_id[name] = self.agent_id[name] + 1


 class MemoryEnvironment(SimpleEnvironment):
-    def __init__(self, brain_names, use_discrete, step_size=0.2):
-        super().__init__(brain_names, use_discrete, step_size=step_size)
+    def __init__(self, brain_names, action_sizes=(1, 0), step_size=0.2):
+        super().__init__(brain_names, action_sizes=action_sizes, step_size=step_size)
        # Number of steps to reveal the goal for. Lower is harder. Should be
        # less than 1/step_size to force agent to use memory
        self.num_show_steps = 2
    def __init__(
        self,
        brain_names,
-        use_discrete,
+        action_sizes=(1, 0),
-            use_discrete,
+            action_sizes=action_sizes,
        )
        self.demonstration_protos: Dict[str, List[AgentInfoActionPairProto]] = {}
        self.n_demos = n_demos
    def step(self) -> None:
        super().step()
        for name in self.names:
-            if self.discrete:
+            if self.action_spec.discrete_size > 0:
                action = self.action[name].discrete
            else:
                action = self.action[name].continuous
        self.reset()
        for _ in range(self.n_demos):
            for name in self.names:
-                if self.discrete:
+                if self.action_spec.discrete_size > 0:
                    self.action[name] = ActionTuple(
                        np.array([], dtype=np.float32),
                        np.array(
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    # Test update
-    update_buffer = mb.simulate_rollout(
-        BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
-    )
+    behavior_spec = optimizer.policy.behavior_spec
+    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
+    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
+    if discrete:
+        n_agents = len(update_buffer["discrete_log_probs"])
+        update_buffer["discrete_log_probs"] = np.ones(
+            (n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
+            dtype=np.float32,
+        )
+    else:
+        n_agents = len(update_buffer["continuous_log_probs"])
+        update_buffer["continuous_log_probs"] = np.ones(
+            (n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
+        )
+
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    # Test update
-    update_buffer = mb.simulate_rollout(
-        BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
-    )
+    behavior_spec = optimizer.policy.behavior_spec
+    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
+    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
+    if discrete:
+        n_agents = len(update_buffer["discrete_log_probs"])
+        update_buffer["discrete_log_probs"] = np.ones(
+            (n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
+            dtype=np.float32,
+        )
+    else:
+        n_agents = len(update_buffer["continuous_log_probs"])
+        update_buffer["continuous_log_probs"] = np.ones(
+            (n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
+        )
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
        use_visual=False,
    )
    # Test update
-    update_buffer = mb.simulate_rollout(
-        BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
-    )
+    behavior_spec = optimizer.policy.behavior_spec
+    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
+    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
+    n_agents = len(update_buffer["continuous_log_probs"])
+    update_buffer["continuous_log_probs"] = np.ones(
+        (n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
+    )
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]
-
+    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
+    if use_discrete:
+        n_agents = len(buffer["discrete_log_probs"])
+        buffer["discrete_log_probs"].reset_field()
+        for _ in range(n_agents):
+            buffer["discrete_log_probs"].append(
+                np.ones(
+                    int(sum(mock_behavior_spec.action_spec.discrete_branches)),
+                    dtype=np.float32,
+                )
+            )
+    else:
+        n_agents = len(buffer["continuous_log_probs"])
+        buffer["continuous_log_probs"].reset_field()
+        for _ in range(n_agents):
+            buffer["continuous_log_probs"].append(
+                np.ones(
+                    mock_behavior_spec.action_spec.continuous_size, dtype=np.float32
+                )
+            )
    trainer.update_buffer = buffer
    trainer._update_policy()

--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py
            assert all(reward > success_threshold for reward in processed_rewards)


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_ppo(use_discrete):
-    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_simple_ppo(action_sizes):
+    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_2d_ppo(use_discrete):
-    env = SimpleEnvironment(
-        [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
-    )
+@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
+def test_2d_ppo(action_sizes):
+    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
    new_hyperparams = attr.evolve(
        PPO_TF_CONFIG.hyperparameters, batch_size=64, buffer_size=640
    )
    _check_environment_trains(env, {BRAIN_NAME: config})


-@pytest.mark.parametrize("use_discrete", [True, False])
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_visual_ppo(num_visual, use_discrete):
+def test_visual_ppo(num_visual, action_sizes):
-        use_discrete=use_discrete,
+        action_sizes=action_sizes,
        num_visual=num_visual,
        num_vector=0,
        step_size=0.2,
 def test_visual_advanced_ppo(vis_encode_type, num_visual):
    env = SimpleEnvironment(
        [BRAIN_NAME],
-        use_discrete=True,
+        action_sizes=(0, 1),
        num_visual=num_visual,
        num_vector=0,
        step_size=0.5,
    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_recurrent_ppo(use_discrete):
-    env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_recurrent_ppo(action_sizes):
+    env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes)
    new_network_settings = attr.evolve(
        PPO_TF_CONFIG.network_settings,
        memory=NetworkSettings.MemorySettings(memory_size=16),
    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_sac(use_discrete):
-    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_simple_sac(action_sizes):
+    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_2d_sac(use_discrete):
-    env = SimpleEnvironment(
-        [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
-    )
+@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
+def test_2d_sac(action_sizes):
+    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
    new_hyperparams = attr.evolve(SAC_TF_CONFIG.hyperparameters, buffer_init_steps=2000)
    config = attr.evolve(
        SAC_TF_CONFIG,
    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)


-@pytest.mark.parametrize("use_discrete", [True, False])
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_visual_sac(num_visual, use_discrete):
+def test_visual_sac(num_visual, action_sizes):
-        use_discrete=use_discrete,
+        action_sizes=action_sizes,
        num_visual=num_visual,
        num_vector=0,
        step_size=0.2,
 def test_visual_advanced_sac(vis_encode_type, num_visual):
    env = SimpleEnvironment(
        [BRAIN_NAME],
-        use_discrete=True,
+        action_sizes=(0, 1),
        num_visual=num_visual,
        num_vector=0,
        step_size=0.5,
        SAC_TF_CONFIG,
        hyperparameters=new_hyperparams,
        network_settings=new_networksettings,
-        max_steps=100,
+        max_steps=200,
        framework=FrameworkType.TENSORFLOW,
    )
    # The number of steps is pretty small for these encoders
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_recurrent_sac(use_discrete):
-    step_size = 0.2 if use_discrete else 0.5
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_recurrent_sac(action_sizes):
+    step_size = 0.2 if action_sizes == (0, 1) else 0.5
-        [BRAIN_NAME], use_discrete=use_discrete, step_size=step_size
+        [BRAIN_NAME], action_sizes=action_sizes, step_size=step_size
    )
    new_networksettings = attr.evolve(
        SAC_TF_CONFIG.network_settings,
    _check_environment_trains(env, {BRAIN_NAME: config})


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_ghost(use_discrete):
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_simple_ghost(action_sizes):
-        [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
+        [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes
    )
    self_play_settings = SelfPlaySettings(
        play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000
    _check_environment_trains(env, {BRAIN_NAME: config})


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_ghost_fails(use_discrete):
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_simple_ghost_fails(action_sizes):
-        [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
+        [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes
    )
    # This config should fail because the ghosted policy is never swapped with a competent policy.
    # Swap occurs after max step is reached.
    )


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_asymm_ghost(use_discrete):
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_simple_asymm_ghost(action_sizes):
-        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
+        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes
    )
    self_play_settings = SelfPlaySettings(
        play_against_latest_model_ratio=1.0,
    _check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_asymm_ghost_fails(use_discrete):
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_simple_asymm_ghost_fails(action_sizes):
-        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
+        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes
    )
    # This config should fail because the team that us not learning when both have reached
    # max step should be executing the initial, untrained poliy.

@pytest.fixture(scope="session")
 def simple_record(tmpdir_factory):
-    def record_demo(use_discrete, num_visual=0, num_vector=1):
+    def record_demo(action_sizes, num_visual=0, num_vector=1):
-            use_discrete=use_discrete,
+            action_sizes=action_sizes,
            num_visual=num_visual,
            num_vector=num_vector,
            n_demos=100,
        env.solve()
+        continuous_size, discrete_size = action_sizes
+        use_discrete = True if discrete_size > 0 else False
        agent_info_protos = env.demonstration_protos[BRAIN_NAME]
        meta_data_proto = DemonstrationMetaProto()
        brain_param_proto = BrainParametersProto(
    return record_demo


-@pytest.mark.parametrize("use_discrete", [True, False])
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_gail(simple_record, use_discrete, trainer_config):
-    demo_path = simple_record(use_discrete)
-    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
+def test_gail(simple_record, action_sizes, trainer_config):
+    demo_path = simple_record(action_sizes)
+    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.2)
    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
    reward_signals = {
        RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_gail_visual_ppo(simple_record, use_discrete):
-    demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_gail_visual_ppo(simple_record, action_sizes):
+    demo_path = simple_record(action_sizes, num_visual=1, num_vector=0)
-        use_discrete=use_discrete,
+        action_sizes=action_sizes,
        step_size=0.2,
    )
    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)
    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_gail_visual_sac(simple_record, use_discrete):
-    demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_gail_visual_sac(simple_record, action_sizes):
+    demo_path = simple_record(action_sizes, num_visual=1, num_vector=0)
-        use_discrete=use_discrete,
+        action_sizes=action_sizes,
        step_size=0.2,
    )
    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py
+++ b/ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py
    behavior_spec = basic_behavior_spec()
    policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output")
    policy_eval_out = {
-        "action": {"continuous_action": np.array([1.0], dtype=np.float32)},
+        "action": np.array([[1.0]], dtype=np.float32),
        "memory_out": np.array([[2.5]], dtype=np.float32),
        "value": np.array([1.1], dtype=np.float32),
    }
--- a/ml-agents/mlagents/trainers/tests/test_agent_processor.py
+++ b/ml-agents/mlagents/trainers/tests/test_agent_processor.py
    AgentManagerQueue,
 )
 from mlagents.trainers.action_info import ActionInfo
+from mlagents.trainers.torch.action_log_probs import LogProbsTuple
-from mlagents_envs.base_env import ActionSpec
+from mlagents_envs.base_env import ActionSpec, ActionTuple


 def create_mock_policy():
    )

    fake_action_outputs = {
-        "action": {"continuous_action": [0.1, 0.1]},
+        "action": ActionTuple(continuous=np.array([[0.1], [0.1]])),
-        "log_probs": {"continuous_log_probs": [0.1, 0.1]},
+        "log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])),
    }
    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=2,
    fake_action_info = ActionInfo(
-        action={"continuous_action": [0.1, 0.1]},
+        action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
        value=[0.1, 0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_decision_steps.agent_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )
-
-        "action": {"continuous_action": [0.1]},
+        "action": ActionTuple(continuous=np.array([[0.1]])),
-        "log_probs": {"continuous_log_probs": [0.1]},
+        "log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
+
    mock_decision_step, mock_terminal_step = mb.create_mock_steps(
        num_agents=1,
        observation_shapes=[(8,)],
        done=True,
    )
    fake_action_info = ActionInfo(
-        action={"continuous_action": [0.1]},
+        action=ActionTuple(continuous=np.array([[0.1]])),
        value=[0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_decision_step.agent_id,
                mock_decision_step, mock_terminal_step, _ep, fake_action_info
            )
            add_calls.append(
-                mock.call([get_global_agent_id(_ep, 0)], {"continuous_action": [0.1]})
+                mock.call([get_global_agent_id(_ep, 0)], fake_action_outputs["action"])
            )
        processor.add_experiences(
            mock_done_decision_step, mock_done_terminal_step, _ep, fake_action_info
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )
-
-        "action": {"continuous_action": [0.1]},
+        "action": ActionTuple(continuous=np.array([[0.1]])),
-        "log_probs": {"continuous_log_probs": [0.1]},
+        "log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
+
    mock_decision_step, mock_terminal_step = mb.create_mock_steps(
        num_agents=1,
        observation_shapes=[(8,)],
-        action={"continuous_action": [0.1]},
+        action=ActionTuple(continuous=np.array([[0.1]])),
        value=[0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_decision_step.agent_id,
--- a/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
+++ b/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
@pytest.mark.parametrize("num_envs", [1, 4])
 def test_subprocess_env_endtoend(num_envs):
    def simple_env_factory(worker_id, config):
-        env = SimpleEnvironment(["1D"], use_discrete=True)
+        env = SimpleEnvironment(["1D"], action_sizes=(0, 1))
        return env

    env_manager = SubprocessEnvManager(
--- a/ml-agents/mlagents/trainers/tests/test_trajectory.py
+++ b/ml-agents/mlagents/trainers/tests/test_trajectory.py
        "done",
        "actions_pre",
        "continuous_action",
-        "action_probs",
+        "discrete_action",
+        "continuous_log_probs",
+        "discrete_log_probs",
        "action_mask",
        "prev_action",
        "environment_rewards",
--- a/ml-agents/mlagents/trainers/tests/torch/test_distributions.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_distributions.py
    optimizer = torch.optim.Adam(gauss_dist.parameters(), lr=3e-3)

    for _ in range(50):
-        dist_inst = gauss_dist(sample_embedding)[0]
+        dist_inst = gauss_dist(sample_embedding)
        if tanh_squash:
            assert isinstance(dist_inst, TanhGaussianDistInstance)
        else:
--- a/ml-agents/mlagents/trainers/tests/torch/test_networks.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_networks.py
 from mlagents.trainers.torch.networks import (
    NetworkBody,
    ValueNetwork,
-    SimpleActor,
-from mlagents.trainers.torch.distributions import (
-    GaussianDistInstance,
-    CategoricalDistInstance,
-)
-
 from mlagents_envs.base_env import ActionSpec


            assert _out[0] == pytest.approx(1.0, abs=0.1)


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_actor(use_discrete):
-    obs_size = 4
-    network_settings = NetworkSettings()
-    obs_shapes = [(obs_size,)]
-    act_size = [2]
-    if use_discrete:
-        masks = torch.ones((1, 1))
-        action_spec = ActionSpec.create_discrete(tuple(act_size))
-    else:
-        masks = None
-        action_spec = ActionSpec.create_continuous(act_size[0])
-    actor = SimpleActor(obs_shapes, network_settings, action_spec)
-    # Test get_dist
-    sample_obs = torch.ones((1, obs_size))
-    dists, _ = actor.get_dists([sample_obs], [], masks=masks)
-    for dist in dists:
-        if use_discrete:
-            assert isinstance(dist, CategoricalDistInstance)
-        else:
-            assert isinstance(dist, GaussianDistInstance)
-
-    # Test sample_actions
-    actions = actor.sample_action(dists)
-    for act in actions:
-        if use_discrete:
-            assert act.shape == (1, 1)
-        else:
-            assert act.shape == (1, act_size[0])
-
-    # Test forward
-    actions, ver_num, mem_size, is_cont, act_size_vec = actor.forward(
-        [sample_obs], [], masks=masks
-    )
-    for act in actions:
-        # This is different from above for ONNX export
-        if use_discrete:
-            assert act.shape == tuple(act_size)
-        else:
-            assert act.shape == (act_size[0], 1)
-
-    assert mem_size == 0
-    assert is_cont == int(not use_discrete)
-    assert act_size_vec == torch.tensor(act_size)
-
-
@pytest.mark.parametrize("ac_type", [SharedActorCritic, SeparateActorCritic])
@pytest.mark.parametrize("lstm", [True, False])
 def test_actor_critic(ac_type, lstm):
    )
    obs_shapes = [(obs_size,)]
-    act_size = [2]
+    act_size = 2
+    mask = torch.ones([1, act_size * 2])
-    action_spec = ActionSpec.create_continuous(act_size[0])
+    # action_spec = ActionSpec.create_continuous(act_size[0])
+    action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size)))
    actor = ac_type(obs_shapes, network_settings, action_spec, stream_names)
    if lstm:
        sample_obs = torch.ones((1, network_settings.memory.sequence_length, obs_size))
        else:
            assert value_out[stream].shape == (1,)

-    # Test get_dist_and_value
-    dists, value_out, mem_out = actor.get_dist_and_value(
-        [sample_obs], [], memories=memories
+    # Test get action stats and_value
+    action, log_probs, entropies, value_out, mem_out = actor.get_action_stats_and_value(
+        [sample_obs], [], memories=memories, masks=mask
+    if lstm:
+        assert action.continuous_tensor.shape == (64, 2)
+    else:
+        assert action.continuous_tensor.shape == (1, 2)
+
+    assert len(action.discrete_list) == 2
+    for _disc in action.discrete_list:
+        if lstm:
+            assert _disc.shape == (64, 1)
+        else:
+            assert _disc.shape == (1, 1)
+
-    for dist in dists:
-        assert isinstance(dist, GaussianDistInstance)
    for stream in stream_names:
        if lstm:
            assert value_out[stream].shape == (network_settings.memory.sequence_length,)
--- a/ml-agents/mlagents/trainers/tests/torch/test_policy.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_policy.py
 from mlagents.trainers.policy.torch_policy import TorchPolicy
 from mlagents.trainers.tests import mock_brain as mb
 from mlagents.trainers.settings import TrainerSettings, NetworkSettings
-from mlagents.trainers.torch.utils import ModelUtils, AgentAction
+from mlagents.trainers.torch.utils import ModelUtils
+from mlagents.trainers.torch.agent_action import AgentAction

 VECTOR_ACTION_SPACE = 2
 VECTOR_OBS_SPACE = 8

    run_out = policy.evaluate(decision_step, list(decision_step.agent_id))
    if discrete:
-        run_out["action"]["discrete_action"].shape == (
-            NUM_AGENTS,
-            len(DISCRETE_ACTION_SPACE),
-        )
+        run_out["action"].discrete.shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
-        assert run_out["action"]["continuous_action"].shape == (
-            NUM_AGENTS,
-            VECTOR_ACTION_SPACE,
-        )
+        assert run_out["action"].continuous.shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)


@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
--- a/ml-agents/mlagents/trainers/tests/torch/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_ppo.py
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]

-    # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
-    # in PyTorch it is saved as the total probability per branch. So we need to modify the
-    # log prob in the fake buffer here.
-    if discrete:
-        update_buffer["discrete_log_probs"] = np.ones_like(
-            update_buffer["discrete_action"]
-        )
-    else:
-        update_buffer["continuous_log_probs"] = np.ones_like(
-            update_buffer["continuous_action"]
-        )
    return_stats = optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
    update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
    update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"]
-    # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
-    # in PyTorch it is saved as the total probability per branch. So we need to modify the
-    # log prob in the fake buffer here.
-    if discrete:
-        update_buffer["discrete_log_probs"] = np.ones_like(
-            update_buffer["discrete_action"]
-        )
-    else:
-        update_buffer["continuous_log_probs"] = np.ones_like(
-            update_buffer["continuous_action"]
-        )
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
-    # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
-    # in PyTorch it is saved as the total probability per branch. So we need to modify the
-    # log prob in the fake buffer here.
-    update_buffer["continuous_log_probs"] = np.ones_like(
-        update_buffer["continuous_action"]
-    )
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
--- a/ml-agents/mlagents/trainers/tests/torch/test_sac.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_sac.py
        "Losses/Value Loss",
        "Losses/Q1 Loss",
        "Losses/Q2 Loss",
-        "Policy/Entropy Coeff",
+        "Policy/Continuous Entropy Coeff",
+        "Policy/Discrete Entropy Coeff",
        "Policy/Learning Rate",
    ]
    for stat in required_stats:
--- a/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
 SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_ppo(use_discrete):
-    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_simple_ppo(action_sizes):
+    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_2d_ppo(use_discrete):
-    env = SimpleEnvironment(
-        [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
-    )
+@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
+def test_2d_ppo(action_sizes):
+    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
    new_hyperparams = attr.evolve(
        PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=640
    )
    check_environment_trains(env, {BRAIN_NAME: config})


-@pytest.mark.parametrize("use_discrete", [True, False])
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_visual_ppo(num_visual, use_discrete):
+def test_visual_ppo(num_visual, action_sizes):
-        use_discrete=use_discrete,
+        action_sizes=action_sizes,
        num_visual=num_visual,
        num_vector=0,
        step_size=0.2,
 def test_visual_advanced_ppo(vis_encode_type, num_visual):
    env = SimpleEnvironment(
        [BRAIN_NAME],
-        use_discrete=True,
+        action_sizes=(0, 1),
        num_visual=num_visual,
        num_vector=0,
        step_size=0.5,
    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_recurrent_ppo(use_discrete):
-    env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_recurrent_ppo(action_sizes):
+    env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes)
    new_network_settings = attr.evolve(
        PPO_TORCH_CONFIG.network_settings,
        memory=NetworkSettings.MemorySettings(memory_size=16),
    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_sac(use_discrete):
-    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_simple_sac(action_sizes):
+    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_2d_sac(use_discrete):
-    env = SimpleEnvironment(
-        [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
-    )
+@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
+def test_2d_sac(action_sizes):
+    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
-        SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
+        SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=6000
-@pytest.mark.parametrize("use_discrete", [True, False])
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_visual_sac(num_visual, use_discrete):
+def test_visual_sac(num_visual, action_sizes):
-        use_discrete=use_discrete,
+        action_sizes=action_sizes,
        num_visual=num_visual,
        num_vector=0,
        step_size=0.2,
 def test_visual_advanced_sac(vis_encode_type, num_visual):
    env = SimpleEnvironment(
        [BRAIN_NAME],
-        use_discrete=True,
+        action_sizes=(0, 1),
        num_visual=num_visual,
        num_vector=0,
        step_size=0.5,
    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_recurrent_sac(use_discrete):
-    step_size = 0.2 if use_discrete else 0.5
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_recurrent_sac(action_sizes):
+    step_size = 0.2 if action_sizes == (0, 1) else 0.5
-        [BRAIN_NAME], use_discrete=use_discrete, step_size=step_size
+        [BRAIN_NAME], action_sizes=action_sizes, step_size=step_size
    )
    new_networksettings = attr.evolve(
        SAC_TORCH_CONFIG.network_settings,
    check_environment_trains(env, {BRAIN_NAME: config})


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_ghost(use_discrete):
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_simple_ghost(action_sizes):
-        [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
+        [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes
    )
    self_play_settings = SelfPlaySettings(
        play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_ghost_fails(use_discrete):
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_simple_ghost_fails(action_sizes):
-        [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
+        [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes
    )
    # This config should fail because the ghosted policy is never swapped with a competent policy.
    # Swap occurs after max step is reached.
    )


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_asymm_ghost(use_discrete):
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_simple_asymm_ghost(action_sizes):
-        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
+        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes
    )
    self_play_settings = SelfPlaySettings(
        play_against_latest_model_ratio=1.0,
    check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_simple_asymm_ghost_fails(use_discrete):
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_simple_asymm_ghost_fails(action_sizes):
-        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
+        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes
    )
    # This config should fail because the team that us not learning when both have reached
    # max step should be executing the initial, untrained poliy.

@pytest.fixture(scope="session")
 def simple_record(tmpdir_factory):
-    def record_demo(use_discrete, num_visual=0, num_vector=1):
+    def record_demo(action_sizes, num_visual=0, num_vector=1):
-            use_discrete=use_discrete,
+            action_sizes=action_sizes,
            num_visual=num_visual,
            num_vector=num_vector,
            n_demos=100,
        agent_info_protos = env.demonstration_protos[BRAIN_NAME]
        meta_data_proto = DemonstrationMetaProto()
        brain_param_proto = BrainParametersProto(
-            vector_action_size=[2] if use_discrete else [1],
+            vector_action_size=[2] if action_sizes else [1],
-            vector_action_space_type=discrete if use_discrete else continuous,
+            vector_action_space_type=discrete if action_sizes else continuous,
-        action_type = "Discrete" if use_discrete else "Continuous"
+        action_type = "Discrete" if action_sizes else "Continuous"
        demo_path_name = "1DTest" + action_type + ".demo"
        demo_path = str(tmpdir_factory.mktemp("tmp_demo").join(demo_path_name))
        write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos)


-@pytest.mark.parametrize("use_discrete", [True, False])
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_gail(simple_record, use_discrete, trainer_config):
-    demo_path = simple_record(use_discrete)
-    env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
+def test_gail(simple_record, action_sizes, trainer_config):
+    demo_path = simple_record(action_sizes)
+    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.2)
    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
    reward_signals = {
        RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_gail_visual_ppo(simple_record, use_discrete):
-    demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_gail_visual_ppo(simple_record, action_sizes):
+    demo_path = simple_record(action_sizes, num_visual=1, num_vector=0)
-        use_discrete=use_discrete,
+        action_sizes=action_sizes,
        step_size=0.2,
    )
    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)
    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)


-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_gail_visual_sac(simple_record, use_discrete):
-    demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_gail_visual_sac(simple_record, action_sizes):
+    demo_path = simple_record(action_sizes, num_visual=1, num_vector=0)
-        use_discrete=use_discrete,
+        action_sizes=action_sizes,
        step_size=0.2,
    )
    bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
--- a/ml-agents/mlagents/trainers/tests/torch/test_utils.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_utils.py
 from mlagents.trainers.torch.utils import ModelUtils
 from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.torch.encoders import VectorInput
-from mlagents.trainers.torch.distributions import (
-    CategoricalDistInstance,
-    GaussianDistInstance,
-)


 def test_min_visual_size():
    ]
    for res, exp in zip(oh_actions, expected_result):
        assert torch.equal(res, exp)
-
-
-def test_get_probs_and_entropy():
-    # Test continuous
-    # Add two dists to the list. This isn't done in the code but we'd like to support it.
-    dist_list = [
-        GaussianDistInstance(torch.zeros((1, 2)), torch.ones((1, 2))),
-        GaussianDistInstance(torch.zeros((1, 2)), torch.ones((1, 2))),
-    ]
-    action_list = [torch.zeros((1, 2)), torch.zeros((1, 2))]
-    log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy(
-        action_list, dist_list
-    )
-    for lp in log_probs:
-        assert lp.shape == (1, 2)
-    assert entropies.shape == (1, 2, 2)
-    assert all_probs == []
-
-    for log_prob in log_probs:
-        # Log prob of standard normal at 0
-        for lp in log_prob.flatten():
-            assert lp == pytest.approx(-0.919, abs=0.01)
-
-    for ent in entropies.flatten():
-        # entropy of standard normal at 0
-        assert ent == pytest.approx(1.42, abs=0.01)
-
-    # Test continuous
-    # Add two dists to the list.
-    act_size = 2
-    test_prob = torch.tensor(
-        [[1.0 - 0.1 * (act_size - 1)] + [0.1] * (act_size - 1)]
-    )  # High prob for first action
-    dist_list = [CategoricalDistInstance(test_prob), CategoricalDistInstance(test_prob)]
-    action_list = [torch.tensor([0]), torch.tensor([1])]
-    log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy(
-        action_list, dist_list
-    )
-    for all_prob in all_probs:
-        assert all_prob.shape == (1, act_size)
-    assert entropies.shape == (1, len(dist_list))
-    # Make sure the first action has high probability than the others.
-    assert log_probs[0] > log_probs[1]


 def test_masked_mean():
--- a/ml-agents/mlagents/trainers/torch/components/bc/module.py
+++ b/ml-agents/mlagents/trainers/torch/components/bc/module.py
 from mlagents.trainers.policy.torch_policy import TorchPolicy
 from mlagents.trainers.demo_loader import demo_to_buffer
 from mlagents.trainers.settings import BehavioralCloningSettings, ScheduleType
-from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
+from mlagents.trainers.torch.agent_action import AgentAction
+from mlagents.trainers.torch.action_log_probs import ActionLogProbs
+from mlagents.trainers.torch.utils import ModelUtils


 class BCModule:
        log_probs: ActionLogProbs,
        expert_actions: torch.Tensor,
    ) -> torch.Tensor:
-        if self.policy.use_continuous_act:
-            bc_loss = torch.nn.functional.mse_loss(
-                selected_actions.continuous_tensor, expert_actions
+        bc_loss = 0
+        if self.policy.behavior_spec.action_spec.continuous_size > 0:
+            bc_loss += torch.nn.functional.mse_loss(
+                selected_actions.continuous_tensor, expert_actions.continuous_tensor
-        else:
+        if self.policy.behavior_spec.action_spec.discrete_size > 0:
+            one_hot_expert_actions = ModelUtils.actions_to_onehot(
+                expert_actions.discrete_tensor,
+                self.policy.behavior_spec.action_spec.discrete_branches,
+            )
-            bc_loss = torch.mean(
+            bc_loss += torch.mean(
                torch.stack(
                    [
                        torch.sum(
                        )
                        for log_prob_branch, expert_actions_branch in zip(
-                            log_prob_branches, expert_actions
+                            log_prob_branches, one_hot_expert_actions
                        )
                    ]
                )
        """
        vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])]
        act_masks = None
-        if self.policy.use_continuous_act:
-            expert_actions = ModelUtils.list_to_tensor(
-                mini_batch_demo["continuous_action"]
-            )
-        else:
-            raw_expert_actions = ModelUtils.list_to_tensor(
-                mini_batch_demo["discrete_action"], dtype=torch.long
-            )
-            expert_actions = ModelUtils.actions_to_onehot(
-                raw_expert_actions, self.policy.act_size
-            )
+        expert_actions = AgentAction.from_dict(mini_batch_demo)
+        if self.policy.behavior_spec.action_spec.discrete_size > 0:
+
            act_masks = ModelUtils.list_to_tensor(
                np.ones(
                    (
--- a/ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
+++ b/ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
 import numpy as np
-from typing import Dict
+from typing import Dict, NamedTuple
 from mlagents.torch_utils import torch, default_device

 from mlagents.trainers.buffer import AgentBuffer
 from mlagents.trainers.settings import CuriositySettings

 from mlagents_envs.base_env import BehaviorSpec
-from mlagents.trainers.torch.utils import ModelUtils, AgentAction
+from mlagents.trainers.torch.agent_action import AgentAction
+from mlagents.trainers.torch.action_flattener import ActionFlattener
+from mlagents.trainers.torch.utils import ModelUtils
+
+
+class ActionPredictionTuple(NamedTuple):
+    continuous: torch.Tensor
+    discrete: torch.Tensor


 class CuriosityRewardProvider(BaseRewardProvider):
            specs.observation_shapes, state_encoder_settings
        )

-        self._action_flattener = ModelUtils.ActionFlattener(self._action_spec)
+        self._action_flattener = ActionFlattener(self._action_spec)
-        self.inverse_model_action_prediction = torch.nn.Sequential(
-            LinearEncoder(2 * settings.encoding_size, 1, 256),
-            linear_layer(256, self._action_flattener.flattened_size),
+        self.inverse_model_action_encoding = torch.nn.Sequential(
+            LinearEncoder(2 * settings.encoding_size, 1, 256)
+        if self._action_spec.continuous_size > 0:
+            self.continuous_action_prediction = linear_layer(
+                256, self._action_spec.continuous_size
+            )
+        if self._action_spec.discrete_size > 0:
+            self.discrete_action_prediction = linear_layer(
+                256, sum(self._action_spec.discrete_branches)
+            )
+
        self.forward_model_next_state_prediction = torch.nn.Sequential(
            LinearEncoder(
                settings.encoding_size + self._action_flattener.flattened_size, 1, 256
        )
        return hidden

-    def predict_action(self, mini_batch: AgentBuffer) -> torch.Tensor:
+    def predict_action(self, mini_batch: AgentBuffer) -> ActionPredictionTuple:
        """
        In the continuous case, returns the predicted action.
        In the discrete case, returns the logits.
        )
-        hidden = self.inverse_model_action_prediction(inverse_model_input)
-        if self._action_spec.is_continuous():
-            return hidden
-        else:
+
+        continuous_pred = None
+        discrete_pred = None
+        hidden = self.inverse_model_action_encoding(inverse_model_input)
+        if self._action_spec.continuous_size > 0:
+            continuous_pred = self.continuous_action_prediction(hidden)
+        if self._action_spec.discrete_size > 0:
+            raw_discrete_pred = self.discrete_action_prediction(hidden)
-                hidden, self._action_spec.discrete_branches
+                raw_discrete_pred, self._action_spec.discrete_branches
-            return torch.cat(branches, dim=1)
+            discrete_pred = torch.cat(branches, dim=1)
+        return ActionPredictionTuple(continuous_pred, discrete_pred)

    def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor:
        """
        actions = AgentAction.from_dict(mini_batch)
-        if self._action_spec.is_continuous():
-            action = actions.continuous_tensor
-        else:
-            action = torch.cat(
-                ModelUtils.actions_to_onehot(
-                    actions.discrete_tensor, self._action_spec.discrete_branches
-                ),
-                dim=1,
-            )
+        flattened_action = self._action_flattener.forward(actions)
-            (self.get_current_state(mini_batch), action), dim=1
+            (self.get_current_state(mini_batch), flattened_action), dim=1
        )

        return self.forward_model_next_state_prediction(forward_model_input)
        """
        predicted_action = self.predict_action(mini_batch)
        actions = AgentAction.from_dict(mini_batch)
-        if self._action_spec.is_continuous():
-            sq_difference = (actions.continuous_tensor - predicted_action) ** 2
+        _inverse_loss = 0
+        if self._action_spec.continuous_size > 0:
+            sq_difference = (
+                actions.continuous_tensor - predicted_action.continuous
+            ) ** 2
-            return torch.mean(
+            _inverse_loss += torch.mean(
                ModelUtils.dynamic_partition(
                    sq_difference,
                    ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float),
-        else:
+        if self._action_spec.discrete_size > 0:
            true_action = torch.cat(
                ModelUtils.actions_to_onehot(
                    actions.discrete_tensor, self._action_spec.discrete_branches
            cross_entropy = torch.sum(
-                -torch.log(predicted_action + self.EPSILON) * true_action, dim=1
+                -torch.log(predicted_action.discrete + self.EPSILON) * true_action,
+                dim=1,
-            return torch.mean(
+            _inverse_loss += torch.mean(
                ModelUtils.dynamic_partition(
                    cross_entropy,
                    ModelUtils.list_to_tensor(
                )[1]
            )
+        return _inverse_loss

    def compute_reward(self, mini_batch: AgentBuffer) -> torch.Tensor:
        """
--- a/ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
+++ b/ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
 )
 from mlagents.trainers.settings import GAILSettings
 from mlagents_envs.base_env import BehaviorSpec
-from mlagents.trainers.torch.utils import ModelUtils, AgentAction
+from mlagents.trainers.torch.utils import ModelUtils
+from mlagents.trainers.torch.agent_action import AgentAction
+from mlagents.trainers.torch.action_flattener import ActionFlattener
 from mlagents.trainers.torch.networks import NetworkBody
 from mlagents.trainers.torch.layers import linear_layer, Initialization
 from mlagents.trainers.settings import NetworkSettings, EncoderType
            vis_encode_type=EncoderType.SIMPLE,
            memory=None,
        )
-        self._action_flattener = ModelUtils.ActionFlattener(specs.action_spec)
+        self._action_flattener = ActionFlattener(specs.action_spec)
        unencoded_size = (
            self._action_flattener.flattened_size + 1 if settings.use_actions else 0
        )  # +1 is for dones
--- a/ml-agents/mlagents/trainers/torch/distributions.py
+++ b/ml-agents/mlagents/trainers/torch/distributions.py
        """
        pass

+    @abc.abstractmethod
+    def exported_model_output(self) -> torch.Tensor:
+        """
+        Returns the tensor to be exported to ONNX for the distribution
+        """
+        pass
+

 class DiscreteDistInstance(DistInstance):
    @abc.abstractmethod

    def entropy(self):
        return 0.5 * torch.log(2 * math.pi * math.e * self.std + EPSILON)
+
+    def exported_model_output(self):
+        return self.sample()


 class TanhGaussianDistInstance(GaussianDistInstance):
        return torch.log(self.probs)

    def entropy(self):
-        return -torch.sum(self.probs * torch.log(self.probs), dim=-1)
+        return -torch.sum(self.probs * torch.log(self.probs), dim=-1).unsqueeze(-1)
+
+    def exported_model_output(self):
+        return self.all_log_prob()


 class GaussianDistribution(nn.Module):
            # verified version of Barracuda (1.0.2).
            log_sigma = torch.cat([self.log_sigma] * inputs.shape[0], axis=0)
        if self.tanh_squash:
-            return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))]
+            return TanhGaussianDistInstance(mu, torch.exp(log_sigma))
-            return [GaussianDistInstance(mu, torch.exp(log_sigma))]
+            return GaussianDistInstance(mu, torch.exp(log_sigma))


 class MultiCategoricalDistribution(nn.Module):
--- a/ml-agents/mlagents/trainers/torch/networks.py
+++ b/ml-agents/mlagents/trainers/torch/networks.py
 from mlagents.torch_utils import torch, nn

 from mlagents_envs.base_env import ActionSpec
-from mlagents.trainers.torch.distributions import (
-    GaussianDistribution,
-    MultiCategoricalDistribution,
-    DistInstance,
-)
+from mlagents.trainers.torch.action_model import ActionModel
+from mlagents.trainers.torch.agent_action import AgentAction
+from mlagents.trainers.torch.action_log_probs import ActionLogProbs
 from mlagents.trainers.settings import NetworkSettings
 from mlagents.trainers.torch.utils import ModelUtils
 from mlagents.trainers.torch.decoders import ValueHeads
            else 0
        )

-        self.visual_processors, self.vector_processors, encoder_input_size = ModelUtils.create_input_processors(
+        (
+            self.visual_processors,
+            self.vector_processors,
+            encoder_input_size,
+        ) = ModelUtils.create_input_processors(
            observation_shapes,
            self.h_size,
            network_settings.vis_encode_type,
        pass

    @abc.abstractmethod
-    def sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
-        """
-        Takes a List of Distribution iinstances and samples an action from each.
-        """
-        pass
-
-    @abc.abstractmethod
-    def get_dists(
-        self,
-        vec_inputs: List[torch.Tensor],
-        vis_inputs: List[torch.Tensor],
-        masks: Optional[torch.Tensor] = None,
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-    ) -> Tuple[List[DistInstance], Optional[torch.Tensor]]:
-        """
-        Returns distributions from this Actor, from which actions can be sampled.
-        If memory is enabled, return the memories as well.
-        :param vec_inputs: A List of vector inputs as tensors.
-        :param vis_inputs: A List of visual inputs as tensors.
-        :param masks: If using discrete actions, a Tensor of action masks.
-        :param memories: If using memory, a Tensor of initial memories.
-        :param sequence_length: If using memory, the sequence length.
-        :return: A Tuple of a List of action distribution instances, and memories.
-            Memories will be None if not using memory.
-        """
-        pass
-
-    @abc.abstractmethod
    def forward(
        self,
        vec_inputs: List[torch.Tensor],
        pass

    @abc.abstractmethod
-    def get_dist_and_value(
+    def get_action_stats_and_value(
        self,
        vec_inputs: List[torch.Tensor],
        vis_inputs: List[torch.Tensor],
-    ) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
+    ) -> Tuple[
+        AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
+    ]:
        """
        Returns distributions, from which actions can be sampled, and value estimates.
        If memory is enabled, return the memories as well.
        :param memories: If using memory, a Tensor of initial memories.
        :param sequence_length: If using memory, the sequence length.
-        :return: A Tuple of a List of action distribution instances, a Dict of reward signal
+        :return: A Tuple of AgentAction, ActionLogProbs, entropies, Dict of reward signal
            name to value estimate, and memories. Memories will be None if not using memory.
        """
        pass
        else:
            self.encoding_size = network_settings.hidden_units

-        if self.action_spec.is_continuous():
-            self.distribution = GaussianDistribution(
-                self.encoding_size,
-                self.action_spec.continuous_size,
-                conditional_sigma=conditional_sigma,
-                tanh_squash=tanh_squash,
-            )
-        else:
-            self.distribution = MultiCategoricalDistribution(
-                self.encoding_size, self.action_spec.discrete_branches
-            )
+        self.action_model = ActionModel(
+            self.encoding_size,
+            action_spec,
+            conditional_sigma=conditional_sigma,
+            tanh_squash=tanh_squash,
+        )

    @property
    def memory_size(self) -> int:
        self.network_body.update_normalization(vector_obs)

-    def sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
-        actions = []
-        for action_dist in dists:
-            action = action_dist.sample()
-            actions.append(action)
-        return actions
-
-    def get_dists(
-        self,
-        vec_inputs: List[torch.Tensor],
-        vis_inputs: List[torch.Tensor],
-        masks: Optional[torch.Tensor] = None,
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-    ) -> Tuple[List[DistInstance], Optional[torch.Tensor]]:
-        encoding, memories = self.network_body(
-            vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
-        )
-        if self.action_spec.is_continuous():
-            dists = self.distribution(encoding)
-        else:
-            dists = self.distribution(encoding, masks)
-
-        return dists, memories
-
    def forward(
        self,
        vec_inputs: List[torch.Tensor],
        """
        Note: This forward() method is required for exporting to ONNX. Don't modify the inputs and outputs.
        """
-        dists, _ = self.get_dists(vec_inputs, vis_inputs, masks, memories, 1)
-        if self.action_spec.is_continuous():
-            action_list = self.sample_action(dists)
-            action_out = torch.stack(action_list, dim=-1)
-        else:
-            action_out = torch.cat([dist.all_log_prob() for dist in dists], dim=1)
+        encoding, memories_out = self.network_body(
+            vec_inputs, vis_inputs, memories=memories, sequence_length=1
+        )
+
+        # TODO: How this is written depends on how the inference model is structured
+        action_out = self.action_model.get_action_out(encoding, masks)
        return (
            action_out,
            self.version_number,
        conditional_sigma: bool = False,
        tanh_squash: bool = False,
    ):
+        self.use_lstm = network_settings.memory is not None
        super().__init__(
            observation_shapes,
            network_settings,
        )
        return self.value_heads(encoding), memories_out

-    def get_dist_and_value(
+    def get_stats_and_value(
+        actions: AgentAction,
-    ) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
+    ) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
-        if self.action_spec.is_continuous():
-            dists = self.distribution(encoding)
-        else:
-            dists = self.distribution(encoding, masks=masks)
+        log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
+        value_outputs = self.value_heads(encoding)
+        return log_probs, entropies, value_outputs
+    def get_action_stats_and_value(
+        self,
+        vec_inputs: List[torch.Tensor],
+        vis_inputs: List[torch.Tensor],
+        masks: Optional[torch.Tensor] = None,
+        memories: Optional[torch.Tensor] = None,
+        sequence_length: int = 1,
+    ) -> Tuple[
+        AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
+    ]:
+
+        encoding, memories = self.network_body(
+            vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
+        )
+        action, log_probs, entropies = self.action_model(encoding, masks)
-        return dists, value_outputs, memories
+        return action, log_probs, entropies, value_outputs, memories


 class SeparateActorCritic(SimpleActor, ActorCritic):
        conditional_sigma: bool = False,
        tanh_squash: bool = False,
    ):
-        # Give the Actor only half the memories. Note we previously validate
-        # that memory_size must be a multiple of 4.
        self.use_lstm = network_settings.memory is not None
        super().__init__(
            observation_shapes,
            memories_out = None
        return value_outputs, memories_out

-    def get_dist_and_value(
+    def get_stats_and_value(
+        self,
+        vec_inputs: List[torch.Tensor],
+        vis_inputs: List[torch.Tensor],
+        actions: AgentAction,
+        masks: Optional[torch.Tensor] = None,
+        memories: Optional[torch.Tensor] = None,
+        sequence_length: int = 1,
+    ) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
+        if self.use_lstm:
+            # Use only the back half of memories for critic and actor
+            actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1)
+        else:
+            critic_mem = None
+            actor_mem = None
+        encoding, actor_mem_outs = self.network_body(
+            vec_inputs, vis_inputs, memories=actor_mem, sequence_length=sequence_length
+        )
+        log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
+        value_outputs, critic_mem_outs = self.critic(
+            vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
+        )
+
+        return log_probs, entropies, value_outputs
+
+    def get_action_stats_and_value(
        self,
        vec_inputs: List[torch.Tensor],
        vis_inputs: List[torch.Tensor],
-    ) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
+    ) -> Tuple[
+        AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
+    ]:
        if self.use_lstm:
            # Use only the back half of memories for critic and actor
            actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1)
-        dists, actor_mem_outs = self.get_dists(
-            vec_inputs,
-            vis_inputs,
-            memories=actor_mem,
-            sequence_length=sequence_length,
-            masks=masks,
+        encoding, actor_mem_outs = self.network_body(
+            vec_inputs, vis_inputs, memories=actor_mem, sequence_length=sequence_length
+        action, log_probs, entropies = self.action_model(encoding, masks)
        value_outputs, critic_mem_outs = self.critic(
            vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
        )
            mem_out = None
-        return dists, value_outputs, mem_out
-
-    def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
-        super().update_normalization(vector_obs)
-        self.critic.network_body.update_normalization(vector_obs)
+        return action, log_probs, entropies, value_outputs, mem_out


 class GlobalSteps(nn.Module):
--- a/ml-agents/mlagents/trainers/torch/utils.py
+++ b/ml-agents/mlagents/trainers/torch/utils.py
-from typing import List, Optional, Tuple, NamedTuple, Dict
+from typing import List, Optional, Tuple
 from mlagents.torch_utils import torch, nn
 import numpy as np

 )
 from mlagents.trainers.settings import EncoderType, ScheduleType
 from mlagents.trainers.exception import UnityTrainerException
-from mlagents_envs.base_env import ActionSpec
-from mlagents.trainers.torch.distributions import DistInstance, DiscreteDistInstance
-
-
-class AgentAction(NamedTuple):
-    """
-    A NamedTuple containing the tensor for continuous actions and list of tensors for
-    discrete actions. Utility functions provide numpy <=> tensor conversions to be
-    sent as actions to the environment manager as well as used by the optimizers.
-    :param continuous_tensor: Torch tensor corresponding to continuous actions
-    :param discrete_list: List of Torch tensors each corresponding to discrete actions
-    """
-
-    continuous_tensor: torch.Tensor
-    discrete_list: List[torch.Tensor]
-
-    @property
-    def discrete_tensor(self):
-        """
-        Returns the discrete action list as a stacked tensor
-        """
-        return torch.stack(self.discrete_list, dim=-1)
-
-    def to_numpy_dict(self) -> Dict[str, np.ndarray]:
-        """
-        Returns a Dict of np arrays with an entry correspinding to the continuous action
-        and an entry corresponding to the discrete action. "continuous_action" and
-        "discrete_action" are added to the agents buffer individually to maintain a flat buffer.
-        """
-        array_dict: Dict[str, np.ndarray] = {}
-        if self.continuous_tensor is not None:
-            array_dict["continuous_action"] = ModelUtils.to_numpy(
-                self.continuous_tensor
-            )
-        if self.discrete_list is not None:
-            array_dict["discrete_action"] = ModelUtils.to_numpy(
-                self.discrete_tensor[:, 0, :]
-            )
-        return array_dict
-
-    def to_tensor_list(self) -> List[torch.Tensor]:
-        """
-        Returns the tensors in the AgentAction as a flat List of torch Tensors. This will be removed
-        when the ActionModel is merged.
-        """
-        tensor_list: List[torch.Tensor] = []
-        if self.continuous_tensor is not None:
-            tensor_list.append(self.continuous_tensor)
-        if self.discrete_list is not None:
-            tensor_list += (
-                self.discrete_list
-            )  # Note this is different for ActionLogProbs
-        return tensor_list
-
-    @staticmethod
-    def create(
-        tensor_list: List[torch.Tensor], action_spec: ActionSpec
-    ) -> "AgentAction":
-        """
-        A static method that converts a list of torch Tensors into an AgentAction using the ActionSpec.
-        This will change (and may be removed) in the ActionModel.
-        """
-        continuous: torch.Tensor = None
-        discrete: List[torch.Tensor] = None  # type: ignore
-        _offset = 0
-        if action_spec.continuous_size > 0:
-            continuous = tensor_list[0]
-            _offset = 1
-        if action_spec.discrete_size > 0:
-            discrete = tensor_list[_offset:]
-        return AgentAction(continuous, discrete)
-
-    @staticmethod
-    def from_dict(buff: Dict[str, np.ndarray]) -> "AgentAction":
-        """
-        A static method that accesses continuous and discrete action fields in an AgentBuffer
-        and constructs the corresponding AgentAction from the retrieved np arrays.
-        """
-        continuous: torch.Tensor = None
-        discrete: List[torch.Tensor] = None  # type: ignore
-        if "continuous_action" in buff:
-            continuous = ModelUtils.list_to_tensor(buff["continuous_action"])
-        if "discrete_action" in buff:
-            discrete_tensor = ModelUtils.list_to_tensor(
-                buff["discrete_action"], dtype=torch.long
-            )
-            discrete = [
-                discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
-            ]
-        return AgentAction(continuous, discrete)
-
-
-class ActionLogProbs(NamedTuple):
-    """
-    A NamedTuple containing the tensor for continuous log probs and list of tensors for
-    discrete log probs of individual actions as well as all the log probs for an entire branch.
-    Utility functions provide numpy <=> tensor conversions to be used by the optimizers.
-    :param continuous_tensor: Torch tensor corresponding to log probs of continuous actions
-    :param discrete_list: List of Torch tensors each corresponding to log probs of the discrete actions that were
-    sampled.
-    :param all_discrete_list: List of Torch tensors each corresponding to all log probs of
-    a discrete action branch, even the discrete actions that were not sampled. all_discrete_list is a list of Tensors,
-    each Tensor corresponds to one discrete branch log probabilities.
-    """
-
-    continuous_tensor: torch.Tensor
-    discrete_list: List[torch.Tensor]
-    all_discrete_list: Optional[List[torch.Tensor]]
-
-    @property
-    def discrete_tensor(self):
-        """
-        Returns the discrete log probs list as a stacked tensor
-        """
-        return torch.stack(self.discrete_list, dim=-1)
-
-    @property
-    def all_discrete_tensor(self):
-        """
-        Returns the discrete log probs of each branch as a tensor
-        """
-        return torch.cat(self.all_discrete_list, dim=1)
-
-    def to_numpy_dict(self) -> Dict[str, np.ndarray]:
-        """
-        Returns a Dict of np arrays with an entry correspinding to the continuous log probs
-        and an entry corresponding to the discrete log probs. "continuous_log_probs" and
-        "discrete_log_probs" are added to the agents buffer individually to maintain a flat buffer.
-        """
-        array_dict: Dict[str, np.ndarray] = {}
-        if self.continuous_tensor is not None:
-            array_dict["continuous_log_probs"] = ModelUtils.to_numpy(
-                self.continuous_tensor
-            )
-        if self.discrete_list is not None:
-
-            array_dict["discrete_log_probs"] = ModelUtils.to_numpy(self.discrete_tensor)
-        return array_dict
-
-    def _to_tensor_list(self) -> List[torch.Tensor]:
-        """
-        Returns the tensors in the ActionLogProbs as a flat List of torch Tensors. This
-        is private and serves as a utility for self.flatten()
-        """
-        tensor_list: List[torch.Tensor] = []
-        if self.continuous_tensor is not None:
-            tensor_list.append(self.continuous_tensor)
-        if self.discrete_list is not None:
-            tensor_list.append(
-                self.discrete_tensor
-            )  # Note this is different for AgentActions
-        return tensor_list
-
-    def flatten(self) -> torch.Tensor:
-        """
-        A utility method that returns all log probs in ActionLogProbs as a flattened tensor.
-        This is useful for algorithms like PPO which can treat all log probs in the same way.
-        """
-        return torch.cat(self._to_tensor_list(), dim=1)
-
-    @staticmethod
-    def create(
-        log_prob_list: List[torch.Tensor],
-        action_spec: ActionSpec,
-        all_log_prob_list: List[torch.Tensor] = None,
-    ) -> "ActionLogProbs":
-        """
-        A static method that converts a list of torch Tensors into an ActionLogProbs using the ActionSpec.
-        This will change (and may be removed) in the ActionModel.
-        """
-        continuous: torch.Tensor = None
-        discrete: List[torch.Tensor] = None  # type: ignore
-        _offset = 0
-        if action_spec.continuous_size > 0:
-            continuous = log_prob_list[0]
-            _offset = 1
-        if action_spec.discrete_size > 0:
-            discrete = log_prob_list[_offset:]
-        return ActionLogProbs(continuous, discrete, all_log_prob_list)
-
-    @staticmethod
-    def from_dict(buff: Dict[str, np.ndarray]) -> "ActionLogProbs":
-        """
-        A static method that accesses continuous and discrete log probs fields in an AgentBuffer
-        and constructs the corresponding ActionLogProbs from the retrieved np arrays.
-        """
-        continuous: torch.Tensor = None
-        discrete: List[torch.Tensor] = None  # type: ignore
-
-        if "continuous_log_probs" in buff:
-            continuous = ModelUtils.list_to_tensor(buff["continuous_log_probs"])
-        if "discrete_log_probs" in buff:
-            discrete_tensor = ModelUtils.list_to_tensor(buff["discrete_log_probs"])
-            discrete = [
-                discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
-            ]
-        return ActionLogProbs(continuous, discrete, None)


 class ModelUtils:
        EncoderType.NATURE_CNN: 36,
        EncoderType.RESNET: 15,
    }
-
-    class ActionFlattener:
-        def __init__(self, action_spec: ActionSpec):
-            self._specs = action_spec
-
-        @property
-        def flattened_size(self) -> int:
-            if self._specs.is_continuous():
-                return self._specs.continuous_size
-            else:
-                return sum(self._specs.discrete_branches)
-
-        def forward(self, action: AgentAction) -> torch.Tensor:
-            if self._specs.is_continuous():
-                return action.continuous_tensor
-            else:
-                return torch.cat(
-                    ModelUtils.actions_to_onehot(
-                        torch.as_tensor(action.discrete_tensor, dtype=torch.long),
-                        self._specs.discrete_branches,
-                    ),
-                    dim=1,
-                )

    @staticmethod
    def update_learning_rate(optim: torch.optim.Optimizer, lr: float) -> None:
        for i in range(num_partitions):
            res += [data[(partitions == i).nonzero().squeeze(1)]]
        return res
-
-    @staticmethod
-    def get_probs_and_entropy(
-        action_list: List[torch.Tensor], dists: List[DistInstance]
-    ) -> Tuple[List[torch.Tensor], torch.Tensor, Optional[torch.Tensor]]:
-        log_probs_list = []
-        all_probs_list = []
-        entropies_list = []
-        for action, action_dist in zip(action_list, dists):
-            log_prob = action_dist.log_prob(action)
-            log_probs_list.append(log_prob)
-            entropies_list.append(action_dist.entropy())
-            if isinstance(action_dist, DiscreteDistInstance):
-                all_probs_list.append(action_dist.all_log_prob())
-        entropies = torch.stack(entropies_list, dim=-1)
-        if not all_probs_list:
-            entropies = entropies.squeeze(-1)
-        return log_probs_list, entropies, all_probs_list

    @staticmethod
    def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:
--- a/ml-agents/mlagents/trainers/trajectory.py
+++ b/ml-agents/mlagents/trainers/trajectory.py
-from typing import List, NamedTuple, Dict
+from typing import List, NamedTuple
+from mlagents_envs.base_env import ActionTuple
+from mlagents.trainers.torch.action_log_probs import LogProbsTuple


 class AgentExperience(NamedTuple):
-    action: Dict[str, np.ndarray]
-    action_probs: Dict[str, np.ndarray]
+    action: ActionTuple
+    action_probs: LogProbsTuple
    action_pre: np.ndarray  # TODO: Remove this
    action_mask: np.ndarray
    prev_action: np.ndarray
                agent_buffer_trajectory["actions_pre"].append(exp.action_pre)

            # Adds the log prob and action of continuous/discrete separately
-            for act_type, act_array in exp.action.items():
-                agent_buffer_trajectory[act_type].append(act_array)
-            for log_type, log_array in exp.action_probs.items():
-                agent_buffer_trajectory[log_type].append(log_array)
+            agent_buffer_trajectory["continuous_action"].append(exp.action.continuous)
+            agent_buffer_trajectory["discrete_action"].append(exp.action.discrete)
+            agent_buffer_trajectory["continuous_log_probs"].append(
+                exp.action_probs.continuous
+            )
+            agent_buffer_trajectory["discrete_log_probs"].append(
+                exp.action_probs.discrete
+            )

            # Store action masks if necessary. Note that 1 means active, while
            # in AgentExperience False means active.
                # This should never be needed unless the environment somehow doesn't supply the
                # action mask in a discrete space.

-                if "discrete_action" in exp.action:
-                    action_shape = exp.action["discrete_action"].shape
-                else:
-                    action_shape = exp.action["continuous_action"].shape
+                action_shape = exp.action.discrete.shape
                agent_buffer_trajectory["action_mask"].append(
                    np.ones(action_shape, dtype=np.float32), padding_value=1
                )
--- a/ml-agents/mlagents/trainers/tests/torch/test_action_model.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_action_model.py
+import pytest
+
+from mlagents.torch_utils import torch
+from mlagents.trainers.torch.action_model import ActionModel, DistInstances
+from mlagents.trainers.torch.agent_action import AgentAction
+from mlagents.trainers.torch.distributions import (
+    GaussianDistInstance,
+    CategoricalDistInstance,
+)
+
+from mlagents_envs.base_env import ActionSpec
+
+
+def create_action_model(inp_size, act_size):
+    mask = torch.ones([1, act_size * 2])
+    action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size)))
+    action_model = ActionModel(inp_size, action_spec)
+    return action_model, mask
+
+
+def test_get_dists():
+    inp_size = 4
+    act_size = 2
+    action_model, masks = create_action_model(inp_size, act_size)
+    sample_inp = torch.ones((1, inp_size))
+    dists = action_model._get_dists(sample_inp, masks=masks)
+    assert isinstance(dists.continuous, GaussianDistInstance)
+    assert len(dists.discrete) == 2
+    for _dist in dists.discrete:
+        assert isinstance(_dist, CategoricalDistInstance)
+
+
+def test_sample_action():
+    inp_size = 4
+    act_size = 2
+    action_model, masks = create_action_model(inp_size, act_size)
+    sample_inp = torch.ones((1, inp_size))
+    dists = action_model._get_dists(sample_inp, masks=masks)
+    agent_action = action_model._sample_action(dists)
+    assert agent_action.continuous_tensor.shape == (1, 2)
+    assert len(agent_action.discrete_list) == 2
+    for _disc in agent_action.discrete_list:
+        assert _disc.shape == (1, 1)
+
+
+def test_get_probs_and_entropy():
+    inp_size = 4
+    act_size = 2
+    action_model, masks = create_action_model(inp_size, act_size)
+
+    _continuous_dist = GaussianDistInstance(torch.zeros((1, 2)), torch.ones((1, 2)))
+    act_size = 2
+    test_prob = torch.tensor([[1.0 - 0.1 * (act_size - 1)] + [0.1] * (act_size - 1)])
+    _discrete_dist_list = [
+        CategoricalDistInstance(test_prob),
+        CategoricalDistInstance(test_prob),
+    ]
+    dist_tuple = DistInstances(_continuous_dist, _discrete_dist_list)
+
+    agent_action = AgentAction(
+        torch.zeros((1, 2)), [torch.tensor([0]), torch.tensor([1])]
+    )
+
+    log_probs, entropies = action_model._get_probs_and_entropy(agent_action, dist_tuple)
+
+    assert log_probs.continuous_tensor.shape == (1, 2)
+    assert len(log_probs.discrete_list) == 2
+    for _disc in log_probs.discrete_list:
+        assert _disc.shape == (1,)
+    assert len(log_probs.all_discrete_list) == 2
+    for _disc in log_probs.all_discrete_list:
+        assert _disc.shape == (1, 2)
+
+    for clp in log_probs.continuous_tensor[0]:
+        # Log prob of standard normal at 0
+        assert clp == pytest.approx(-0.919, abs=0.01)
+
+    assert log_probs.discrete_list[0] > log_probs.discrete_list[1]
+
+    for ent, val in zip(entropies[0], [1.4189, 1.4189, 0.6191, 0.6191]):
+        assert ent == pytest.approx(val, abs=0.01)
--- a/ml-agents/mlagents/trainers/tests/torch/test_hybrid.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_hybrid.py
+import attr
+import pytest
+
+
+from mlagents.trainers.tests.simple_test_envs import (
+    SimpleEnvironment,
+    MemoryEnvironment,
+)
+
+from mlagents.trainers.settings import NetworkSettings, FrameworkType
+
+from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config
+from mlagents.trainers.tests.check_env_trains import check_environment_trains
+
+BRAIN_NAME = "1D"
+
+PPO_TORCH_CONFIG = attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH)
+SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)
+
+
+@pytest.mark.parametrize("action_size", [(1, 1), (2, 2), (1, 2), (2, 1)])
+def test_hybrid_ppo(action_size):
+    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_size, step_size=0.8)
+    new_network_settings = attr.evolve(PPO_TORCH_CONFIG.network_settings)
+    new_hyperparams = attr.evolve(
+        PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=1024
+    )
+    config = attr.evolve(
+        PPO_TORCH_CONFIG,
+        hyperparameters=new_hyperparams,
+        network_settings=new_network_settings,
+        max_steps=10000,
+    )
+    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
+
+
+@pytest.mark.parametrize("num_visual", [1, 2])
+def test_hybrid_visual_ppo(num_visual):
+    env = SimpleEnvironment(
+        [BRAIN_NAME], num_visual=num_visual, num_vector=0, action_sizes=(1, 1)
+    )
+    new_hyperparams = attr.evolve(
+        PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
+    )
+    config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams)
+    check_environment_trains(env, {BRAIN_NAME: config})
+
+
+def test_hybrid_recurrent_ppo():
+    env = MemoryEnvironment([BRAIN_NAME], action_sizes=(1, 1), step_size=0.5)
+    new_network_settings = attr.evolve(
+        PPO_TORCH_CONFIG.network_settings,
+        memory=NetworkSettings.MemorySettings(memory_size=16),
+    )
+    new_hyperparams = attr.evolve(
+        PPO_TORCH_CONFIG.hyperparameters,
+        learning_rate=1.0e-3,
+        batch_size=64,
+        buffer_size=512,
+    )
+    config = attr.evolve(
+        PPO_TORCH_CONFIG,
+        hyperparameters=new_hyperparams,
+        network_settings=new_network_settings,
+        max_steps=3000,
+    )
+    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
+
+
+@pytest.mark.parametrize("action_size", [(1, 1), (2, 2), (1, 2), (2, 1)])
+def test_hybrid_sac(action_size):
+    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_size, step_size=0.8)
+
+    new_hyperparams = attr.evolve(
+        SAC_TORCH_CONFIG.hyperparameters,
+        buffer_size=50000,
+        batch_size=256,
+        buffer_init_steps=2000,
+    )
+    config = attr.evolve(
+        SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=5000
+    )
+    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
+
+
+@pytest.mark.parametrize("num_visual", [1, 2])
+def test_hybrid_visual_sac(num_visual):
+    env = SimpleEnvironment(
+        [BRAIN_NAME], num_visual=num_visual, num_vector=0, action_sizes=(1, 1)
+    )
+    new_hyperparams = attr.evolve(
+        SAC_TORCH_CONFIG.hyperparameters,
+        buffer_size=50000,
+        batch_size=128,
+        learning_rate=3.0e-4,
+    )
+    config = attr.evolve(
+        SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=3000
+    )
+    check_environment_trains(env, {BRAIN_NAME: config})
+
+
+def test_hybrid_recurrent_sac():
+    env = MemoryEnvironment([BRAIN_NAME], action_sizes=(1, 1), step_size=0.5)
+    new_networksettings = attr.evolve(
+        SAC_TORCH_CONFIG.network_settings,
+        memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16),
+    )
+    new_hyperparams = attr.evolve(
+        SAC_TORCH_CONFIG.hyperparameters,
+        batch_size=256,
+        learning_rate=1e-3,
+        buffer_init_steps=1000,
+        steps_per_update=2,
+    )
+    config = attr.evolve(
+        SAC_TORCH_CONFIG,
+        hyperparameters=new_hyperparams,
+        network_settings=new_networksettings,
+        max_steps=2000,
+    )
+    check_environment_trains(env, {BRAIN_NAME: config})
--- a/ml-agents/mlagents/trainers/torch/action_flattener.py
+++ b/ml-agents/mlagents/trainers/torch/action_flattener.py
+from typing import List
+from mlagents.torch_utils import torch
+
+from mlagents_envs.base_env import ActionSpec
+from mlagents.trainers.torch.agent_action import AgentAction
+from mlagents.trainers.torch.utils import ModelUtils
+
+
+class ActionFlattener:
+    def __init__(self, action_spec: ActionSpec):
+        """
+        A torch module that creates the flattened form of an AgentAction object.
+        The flattened form is the continuous action concatenated with the
+        concatenated one hot encodings of the discrete actions.
+        :param action_spec: An ActionSpec that describes the action space dimensions
+        """
+        self._specs = action_spec
+
+    @property
+    def flattened_size(self) -> int:
+        """
+        The flattened size is the continuous size plus the sum of the branch sizes
+        since discrete actions are encoded as one hots.
+        """
+        return self._specs.continuous_size + sum(self._specs.discrete_branches)
+
+    def forward(self, action: AgentAction) -> torch.Tensor:
+        """
+        Returns a tensor corresponding the flattened action
+        :param action: An AgentAction object
+        """
+        action_list: List[torch.Tensor] = []
+        if self._specs.continuous_size > 0:
+            action_list.append(action.continuous_tensor)
+        if self._specs.discrete_size > 0:
+            flat_discrete = torch.cat(
+                ModelUtils.actions_to_onehot(
+                    torch.as_tensor(action.discrete_tensor, dtype=torch.long),
+                    self._specs.discrete_branches,
+                ),
+                dim=1,
+            )
+            action_list.append(flat_discrete)
+        return torch.cat(action_list, dim=1)
--- a/ml-agents/mlagents/trainers/torch/action_log_probs.py
+++ b/ml-agents/mlagents/trainers/torch/action_log_probs.py
+from typing import List, Optional, NamedTuple, Dict
+from mlagents.torch_utils import torch
+import numpy as np
+
+from mlagents.trainers.torch.utils import ModelUtils
+from mlagents_envs.base_env import _ActionTupleBase
+
+
+class LogProbsTuple(_ActionTupleBase):
+    """
+    An object whose fields correspond to the log probs of actions of different types.
+    Continuous and discrete are numpy arrays
+    Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
+    respectively. Note, this also holds when continuous or discrete size is
+    zero.
+    """
+
+    @property
+    def discrete_dtype(self) -> np.dtype:
+        """
+        The dtype of a discrete log probability.
+        """
+        return np.float32
+
+
+class ActionLogProbs(NamedTuple):
+    """
+    A NamedTuple containing the tensor for continuous log probs and list of tensors for
+    discrete log probs of individual actions as well as all the log probs for an entire branch.
+    Utility functions provide numpy <=> tensor conversions to be used by the optimizers.
+    :param continuous_tensor: Torch tensor corresponding to log probs of continuous actions
+    :param discrete_list: List of Torch tensors each corresponding to log probs of the discrete actions that were
+    sampled.
+    :param all_discrete_list: List of Torch tensors each corresponding to all log probs of
+    a discrete action branch, even the discrete actions that were not sampled. all_discrete_list is a list of Tensors,
+    each Tensor corresponds to one discrete branch log probabilities.
+    """
+
+    continuous_tensor: torch.Tensor
+    discrete_list: Optional[List[torch.Tensor]]
+    all_discrete_list: Optional[List[torch.Tensor]]
+
+    @property
+    def discrete_tensor(self):
+        """
+        Returns the discrete log probs list as a stacked tensor
+        """
+        return torch.stack(self.discrete_list, dim=-1)
+
+    @property
+    def all_discrete_tensor(self):
+        """
+        Returns the discrete log probs of each branch as a tensor
+        """
+        return torch.cat(self.all_discrete_list, dim=1)
+
+    def to_log_probs_tuple(self) -> LogProbsTuple:
+        """
+        Returns a LogProbsTuple. Only adds if tensor is not None. Otherwise,
+        LogProbsTuple uses a default.
+        """
+        log_probs_tuple = LogProbsTuple()
+        if self.continuous_tensor is not None:
+            continuous = ModelUtils.to_numpy(self.continuous_tensor)
+            log_probs_tuple.add_continuous(continuous)
+        if self.discrete_list is not None:
+            discrete = ModelUtils.to_numpy(self.discrete_tensor)
+            log_probs_tuple.add_discrete(discrete)
+        return log_probs_tuple
+
+    def _to_tensor_list(self) -> List[torch.Tensor]:
+        """
+        Returns the tensors in the ActionLogProbs as a flat List of torch Tensors. This
+        is private and serves as a utility for self.flatten()
+        """
+        tensor_list: List[torch.Tensor] = []
+        if self.continuous_tensor is not None:
+            tensor_list.append(self.continuous_tensor)
+        if self.discrete_list is not None:
+            tensor_list.append(self.discrete_tensor)
+        return tensor_list
+
+    def flatten(self) -> torch.Tensor:
+        """
+        A utility method that returns all log probs in ActionLogProbs as a flattened tensor.
+        This is useful for algorithms like PPO which can treat all log probs in the same way.
+        """
+        return torch.cat(self._to_tensor_list(), dim=1)
+
+    @staticmethod
+    def from_dict(buff: Dict[str, np.ndarray]) -> "ActionLogProbs":
+        """
+        A static method that accesses continuous and discrete log probs fields in an AgentBuffer
+        and constructs the corresponding ActionLogProbs from the retrieved np arrays.
+        """
+        continuous: torch.Tensor = None
+        discrete: List[torch.Tensor] = None  # type: ignore
+
+        if "continuous_log_probs" in buff:
+            continuous = ModelUtils.list_to_tensor(buff["continuous_log_probs"])
+        if "discrete_log_probs" in buff:
+            discrete_tensor = ModelUtils.list_to_tensor(buff["discrete_log_probs"])
+            # This will keep discrete_list = None which enables flatten()
+            if discrete_tensor.shape[1] > 0:
+                discrete = [
+                    discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
+                ]
+        return ActionLogProbs(continuous, discrete, None)
--- a/ml-agents/mlagents/trainers/torch/action_model.py
+++ b/ml-agents/mlagents/trainers/torch/action_model.py
+from typing import List, Tuple, NamedTuple, Optional
+from mlagents.torch_utils import torch, nn
+from mlagents.trainers.torch.distributions import (
+    DistInstance,
+    DiscreteDistInstance,
+    GaussianDistribution,
+    MultiCategoricalDistribution,
+)
+from mlagents.trainers.torch.agent_action import AgentAction
+from mlagents.trainers.torch.action_log_probs import ActionLogProbs
+from mlagents_envs.base_env import ActionSpec
+
+EPSILON = 1e-7  # Small value to avoid divide by zero
+
+
+class DistInstances(NamedTuple):
+    """
+    A NamedTuple with fields corresponding the the DistInstance objects
+    output by continuous and discrete distributions, respectively. Discrete distributions
+    output a list of DistInstance objects whereas continuous distributions output a single
+    DistInstance object.
+    """
+
+    continuous: Optional[DistInstance]
+    discrete: Optional[List[DiscreteDistInstance]]
+
+
+class ActionModel(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        action_spec: ActionSpec,
+        conditional_sigma: bool = False,
+        tanh_squash: bool = False,
+    ):
+        """
+        A torch module that represents the action space of a policy. The ActionModel may contain
+        a continuous distribution, a discrete distribution or both where construction depends on
+        the action_spec.  The ActionModel uses the encoded input of the network body to parameterize
+        these distributions. The forward method of this module outputs the action, log probs,
+        and entropies given the encoding from the network body.
+        :params hidden_size: Size of the input to the ActionModel.
+        :params action_spec: The ActionSpec defining the action space dimensions and distributions.
+        :params conditional_sigma: Whether or not the std of a Gaussian is conditioned on state.
+        :params tanh_squash: Whether to squash the output of a Gaussian with the tanh function.
+        """
+        super().__init__()
+        self.encoding_size = hidden_size
+        self.action_spec = action_spec
+        self._continuous_distribution = None
+        self._discrete_distribution = None
+
+        if self.action_spec.continuous_size > 0:
+            self._continuous_distribution = GaussianDistribution(
+                self.encoding_size,
+                self.action_spec.continuous_size,
+                conditional_sigma=conditional_sigma,
+                tanh_squash=tanh_squash,
+            )
+
+        if self.action_spec.discrete_size > 0:
+            self._discrete_distribution = MultiCategoricalDistribution(
+                self.encoding_size, self.action_spec.discrete_branches
+            )
+
+    def _sample_action(self, dists: DistInstances) -> AgentAction:
+        """
+        Samples actions from a DistInstances tuple
+        :params dists: The DistInstances tuple
+        :return: An AgentAction corresponding to the actions sampled from the DistInstances
+        """
+        continuous_action: Optional[torch.Tensor] = None
+        discrete_action: Optional[List[torch.Tensor]] = None
+        # This checks None because mypy complains otherwise
+        if dists.continuous is not None:
+            continuous_action = dists.continuous.sample()
+        if dists.discrete is not None:
+            discrete_action = []
+            for discrete_dist in dists.discrete:
+                discrete_action.append(discrete_dist.sample())
+        return AgentAction(continuous_action, discrete_action)
+
+    def _get_dists(self, inputs: torch.Tensor, masks: torch.Tensor) -> DistInstances:
+        """
+        Creates a DistInstances tuple using the continuous and discrete distributions
+        :params inputs: The encoding from the network body
+        :params masks: Action masks for discrete actions
+        :return: A DistInstances tuple
+        """
+        continuous_dist: Optional[DistInstance] = None
+        discrete_dist: Optional[List[DiscreteDistInstance]] = None
+        # This checks None because mypy complains otherwise
+        if self._continuous_distribution is not None:
+            continuous_dist = self._continuous_distribution(inputs)
+        if self._discrete_distribution is not None:
+            discrete_dist = self._discrete_distribution(inputs, masks)
+        return DistInstances(continuous_dist, discrete_dist)
+
+    def _get_probs_and_entropy(
+        self, actions: AgentAction, dists: DistInstances
+    ) -> Tuple[ActionLogProbs, torch.Tensor]:
+        """
+        Computes the log probabilites of the actions given distributions and entropies of
+        the given distributions.
+        :params actions: The AgentAction
+        :params dists: The DistInstances tuple
+        :return: An ActionLogProbs tuple and a torch tensor of the distribution entropies.
+        """
+        entropies_list: List[torch.Tensor] = []
+        continuous_log_prob: Optional[torch.Tensor] = None
+        discrete_log_probs: Optional[List[torch.Tensor]] = None
+        all_discrete_log_probs: Optional[List[torch.Tensor]] = None
+        # This checks None because mypy complains otherwise
+        if dists.continuous is not None:
+            continuous_log_prob = dists.continuous.log_prob(actions.continuous_tensor)
+            entropies_list.append(dists.continuous.entropy())
+        if dists.discrete is not None:
+            discrete_log_probs = []
+            all_discrete_log_probs = []
+            for discrete_action, discrete_dist in zip(
+                actions.discrete_list, dists.discrete  # type: ignore
+            ):
+                discrete_log_prob = discrete_dist.log_prob(discrete_action)
+                entropies_list.append(discrete_dist.entropy())
+                discrete_log_probs.append(discrete_log_prob)
+                all_discrete_log_probs.append(discrete_dist.all_log_prob())
+        action_log_probs = ActionLogProbs(
+            continuous_log_prob, discrete_log_probs, all_discrete_log_probs
+        )
+        entropies = torch.cat(entropies_list, dim=1)
+        return action_log_probs, entropies
+
+    def evaluate(
+        self, inputs: torch.Tensor, masks: torch.Tensor, actions: AgentAction
+    ) -> Tuple[ActionLogProbs, torch.Tensor]:
+        """
+        Given actions and encoding from the network body, gets the distributions and
+        computes the log probabilites and entropies.
+        :params inputs: The encoding from the network body
+        :params masks: Action masks for discrete actions
+        :params actions: The AgentAction
+        :return: An ActionLogProbs tuple and a torch tensor of the distribution entropies.
+        """
+        dists = self._get_dists(inputs, masks)
+        log_probs, entropies = self._get_probs_and_entropy(actions, dists)
+        # Use the sum of entropy across actions, not the mean
+        entropy_sum = torch.sum(entropies, dim=1)
+        return log_probs, entropy_sum
+
+    def get_action_out(self, inputs: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:
+        """
+        Gets the tensors corresponding to the output of the policy network to be used for
+        inference. Called by the Actor's forward call.
+        :params inputs: The encoding from the network body
+        :params masks: Action masks for discrete actions
+        :return: A tuple of torch tensors corresponding to the inference output
+        """
+        dists = self._get_dists(inputs, masks)
+        out_list: List[torch.Tensor] = []
+        # This checks None because mypy complains otherwise
+        if dists.continuous is not None:
+            out_list.append(dists.continuous.exported_model_output())
+        if dists.discrete is not None:
+            for discrete_dist in dists.discrete:
+                out_list.append(discrete_dist.exported_model_output())
+        return torch.cat(out_list, dim=1)
+
+    def forward(
+        self, inputs: torch.Tensor, masks: torch.Tensor
+    ) -> Tuple[AgentAction, ActionLogProbs, torch.Tensor]:
+        """
+        The forward method of this module. Outputs the action, log probs,
+        and entropies given the encoding from the network body.
+        :params inputs: The encoding from the network body
+        :params masks: Action masks for discrete actions
+        :return: Given the input, an AgentAction of the actions generated by the policy and the corresponding
+        ActionLogProbs and entropies.
+        """
+        dists = self._get_dists(inputs, masks)
+        actions = self._sample_action(dists)
+        log_probs, entropies = self._get_probs_and_entropy(actions, dists)
+        # Use the sum of entropy across actions, not the mean
+        entropy_sum = torch.sum(entropies, dim=1)
+        return (actions, log_probs, entropy_sum)
--- a/ml-agents/mlagents/trainers/torch/agent_action.py
+++ b/ml-agents/mlagents/trainers/torch/agent_action.py
+from typing import List, Optional, NamedTuple, Dict
+from mlagents.torch_utils import torch
+import numpy as np
+
+from mlagents.trainers.torch.utils import ModelUtils
+from mlagents_envs.base_env import ActionTuple
+
+
+class AgentAction(NamedTuple):
+    """
+    A NamedTuple containing the tensor for continuous actions and list of tensors for
+    discrete actions. Utility functions provide numpy <=> tensor conversions to be
+    sent as actions to the environment manager as well as used by the optimizers.
+    :param continuous_tensor: Torch tensor corresponding to continuous actions
+    :param discrete_list: List of Torch tensors each corresponding to discrete actions
+    """
+
+    continuous_tensor: torch.Tensor
+    discrete_list: Optional[List[torch.Tensor]]
+
+    @property
+    def discrete_tensor(self):
+        """
+        Returns the discrete action list as a stacked tensor
+        """
+        return torch.stack(self.discrete_list, dim=-1)
+
+    def to_action_tuple(self) -> ActionTuple:
+        """
+        Returns an ActionTuple
+        """
+        action_tuple = ActionTuple()
+        if self.continuous_tensor is not None:
+            continuous = ModelUtils.to_numpy(self.continuous_tensor)
+            action_tuple.add_continuous(continuous)
+        if self.discrete_list is not None:
+            discrete = ModelUtils.to_numpy(self.discrete_tensor[:, 0, :])
+            action_tuple.add_discrete(discrete)
+        return action_tuple
+
+    @staticmethod
+    def from_dict(buff: Dict[str, np.ndarray]) -> "AgentAction":
+        """
+        A static method that accesses continuous and discrete action fields in an AgentBuffer
+        and constructs the corresponding AgentAction from the retrieved np arrays.
+        """
+        continuous: torch.Tensor = None
+        discrete: List[torch.Tensor] = None  # type: ignore
+        if "continuous_action" in buff:
+            continuous = ModelUtils.list_to_tensor(buff["continuous_action"])
+        if "discrete_action" in buff:
+            discrete_tensor = ModelUtils.list_to_tensor(
+                buff["discrete_action"], dtype=torch.long
+            )
+            discrete = [
+                discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
+            ]
+        return AgentAction(continuous, discrete)