Merge commit '9450d3fc0dda4547a14c5ed1b7e13fc6e3a15413' into develop-nopreviousactions

--- a/ml-agents/mlagents/trainers/ppo/optimizer.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer.py
+import logging
+from typing import Optional, Any, Dict
+
+import numpy as np
+from mlagents.tf_utils import tf
+from mlagents_envs.timers import timed
+from mlagents.trainers.models import ModelUtils, EncoderType, LearningRateSchedule
+from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.common.tf_optimizer import TFOptimizer
+from mlagents.trainers.buffer import AgentBuffer
+
+
+logger = logging.getLogger("mlagents.trainers")
+
+
+class PPOOptimizer(TFOptimizer):
+    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
+        """
+        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
+        The PPO optimizer has a value estimator and a loss function.
+        :param policy: A TFPolicy object that will be updated by this PPO Optimizer.
+        :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
+        """
+        # Create the graph here to give more granular control of the TF graph to the Optimizer.
+        policy.create_tf_graph()
+
+        with policy.graph.as_default():
+            with tf.variable_scope("optimizer/"):
+                super().__init__(policy, trainer_params)
+
+                lr = float(trainer_params["learning_rate"])
+                lr_schedule = LearningRateSchedule(
+                    trainer_params.get("learning_rate_schedule", "linear")
+                )
+                h_size = int(trainer_params["hidden_units"])
+                epsilon = float(trainer_params["epsilon"])
+                beta = float(trainer_params["beta"])
+                max_step = float(trainer_params["max_steps"])
+                num_layers = int(trainer_params["num_layers"])
+                vis_encode_type = EncoderType(
+                    trainer_params.get("vis_encode_type", "simple")
+                )
+                self.burn_in_ratio = float(trainer_params.get("burn_in_ratio", 0.0))
+
+                self.stream_names = list(self.reward_signals.keys())
+
+                self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None
+                self.grads = None
+                self.update_batch: Optional[tf.Operation] = None
+
+                self.stats_name_to_update_name = {
+                    "Losses/Value Loss": "value_loss",
+                    "Losses/Policy Loss": "policy_loss",
+                    "Policy/Learning Rate": "learning_rate",
+                }
+                if self.policy.use_recurrent:
+                    self.m_size = self.policy.m_size
+                    self.memory_in = tf.placeholder(
+                        shape=[None, self.m_size],
+                        dtype=tf.float32,
+                        name="recurrent_value_in",
+                    )
+
+                if num_layers < 1:
+                    num_layers = 1
+                if policy.use_continuous_act:
+                    self._create_cc_critic(h_size, num_layers, vis_encode_type)
+                else:
+                    self._create_dc_critic(h_size, num_layers, vis_encode_type)
+
+                self.learning_rate = ModelUtils.create_learning_rate(
+                    lr_schedule, lr, self.policy.global_step, int(max_step)
+                )
+                self._create_losses(
+                    self.policy.log_probs,
+                    self.old_log_probs,
+                    self.value_heads,
+                    self.policy.entropy,
+                    beta,
+                    epsilon,
+                    lr,
+                    max_step,
+                )
+                self._create_ppo_optimizer_ops()
+
+            self.update_dict.update(
+                {
+                    "value_loss": self.value_loss,
+                    "policy_loss": self.abs_policy_loss,
+                    "update_batch": self.update_batch,
+                    "learning_rate": self.learning_rate,
+                }
+            )
+
+            self.policy.initialize_or_load()
+
+    def _create_cc_critic(
+        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
+    ) -> None:
+        """
+        Creates Continuous control actor-critic model.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: The type of visual encoder to use.
+        """
+        hidden_stream = ModelUtils.create_observation_streams(
+            self.policy.visual_in,
+            self.policy.processed_vector_in,
+            1,
+            h_size,
+            num_layers,
+            vis_encode_type,
+        )[0]
+
+        if self.policy.use_recurrent:
+            hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
+                hidden_stream,
+                self.memory_in,
+                self.policy.sequence_length_ph,
+                name="lstm_value",
+            )
+            self.memory_out = memory_value_out
+        else:
+            hidden_value = hidden_stream
+
+        self.value_heads, self.value = ModelUtils.create_value_heads(
+            self.stream_names, hidden_value
+        )
+        self.all_old_log_probs = tf.placeholder(
+            shape=[None, 1], dtype=tf.float32, name="old_probabilities"
+        )
+
+        self.old_log_probs = tf.reduce_sum(
+            (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
+        )
+
+    def _create_dc_critic(
+        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
+    ) -> None:
+        """
+        Creates Discrete control actor-critic model.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: The type of visual encoder to use.
+        """
+        hidden_stream = ModelUtils.create_observation_streams(
+            self.policy.visual_in,
+            self.policy.processed_vector_in,
+            1,
+            h_size,
+            num_layers,
+            vis_encode_type,
+        )[0]
+
+        if self.policy.use_recurrent:
+            hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
+                hidden_stream,
+                self.memory_in,
+                self.policy.sequence_length_ph,
+                name="lstm_value",
+            )
+            self.memory_out = memory_value_out
+        else:
+            hidden_value = hidden_stream
+
+        self.value_heads, self.value = ModelUtils.create_value_heads(
+            self.stream_names, hidden_value
+        )
+
+        self.all_old_log_probs = tf.placeholder(
+            shape=[None, sum(self.policy.act_size)],
+            dtype=tf.float32,
+            name="old_probabilities",
+        )
+        _, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer(
+            self.all_old_log_probs, self.policy.action_masks, self.policy.act_size
+        )
+
+        action_idx = [0] + list(np.cumsum(self.policy.act_size))
+
+        self.old_log_probs = tf.reduce_sum(
+            (
+                tf.stack(
+                    [
+                        -tf.nn.softmax_cross_entropy_with_logits_v2(
+                            labels=self.policy.action_oh[
+                                :, action_idx[i] : action_idx[i + 1]
+                            ],
+                            logits=old_normalized_logits[
+                                :, action_idx[i] : action_idx[i + 1]
+                            ],
+                        )
+                        for i in range(len(self.policy.act_size))
+                    ],
+                    axis=1,
+                )
+            ),
+            axis=1,
+            keepdims=True,
+        )
+
+    def _create_losses(
+        self, probs, old_probs, value_heads, entropy, beta, epsilon, lr, max_step
+    ):
+        """
+        Creates training-specific Tensorflow ops for PPO models.
+        :param probs: Current policy probabilities
+        :param old_probs: Past policy probabilities
+        :param value_heads: Value estimate tensors from each value stream
+        :param beta: Entropy regularization strength
+        :param entropy: Current policy entropy
+        :param epsilon: Value for policy-divergence threshold
+        :param lr: Learning rate
+        :param max_step: Total number of training steps.
+        """
+        self.returns_holders = {}
+        self.old_values = {}
+        for name in value_heads.keys():
+            returns_holder = tf.placeholder(
+                shape=[None], dtype=tf.float32, name="{}_returns".format(name)
+            )
+            old_value = tf.placeholder(
+                shape=[None], dtype=tf.float32, name="{}_value_estimate".format(name)
+            )
+            self.returns_holders[name] = returns_holder
+            self.old_values[name] = old_value
+        self.advantage = tf.placeholder(
+            shape=[None], dtype=tf.float32, name="advantages"
+        )
+        advantage = tf.expand_dims(self.advantage, -1)
+
+        decay_epsilon = tf.train.polynomial_decay(
+            epsilon, self.policy.global_step, max_step, 0.1, power=1.0
+        )
+        decay_beta = tf.train.polynomial_decay(
+            beta, self.policy.global_step, max_step, 1e-5, power=1.0
+        )
+
+        value_losses = []
+        for name, head in value_heads.items():
+            clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
+                tf.reduce_sum(head, axis=1) - self.old_values[name],
+                -decay_epsilon,
+                decay_epsilon,
+            )
+            v_opt_a = tf.squared_difference(
+                self.returns_holders[name], tf.reduce_sum(head, axis=1)
+            )
+            v_opt_b = tf.squared_difference(
+                self.returns_holders[name], clipped_value_estimate
+            )
+            value_loss = tf.reduce_mean(
+                tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.policy.mask, 2)[
+                    1
+                ]
+            )
+            value_losses.append(value_loss)
+        self.value_loss = tf.reduce_mean(value_losses)
+
+        r_theta = tf.exp(probs - old_probs)
+        p_opt_a = r_theta * advantage
+        p_opt_b = (
+            tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon)
+            * advantage
+        )
+        self.policy_loss = -tf.reduce_mean(
+            tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.policy.mask, 2)[1]
+        )
+        # For cleaner stats reporting
+        self.abs_policy_loss = tf.abs(self.policy_loss)
+
+        self.loss = (
+            self.policy_loss
+            + 0.5 * self.value_loss
+            - decay_beta
+            * tf.reduce_mean(tf.dynamic_partition(entropy, self.policy.mask, 2)[1])
+        )
+
+    def _create_ppo_optimizer_ops(self):
+        self.tf_optimizer = self.create_optimizer_op(self.learning_rate)
+        self.grads = self.tf_optimizer.compute_gradients(self.loss)
+        self.update_batch = self.tf_optimizer.minimize(self.loss)
+
+    @timed
+    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
+        """
+        Performs update on model.
+        :param mini_batch: Batch of experiences.
+        :param num_sequences: Number of sequences to process.
+        :return: Results of update.
+        """
+        feed_dict = self._construct_feed_dict(batch, num_sequences)
+        stats_needed = self.stats_name_to_update_name
+        update_stats = {}
+        # Collect feed dicts for all reward signals.
+        for _, reward_signal in self.reward_signals.items():
+            feed_dict.update(
+                reward_signal.prepare_update(self.policy, batch, num_sequences)
+            )
+            stats_needed.update(reward_signal.stats_name_to_update_name)
+
+        update_vals = self._execute_model(feed_dict, self.update_dict)
+        for stat_name, update_name in stats_needed.items():
+            update_stats[stat_name] = update_vals[update_name]
+        return update_stats
+
+    def _construct_feed_dict(
+        self, mini_batch: AgentBuffer, num_sequences: int
+    ) -> Dict[tf.Tensor, Any]:
+        # Do an optional burn-in for memories
+        num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
+        burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
+        burn_in_mask[range(0, num_burn_in)] = 0
+        burn_in_mask = np.tile(burn_in_mask, num_sequences)
+        feed_dict = {
+            self.policy.batch_size_ph: num_sequences,
+            self.policy.sequence_length_ph: self.policy.sequence_length,
+            self.policy.mask_input: mini_batch["masks"] * burn_in_mask,
+            self.advantage: mini_batch["advantages"],
+            self.all_old_log_probs: mini_batch["action_probs"],
+        }
+        for name in self.reward_signals:
+            feed_dict[self.returns_holders[name]] = mini_batch[
+                "{}_returns".format(name)
+            ]
+            feed_dict[self.old_values[name]] = mini_batch[
+                "{}_value_estimates".format(name)
+            ]
+
+        if self.policy.output_pre is not None and "actions_pre" in mini_batch:
+            feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]
+        else:
+            feed_dict[self.policy.action_holder] = mini_batch["actions"]
+            if self.policy.use_recurrent:
+                feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
+            feed_dict[self.policy.action_masks] = mini_batch["action_mask"]
+        if "vector_obs" in mini_batch:
+            feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]
+        if self.policy.vis_obs_size > 0:
+            for i, _ in enumerate(self.policy.visual_in):
+                feed_dict[self.policy.visual_in[i]] = mini_batch["visual_obs%d" % i]
+        if self.policy.use_recurrent:
+            feed_dict[self.policy.memory_in] = [
+                mini_batch["memory"][i]
+                for i in range(
+                    0, len(mini_batch["memory"]), self.policy.sequence_length
+                )
+            ]
+            feed_dict[self.memory_in] = self._make_zero_mem(
+                self.m_size, mini_batch.num_experiences
+            )
+        return feed_dict
--- a/ml-agents/mlagents/trainers/sac/optimizer.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer.py
+import logging
+import numpy as np
+from typing import Dict, List, Optional, Any, Mapping
+
+from mlagents.tf_utils import tf
+
+from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
+from mlagents.trainers.models import LearningRateSchedule, EncoderType, ModelUtils
+from mlagents.trainers.common.tf_optimizer import TFOptimizer
+from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.buffer import AgentBuffer
+from mlagents_envs.timers import timed
+
+EPSILON = 1e-6  # Small value to avoid divide by zero
+
+LOGGER = logging.getLogger("mlagents.trainers")
+
+POLICY_SCOPE = ""
+TARGET_SCOPE = "target_network"
+
+
+class SACOptimizer(TFOptimizer):
+    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
+        """
+        Takes a Unity environment and model-specific hyper-parameters and returns the
+        appropriate PPO agent model for the environment.
+        :param brain: Brain parameters used to generate specific network graph.
+        :param lr: Learning rate.
+        :param lr_schedule: Learning rate decay schedule.
+        :param h_size: Size of hidden layers
+        :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster,
+            set higher to explore more.
+        :return: a sub-class of PPOAgent tailored to the environment.
+        :param max_step: Total number of training steps.
+        :param normalize: Whether to normalize vector observation input.
+        :param use_recurrent: Whether to use an LSTM layer in the network.
+        :param num_layers: Number of hidden layers between encoded input and policy & value layers
+        :param tau: Strength of soft-Q update.
+        :param m_size: Size of brain memory.
+        """
+        # Create the graph here to give more granular control of the TF graph to the Optimizer.
+        policy.create_tf_graph()
+
+        with policy.graph.as_default():
+            with tf.variable_scope(""):
+                super().__init__(policy, trainer_params)
+                lr = float(trainer_params["learning_rate"])
+                lr_schedule = LearningRateSchedule(
+                    trainer_params.get("learning_rate_schedule", "constant")
+                )
+                self.policy = policy
+                self.act_size = self.policy.act_size
+                h_size = int(trainer_params["hidden_units"])
+                max_step = float(trainer_params["max_steps"])
+                num_layers = int(trainer_params["num_layers"])
+                vis_encode_type = EncoderType(
+                    trainer_params.get("vis_encode_type", "simple")
+                )
+                self.tau = trainer_params.get("tau", 0.005)
+                self.burn_in_ratio = float(trainer_params.get("burn_in_ratio", 0.0))
+
+                # Non-exposed SAC parameters
+                self.discrete_target_entropy_scale = (
+                    0.2
+                )  # Roughly equal to e-greedy 0.05
+                self.continuous_target_entropy_scale = 1.0
+
+                self.init_entcoef = trainer_params.get("init_entcoef", 1.0)
+                stream_names = list(self.reward_signals.keys())
+                # Use to reduce "survivor bonus" when using Curiosity or GAIL.
+                self.gammas = [
+                    _val["gamma"] for _val in trainer_params["reward_signals"].values()
+                ]
+                self.use_dones_in_backup = {
+                    name: tf.Variable(1.0) for name in stream_names
+                }
+                self.disable_use_dones = {
+                    name: self.use_dones_in_backup[name].assign(0.0)
+                    for name in stream_names
+                }
+
+                if num_layers < 1:
+                    num_layers = 1
+
+                self.target_init_op: List[tf.Tensor] = []
+                self.target_update_op: List[tf.Tensor] = []
+                self.update_batch_policy: Optional[tf.Operation] = None
+                self.update_batch_value: Optional[tf.Operation] = None
+                self.update_batch_entropy: Optional[tf.Operation] = None
+
+                self.policy_network = SACPolicyNetwork(
+                    policy=self.policy,
+                    m_size=self.policy.m_size,  # 3x policy.m_size
+                    h_size=h_size,
+                    normalize=self.policy.normalize,
+                    use_recurrent=self.policy.use_recurrent,
+                    num_layers=num_layers,
+                    stream_names=stream_names,
+                    vis_encode_type=vis_encode_type,
+                )
+                self.target_network = SACTargetNetwork(
+                    policy=self.policy,
+                    m_size=self.policy.m_size,  # 1x policy.m_size
+                    h_size=h_size,
+                    normalize=self.policy.normalize,
+                    use_recurrent=self.policy.use_recurrent,
+                    num_layers=num_layers,
+                    stream_names=stream_names,
+                    vis_encode_type=vis_encode_type,
+                )
+                # The optimizer's m_size is 3 times the policy (Q1, Q2, and Value)
+                self.m_size = 3 * self.policy.m_size
+                self._create_inputs_and_outputs()
+                self.learning_rate = ModelUtils.create_learning_rate(
+                    lr_schedule, lr, self.policy.global_step, int(max_step)
+                )
+                self._create_losses(
+                    self.policy_network.q1_heads,
+                    self.policy_network.q2_heads,
+                    lr,
+                    int(max_step),
+                    stream_names,
+                    discrete=not self.policy.use_continuous_act,
+                )
+                self._create_sac_optimizer_ops()
+
+                self.selected_actions = (
+                    self.policy.selected_actions
+                )  # For GAIL and other reward signals
+                if self.policy.normalize:
+                    target_update_norm = self.target_network.copy_normalization(
+                        self.policy.running_mean,
+                        self.policy.running_variance,
+                        self.policy.normalization_steps,
+                    )
+                    # Update the normalization of the optimizer when the policy does.
+                    self.policy.update_normalization_op = tf.group(
+                        [self.policy.update_normalization_op, target_update_norm]
+                    )
+
+                self.policy.initialize_or_load()
+
+        self.stats_name_to_update_name = {
+            "Losses/Value Loss": "value_loss",
+            "Losses/Policy Loss": "policy_loss",
+            "Losses/Q1 Loss": "q1_loss",
+            "Losses/Q2 Loss": "q2_loss",
+            "Policy/Entropy Coeff": "entropy_coef",
+            "Policy/Learning Rate": "learning_rate",
+        }
+
+        self.update_dict = {
+            "value_loss": self.total_value_loss,
+            "policy_loss": self.policy_loss,
+            "q1_loss": self.q1_loss,
+            "q2_loss": self.q2_loss,
+            "entropy_coef": self.ent_coef,
+            "entropy": self.policy.entropy,
+            "update_batch": self.update_batch_policy,
+            "update_value": self.update_batch_value,
+            "update_entropy": self.update_batch_entropy,
+            "learning_rate": self.learning_rate,
+        }
+
+    def _create_inputs_and_outputs(self) -> None:
+        """
+        Assign the higher-level SACModel's inputs and outputs to those of its policy or
+        target network.
+        """
+        self.vector_in = self.policy.vector_in
+        self.visual_in = self.policy.visual_in
+        self.next_vector_in = self.target_network.vector_in
+        self.next_visual_in = self.target_network.visual_in
+        self.action_holder = self.policy.action_holder
+        self.sequence_length_ph = self.policy.sequence_length_ph
+        self.next_sequence_length_ph = self.target_network.sequence_length_ph
+        if not self.policy.use_continuous_act:
+            self.action_masks = self.policy_network.action_masks
+        else:
+            self.output_pre = self.policy_network.output_pre
+
+        # Don't use value estimate during inference. TODO: Check why PPO uses value_estimate in inference.
+        self.value = tf.identity(
+            self.policy_network.value, name="value_estimate_unused"
+        )
+        self.value_heads = self.policy_network.value_heads
+        self.dones_holder = tf.placeholder(
+            shape=[None], dtype=tf.float32, name="dones_holder"
+        )
+
+        if self.policy.use_recurrent:
+            self.memory_in = self.policy_network.memory_in
+            self.memory_out = self.policy_network.memory_out
+            if not self.policy.use_continuous_act:
+                self.prev_action = self.policy_network.prev_action
+            self.next_memory_in = self.target_network.memory_in
+
+    def _create_losses(
+        self,
+        q1_streams: Dict[str, tf.Tensor],
+        q2_streams: Dict[str, tf.Tensor],
+        lr: tf.Tensor,
+        max_step: int,
+        stream_names: List[str],
+        discrete: bool = False,
+    ) -> None:
+        """
+        Creates training-specific Tensorflow ops for SAC models.
+        :param q1_streams: Q1 streams from policy network
+        :param q1_streams: Q2 streams from policy network
+        :param lr: Learning rate
+        :param max_step: Total number of training steps.
+        :param stream_names: List of reward stream names.
+        :param discrete: Whether or not to use discrete action losses.
+        """
+
+        if discrete:
+            self.target_entropy = [
+                self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
+                for i in self.act_size
+            ]
+            discrete_action_probs = tf.exp(self.policy.all_log_probs)
+            per_action_entropy = discrete_action_probs * self.policy.all_log_probs
+        else:
+            self.target_entropy = (
+                -1
+                * self.continuous_target_entropy_scale
+                * np.prod(self.act_size[0]).astype(np.float32)
+            )
+
+        self.rewards_holders = {}
+        self.min_policy_qs = {}
+
+        for name in stream_names:
+            if discrete:
+                _branched_mpq1 = self._apply_as_branches(
+                    self.policy_network.q1_pheads[name] * discrete_action_probs
+                )
+                branched_mpq1 = tf.stack(
+                    [
+                        tf.reduce_sum(_br, axis=1, keep_dims=True)
+                        for _br in _branched_mpq1
+                    ]
+                )
+                _q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0)
+
+                _branched_mpq2 = self._apply_as_branches(
+                    self.policy_network.q2_pheads[name] * discrete_action_probs
+                )
+                branched_mpq2 = tf.stack(
+                    [
+                        tf.reduce_sum(_br, axis=1, keep_dims=True)
+                        for _br in _branched_mpq2
+                    ]
+                )
+                _q2_p_mean = tf.reduce_mean(branched_mpq2, axis=0)
+
+                self.min_policy_qs[name] = tf.minimum(_q1_p_mean, _q2_p_mean)
+            else:
+                self.min_policy_qs[name] = tf.minimum(
+                    self.policy_network.q1_pheads[name],
+                    self.policy_network.q2_pheads[name],
+                )
+
+            rewards_holder = tf.placeholder(
+                shape=[None], dtype=tf.float32, name="{}_rewards".format(name)
+            )
+            self.rewards_holders[name] = rewards_holder
+
+        q1_losses = []
+        q2_losses = []
+        # Multiple q losses per stream
+        expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
+        for i, name in enumerate(stream_names):
+            _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1)
+
+            q_backup = tf.stop_gradient(
+                _expanded_rewards
+                + (1.0 - self.use_dones_in_backup[name] * expanded_dones)
+                * self.gammas[i]
+                * self.target_network.value_heads[name]
+            )
+
+            if discrete:
+                # We need to break up the Q functions by branch, and update them individually.
+                branched_q1_stream = self._apply_as_branches(
+                    self.policy.action_oh * q1_streams[name]
+                )
+                branched_q2_stream = self._apply_as_branches(
+                    self.policy.action_oh * q2_streams[name]
+                )
+
+                # Reduce each branch into scalar
+                branched_q1_stream = [
+                    tf.reduce_sum(_branch, axis=1, keep_dims=True)
+                    for _branch in branched_q1_stream
+                ]
+                branched_q2_stream = [
+                    tf.reduce_sum(_branch, axis=1, keep_dims=True)
+                    for _branch in branched_q2_stream
+                ]
+
+                q1_stream = tf.reduce_mean(branched_q1_stream, axis=0)
+                q2_stream = tf.reduce_mean(branched_q2_stream, axis=0)
+
+            else:
+                q1_stream = q1_streams[name]
+                q2_stream = q2_streams[name]
+
+            _q1_loss = 0.5 * tf.reduce_mean(
+                tf.to_float(self.policy.mask)
+                * tf.squared_difference(q_backup, q1_stream)
+            )
+
+            _q2_loss = 0.5 * tf.reduce_mean(
+                tf.to_float(self.policy.mask)
+                * tf.squared_difference(q_backup, q2_stream)
+            )
+
+            q1_losses.append(_q1_loss)
+            q2_losses.append(_q2_loss)
+
+        self.q1_loss = tf.reduce_mean(q1_losses)
+        self.q2_loss = tf.reduce_mean(q2_losses)
+
+        # Learn entropy coefficient
+        if discrete:
+            # Create a log_ent_coef for each branch
+            self.log_ent_coef = tf.get_variable(
+                "log_ent_coef",
+                dtype=tf.float32,
+                initializer=np.log([self.init_entcoef] * len(self.act_size)).astype(
+                    np.float32
+                ),
+                trainable=True,
+            )
+        else:
+            self.log_ent_coef = tf.get_variable(
+                "log_ent_coef",
+                dtype=tf.float32,
+                initializer=np.log(self.init_entcoef).astype(np.float32),
+                trainable=True,
+            )
+
+        self.ent_coef = tf.exp(self.log_ent_coef)
+        if discrete:
+            # We also have to do a different entropy and target_entropy per branch.
+            branched_per_action_ent = self._apply_as_branches(per_action_entropy)
+            branched_ent_sums = tf.stack(
+                [
+                    tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
+                    for _lp, _te in zip(branched_per_action_ent, self.target_entropy)
+                ],
+                axis=1,
+            )
+            self.entropy_loss = -tf.reduce_mean(
+                tf.to_float(self.policy.mask)
+                * tf.reduce_mean(
+                    self.log_ent_coef
+                    * tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
+                    axis=1,
+                )
+            )
+
+            # Same with policy loss, we have to do the loss per branch and average them,
+            # so that larger branches don't get more weight.
+            # The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q
+            branched_q_term = self._apply_as_branches(
+                discrete_action_probs * self.policy_network.q1_p
+            )
+
+            branched_policy_loss = tf.stack(
+                [
+                    tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True)
+                    for i, (_lp, _qt) in enumerate(
+                        zip(branched_per_action_ent, branched_q_term)
+                    )
+                ]
+            )
+            self.policy_loss = tf.reduce_mean(
+                tf.to_float(self.policy.mask) * tf.squeeze(branched_policy_loss)
+            )
+
+            # Do vbackup entropy bonus per branch as well.
+            branched_ent_bonus = tf.stack(
+                [
+                    tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True)
+                    for i, _lp in enumerate(branched_per_action_ent)
+                ]
+            )
+            value_losses = []
+            for name in stream_names:
+                v_backup = tf.stop_gradient(
+                    self.min_policy_qs[name]
+                    - tf.reduce_mean(branched_ent_bonus, axis=0)
+                )
+                value_losses.append(
+                    0.5
+                    * tf.reduce_mean(
+                        tf.to_float(self.policy.mask)
+                        * tf.squared_difference(
+                            self.policy_network.value_heads[name], v_backup
+                        )
+                    )
+                )
+
+        else:
+            self.entropy_loss = -tf.reduce_mean(
+                self.log_ent_coef
+                * tf.to_float(self.policy.mask)
+                * tf.stop_gradient(
+                    tf.reduce_sum(
+                        self.policy.all_log_probs + self.target_entropy,
+                        axis=1,
+                        keep_dims=True,
+                    )
+                )
+            )
+            batch_policy_loss = tf.reduce_mean(
+                self.ent_coef * self.policy.all_log_probs - self.policy_network.q1_p,
+                axis=1,
+            )
+            self.policy_loss = tf.reduce_mean(
+                tf.to_float(self.policy.mask) * batch_policy_loss
+            )
+
+            value_losses = []
+            for name in stream_names:
+                v_backup = tf.stop_gradient(
+                    self.min_policy_qs[name]
+                    - tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
+                )
+                value_losses.append(
+                    0.5
+                    * tf.reduce_mean(
+                        tf.to_float(self.policy.mask)
+                        * tf.squared_difference(
+                            self.policy_network.value_heads[name], v_backup
+                        )
+                    )
+                )
+        self.value_loss = tf.reduce_mean(value_losses)
+
+        self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss
+
+        self.entropy = self.policy_network.entropy
+
+    def _apply_as_branches(self, concat_logits: tf.Tensor) -> List[tf.Tensor]:
+        """
+        Takes in a concatenated set of logits and breaks it up into a list of non-concatenated logits, one per
+        action branch
+        """
+        action_idx = [0] + list(np.cumsum(self.act_size))
+        branches_logits = [
+            concat_logits[:, action_idx[i] : action_idx[i + 1]]
+            for i in range(len(self.act_size))
+        ]
+        return branches_logits
+
+    def _create_sac_optimizer_ops(self) -> None:
+        """
+        Creates the Adam optimizers and update ops for SAC, including
+        the policy, value, and entropy updates, as well as the target network update.
+        """
+        policy_optimizer = self.create_optimizer_op(
+            learning_rate=self.learning_rate, name="sac_policy_opt"
+        )
+        entropy_optimizer = self.create_optimizer_op(
+            learning_rate=self.learning_rate, name="sac_entropy_opt"
+        )
+        value_optimizer = self.create_optimizer_op(
+            learning_rate=self.learning_rate, name="sac_value_opt"
+        )
+
+        self.target_update_op = [
+            tf.assign(target, (1 - self.tau) * target + self.tau * source)
+            for target, source in zip(
+                self.target_network.value_vars, self.policy_network.value_vars
+            )
+        ]
+        LOGGER.debug("value_vars")
+        self.print_all_vars(self.policy_network.value_vars)
+        LOGGER.debug("targvalue_vars")
+        self.print_all_vars(self.target_network.value_vars)
+        LOGGER.debug("critic_vars")
+        self.print_all_vars(self.policy_network.critic_vars)
+        LOGGER.debug("q_vars")
+        self.print_all_vars(self.policy_network.q_vars)
+        LOGGER.debug("policy_vars")
+        policy_vars = self.policy.get_trainable_variables()
+        self.print_all_vars(policy_vars)
+
+        self.target_init_op = [
+            tf.assign(target, source)
+            for target, source in zip(
+                self.target_network.value_vars, self.policy_network.value_vars
+            )
+        ]
+
+        self.update_batch_policy = policy_optimizer.minimize(
+            self.policy_loss, var_list=policy_vars
+        )
+
+        # Make sure policy is updated first, then value, then entropy.
+        with tf.control_dependencies([self.update_batch_policy]):
+            self.update_batch_value = value_optimizer.minimize(
+                self.total_value_loss, var_list=self.policy_network.critic_vars
+            )
+            # Add entropy coefficient optimization operation
+            with tf.control_dependencies([self.update_batch_value]):
+                self.update_batch_entropy = entropy_optimizer.minimize(
+                    self.entropy_loss, var_list=self.log_ent_coef
+                )
+
+    def print_all_vars(self, variables):
+        for _var in variables:
+            LOGGER.debug(_var)
+
+    @timed
+    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
+        """
+        Updates model using buffer.
+        :param num_sequences: Number of trajectories in batch.
+        :param batch: Experience mini-batch.
+        :param update_target: Whether or not to update target value network
+        :param reward_signal_batches: Minibatches to use for updating the reward signals,
+            indexed by name. If none, don't update the reward signals.
+        :return: Output from update process.
+        """
+        feed_dict = self._construct_feed_dict(self.policy, batch, num_sequences)
+        stats_needed = self.stats_name_to_update_name
+        update_stats: Dict[str, float] = {}
+        update_vals = self._execute_model(feed_dict, self.update_dict)
+        for stat_name, update_name in stats_needed.items():
+            update_stats[stat_name] = update_vals[update_name]
+        # Update target network. By default, target update happens at every policy update.
+        self.sess.run(self.target_update_op)
+        return update_stats
+
+    def update_reward_signals(
+        self, reward_signal_minibatches: Mapping[str, Dict], num_sequences: int
+    ) -> Dict[str, float]:
+        """
+        Only update the reward signals.
+        :param reward_signal_batches: Minibatches to use for updating the reward signals,
+            indexed by name. If none, don't update the reward signals.
+        """
+        # Collect feed dicts for all reward signals.
+        feed_dict: Dict[tf.Tensor, Any] = {}
+        update_dict: Dict[str, tf.Tensor] = {}
+        update_stats: Dict[str, float] = {}
+        stats_needed: Dict[str, str] = {}
+        if reward_signal_minibatches:
+            self.add_reward_signal_dicts(
+                feed_dict,
+                update_dict,
+                stats_needed,
+                reward_signal_minibatches,
+                num_sequences,
+            )
+        update_vals = self._execute_model(feed_dict, update_dict)
+        for stat_name, update_name in stats_needed.items():
+            update_stats[stat_name] = update_vals[update_name]
+        return update_stats
+
+    def add_reward_signal_dicts(
+        self,
+        feed_dict: Dict[tf.Tensor, Any],
+        update_dict: Dict[str, tf.Tensor],
+        stats_needed: Dict[str, str],
+        reward_signal_minibatches: Mapping[str, Dict],
+        num_sequences: int,
+    ) -> None:
+        """
+        Adds the items needed for reward signal updates to the feed_dict and stats_needed dict.
+        :param feed_dict: Feed dict needed update
+        :param update_dit: Update dict that needs update
+        :param stats_needed: Stats needed to get from the update.
+        :param reward_signal_minibatches: Minibatches to use for updating the reward signals,
+            indexed by name.
+        """
+        for name, r_batch in reward_signal_minibatches.items():
+            feed_dict.update(
+                self.reward_signals[name].prepare_update(
+                    self.policy, r_batch, num_sequences
+                )
+            )
+            update_dict.update(self.reward_signals[name].update_dict)
+            stats_needed.update(self.reward_signals[name].stats_name_to_update_name)
+
+    def _construct_feed_dict(
+        self, policy: TFPolicy, batch: AgentBuffer, num_sequences: int
+    ) -> Dict[tf.Tensor, Any]:
+        """
+        Builds the feed dict for updating the SAC model.
+        :param model: The model to update. May be different when, e.g. using multi-GPU.
+        :param batch: Mini-batch to use to update.
+        :param num_sequences: Number of LSTM sequences in batch.
+        """
+        # Do an optional burn-in for memories
+        num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
+        burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
+        burn_in_mask[range(0, num_burn_in)] = 0
+        burn_in_mask = np.tile(burn_in_mask, num_sequences)
+        feed_dict = {
+            policy.batch_size_ph: num_sequences,
+            policy.sequence_length_ph: self.policy.sequence_length,
+            self.next_sequence_length_ph: self.policy.sequence_length,
+            self.policy.mask_input: batch["masks"] * burn_in_mask,
+        }
+        for name in self.reward_signals:
+            feed_dict[self.rewards_holders[name]] = batch["{}_rewards".format(name)]
+
+        if self.policy.use_continuous_act:
+            feed_dict[policy.action_holder] = batch["actions"]
+        else:
+            feed_dict[policy.action_holder] = batch["actions"]
+            if self.policy.use_recurrent:
+                feed_dict[policy.prev_action] = batch["prev_action"]
+            feed_dict[policy.action_masks] = batch["action_mask"]
+        if self.policy.use_vec_obs:
+            feed_dict[policy.vector_in] = batch["vector_obs"]
+            feed_dict[self.next_vector_in] = batch["next_vector_in"]
+        if self.policy.vis_obs_size > 0:
+            for i, _ in enumerate(policy.visual_in):
+                _obs = batch["visual_obs%d" % i]
+                feed_dict[policy.visual_in[i]] = _obs
+            for i, _ in enumerate(self.next_visual_in):
+                _obs = batch["next_visual_obs%d" % i]
+                feed_dict[self.next_visual_in[i]] = _obs
+        if self.policy.use_recurrent:
+            feed_dict[policy.memory_in] = [
+                batch["memory"][i]
+                for i in range(0, len(batch["memory"]), self.policy.sequence_length)
+            ]
+            feed_dict[self.policy_network.memory_in] = self._make_zero_mem(
+                self.m_size, batch.num_experiences
+            )
+            feed_dict[self.target_network.memory_in] = self._make_zero_mem(
+                self.m_size // 3, batch.num_experiences
+            )
+        feed_dict[self.dones_holder] = batch["done"]
+        return feed_dict
--- a/ml-agents/mlagents/trainers/sac/network.py
+++ b/ml-agents/mlagents/trainers/sac/network.py
+import logging
+from typing import Dict, Optional
+
+from mlagents.tf_utils import tf
+
+from mlagents.trainers.models import ModelUtils, EncoderType
+
+LOG_STD_MAX = 2
+LOG_STD_MIN = -20
+EPSILON = 1e-6  # Small value to avoid divide by zero
+DISCRETE_TARGET_ENTROPY_SCALE = 0.2  # Roughly equal to e-greedy 0.05
+CONTINUOUS_TARGET_ENTROPY_SCALE = 1.0  # TODO: Make these an optional hyperparam.
+
+LOGGER = logging.getLogger("mlagents.trainers")
+
+POLICY_SCOPE = ""
+TARGET_SCOPE = "target_network"
+
+
+class SACNetwork:
+    """
+    Base class for an SAC network. Implements methods for creating the actor and critic heads.
+    """
+
+    def __init__(
+        self,
+        policy=None,
+        m_size=None,
+        h_size=128,
+        normalize=False,
+        use_recurrent=False,
+        num_layers=2,
+        stream_names=None,
+        vis_encode_type=EncoderType.SIMPLE,
+    ):
+        self.normalize = normalize
+        self.use_recurrent = use_recurrent
+        self.num_layers = num_layers
+        self.stream_names = stream_names
+        self.h_size = h_size
+        self.activ_fn = ModelUtils.swish
+
+        self.sequence_length_ph = tf.placeholder(
+            shape=None, dtype=tf.int32, name="sac_sequence_length"
+        )
+
+        self.policy_memory_in: Optional[tf.Tensor] = None
+        self.policy_memory_out: Optional[tf.Tensor] = None
+        self.value_memory_in: Optional[tf.Tensor] = None
+        self.value_memory_out: Optional[tf.Tensor] = None
+        self.q1: Optional[tf.Tensor] = None
+        self.q2: Optional[tf.Tensor] = None
+        self.q1_p: Optional[tf.Tensor] = None
+        self.q2_p: Optional[tf.Tensor] = None
+        self.q1_memory_in: Optional[tf.Tensor] = None
+        self.q2_memory_in: Optional[tf.Tensor] = None
+        self.q1_memory_out: Optional[tf.Tensor] = None
+        self.q2_memory_out: Optional[tf.Tensor] = None
+        self.prev_action: Optional[tf.Tensor] = None
+        self.action_masks: Optional[tf.Tensor] = None
+        self.external_action_in: Optional[tf.Tensor] = None
+        self.log_sigma_sq: Optional[tf.Tensor] = None
+        self.entropy: Optional[tf.Tensor] = None
+        self.deterministic_output: Optional[tf.Tensor] = None
+        self.normalized_logprobs: Optional[tf.Tensor] = None
+        self.action_probs: Optional[tf.Tensor] = None
+        self.output_oh: Optional[tf.Tensor] = None
+        self.output_pre: Optional[tf.Tensor] = None
+
+        self.value_vars = None
+        self.q_vars = None
+        self.critic_vars = None
+        self.policy_vars = None
+
+        self.q1_heads: Dict[str, tf.Tensor] = None
+        self.q2_heads: Dict[str, tf.Tensor] = None
+        self.q1_pheads: Dict[str, tf.Tensor] = None
+        self.q2_pheads: Dict[str, tf.Tensor] = None
+
+        self.policy = policy
+
+    def get_vars(self, scope):
+        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
+
+    def join_scopes(self, scope_1, scope_2):
+        """
+        Joins two scopes. Does so safetly (i.e., if one of the two scopes doesn't
+        exist, don't add any backslashes)
+        """
+        if not scope_1:
+            return scope_2
+        if not scope_2:
+            return scope_1
+        else:
+            return "/".join(filter(None, [scope_1, scope_2]))
+
+    def create_value_heads(self, stream_names, hidden_input):
+        """
+        Creates one value estimator head for each reward signal in stream_names.
+        Also creates the node corresponding to the mean of all the value heads in self.value.
+        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
+        :param stream_names: The list of reward signal names
+        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
+        of the hidden input.
+        """
+        self.value_heads = {}
+        for name in stream_names:
+            value = tf.layers.dense(hidden_input, 1, name="{}_value".format(name))
+            self.value_heads[name] = value
+        self.value = tf.reduce_mean(list(self.value_heads.values()), 0)
+
+    def _create_cc_critic(self, hidden_value, scope, create_qs=True):
+        """
+        Creates just the critic network
+        """
+        scope = self.join_scopes(scope, "critic")
+        self.create_sac_value_head(
+            self.stream_names,
+            hidden_value,
+            self.num_layers,
+            self.h_size,
+            self.join_scopes(scope, "value"),
+        )
+
+        self.value_vars = self.get_vars(self.join_scopes(scope, "value"))
+        if create_qs:
+            hidden_q = tf.concat([hidden_value, self.policy.action_holder], axis=-1)
+            hidden_qp = tf.concat([hidden_value, self.policy.output], axis=-1)
+            self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads(
+                self.stream_names,
+                hidden_q,
+                self.num_layers,
+                self.h_size,
+                self.join_scopes(scope, "q"),
+            )
+            self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads(
+                self.stream_names,
+                hidden_qp,
+                self.num_layers,
+                self.h_size,
+                self.join_scopes(scope, "q"),
+                reuse=True,
+            )
+            self.q_vars = self.get_vars(self.join_scopes(scope, "q"))
+        self.critic_vars = self.get_vars(scope)
+
+    def _create_dc_critic(self, hidden_value, scope, create_qs=True):
+        """
+        Creates just the critic network
+        """
+        scope = self.join_scopes(scope, "critic")
+        self.create_sac_value_head(
+            self.stream_names,
+            hidden_value,
+            self.num_layers,
+            self.h_size,
+            self.join_scopes(scope, "value"),
+        )
+
+        self.value_vars = self.get_vars("/".join([scope, "value"]))
+
+        if create_qs:
+            self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads(
+                self.stream_names,
+                hidden_value,
+                self.num_layers,
+                self.h_size,
+                self.join_scopes(scope, "q"),
+                num_outputs=sum(self.policy.act_size),
+            )
+            self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads(
+                self.stream_names,
+                hidden_value,
+                self.num_layers,
+                self.h_size,
+                self.join_scopes(scope, "q"),
+                reuse=True,
+                num_outputs=sum(self.policy.act_size),
+            )
+            self.q_vars = self.get_vars(scope)
+        self.critic_vars = self.get_vars(scope)
+
+    def create_sac_value_head(
+        self, stream_names, hidden_input, num_layers, h_size, scope
+    ):
+        """
+        Creates one value estimator head for each reward signal in stream_names.
+        Also creates the node corresponding to the mean of all the value heads in self.value.
+        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
+        :param stream_names: The list of reward signal names
+        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
+        of the hidden input.
+        :param num_layers: Number of hidden layers for value network
+        :param h_size: size of hidden layers for value network
+        :param scope: TF scope for value network.
+        """
+        with tf.variable_scope(scope):
+            value_hidden = ModelUtils.create_vector_observation_encoder(
+                hidden_input, h_size, self.activ_fn, num_layers, "encoder", False
+            )
+            if self.use_recurrent:
+                value_hidden, memory_out = ModelUtils.create_recurrent_encoder(
+                    value_hidden,
+                    self.value_memory_in,
+                    self.sequence_length_ph,
+                    name="lstm_value",
+                )
+                self.value_memory_out = memory_out
+            self.create_value_heads(stream_names, value_hidden)
+
+    def create_q_heads(
+        self,
+        stream_names,
+        hidden_input,
+        num_layers,
+        h_size,
+        scope,
+        reuse=False,
+        num_outputs=1,
+    ):
+        """
+        Creates two q heads for each reward signal in stream_names.
+        Also creates the node corresponding to the mean of all the value heads in self.value.
+        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
+        :param stream_names: The list of reward signal names
+        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
+        of the hidden input.
+        :param num_layers: Number of hidden layers for Q network
+        :param h_size: size of hidden layers for Q network
+        :param scope: TF scope for Q network.
+        :param reuse: Whether or not to reuse variables. Useful for creating Q of policy.
+        :param num_outputs: Number of outputs of each Q function. If discrete, equal to number of actions.
+        """
+        with tf.variable_scope(self.join_scopes(scope, "q1_encoding"), reuse=reuse):
+            q1_hidden = ModelUtils.create_vector_observation_encoder(
+                hidden_input, h_size, self.activ_fn, num_layers, "q1_encoder", reuse
+            )
+            if self.use_recurrent:
+                q1_hidden, memory_out = ModelUtils.create_recurrent_encoder(
+                    q1_hidden,
+                    self.q1_memory_in,
+                    self.sequence_length_ph,
+                    name="lstm_q1",
+                )
+                self.q1_memory_out = memory_out
+
+            q1_heads = {}
+            for name in stream_names:
+                _q1 = tf.layers.dense(q1_hidden, num_outputs, name="{}_q1".format(name))
+                q1_heads[name] = _q1
+
+            q1 = tf.reduce_mean(list(q1_heads.values()), axis=0)
+        with tf.variable_scope(self.join_scopes(scope, "q2_encoding"), reuse=reuse):
+            q2_hidden = ModelUtils.create_vector_observation_encoder(
+                hidden_input, h_size, self.activ_fn, num_layers, "q2_encoder", reuse
+            )
+            if self.use_recurrent:
+                q2_hidden, memory_out = ModelUtils.create_recurrent_encoder(
+                    q2_hidden,
+                    self.q2_memory_in,
+                    self.sequence_length_ph,
+                    name="lstm_q2",
+                )
+                self.q2_memory_out = memory_out
+
+            q2_heads = {}
+            for name in stream_names:
+                _q2 = tf.layers.dense(q2_hidden, num_outputs, name="{}_q2".format(name))
+                q2_heads[name] = _q2
+
+            q2 = tf.reduce_mean(list(q2_heads.values()), axis=0)
+
+        return q1_heads, q2_heads, q1, q2
+
+
+class SACTargetNetwork(SACNetwork):
+    """
+    Instantiation for the SAC target network. Only contains a single
+    value estimator and is updated from the Policy Network.
+    """
+
+    def __init__(
+        self,
+        policy,
+        m_size=None,
+        h_size=128,
+        normalize=False,
+        use_recurrent=False,
+        num_layers=2,
+        stream_names=None,
+        vis_encode_type=EncoderType.SIMPLE,
+    ):
+        super().__init__(
+            policy,
+            m_size,
+            h_size,
+            normalize,
+            use_recurrent,
+            num_layers,
+            stream_names,
+            vis_encode_type,
+        )
+        with tf.variable_scope(TARGET_SCOPE):
+            self.visual_in = ModelUtils.create_visual_input_placeholders(
+                policy.brain.camera_resolutions
+            )
+            self.vector_in = ModelUtils.create_vector_input(policy.vec_obs_size)
+            if self.policy.normalize:
+                normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
+                self.update_normalization_op = normalization_tensors.update_op
+                self.normalization_steps = normalization_tensors.steps
+                self.running_mean = normalization_tensors.running_mean
+                self.running_variance = normalization_tensors.running_variance
+                self.processed_vector_in = ModelUtils.normalize_vector_obs(
+                    self.vector_in,
+                    self.running_mean,
+                    self.running_variance,
+                    self.normalization_steps,
+                )
+            else:
+                self.processed_vector_in = self.vector_in
+                self.update_normalization_op = None
+
+            if self.policy.use_recurrent:
+                self.memory_in = tf.placeholder(
+                    shape=[None, m_size], dtype=tf.float32, name="target_recurrent_in"
+                )
+                self.value_memory_in = self.memory_in
+            hidden_streams = ModelUtils.create_observation_streams(
+                self.visual_in,
+                self.processed_vector_in,
+                1,
+                self.h_size,
+                0,
+                vis_encode_type=vis_encode_type,
+                stream_scopes=["critic/value/"],
+            )
+        if self.policy.use_continuous_act:
+            self._create_cc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
+        else:
+            self._create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
+        if self.use_recurrent:
+            self.memory_out = tf.concat(
+                self.value_memory_out, axis=1
+            )  # Needed for Barracuda to work
+
+    def copy_normalization(self, mean, variance, steps):
+        """
+        Copies the mean, variance, and steps into the normalizers of the
+        input of this SACNetwork. Used to copy the normalizer from the policy network
+        to the target network.
+        param mean: Tensor containing the mean.
+        param variance: Tensor containing the variance
+        param steps: Tensor containing the number of steps.
+        """
+        update_mean = tf.assign(self.running_mean, mean)
+        update_variance = tf.assign(self.running_variance, variance)
+        update_norm_step = tf.assign(self.normalization_steps, steps)
+        return tf.group([update_mean, update_variance, update_norm_step])
+
+
+class SACPolicyNetwork(SACNetwork):
+    """
+    Instantiation for SAC policy network. Contains a dual Q estimator,
+    a value estimator, and a reference to the actual policy network.
+    """
+
+    def __init__(
+        self,
+        policy,
+        m_size=None,
+        h_size=128,
+        normalize=False,
+        use_recurrent=False,
+        num_layers=2,
+        stream_names=None,
+        vis_encode_type=EncoderType.SIMPLE,
+    ):
+        super().__init__(
+            policy,
+            m_size,
+            h_size,
+            normalize,
+            use_recurrent,
+            num_layers,
+            stream_names,
+            vis_encode_type,
+        )
+        if self.policy.use_recurrent:
+            self._create_memory_ins(m_size)
+
+        hidden_critic = self._create_observation_in(vis_encode_type)
+        self.policy.output = self.policy.output
+        # Use the sequence length of the policy
+        self.sequence_length_ph = self.policy.sequence_length_ph
+
+        if self.policy.use_continuous_act:
+            self._create_cc_critic(hidden_critic, POLICY_SCOPE)
+
+        else:
+            self._create_dc_critic(hidden_critic, POLICY_SCOPE)
+
+        if self.use_recurrent:
+            mem_outs = [self.value_memory_out, self.q1_memory_out, self.q2_memory_out]
+            self.memory_out = tf.concat(mem_outs, axis=1)
+
+    def _create_memory_ins(self, m_size):
+        """
+        Creates the memory input placeholders for LSTM.
+        :param m_size: the total size of the memory.
+        """
+        self.memory_in = tf.placeholder(
+            shape=[None, m_size * 3], dtype=tf.float32, name="value_recurrent_in"
+        )
+
+        # Re-break-up for each network
+        num_mems = 3
+        input_size = self.memory_in.get_shape().as_list()[1]
+        mem_ins = []
+        for i in range(num_mems):
+            _start = input_size // num_mems * i
+            _end = input_size // num_mems * (i + 1)
+            mem_ins.append(self.memory_in[:, _start:_end])
+        self.value_memory_in = mem_ins[0]
+        self.q1_memory_in = mem_ins[1]
+        self.q2_memory_in = mem_ins[2]
+
+    def _create_observation_in(self, vis_encode_type):
+        """
+        Creates the observation inputs, and a CNN if needed,
+        :param vis_encode_type: Type of CNN encoder.
+        :param share_ac_cnn: Whether or not to share the actor and critic CNNs.
+        :return A tuple of (hidden_policy, hidden_critic). We don't save it to self since they're used
+        once and thrown away.
+        """
+        with tf.variable_scope(POLICY_SCOPE):
+            hidden_streams = ModelUtils.create_observation_streams(
+                self.policy.visual_in,
+                self.policy.processed_vector_in,
+                1,
+                self.h_size,
+                0,
+                vis_encode_type=vis_encode_type,
+                stream_scopes=["critic/value/"],
+            )
+        hidden_critic = hidden_streams[0]
+        return hidden_critic
--- a/ml-agents/mlagents/trainers/tests/test_nn_policy.py
+++ b/ml-agents/mlagents/trainers/tests/test_nn_policy.py
+import pytest
+
+import numpy as np
+from mlagents.tf_utils import tf
+
+import yaml
+
+from mlagents.trainers.common.nn_policy import NNPolicy
+from mlagents.trainers.models import EncoderType, ModelUtils
+from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.brain import BrainParameters, CameraResolution
+from mlagents.trainers.tests import mock_brain as mb
+from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
+
+
+@pytest.fixture
+def dummy_config():
+    return yaml.safe_load(
+        """
+        trainer: ppo
+        batch_size: 32
+        beta: 5.0e-3
+        buffer_size: 512
+        epsilon: 0.2
+        hidden_units: 128
+        lambd: 0.95
+        learning_rate: 3.0e-4
+        max_steps: 5.0e4
+        normalize: true
+        num_epoch: 5
+        num_layers: 2
+        time_horizon: 64
+        sequence_length: 64
+        summary_freq: 1000
+        use_recurrent: false
+        normalize: true
+        memory_size: 8
+        curiosity_strength: 0.0
+        curiosity_enc_size: 1
+        summary_path: test
+        model_path: test
+        reward_signals:
+          extrinsic:
+            strength: 1.0
+            gamma: 0.99
+        """
+    )
+
+
+VECTOR_ACTION_SPACE = [2]
+VECTOR_OBS_SPACE = 8
+DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
+BUFFER_INIT_SAMPLES = 32
+NUM_AGENTS = 12
+
+
+def create_policy_mock(dummy_config, use_rnn, use_discrete, use_visual):
+    mock_brain = mb.setup_mock_brain(
+        use_discrete,
+        use_visual,
+        vector_action_space=VECTOR_ACTION_SPACE,
+        vector_obs_space=VECTOR_OBS_SPACE,
+        discrete_action_space=DISCRETE_ACTION_SPACE,
+    )
+
+    trainer_parameters = dummy_config
+    model_path = "testmodel"
+    trainer_parameters["model_path"] = model_path
+    trainer_parameters["keep_checkpoints"] = 3
+    trainer_parameters["use_recurrent"] = use_rnn
+    policy = NNPolicy(0, mock_brain, trainer_parameters, False, False)
+    return policy
+
+
+@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
+@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
+@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
+def test_policy_evaluate(dummy_config, rnn, visual, discrete):
+    # Test evaluate
+    tf.reset_default_graph()
+    policy = create_policy_mock(
+        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
+    )
+    step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
+
+    run_out = policy.evaluate(step, list(step.agent_id))
+    if discrete:
+        run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
+    else:
+        assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE[0])
+
+
+def test_normalization(dummy_config):
+    brain_params = BrainParameters(
+        brain_name="test_brain",
+        vector_observation_space_size=1,
+        camera_resolutions=[],
+        vector_action_space_size=[2],
+        vector_action_descriptions=[],
+        vector_action_space_type=0,
+    )
+    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
+    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
+
+    time_horizon = 6
+    trajectory = make_fake_trajectory(
+        length=time_horizon,
+        max_step_complete=True,
+        vec_obs_size=1,
+        num_vis_obs=0,
+        action_space=[2],
+    )
+    # Change half of the obs to 0
+    for i in range(3):
+        trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
+    policy = policy = NNPolicy(0, brain_params, dummy_config, False, False)
+
+    trajectory_buffer = trajectory.to_agentbuffer()
+    policy.update_normalization(trajectory_buffer["vector_obs"])
+
+    # Check that the running mean and variance is correct
+    steps, mean, variance = policy.sess.run(
+        [policy.normalization_steps, policy.running_mean, policy.running_variance]
+    )
+
+    assert steps == 6
+    assert mean[0] == 0.5
+    # Note: variance is divided by number of steps, and initialized to 1 to avoid
+    # divide by 0. The right answer is 0.25
+    assert (variance[0] - 1) / steps == 0.25
+
+    # Make another update, this time with all 1's
+    time_horizon = 10
+    trajectory = make_fake_trajectory(
+        length=time_horizon,
+        max_step_complete=True,
+        vec_obs_size=1,
+        num_vis_obs=0,
+        action_space=[2],
+    )
+    trajectory_buffer = trajectory.to_agentbuffer()
+    policy.update_normalization(trajectory_buffer["vector_obs"])
+
+    # Check that the running mean and variance is correct
+    steps, mean, variance = policy.sess.run(
+        [policy.normalization_steps, policy.running_mean, policy.running_variance]
+    )
+
+    assert steps == 16
+    assert mean[0] == 0.8125
+    assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
+
+
+def test_min_visual_size():
+    # Make sure each EncoderType has an entry in MIS_RESOLUTION_FOR_ENCODER
+    assert set(ModelUtils.MIN_RESOLUTION_FOR_ENCODER.keys()) == set(EncoderType)
+
+    for encoder_type in EncoderType:
+        with tf.Graph().as_default():
+            good_size = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[encoder_type]
+            good_res = CameraResolution(
+                width=good_size, height=good_size, num_channels=3
+            )
+            vis_input = ModelUtils.create_visual_input(good_res, "test_min_visual_size")
+            ModelUtils._check_resolution_for_encoder(vis_input, encoder_type)
+            enc_func = ModelUtils.get_encoder_for_type(encoder_type)
+            enc_func(vis_input, 32, ModelUtils.swish, 1, "test", False)
+
+        # Anything under the min size should raise an exception. If not, decrease the min size!
+        with pytest.raises(Exception):
+            with tf.Graph().as_default():
+                bad_size = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[encoder_type] - 1
+                bad_res = CameraResolution(
+                    width=bad_size, height=bad_size, num_channels=3
+                )
+                vis_input = ModelUtils.create_visual_input(
+                    bad_res, "test_min_visual_size"
+                )
+
+                with pytest.raises(UnityTrainerException):
+                    # Make sure we'd hit a friendly error during model setup time.
+                    ModelUtils._check_resolution_for_encoder(vis_input, encoder_type)
+
+                enc_func = ModelUtils.get_encoder_for_type(encoder_type)
+                enc_func(vis_input, 32, ModelUtils.swish, 1, "test", False)
+
+
+if __name__ == "__main__":
+    pytest.main()
--- a/ml-agents/mlagents/trainers/common/init.py
+++ b/ml-agents/mlagents/trainers/common/init.py
--- a/ml-agents/mlagents/trainers/common/tf_optimizer.py
+++ b/ml-agents/mlagents/trainers/common/tf_optimizer.py
+from typing import Dict, Any, List, Tuple, Optional
+import numpy as np
+
+from mlagents.tf_utils.tf import tf
+from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.common.optimizer import Optimizer
+from mlagents.trainers.trajectory import SplitObservations
+from mlagents.trainers.components.reward_signals.reward_signal_factory import (
+    create_reward_signal,
+)
+from mlagents.trainers.components.bc.module import BCModule
+
+
+class TFOptimizer(Optimizer):  # pylint: disable=W0223
+    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
+        self.sess = policy.sess
+        self.policy = policy
+        self.update_dict: Dict[str, tf.Tensor] = {}
+        self.value_heads: Dict[str, tf.Tensor] = {}
+        self.create_reward_signals(trainer_params["reward_signals"])
+        self.memory_in: tf.Tensor = None
+        self.memory_out: tf.Tensor = None
+        self.m_size: int = 0
+        self.bc_module: Optional[BCModule] = None
+        # Create pretrainer if needed
+        if "behavioral_cloning" in trainer_params:
+            BCModule.check_config(trainer_params["behavioral_cloning"])
+            self.bc_module = BCModule(
+                self.policy,
+                policy_learning_rate=trainer_params["learning_rate"],
+                default_batch_size=trainer_params["batch_size"],
+                default_num_epoch=3,
+                **trainer_params["behavioral_cloning"],
+            )
+
+    def get_trajectory_value_estimates(
+        self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
+        feed_dict: Dict[tf.Tensor, Any] = {
+            self.policy.batch_size_ph: batch.num_experiences,
+            self.policy.sequence_length_ph: batch.num_experiences,  # We want to feed data in batch-wise, not time-wise.
+        }
+
+        if self.policy.vec_obs_size > 0:
+            feed_dict[self.policy.vector_in] = batch["vector_obs"]
+        if self.policy.vis_obs_size > 0:
+            for i in range(len(self.policy.visual_in)):
+                _obs = batch["visual_obs%d" % i]
+                feed_dict[self.policy.visual_in[i]] = _obs
+        if self.policy.use_recurrent:
+            feed_dict[self.policy.memory_in] = [np.zeros((self.policy.m_size))]
+            feed_dict[self.memory_in] = [np.zeros((self.m_size))]
+        if self.policy.prev_action is not None:
+            feed_dict[self.policy.prev_action] = batch["prev_action"]
+
+        if self.policy.use_recurrent:
+            value_estimates, policy_mem, value_mem = self.sess.run(
+                [self.value_heads, self.policy.memory_out, self.memory_out], feed_dict
+            )
+            prev_action = batch["actions"][-1]
+        else:
+            value_estimates = self.sess.run(self.value_heads, feed_dict)
+            prev_action = None
+            policy_mem = None
+            value_mem = None
+        value_estimates = {k: np.squeeze(v, axis=1) for k, v in value_estimates.items()}
+
+        # We do this in a separate step to feed the memory outs - a further optimization would
+        # be to append to the obs before running sess.run.
+        final_value_estimates = self._get_value_estimates(
+            next_obs, done, policy_mem, value_mem, prev_action
+        )
+
+        return value_estimates, final_value_estimates
+
+    def _get_value_estimates(
+        self,
+        next_obs: List[np.ndarray],
+        done: bool,
+        policy_memory: np.ndarray = None,
+        value_memory: np.ndarray = None,
+        prev_action: np.ndarray = None,
+    ) -> Dict[str, float]:
+        """
+        Generates value estimates for bootstrapping.
+        :param experience: AgentExperience to be used for bootstrapping.
+        :param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
+        :return: The value estimate dictionary with key being the name of the reward signal and the value the
+        corresponding value estimate.
+        """
+
+        feed_dict: Dict[tf.Tensor, Any] = {
+            self.policy.batch_size_ph: 1,
+            self.policy.sequence_length_ph: 1,
+        }
+        vec_vis_obs = SplitObservations.from_observations(next_obs)
+        for i in range(len(vec_vis_obs.visual_observations)):
+            feed_dict[self.policy.visual_in[i]] = [vec_vis_obs.visual_observations[i]]
+
+        if self.policy.vec_obs_size > 0:
+            feed_dict[self.policy.vector_in] = [vec_vis_obs.vector_observations]
+        if policy_memory is not None:
+            feed_dict[self.policy.memory_in] = policy_memory
+        if value_memory is not None:
+            feed_dict[self.memory_in] = value_memory
+        if prev_action is not None:
+            feed_dict[self.policy.prev_action] = [prev_action]
+        value_estimates = self.sess.run(self.value_heads, feed_dict)
+
+        value_estimates = {k: float(v) for k, v in value_estimates.items()}
+
+        # If we're done, reassign all of the value estimates that need terminal states.
+        if done:
+            for k in value_estimates:
+                if self.reward_signals[k].use_terminal_states:
+                    value_estimates[k] = 0.0
+
+        return value_estimates
+
+    def create_reward_signals(self, reward_signal_configs: Dict[str, Any]) -> None:
+        """
+        Create reward signals
+        :param reward_signal_configs: Reward signal config.
+        """
+        self.reward_signals = {}
+        # Create reward signals
+        for reward_signal, config in reward_signal_configs.items():
+            self.reward_signals[reward_signal] = create_reward_signal(
+                self.policy, reward_signal, config
+            )
+            self.update_dict.update(self.reward_signals[reward_signal].update_dict)
+
+    def create_optimizer_op(
+        self, learning_rate: tf.Tensor, name: str = "Adam"
+    ) -> tf.train.Optimizer:
+        return tf.train.AdamOptimizer(learning_rate=learning_rate, name=name)
+
+    def _execute_model(
+        self, feed_dict: Dict[tf.Tensor, np.ndarray], out_dict: Dict[str, tf.Tensor]
+    ) -> Dict[str, np.ndarray]:
+        """
+        Executes model.
+        :param feed_dict: Input dictionary mapping nodes to input data.
+        :param out_dict: Output dictionary mapping names to nodes.
+        :return: Dictionary mapping names to input data.
+        """
+        network_out = self.sess.run(list(out_dict.values()), feed_dict=feed_dict)
+        run_out = dict(zip(list(out_dict.keys()), network_out))
+        return run_out
+
+    def _make_zero_mem(self, m_size: int, length: int) -> List[np.ndarray]:
+        return [
+            np.zeros((m_size), dtype=np.float32)
+            for i in range(0, length, self.policy.sequence_length)
+        ]
--- a/ml-agents/mlagents/trainers/common/nn_policy.py
+++ b/ml-agents/mlagents/trainers/common/nn_policy.py
+import logging
+import numpy as np
+from typing import Any, Dict, Optional, List
+
+from mlagents.tf_utils import tf
+
+from mlagents_envs.timers import timed
+from mlagents_envs.base_env import BatchedStepResult
+from mlagents.trainers.brain import BrainParameters
+from mlagents.trainers.models import EncoderType
+from mlagents.trainers.models import ModelUtils
+from mlagents.trainers.tf_policy import TFPolicy
+
+logger = logging.getLogger("mlagents.trainers")
+
+EPSILON = 1e-6  # Small value to avoid divide by zero
+
+
+class NNPolicy(TFPolicy):
+    def __init__(
+        self,
+        seed: int,
+        brain: BrainParameters,
+        trainer_params: Dict[str, Any],
+        is_training: bool,
+        load: bool,
+        tanh_squash: bool = False,
+        reparameterize: bool = False,
+        condition_sigma_on_obs: bool = True,
+        create_tf_graph: bool = True,
+    ):
+        """
+        Policy that uses a multilayer perceptron to map the observations to actions. Could
+        also use a CNN to encode visual input prior to the MLP. Supports discrete and
+        continuous action spaces, as well as recurrent networks.
+        :param seed: Random seed.
+        :param brain: Assigned BrainParameters object.
+        :param trainer_params: Defined training parameters.
+        :param is_training: Whether the model should be trained.
+        :param load: Whether a pre-trained model will be loaded or a new one created.
+        :param tanh_squash: Whether to use a tanh function on the continuous output, or a clipped output.
+        :param reparameterize: Whether we are using the resampling trick to update the policy in continuous output.
+        """
+        super().__init__(seed, brain, trainer_params, load)
+        self.grads = None
+        self.update_batch: Optional[tf.Operation] = None
+        num_layers = trainer_params["num_layers"]
+        self.h_size = trainer_params["hidden_units"]
+        if num_layers < 1:
+            num_layers = 1
+        self.num_layers = num_layers
+        self.vis_encode_type = EncoderType(
+            trainer_params.get("vis_encode_type", "simple")
+        )
+        self.tanh_squash = tanh_squash
+        self.reparameterize = reparameterize
+        self.condition_sigma_on_obs = condition_sigma_on_obs
+        self.trainable_variables: List[tf.Variable] = []
+
+        # Non-exposed parameters; these aren't exposed because they don't have a
+        # good explanation and usually shouldn't be touched.
+        self.log_std_min = -20
+        self.log_std_max = 2
+        if create_tf_graph:
+            self.create_tf_graph()
+
+    def get_trainable_variables(self) -> List[tf.Variable]:
+        """
+        Returns a List of the trainable variables in this policy. if create_tf_graph hasn't been called,
+        returns empty list.
+        """
+        return self.trainable_variables
+
+    def create_tf_graph(self) -> None:
+        """
+        Builds the tensorflow graph needed for this policy.
+        """
+        with self.graph.as_default():
+            tf.set_random_seed(self.seed)
+            _vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
+            if len(_vars) > 0:
+                # We assume the first thing created in the graph is the Policy. If
+                # already populated, don't create more tensors.
+                return
+
+            self.create_input_placeholders()
+            encoded = self._create_encoder(
+                self.visual_in,
+                self.processed_vector_in,
+                self.h_size,
+                self.num_layers,
+                self.vis_encode_type,
+            )
+            if self.use_continuous_act:
+                self._create_cc_actor(
+                    encoded,
+                    self.tanh_squash,
+                    self.reparameterize,
+                    self.condition_sigma_on_obs,
+                )
+            else:
+                self._create_dc_actor(encoded)
+            self.trainable_variables = tf.get_collection(
+                tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy"
+            )
+            self.trainable_variables += tf.get_collection(
+                tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm"
+            )  # LSTMs need to be root scope for Barracuda export
+
+        self.inference_dict: Dict[str, tf.Tensor] = {
+            "action": self.output,
+            "log_probs": self.all_log_probs,
+            "entropy": self.entropy,
+        }
+        if self.use_continuous_act:
+            self.inference_dict["pre_action"] = self.output_pre
+        if self.use_recurrent:
+            self.inference_dict["memory_out"] = self.memory_out
+
+        # We do an initialize to make the Policy usable out of the box. If an optimizer is needed,
+        # it will re-load the full graph
+        self._initialize_graph()
+
+    @timed
+    def evaluate(
+        self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]
+    ) -> Dict[str, Any]:
+        """
+        Evaluates policy for the agent experiences provided.
+        :param batched_step_result: BatchedStepResult object containing inputs.
+        :param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result.
+        :return: Outputs from network as defined by self.inference_dict.
+        """
+        feed_dict = {
+            self.batch_size_ph: batched_step_result.n_agents(),
+            self.sequence_length_ph: 1,
+        }
+        if self.use_recurrent:
+            if not self.use_continuous_act:
+                feed_dict[self.prev_action] = self.retrieve_previous_action(
+                    global_agent_ids
+                )
+            feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
+        feed_dict = self.fill_eval_dict(feed_dict, batched_step_result)
+        run_out = self._execute_model(feed_dict, self.inference_dict)
+        return run_out
+
+    def _create_encoder(
+        self,
+        visual_in: List[tf.Tensor],
+        vector_in: tf.Tensor,
+        h_size: int,
+        num_layers: int,
+        vis_encode_type: EncoderType,
+    ) -> tf.Tensor:
+        """
+        Creates an encoder for visual and vector observations.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
+        :return: The hidden layer (tf.Tensor) after the encoder.
+        """
+        with tf.variable_scope("policy"):
+            encoded = ModelUtils.create_observation_streams(
+                self.visual_in,
+                self.processed_vector_in,
+                1,
+                h_size,
+                num_layers,
+                vis_encode_type,
+            )[0]
+        return encoded
+
+    def _create_cc_actor(
+        self,
+        encoded: tf.Tensor,
+        tanh_squash: bool = False,
+        reparameterize: bool = False,
+        condition_sigma_on_obs: bool = True,
+    ) -> None:
+        """
+        Creates Continuous control actor-critic model.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
+        :param tanh_squash: Whether to use a tanh function, or a clipped output.
+        :param reparameterize: Whether we are using the resampling trick to update the policy.
+        """
+        if self.use_recurrent:
+            self.memory_in = tf.placeholder(
+                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
+            )
+            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
+                encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy"
+            )
+
+            self.memory_out = tf.identity(memory_policy_out, name="recurrent_out")
+        else:
+            hidden_policy = encoded
+
+        with tf.variable_scope("policy"):
+            mu = tf.layers.dense(
+                hidden_policy,
+                self.act_size[0],
+                activation=None,
+                name="mu",
+                kernel_initializer=ModelUtils.scaled_init(0.01),
+                reuse=tf.AUTO_REUSE,
+            )
+
+            # Policy-dependent log_sigma
+            if condition_sigma_on_obs:
+                log_sigma = tf.layers.dense(
+                    hidden_policy,
+                    self.act_size[0],
+                    activation=None,
+                    name="log_sigma",
+                    kernel_initializer=ModelUtils.scaled_init(0.01),
+                )
+            else:
+                log_sigma = tf.get_variable(
+                    "log_sigma",
+                    [self.act_size[0]],
+                    dtype=tf.float32,
+                    initializer=tf.zeros_initializer(),
+                )
+            log_sigma = tf.clip_by_value(log_sigma, self.log_std_min, self.log_std_max)
+
+            sigma = tf.exp(log_sigma)
+
+            epsilon = tf.random_normal(tf.shape(mu))
+
+            sampled_policy = mu + sigma * epsilon
+
+            # Stop gradient if we're not doing the resampling trick
+            if not reparameterize:
+                sampled_policy_probs = tf.stop_gradient(sampled_policy)
+            else:
+                sampled_policy_probs = sampled_policy
+
+            # Compute probability of model output.
+            _gauss_pre = -0.5 * (
+                ((sampled_policy_probs - mu) / (sigma + EPSILON)) ** 2
+                + 2 * log_sigma
+                + np.log(2 * np.pi)
+            )
+            all_probs = _gauss_pre
+            all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)
+
+        if tanh_squash:
+            self.output_pre = tf.tanh(sampled_policy)
+
+            # Squash correction
+            all_probs -= tf.reduce_sum(
+                tf.log(1 - self.output_pre ** 2 + EPSILON), axis=1, keepdims=True
+            )
+            self.output = tf.identity(self.output_pre, name="action")
+        else:
+            self.output_pre = sampled_policy
+            # Clip and scale output to ensure actions are always within [-1, 1] range.
+            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
+            self.output = tf.identity(output_post, name="action")
+
+        self.selected_actions = tf.stop_gradient(self.output)
+
+        self.all_log_probs = tf.identity(all_probs, name="action_probs")
+
+        single_dim_entropy = 0.5 * tf.reduce_mean(
+            tf.log(2 * np.pi * np.e) + 2 * log_sigma
+        )
+        # Make entropy the right shape
+        self.entropy = tf.ones_like(tf.reshape(mu[:, 0], [-1])) * single_dim_entropy
+
+        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
+        self.log_probs = tf.reduce_sum(
+            (tf.identity(self.all_log_probs)), axis=1, keepdims=True
+        )
+
+        self.action_holder = tf.placeholder(
+            shape=[None, self.act_size[0]], dtype=tf.float32, name="action_holder"
+        )
+
+    def _create_dc_actor(self, encoded: tf.Tensor) -> None:
+        """
+        Creates Discrete control actor-critic model.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
+        """
+        if self.use_recurrent:
+            self.prev_action = tf.placeholder(
+                shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action"
+            )
+            self.memory_in = tf.placeholder(
+                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
+            )
+            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
+                encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy"
+            )
+
+            self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
+        else:
+            hidden_policy = encoded
+
+        policy_branches = []
+        with tf.variable_scope("policy"):
+            for size in self.act_size:
+                policy_branches.append(
+                    tf.layers.dense(
+                        hidden_policy,
+                        size,
+                        activation=None,
+                        use_bias=False,
+                        kernel_initializer=ModelUtils.scaled_init(0.01),
+                    )
+                )
+
+        raw_log_probs = tf.concat(policy_branches, axis=1, name="action_probs")
+
+        self.action_masks = tf.placeholder(
+            shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
+        )
+        output, self.action_probs, normalized_logits = ModelUtils.create_discrete_action_masking_layer(
+            raw_log_probs, self.action_masks, self.act_size
+        )
+
+        self.output = tf.identity(output)
+        self.all_log_probs = tf.identity(normalized_logits, name="action")
+
+        self.action_holder = tf.placeholder(
+            shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder"
+        )
+        self.action_oh = tf.concat(
+            [
+                tf.one_hot(self.action_holder[:, i], self.act_size[i])
+                for i in range(len(self.act_size))
+            ],
+            axis=1,
+        )
+        self.selected_actions = tf.stop_gradient(self.action_oh)
+
+        action_idx = [0] + list(np.cumsum(self.act_size))
+
+        self.entropy = tf.reduce_sum(
+            (
+                tf.stack(
+                    [
+                        tf.nn.softmax_cross_entropy_with_logits_v2(
+                            labels=tf.nn.softmax(
+                                self.all_log_probs[:, action_idx[i] : action_idx[i + 1]]
+                            ),
+                            logits=self.all_log_probs[
+                                :, action_idx[i] : action_idx[i + 1]
+                            ],
+                        )
+                        for i in range(len(self.act_size))
+                    ],
+                    axis=1,
+                )
+            ),
+            axis=1,
+        )
+
+        self.log_probs = tf.reduce_sum(
+            (
+                tf.stack(
+                    [
+                        -tf.nn.softmax_cross_entropy_with_logits_v2(
+                            labels=self.action_oh[:, action_idx[i] : action_idx[i + 1]],
+                            logits=normalized_logits[
+                                :, action_idx[i] : action_idx[i + 1]
+                            ],
+                        )
+                        for i in range(len(self.act_size))
+                    ],
+                    axis=1,
+                )
+            ),
+            axis=1,
+            keepdims=True,
+        )
--- a/ml-agents/mlagents/trainers/ppo/optimizer.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer.py
-import logging
-from typing import Optional, Any, Dict
-
-import numpy as np
-from mlagents.tf_utils import tf
-from mlagents_envs.timers import timed
-from mlagents.trainers.models import ModelUtils, EncoderType, LearningRateSchedule
-from mlagents.trainers.tf_policy import TFPolicy
-from mlagents.trainers.common.tf_optimizer import TFOptimizer
-from mlagents.trainers.buffer import AgentBuffer
-
-
-logger = logging.getLogger("mlagents.trainers")
-
-
-class PPOOptimizer(TFOptimizer):
-    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
-        """
-        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
-        The PPO optimizer has a value estimator and a loss function.
-        :param policy: A TFPolicy object that will be updated by this PPO Optimizer.
-        :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
-        """
-        # Create the graph here to give more granular control of the TF graph to the Optimizer.
-        policy.create_tf_graph()
-
-        with policy.graph.as_default():
-            with tf.variable_scope("optimizer/"):
-                super().__init__(policy, trainer_params)
-
-                lr = float(trainer_params["learning_rate"])
-                lr_schedule = LearningRateSchedule(
-                    trainer_params.get("learning_rate_schedule", "linear")
-                )
-                h_size = int(trainer_params["hidden_units"])
-                epsilon = float(trainer_params["epsilon"])
-                beta = float(trainer_params["beta"])
-                max_step = float(trainer_params["max_steps"])
-                num_layers = int(trainer_params["num_layers"])
-                vis_encode_type = EncoderType(
-                    trainer_params.get("vis_encode_type", "simple")
-                )
-                self.burn_in_ratio = float(trainer_params.get("burn_in_ratio", 0.0))
-
-                self.stream_names = list(self.reward_signals.keys())
-
-                self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None
-                self.grads = None
-                self.update_batch: Optional[tf.Operation] = None
-
-                self.stats_name_to_update_name = {
-                    "Losses/Value Loss": "value_loss",
-                    "Losses/Policy Loss": "policy_loss",
-                    "Policy/Learning Rate": "learning_rate",
-                }
-                if self.policy.use_recurrent:
-                    self.m_size = self.policy.m_size
-                    self.memory_in = tf.placeholder(
-                        shape=[None, self.m_size],
-                        dtype=tf.float32,
-                        name="recurrent_value_in",
-                    )
-
-                if num_layers < 1:
-                    num_layers = 1
-                if policy.use_continuous_act:
-                    self._create_cc_critic(h_size, num_layers, vis_encode_type)
-                else:
-                    self._create_dc_critic(h_size, num_layers, vis_encode_type)
-
-                self.learning_rate = ModelUtils.create_learning_rate(
-                    lr_schedule, lr, self.policy.global_step, int(max_step)
-                )
-                self._create_losses(
-                    self.policy.log_probs,
-                    self.old_log_probs,
-                    self.value_heads,
-                    self.policy.entropy,
-                    beta,
-                    epsilon,
-                    lr,
-                    max_step,
-                )
-                self._create_ppo_optimizer_ops()
-
-            self.update_dict.update(
-                {
-                    "value_loss": self.value_loss,
-                    "policy_loss": self.abs_policy_loss,
-                    "update_batch": self.update_batch,
-                    "learning_rate": self.learning_rate,
-                }
-            )
-
-            self.policy.initialize_or_load()
-
-    def _create_cc_critic(
-        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
-    ) -> None:
-        """
-        Creates Continuous control actor-critic model.
-        :param h_size: Size of hidden linear layers.
-        :param num_layers: Number of hidden linear layers.
-        :param vis_encode_type: The type of visual encoder to use.
-        """
-        hidden_stream = ModelUtils.create_observation_streams(
-            self.policy.visual_in,
-            self.policy.processed_vector_in,
-            1,
-            h_size,
-            num_layers,
-            vis_encode_type,
-        )[0]
-
-        if self.policy.use_recurrent:
-            hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
-                hidden_stream,
-                self.memory_in,
-                self.policy.sequence_length_ph,
-                name="lstm_value",
-            )
-            self.memory_out = memory_value_out
-        else:
-            hidden_value = hidden_stream
-
-        self.value_heads, self.value = ModelUtils.create_value_heads(
-            self.stream_names, hidden_value
-        )
-        self.all_old_log_probs = tf.placeholder(
-            shape=[None, 1], dtype=tf.float32, name="old_probabilities"
-        )
-
-        self.old_log_probs = tf.reduce_sum(
-            (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
-        )
-
-    def _create_dc_critic(
-        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
-    ) -> None:
-        """
-        Creates Discrete control actor-critic model.
-        :param h_size: Size of hidden linear layers.
-        :param num_layers: Number of hidden linear layers.
-        :param vis_encode_type: The type of visual encoder to use.
-        """
-        hidden_stream = ModelUtils.create_observation_streams(
-            self.policy.visual_in,
-            self.policy.processed_vector_in,
-            1,
-            h_size,
-            num_layers,
-            vis_encode_type,
-        )[0]
-
-        if self.policy.use_recurrent:
-            hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
-                hidden_stream,
-                self.memory_in,
-                self.policy.sequence_length_ph,
-                name="lstm_value",
-            )
-            self.memory_out = memory_value_out
-        else:
-            hidden_value = hidden_stream
-
-        self.value_heads, self.value = ModelUtils.create_value_heads(
-            self.stream_names, hidden_value
-        )
-
-        self.all_old_log_probs = tf.placeholder(
-            shape=[None, sum(self.policy.act_size)],
-            dtype=tf.float32,
-            name="old_probabilities",
-        )
-        _, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer(
-            self.all_old_log_probs, self.policy.action_masks, self.policy.act_size
-        )
-
-        action_idx = [0] + list(np.cumsum(self.policy.act_size))
-
-        self.old_log_probs = tf.reduce_sum(
-            (
-                tf.stack(
-                    [
-                        -tf.nn.softmax_cross_entropy_with_logits_v2(
-                            labels=self.policy.action_oh[
-                                :, action_idx[i] : action_idx[i + 1]
-                            ],
-                            logits=old_normalized_logits[
-                                :, action_idx[i] : action_idx[i + 1]
-                            ],
-                        )
-                        for i in range(len(self.policy.act_size))
-                    ],
-                    axis=1,
-                )
-            ),
-            axis=1,
-            keepdims=True,
-        )
-
-    def _create_losses(
-        self, probs, old_probs, value_heads, entropy, beta, epsilon, lr, max_step
-    ):
-        """
-        Creates training-specific Tensorflow ops for PPO models.
-        :param probs: Current policy probabilities
-        :param old_probs: Past policy probabilities
-        :param value_heads: Value estimate tensors from each value stream
-        :param beta: Entropy regularization strength
-        :param entropy: Current policy entropy
-        :param epsilon: Value for policy-divergence threshold
-        :param lr: Learning rate
-        :param max_step: Total number of training steps.
-        """
-        self.returns_holders = {}
-        self.old_values = {}
-        for name in value_heads.keys():
-            returns_holder = tf.placeholder(
-                shape=[None], dtype=tf.float32, name="{}_returns".format(name)
-            )
-            old_value = tf.placeholder(
-                shape=[None], dtype=tf.float32, name="{}_value_estimate".format(name)
-            )
-            self.returns_holders[name] = returns_holder
-            self.old_values[name] = old_value
-        self.advantage = tf.placeholder(
-            shape=[None], dtype=tf.float32, name="advantages"
-        )
-        advantage = tf.expand_dims(self.advantage, -1)
-
-        decay_epsilon = tf.train.polynomial_decay(
-            epsilon, self.policy.global_step, max_step, 0.1, power=1.0
-        )
-        decay_beta = tf.train.polynomial_decay(
-            beta, self.policy.global_step, max_step, 1e-5, power=1.0
-        )
-
-        value_losses = []
-        for name, head in value_heads.items():
-            clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
-                tf.reduce_sum(head, axis=1) - self.old_values[name],
-                -decay_epsilon,
-                decay_epsilon,
-            )
-            v_opt_a = tf.squared_difference(
-                self.returns_holders[name], tf.reduce_sum(head, axis=1)
-            )
-            v_opt_b = tf.squared_difference(
-                self.returns_holders[name], clipped_value_estimate
-            )
-            value_loss = tf.reduce_mean(
-                tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.policy.mask, 2)[
-                    1
-                ]
-            )
-            value_losses.append(value_loss)
-        self.value_loss = tf.reduce_mean(value_losses)
-
-        r_theta = tf.exp(probs - old_probs)
-        p_opt_a = r_theta * advantage
-        p_opt_b = (
-            tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon)
-            * advantage
-        )
-        self.policy_loss = -tf.reduce_mean(
-            tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.policy.mask, 2)[1]
-        )
-        # For cleaner stats reporting
-        self.abs_policy_loss = tf.abs(self.policy_loss)
-
-        self.loss = (
-            self.policy_loss
-            + 0.5 * self.value_loss
-            - decay_beta
-            * tf.reduce_mean(tf.dynamic_partition(entropy, self.policy.mask, 2)[1])
-        )
-
-    def _create_ppo_optimizer_ops(self):
-        self.tf_optimizer = self.create_optimizer_op(self.learning_rate)
-        self.grads = self.tf_optimizer.compute_gradients(self.loss)
-        self.update_batch = self.tf_optimizer.minimize(self.loss)
-
-    @timed
-    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
-        """
-        Performs update on model.
-        :param mini_batch: Batch of experiences.
-        :param num_sequences: Number of sequences to process.
-        :return: Results of update.
-        """
-        feed_dict = self._construct_feed_dict(batch, num_sequences)
-        stats_needed = self.stats_name_to_update_name
-        update_stats = {}
-        # Collect feed dicts for all reward signals.
-        for _, reward_signal in self.reward_signals.items():
-            feed_dict.update(
-                reward_signal.prepare_update(self.policy, batch, num_sequences)
-            )
-            stats_needed.update(reward_signal.stats_name_to_update_name)
-
-        update_vals = self._execute_model(feed_dict, self.update_dict)
-        for stat_name, update_name in stats_needed.items():
-            update_stats[stat_name] = update_vals[update_name]
-        return update_stats
-
-    def _construct_feed_dict(
-        self, mini_batch: AgentBuffer, num_sequences: int
-    ) -> Dict[tf.Tensor, Any]:
-        # Do an optional burn-in for memories
-        num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
-        burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
-        burn_in_mask[range(0, num_burn_in)] = 0
-        burn_in_mask = np.tile(burn_in_mask, num_sequences)
-        feed_dict = {
-            self.policy.batch_size_ph: num_sequences,
-            self.policy.sequence_length_ph: self.policy.sequence_length,
-            self.policy.mask_input: mini_batch["masks"] * burn_in_mask,
-            self.advantage: mini_batch["advantages"],
-            self.all_old_log_probs: mini_batch["action_probs"],
-        }
-        for name in self.reward_signals:
-            feed_dict[self.returns_holders[name]] = mini_batch[
-                "{}_returns".format(name)
-            ]
-            feed_dict[self.old_values[name]] = mini_batch[
-                "{}_value_estimates".format(name)
-            ]
-
-        if self.policy.output_pre is not None and "actions_pre" in mini_batch:
-            feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]
-        else:
-            feed_dict[self.policy.action_holder] = mini_batch["actions"]
-            if self.policy.use_recurrent:
-                feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
-            feed_dict[self.policy.action_masks] = mini_batch["action_mask"]
-        if "vector_obs" in mini_batch:
-            feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]
-        if self.policy.vis_obs_size > 0:
-            for i, _ in enumerate(self.policy.visual_in):
-                feed_dict[self.policy.visual_in[i]] = mini_batch["visual_obs%d" % i]
-        if self.policy.use_recurrent:
-            feed_dict[self.policy.memory_in] = [
-                mini_batch["memory"][i]
-                for i in range(
-                    0, len(mini_batch["memory"]), self.policy.sequence_length
-                )
-            ]
-            feed_dict[self.memory_in] = self._make_zero_mem(
-                self.m_size, mini_batch.num_experiences
-            )
-        return feed_dict
--- a/ml-agents/mlagents/trainers/sac/network.py
+++ b/ml-agents/mlagents/trainers/sac/network.py
-import logging
-from typing import Dict, Optional
-
-from mlagents.tf_utils import tf
-
-from mlagents.trainers.models import ModelUtils, EncoderType
-
-LOG_STD_MAX = 2
-LOG_STD_MIN = -20
-EPSILON = 1e-6  # Small value to avoid divide by zero
-DISCRETE_TARGET_ENTROPY_SCALE = 0.2  # Roughly equal to e-greedy 0.05
-CONTINUOUS_TARGET_ENTROPY_SCALE = 1.0  # TODO: Make these an optional hyperparam.
-
-LOGGER = logging.getLogger("mlagents.trainers")
-
-POLICY_SCOPE = ""
-TARGET_SCOPE = "target_network"
-
-
-class SACNetwork:
-    """
-    Base class for an SAC network. Implements methods for creating the actor and critic heads.
-    """
-
-    def __init__(
-        self,
-        policy=None,
-        m_size=None,
-        h_size=128,
-        normalize=False,
-        use_recurrent=False,
-        num_layers=2,
-        stream_names=None,
-        vis_encode_type=EncoderType.SIMPLE,
-    ):
-        self.normalize = normalize
-        self.use_recurrent = use_recurrent
-        self.num_layers = num_layers
-        self.stream_names = stream_names
-        self.h_size = h_size
-        self.activ_fn = ModelUtils.swish
-
-        self.sequence_length_ph = tf.placeholder(
-            shape=None, dtype=tf.int32, name="sac_sequence_length"
-        )
-
-        self.policy_memory_in: Optional[tf.Tensor] = None
-        self.policy_memory_out: Optional[tf.Tensor] = None
-        self.value_memory_in: Optional[tf.Tensor] = None
-        self.value_memory_out: Optional[tf.Tensor] = None
-        self.q1: Optional[tf.Tensor] = None
-        self.q2: Optional[tf.Tensor] = None
-        self.q1_p: Optional[tf.Tensor] = None
-        self.q2_p: Optional[tf.Tensor] = None
-        self.q1_memory_in: Optional[tf.Tensor] = None
-        self.q2_memory_in: Optional[tf.Tensor] = None
-        self.q1_memory_out: Optional[tf.Tensor] = None
-        self.q2_memory_out: Optional[tf.Tensor] = None
-        self.prev_action: Optional[tf.Tensor] = None
-        self.action_masks: Optional[tf.Tensor] = None
-        self.external_action_in: Optional[tf.Tensor] = None
-        self.log_sigma_sq: Optional[tf.Tensor] = None
-        self.entropy: Optional[tf.Tensor] = None
-        self.deterministic_output: Optional[tf.Tensor] = None
-        self.normalized_logprobs: Optional[tf.Tensor] = None
-        self.action_probs: Optional[tf.Tensor] = None
-        self.output_oh: Optional[tf.Tensor] = None
-        self.output_pre: Optional[tf.Tensor] = None
-
-        self.value_vars = None
-        self.q_vars = None
-        self.critic_vars = None
-        self.policy_vars = None
-
-        self.q1_heads: Dict[str, tf.Tensor] = None
-        self.q2_heads: Dict[str, tf.Tensor] = None
-        self.q1_pheads: Dict[str, tf.Tensor] = None
-        self.q2_pheads: Dict[str, tf.Tensor] = None
-
-        self.policy = policy
-
-    def get_vars(self, scope):
-        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
-
-    def join_scopes(self, scope_1, scope_2):
-        """
-        Joins two scopes. Does so safetly (i.e., if one of the two scopes doesn't
-        exist, don't add any backslashes)
-        """
-        if not scope_1:
-            return scope_2
-        if not scope_2:
-            return scope_1
-        else:
-            return "/".join(filter(None, [scope_1, scope_2]))
-
-    def create_value_heads(self, stream_names, hidden_input):
-        """
-        Creates one value estimator head for each reward signal in stream_names.
-        Also creates the node corresponding to the mean of all the value heads in self.value.
-        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
-        :param stream_names: The list of reward signal names
-        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
-        of the hidden input.
-        """
-        self.value_heads = {}
-        for name in stream_names:
-            value = tf.layers.dense(hidden_input, 1, name="{}_value".format(name))
-            self.value_heads[name] = value
-        self.value = tf.reduce_mean(list(self.value_heads.values()), 0)
-
-    def _create_cc_critic(self, hidden_value, scope, create_qs=True):
-        """
-        Creates just the critic network
-        """
-        scope = self.join_scopes(scope, "critic")
-        self.create_sac_value_head(
-            self.stream_names,
-            hidden_value,
-            self.num_layers,
-            self.h_size,
-            self.join_scopes(scope, "value"),
-        )
-
-        self.value_vars = self.get_vars(self.join_scopes(scope, "value"))
-        if create_qs:
-            hidden_q = tf.concat([hidden_value, self.policy.action_holder], axis=-1)
-            hidden_qp = tf.concat([hidden_value, self.policy.output], axis=-1)
-            self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads(
-                self.stream_names,
-                hidden_q,
-                self.num_layers,
-                self.h_size,
-                self.join_scopes(scope, "q"),
-            )
-            self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads(
-                self.stream_names,
-                hidden_qp,
-                self.num_layers,
-                self.h_size,
-                self.join_scopes(scope, "q"),
-                reuse=True,
-            )
-            self.q_vars = self.get_vars(self.join_scopes(scope, "q"))
-        self.critic_vars = self.get_vars(scope)
-
-    def _create_dc_critic(self, hidden_value, scope, create_qs=True):
-        """
-        Creates just the critic network
-        """
-        scope = self.join_scopes(scope, "critic")
-        self.create_sac_value_head(
-            self.stream_names,
-            hidden_value,
-            self.num_layers,
-            self.h_size,
-            self.join_scopes(scope, "value"),
-        )
-
-        self.value_vars = self.get_vars("/".join([scope, "value"]))
-
-        if create_qs:
-            self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads(
-                self.stream_names,
-                hidden_value,
-                self.num_layers,
-                self.h_size,
-                self.join_scopes(scope, "q"),
-                num_outputs=sum(self.policy.act_size),
-            )
-            self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads(
-                self.stream_names,
-                hidden_value,
-                self.num_layers,
-                self.h_size,
-                self.join_scopes(scope, "q"),
-                reuse=True,
-                num_outputs=sum(self.policy.act_size),
-            )
-            self.q_vars = self.get_vars(scope)
-        self.critic_vars = self.get_vars(scope)
-
-    def create_sac_value_head(
-        self, stream_names, hidden_input, num_layers, h_size, scope
-    ):
-        """
-        Creates one value estimator head for each reward signal in stream_names.
-        Also creates the node corresponding to the mean of all the value heads in self.value.
-        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
-        :param stream_names: The list of reward signal names
-        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
-        of the hidden input.
-        :param num_layers: Number of hidden layers for value network
-        :param h_size: size of hidden layers for value network
-        :param scope: TF scope for value network.
-        """
-        with tf.variable_scope(scope):
-            value_hidden = ModelUtils.create_vector_observation_encoder(
-                hidden_input, h_size, self.activ_fn, num_layers, "encoder", False
-            )
-            if self.use_recurrent:
-                value_hidden, memory_out = ModelUtils.create_recurrent_encoder(
-                    value_hidden,
-                    self.value_memory_in,
-                    self.sequence_length_ph,
-                    name="lstm_value",
-                )
-                self.value_memory_out = memory_out
-            self.create_value_heads(stream_names, value_hidden)
-
-    def create_q_heads(
-        self,
-        stream_names,
-        hidden_input,
-        num_layers,
-        h_size,
-        scope,
-        reuse=False,
-        num_outputs=1,
-    ):
-        """
-        Creates two q heads for each reward signal in stream_names.
-        Also creates the node corresponding to the mean of all the value heads in self.value.
-        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
-        :param stream_names: The list of reward signal names
-        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
-        of the hidden input.
-        :param num_layers: Number of hidden layers for Q network
-        :param h_size: size of hidden layers for Q network
-        :param scope: TF scope for Q network.
-        :param reuse: Whether or not to reuse variables. Useful for creating Q of policy.
-        :param num_outputs: Number of outputs of each Q function. If discrete, equal to number of actions.
-        """
-        with tf.variable_scope(self.join_scopes(scope, "q1_encoding"), reuse=reuse):
-            q1_hidden = ModelUtils.create_vector_observation_encoder(
-                hidden_input, h_size, self.activ_fn, num_layers, "q1_encoder", reuse
-            )
-            if self.use_recurrent:
-                q1_hidden, memory_out = ModelUtils.create_recurrent_encoder(
-                    q1_hidden,
-                    self.q1_memory_in,
-                    self.sequence_length_ph,
-                    name="lstm_q1",
-                )
-                self.q1_memory_out = memory_out
-
-            q1_heads = {}
-            for name in stream_names:
-                _q1 = tf.layers.dense(q1_hidden, num_outputs, name="{}_q1".format(name))
-                q1_heads[name] = _q1
-
-            q1 = tf.reduce_mean(list(q1_heads.values()), axis=0)
-        with tf.variable_scope(self.join_scopes(scope, "q2_encoding"), reuse=reuse):
-            q2_hidden = ModelUtils.create_vector_observation_encoder(
-                hidden_input, h_size, self.activ_fn, num_layers, "q2_encoder", reuse
-            )
-            if self.use_recurrent:
-                q2_hidden, memory_out = ModelUtils.create_recurrent_encoder(
-                    q2_hidden,
-                    self.q2_memory_in,
-                    self.sequence_length_ph,
-                    name="lstm_q2",
-                )
-                self.q2_memory_out = memory_out
-
-            q2_heads = {}
-            for name in stream_names:
-                _q2 = tf.layers.dense(q2_hidden, num_outputs, name="{}_q2".format(name))
-                q2_heads[name] = _q2
-
-            q2 = tf.reduce_mean(list(q2_heads.values()), axis=0)
-
-        return q1_heads, q2_heads, q1, q2
-
-
-class SACTargetNetwork(SACNetwork):
-    """
-    Instantiation for the SAC target network. Only contains a single
-    value estimator and is updated from the Policy Network.
-    """
-
-    def __init__(
-        self,
-        policy,
-        m_size=None,
-        h_size=128,
-        normalize=False,
-        use_recurrent=False,
-        num_layers=2,
-        stream_names=None,
-        vis_encode_type=EncoderType.SIMPLE,
-    ):
-        super().__init__(
-            policy,
-            m_size,
-            h_size,
-            normalize,
-            use_recurrent,
-            num_layers,
-            stream_names,
-            vis_encode_type,
-        )
-        with tf.variable_scope(TARGET_SCOPE):
-            self.visual_in = ModelUtils.create_visual_input_placeholders(
-                policy.brain.camera_resolutions
-            )
-            self.vector_in = ModelUtils.create_vector_input(policy.vec_obs_size)
-            if self.policy.normalize:
-                normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
-                self.update_normalization_op = normalization_tensors.update_op
-                self.normalization_steps = normalization_tensors.steps
-                self.running_mean = normalization_tensors.running_mean
-                self.running_variance = normalization_tensors.running_variance
-                self.processed_vector_in = ModelUtils.normalize_vector_obs(
-                    self.vector_in,
-                    self.running_mean,
-                    self.running_variance,
-                    self.normalization_steps,
-                )
-            else:
-                self.processed_vector_in = self.vector_in
-                self.update_normalization_op = None
-
-            if self.policy.use_recurrent:
-                self.memory_in = tf.placeholder(
-                    shape=[None, m_size], dtype=tf.float32, name="target_recurrent_in"
-                )
-                self.value_memory_in = self.memory_in
-            hidden_streams = ModelUtils.create_observation_streams(
-                self.visual_in,
-                self.processed_vector_in,
-                1,
-                self.h_size,
-                0,
-                vis_encode_type=vis_encode_type,
-                stream_scopes=["critic/value/"],
-            )
-        if self.policy.use_continuous_act:
-            self._create_cc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
-        else:
-            self._create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
-        if self.use_recurrent:
-            self.memory_out = tf.concat(
-                self.value_memory_out, axis=1
-            )  # Needed for Barracuda to work
-
-    def copy_normalization(self, mean, variance, steps):
-        """
-        Copies the mean, variance, and steps into the normalizers of the
-        input of this SACNetwork. Used to copy the normalizer from the policy network
-        to the target network.
-        param mean: Tensor containing the mean.
-        param variance: Tensor containing the variance
-        param steps: Tensor containing the number of steps.
-        """
-        update_mean = tf.assign(self.running_mean, mean)
-        update_variance = tf.assign(self.running_variance, variance)
-        update_norm_step = tf.assign(self.normalization_steps, steps)
-        return tf.group([update_mean, update_variance, update_norm_step])
-
-
-class SACPolicyNetwork(SACNetwork):
-    """
-    Instantiation for SAC policy network. Contains a dual Q estimator,
-    a value estimator, and a reference to the actual policy network.
-    """
-
-    def __init__(
-        self,
-        policy,
-        m_size=None,
-        h_size=128,
-        normalize=False,
-        use_recurrent=False,
-        num_layers=2,
-        stream_names=None,
-        vis_encode_type=EncoderType.SIMPLE,
-    ):
-        super().__init__(
-            policy,
-            m_size,
-            h_size,
-            normalize,
-            use_recurrent,
-            num_layers,
-            stream_names,
-            vis_encode_type,
-        )
-        if self.policy.use_recurrent:
-            self._create_memory_ins(m_size)
-
-        hidden_critic = self._create_observation_in(vis_encode_type)
-        self.policy.output = self.policy.output
-        # Use the sequence length of the policy
-        self.sequence_length_ph = self.policy.sequence_length_ph
-
-        if self.policy.use_continuous_act:
-            self._create_cc_critic(hidden_critic, POLICY_SCOPE)
-
-        else:
-            self._create_dc_critic(hidden_critic, POLICY_SCOPE)
-
-        if self.use_recurrent:
-            mem_outs = [self.value_memory_out, self.q1_memory_out, self.q2_memory_out]
-            self.memory_out = tf.concat(mem_outs, axis=1)
-
-    def _create_memory_ins(self, m_size):
-        """
-        Creates the memory input placeholders for LSTM.
-        :param m_size: the total size of the memory.
-        """
-        self.memory_in = tf.placeholder(
-            shape=[None, m_size * 3], dtype=tf.float32, name="value_recurrent_in"
-        )
-
-        # Re-break-up for each network
-        num_mems = 3
-        input_size = self.memory_in.get_shape().as_list()[1]
-        mem_ins = []
-        for i in range(num_mems):
-            _start = input_size // num_mems * i
-            _end = input_size // num_mems * (i + 1)
-            mem_ins.append(self.memory_in[:, _start:_end])
-        self.value_memory_in = mem_ins[0]
-        self.q1_memory_in = mem_ins[1]
-        self.q2_memory_in = mem_ins[2]
-
-    def _create_observation_in(self, vis_encode_type):
-        """
-        Creates the observation inputs, and a CNN if needed,
-        :param vis_encode_type: Type of CNN encoder.
-        :param share_ac_cnn: Whether or not to share the actor and critic CNNs.
-        :return A tuple of (hidden_policy, hidden_critic). We don't save it to self since they're used
-        once and thrown away.
-        """
-        with tf.variable_scope(POLICY_SCOPE):
-            hidden_streams = ModelUtils.create_observation_streams(
-                self.policy.visual_in,
-                self.policy.processed_vector_in,
-                1,
-                self.h_size,
-                0,
-                vis_encode_type=vis_encode_type,
-                stream_scopes=["critic/value/"],
-            )
-        hidden_critic = hidden_streams[0]
-        return hidden_critic
--- a/ml-agents/mlagents/trainers/sac/optimizer.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer.py
-import logging
-import numpy as np
-from typing import Dict, List, Optional, Any, Mapping
-
-from mlagents.tf_utils import tf
-
-from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
-from mlagents.trainers.models import LearningRateSchedule, EncoderType, ModelUtils
-from mlagents.trainers.common.tf_optimizer import TFOptimizer
-from mlagents.trainers.tf_policy import TFPolicy
-from mlagents.trainers.buffer import AgentBuffer
-from mlagents_envs.timers import timed
-
-EPSILON = 1e-6  # Small value to avoid divide by zero
-
-LOGGER = logging.getLogger("mlagents.trainers")
-
-POLICY_SCOPE = ""
-TARGET_SCOPE = "target_network"
-
-
-class SACOptimizer(TFOptimizer):
-    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
-        """
-        Takes a Unity environment and model-specific hyper-parameters and returns the
-        appropriate PPO agent model for the environment.
-        :param brain: Brain parameters used to generate specific network graph.
-        :param lr: Learning rate.
-        :param lr_schedule: Learning rate decay schedule.
-        :param h_size: Size of hidden layers
-        :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster,
-            set higher to explore more.
-        :return: a sub-class of PPOAgent tailored to the environment.
-        :param max_step: Total number of training steps.
-        :param normalize: Whether to normalize vector observation input.
-        :param use_recurrent: Whether to use an LSTM layer in the network.
-        :param num_layers: Number of hidden layers between encoded input and policy & value layers
-        :param tau: Strength of soft-Q update.
-        :param m_size: Size of brain memory.
-        """
-        # Create the graph here to give more granular control of the TF graph to the Optimizer.
-        policy.create_tf_graph()
-
-        with policy.graph.as_default():
-            with tf.variable_scope(""):
-                super().__init__(policy, trainer_params)
-                lr = float(trainer_params["learning_rate"])
-                lr_schedule = LearningRateSchedule(
-                    trainer_params.get("learning_rate_schedule", "constant")
-                )
-                self.policy = policy
-                self.act_size = self.policy.act_size
-                h_size = int(trainer_params["hidden_units"])
-                max_step = float(trainer_params["max_steps"])
-                num_layers = int(trainer_params["num_layers"])
-                vis_encode_type = EncoderType(
-                    trainer_params.get("vis_encode_type", "simple")
-                )
-                self.tau = trainer_params.get("tau", 0.005)
-                self.burn_in_ratio = float(trainer_params.get("burn_in_ratio", 0.0))
-
-                # Non-exposed SAC parameters
-                self.discrete_target_entropy_scale = (
-                    0.2
-                )  # Roughly equal to e-greedy 0.05
-                self.continuous_target_entropy_scale = 1.0
-
-                self.init_entcoef = trainer_params.get("init_entcoef", 1.0)
-                stream_names = list(self.reward_signals.keys())
-                # Use to reduce "survivor bonus" when using Curiosity or GAIL.
-                self.gammas = [
-                    _val["gamma"] for _val in trainer_params["reward_signals"].values()
-                ]
-                self.use_dones_in_backup = {
-                    name: tf.Variable(1.0) for name in stream_names
-                }
-                self.disable_use_dones = {
-                    name: self.use_dones_in_backup[name].assign(0.0)
-                    for name in stream_names
-                }
-
-                if num_layers < 1:
-                    num_layers = 1
-
-                self.target_init_op: List[tf.Tensor] = []
-                self.target_update_op: List[tf.Tensor] = []
-                self.update_batch_policy: Optional[tf.Operation] = None
-                self.update_batch_value: Optional[tf.Operation] = None
-                self.update_batch_entropy: Optional[tf.Operation] = None
-
-                self.policy_network = SACPolicyNetwork(
-                    policy=self.policy,
-                    m_size=self.policy.m_size,  # 3x policy.m_size
-                    h_size=h_size,
-                    normalize=self.policy.normalize,
-                    use_recurrent=self.policy.use_recurrent,
-                    num_layers=num_layers,
-                    stream_names=stream_names,
-                    vis_encode_type=vis_encode_type,
-                )
-                self.target_network = SACTargetNetwork(
-                    policy=self.policy,
-                    m_size=self.policy.m_size,  # 1x policy.m_size
-                    h_size=h_size,
-                    normalize=self.policy.normalize,
-                    use_recurrent=self.policy.use_recurrent,
-                    num_layers=num_layers,
-                    stream_names=stream_names,
-                    vis_encode_type=vis_encode_type,
-                )
-                # The optimizer's m_size is 3 times the policy (Q1, Q2, and Value)
-                self.m_size = 3 * self.policy.m_size
-                self._create_inputs_and_outputs()
-                self.learning_rate = ModelUtils.create_learning_rate(
-                    lr_schedule, lr, self.policy.global_step, int(max_step)
-                )
-                self._create_losses(
-                    self.policy_network.q1_heads,
-                    self.policy_network.q2_heads,
-                    lr,
-                    int(max_step),
-                    stream_names,
-                    discrete=not self.policy.use_continuous_act,
-                )
-                self._create_sac_optimizer_ops()
-
-                self.selected_actions = (
-                    self.policy.selected_actions
-                )  # For GAIL and other reward signals
-                if self.policy.normalize:
-                    target_update_norm = self.target_network.copy_normalization(
-                        self.policy.running_mean,
-                        self.policy.running_variance,
-                        self.policy.normalization_steps,
-                    )
-                    # Update the normalization of the optimizer when the policy does.
-                    self.policy.update_normalization_op = tf.group(
-                        [self.policy.update_normalization_op, target_update_norm]
-                    )
-
-                self.policy.initialize_or_load()
-
-        self.stats_name_to_update_name = {
-            "Losses/Value Loss": "value_loss",
-            "Losses/Policy Loss": "policy_loss",
-            "Losses/Q1 Loss": "q1_loss",
-            "Losses/Q2 Loss": "q2_loss",
-            "Policy/Entropy Coeff": "entropy_coef",
-            "Policy/Learning Rate": "learning_rate",
-        }
-
-        self.update_dict = {
-            "value_loss": self.total_value_loss,
-            "policy_loss": self.policy_loss,
-            "q1_loss": self.q1_loss,
-            "q2_loss": self.q2_loss,
-            "entropy_coef": self.ent_coef,
-            "entropy": self.policy.entropy,
-            "update_batch": self.update_batch_policy,
-            "update_value": self.update_batch_value,
-            "update_entropy": self.update_batch_entropy,
-            "learning_rate": self.learning_rate,
-        }
-
-    def _create_inputs_and_outputs(self) -> None:
-        """
-        Assign the higher-level SACModel's inputs and outputs to those of its policy or
-        target network.
-        """
-        self.vector_in = self.policy.vector_in
-        self.visual_in = self.policy.visual_in
-        self.next_vector_in = self.target_network.vector_in
-        self.next_visual_in = self.target_network.visual_in
-        self.action_holder = self.policy.action_holder
-        self.sequence_length_ph = self.policy.sequence_length_ph
-        self.next_sequence_length_ph = self.target_network.sequence_length_ph
-        if not self.policy.use_continuous_act:
-            self.action_masks = self.policy_network.action_masks
-        else:
-            self.output_pre = self.policy_network.output_pre
-
-        # Don't use value estimate during inference. TODO: Check why PPO uses value_estimate in inference.
-        self.value = tf.identity(
-            self.policy_network.value, name="value_estimate_unused"
-        )
-        self.value_heads = self.policy_network.value_heads
-        self.dones_holder = tf.placeholder(
-            shape=[None], dtype=tf.float32, name="dones_holder"
-        )
-
-        if self.policy.use_recurrent:
-            self.memory_in = self.policy_network.memory_in
-            self.memory_out = self.policy_network.memory_out
-            if not self.policy.use_continuous_act:
-                self.prev_action = self.policy_network.prev_action
-            self.next_memory_in = self.target_network.memory_in
-
-    def _create_losses(
-        self,
-        q1_streams: Dict[str, tf.Tensor],
-        q2_streams: Dict[str, tf.Tensor],
-        lr: tf.Tensor,
-        max_step: int,
-        stream_names: List[str],
-        discrete: bool = False,
-    ) -> None:
-        """
-        Creates training-specific Tensorflow ops for SAC models.
-        :param q1_streams: Q1 streams from policy network
-        :param q1_streams: Q2 streams from policy network
-        :param lr: Learning rate
-        :param max_step: Total number of training steps.
-        :param stream_names: List of reward stream names.
-        :param discrete: Whether or not to use discrete action losses.
-        """
-
-        if discrete:
-            self.target_entropy = [
-                self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
-                for i in self.act_size
-            ]
-            discrete_action_probs = tf.exp(self.policy.all_log_probs)
-            per_action_entropy = discrete_action_probs * self.policy.all_log_probs
-        else:
-            self.target_entropy = (
-                -1
-                * self.continuous_target_entropy_scale
-                * np.prod(self.act_size[0]).astype(np.float32)
-            )
-
-        self.rewards_holders = {}
-        self.min_policy_qs = {}
-
-        for name in stream_names:
-            if discrete:
-                _branched_mpq1 = self._apply_as_branches(
-                    self.policy_network.q1_pheads[name] * discrete_action_probs
-                )
-                branched_mpq1 = tf.stack(
-                    [
-                        tf.reduce_sum(_br, axis=1, keep_dims=True)
-                        for _br in _branched_mpq1
-                    ]
-                )
-                _q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0)
-
-                _branched_mpq2 = self._apply_as_branches(
-                    self.policy_network.q2_pheads[name] * discrete_action_probs
-                )
-                branched_mpq2 = tf.stack(
-                    [
-                        tf.reduce_sum(_br, axis=1, keep_dims=True)
-                        for _br in _branched_mpq2
-                    ]
-                )
-                _q2_p_mean = tf.reduce_mean(branched_mpq2, axis=0)
-
-                self.min_policy_qs[name] = tf.minimum(_q1_p_mean, _q2_p_mean)
-            else:
-                self.min_policy_qs[name] = tf.minimum(
-                    self.policy_network.q1_pheads[name],
-                    self.policy_network.q2_pheads[name],
-                )
-
-            rewards_holder = tf.placeholder(
-                shape=[None], dtype=tf.float32, name="{}_rewards".format(name)
-            )
-            self.rewards_holders[name] = rewards_holder
-
-        q1_losses = []
-        q2_losses = []
-        # Multiple q losses per stream
-        expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
-        for i, name in enumerate(stream_names):
-            _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1)
-
-            q_backup = tf.stop_gradient(
-                _expanded_rewards
-                + (1.0 - self.use_dones_in_backup[name] * expanded_dones)
-                * self.gammas[i]
-                * self.target_network.value_heads[name]
-            )
-
-            if discrete:
-                # We need to break up the Q functions by branch, and update them individually.
-                branched_q1_stream = self._apply_as_branches(
-                    self.policy.action_oh * q1_streams[name]
-                )
-                branched_q2_stream = self._apply_as_branches(
-                    self.policy.action_oh * q2_streams[name]
-                )
-
-                # Reduce each branch into scalar
-                branched_q1_stream = [
-                    tf.reduce_sum(_branch, axis=1, keep_dims=True)
-                    for _branch in branched_q1_stream
-                ]
-                branched_q2_stream = [
-                    tf.reduce_sum(_branch, axis=1, keep_dims=True)
-                    for _branch in branched_q2_stream
-                ]
-
-                q1_stream = tf.reduce_mean(branched_q1_stream, axis=0)
-                q2_stream = tf.reduce_mean(branched_q2_stream, axis=0)
-
-            else:
-                q1_stream = q1_streams[name]
-                q2_stream = q2_streams[name]
-
-            _q1_loss = 0.5 * tf.reduce_mean(
-                tf.to_float(self.policy.mask)
-                * tf.squared_difference(q_backup, q1_stream)
-            )
-
-            _q2_loss = 0.5 * tf.reduce_mean(
-                tf.to_float(self.policy.mask)
-                * tf.squared_difference(q_backup, q2_stream)
-            )
-
-            q1_losses.append(_q1_loss)
-            q2_losses.append(_q2_loss)
-
-        self.q1_loss = tf.reduce_mean(q1_losses)
-        self.q2_loss = tf.reduce_mean(q2_losses)
-
-        # Learn entropy coefficient
-        if discrete:
-            # Create a log_ent_coef for each branch
-            self.log_ent_coef = tf.get_variable(
-                "log_ent_coef",
-                dtype=tf.float32,
-                initializer=np.log([self.init_entcoef] * len(self.act_size)).astype(
-                    np.float32
-                ),
-                trainable=True,
-            )
-        else:
-            self.log_ent_coef = tf.get_variable(
-                "log_ent_coef",
-                dtype=tf.float32,
-                initializer=np.log(self.init_entcoef).astype(np.float32),
-                trainable=True,
-            )
-
-        self.ent_coef = tf.exp(self.log_ent_coef)
-        if discrete:
-            # We also have to do a different entropy and target_entropy per branch.
-            branched_per_action_ent = self._apply_as_branches(per_action_entropy)
-            branched_ent_sums = tf.stack(
-                [
-                    tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
-                    for _lp, _te in zip(branched_per_action_ent, self.target_entropy)
-                ],
-                axis=1,
-            )
-            self.entropy_loss = -tf.reduce_mean(
-                tf.to_float(self.policy.mask)
-                * tf.reduce_mean(
-                    self.log_ent_coef
-                    * tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
-                    axis=1,
-                )
-            )
-
-            # Same with policy loss, we have to do the loss per branch and average them,
-            # so that larger branches don't get more weight.
-            # The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q
-            branched_q_term = self._apply_as_branches(
-                discrete_action_probs * self.policy_network.q1_p
-            )
-
-            branched_policy_loss = tf.stack(
-                [
-                    tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True)
-                    for i, (_lp, _qt) in enumerate(
-                        zip(branched_per_action_ent, branched_q_term)
-                    )
-                ]
-            )
-            self.policy_loss = tf.reduce_mean(
-                tf.to_float(self.policy.mask) * tf.squeeze(branched_policy_loss)
-            )
-
-            # Do vbackup entropy bonus per branch as well.
-            branched_ent_bonus = tf.stack(
-                [
-                    tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True)
-                    for i, _lp in enumerate(branched_per_action_ent)
-                ]
-            )
-            value_losses = []
-            for name in stream_names:
-                v_backup = tf.stop_gradient(
-                    self.min_policy_qs[name]
-                    - tf.reduce_mean(branched_ent_bonus, axis=0)
-                )
-                value_losses.append(
-                    0.5
-                    * tf.reduce_mean(
-                        tf.to_float(self.policy.mask)
-                        * tf.squared_difference(
-                            self.policy_network.value_heads[name], v_backup
-                        )
-                    )
-                )
-
-        else:
-            self.entropy_loss = -tf.reduce_mean(
-                self.log_ent_coef
-                * tf.to_float(self.policy.mask)
-                * tf.stop_gradient(
-                    tf.reduce_sum(
-                        self.policy.all_log_probs + self.target_entropy,
-                        axis=1,
-                        keep_dims=True,
-                    )
-                )
-            )
-            batch_policy_loss = tf.reduce_mean(
-                self.ent_coef * self.policy.all_log_probs - self.policy_network.q1_p,
-                axis=1,
-            )
-            self.policy_loss = tf.reduce_mean(
-                tf.to_float(self.policy.mask) * batch_policy_loss
-            )
-
-            value_losses = []
-            for name in stream_names:
-                v_backup = tf.stop_gradient(
-                    self.min_policy_qs[name]
-                    - tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
-                )
-                value_losses.append(
-                    0.5
-                    * tf.reduce_mean(
-                        tf.to_float(self.policy.mask)
-                        * tf.squared_difference(
-                            self.policy_network.value_heads[name], v_backup
-                        )
-                    )
-                )
-        self.value_loss = tf.reduce_mean(value_losses)
-
-        self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss
-
-        self.entropy = self.policy_network.entropy
-
-    def _apply_as_branches(self, concat_logits: tf.Tensor) -> List[tf.Tensor]:
-        """
-        Takes in a concatenated set of logits and breaks it up into a list of non-concatenated logits, one per
-        action branch
-        """
-        action_idx = [0] + list(np.cumsum(self.act_size))
-        branches_logits = [
-            concat_logits[:, action_idx[i] : action_idx[i + 1]]
-            for i in range(len(self.act_size))
-        ]
-        return branches_logits
-
-    def _create_sac_optimizer_ops(self) -> None:
-        """
-        Creates the Adam optimizers and update ops for SAC, including
-        the policy, value, and entropy updates, as well as the target network update.
-        """
-        policy_optimizer = self.create_optimizer_op(
-            learning_rate=self.learning_rate, name="sac_policy_opt"
-        )
-        entropy_optimizer = self.create_optimizer_op(
-            learning_rate=self.learning_rate, name="sac_entropy_opt"
-        )
-        value_optimizer = self.create_optimizer_op(
-            learning_rate=self.learning_rate, name="sac_value_opt"
-        )
-
-        self.target_update_op = [
-            tf.assign(target, (1 - self.tau) * target + self.tau * source)
-            for target, source in zip(
-                self.target_network.value_vars, self.policy_network.value_vars
-            )
-        ]
-        LOGGER.debug("value_vars")
-        self.print_all_vars(self.policy_network.value_vars)
-        LOGGER.debug("targvalue_vars")
-        self.print_all_vars(self.target_network.value_vars)
-        LOGGER.debug("critic_vars")
-        self.print_all_vars(self.policy_network.critic_vars)
-        LOGGER.debug("q_vars")
-        self.print_all_vars(self.policy_network.q_vars)
-        LOGGER.debug("policy_vars")
-        policy_vars = self.policy.get_trainable_variables()
-        self.print_all_vars(policy_vars)
-
-        self.target_init_op = [
-            tf.assign(target, source)
-            for target, source in zip(
-                self.target_network.value_vars, self.policy_network.value_vars
-            )
-        ]
-
-        self.update_batch_policy = policy_optimizer.minimize(
-            self.policy_loss, var_list=policy_vars
-        )
-
-        # Make sure policy is updated first, then value, then entropy.
-        with tf.control_dependencies([self.update_batch_policy]):
-            self.update_batch_value = value_optimizer.minimize(
-                self.total_value_loss, var_list=self.policy_network.critic_vars
-            )
-            # Add entropy coefficient optimization operation
-            with tf.control_dependencies([self.update_batch_value]):
-                self.update_batch_entropy = entropy_optimizer.minimize(
-                    self.entropy_loss, var_list=self.log_ent_coef
-                )
-
-    def print_all_vars(self, variables):
-        for _var in variables:
-            LOGGER.debug(_var)
-
-    @timed
-    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
-        """
-        Updates model using buffer.
-        :param num_sequences: Number of trajectories in batch.
-        :param batch: Experience mini-batch.
-        :param update_target: Whether or not to update target value network
-        :param reward_signal_batches: Minibatches to use for updating the reward signals,
-            indexed by name. If none, don't update the reward signals.
-        :return: Output from update process.
-        """
-        feed_dict = self._construct_feed_dict(self.policy, batch, num_sequences)
-        stats_needed = self.stats_name_to_update_name
-        update_stats: Dict[str, float] = {}
-        update_vals = self._execute_model(feed_dict, self.update_dict)
-        for stat_name, update_name in stats_needed.items():
-            update_stats[stat_name] = update_vals[update_name]
-        # Update target network. By default, target update happens at every policy update.
-        self.sess.run(self.target_update_op)
-        return update_stats
-
-    def update_reward_signals(
-        self, reward_signal_minibatches: Mapping[str, Dict], num_sequences: int
-    ) -> Dict[str, float]:
-        """
-        Only update the reward signals.
-        :param reward_signal_batches: Minibatches to use for updating the reward signals,
-            indexed by name. If none, don't update the reward signals.
-        """
-        # Collect feed dicts for all reward signals.
-        feed_dict: Dict[tf.Tensor, Any] = {}
-        update_dict: Dict[str, tf.Tensor] = {}
-        update_stats: Dict[str, float] = {}
-        stats_needed: Dict[str, str] = {}
-        if reward_signal_minibatches:
-            self.add_reward_signal_dicts(
-                feed_dict,
-                update_dict,
-                stats_needed,
-                reward_signal_minibatches,
-                num_sequences,
-            )
-        update_vals = self._execute_model(feed_dict, update_dict)
-        for stat_name, update_name in stats_needed.items():
-            update_stats[stat_name] = update_vals[update_name]
-        return update_stats
-
-    def add_reward_signal_dicts(
-        self,
-        feed_dict: Dict[tf.Tensor, Any],
-        update_dict: Dict[str, tf.Tensor],
-        stats_needed: Dict[str, str],
-        reward_signal_minibatches: Mapping[str, Dict],
-        num_sequences: int,
-    ) -> None:
-        """
-        Adds the items needed for reward signal updates to the feed_dict and stats_needed dict.
-        :param feed_dict: Feed dict needed update
-        :param update_dit: Update dict that needs update
-        :param stats_needed: Stats needed to get from the update.
-        :param reward_signal_minibatches: Minibatches to use for updating the reward signals,
-            indexed by name.
-        """
-        for name, r_batch in reward_signal_minibatches.items():
-            feed_dict.update(
-                self.reward_signals[name].prepare_update(
-                    self.policy, r_batch, num_sequences
-                )
-            )
-            update_dict.update(self.reward_signals[name].update_dict)
-            stats_needed.update(self.reward_signals[name].stats_name_to_update_name)
-
-    def _construct_feed_dict(
-        self, policy: TFPolicy, batch: AgentBuffer, num_sequences: int
-    ) -> Dict[tf.Tensor, Any]:
-        """
-        Builds the feed dict for updating the SAC model.
-        :param model: The model to update. May be different when, e.g. using multi-GPU.
-        :param batch: Mini-batch to use to update.
-        :param num_sequences: Number of LSTM sequences in batch.
-        """
-        # Do an optional burn-in for memories
-        num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
-        burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
-        burn_in_mask[range(0, num_burn_in)] = 0
-        burn_in_mask = np.tile(burn_in_mask, num_sequences)
-        feed_dict = {
-            policy.batch_size_ph: num_sequences,
-            policy.sequence_length_ph: self.policy.sequence_length,
-            self.next_sequence_length_ph: self.policy.sequence_length,
-            self.policy.mask_input: batch["masks"] * burn_in_mask,
-        }
-        for name in self.reward_signals:
-            feed_dict[self.rewards_holders[name]] = batch["{}_rewards".format(name)]
-
-        if self.policy.use_continuous_act:
-            feed_dict[policy.action_holder] = batch["actions"]
-        else:
-            feed_dict[policy.action_holder] = batch["actions"]
-            if self.policy.use_recurrent:
-                feed_dict[policy.prev_action] = batch["prev_action"]
-            feed_dict[policy.action_masks] = batch["action_mask"]
-        if self.policy.use_vec_obs:
-            feed_dict[policy.vector_in] = batch["vector_obs"]
-            feed_dict[self.next_vector_in] = batch["next_vector_in"]
-        if self.policy.vis_obs_size > 0:
-            for i, _ in enumerate(policy.visual_in):
-                _obs = batch["visual_obs%d" % i]
-                feed_dict[policy.visual_in[i]] = _obs
-            for i, _ in enumerate(self.next_visual_in):
-                _obs = batch["next_visual_obs%d" % i]
-                feed_dict[self.next_visual_in[i]] = _obs
-        if self.policy.use_recurrent:
-            feed_dict[policy.memory_in] = [
-                batch["memory"][i]
-                for i in range(0, len(batch["memory"]), self.policy.sequence_length)
-            ]
-            feed_dict[self.policy_network.memory_in] = self._make_zero_mem(
-                self.m_size, batch.num_experiences
-            )
-            feed_dict[self.target_network.memory_in] = self._make_zero_mem(
-                self.m_size // 3, batch.num_experiences
-            )
-        feed_dict[self.dones_holder] = batch["done"]
-        return feed_dict
--- a/ml-agents/mlagents/trainers/tests/test_nn_policy.py
+++ b/ml-agents/mlagents/trainers/tests/test_nn_policy.py
-import pytest
-
-import numpy as np
-from mlagents.tf_utils import tf
-
-import yaml
-
-from mlagents.trainers.common.nn_policy import NNPolicy
-from mlagents.trainers.models import EncoderType, ModelUtils
-from mlagents.trainers.exception import UnityTrainerException
-from mlagents.trainers.brain import BrainParameters, CameraResolution
-from mlagents.trainers.tests import mock_brain as mb
-from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
-
-
-@pytest.fixture
-def dummy_config():
-    return yaml.safe_load(
-        """
-        trainer: ppo
-        batch_size: 32
-        beta: 5.0e-3
-        buffer_size: 512
-        epsilon: 0.2
-        hidden_units: 128
-        lambd: 0.95
-        learning_rate: 3.0e-4
-        max_steps: 5.0e4
-        normalize: true
-        num_epoch: 5
-        num_layers: 2
-        time_horizon: 64
-        sequence_length: 64
-        summary_freq: 1000
-        use_recurrent: false
-        normalize: true
-        memory_size: 8
-        curiosity_strength: 0.0
-        curiosity_enc_size: 1
-        summary_path: test
-        model_path: test
-        reward_signals:
-          extrinsic:
-            strength: 1.0
-            gamma: 0.99
-        """
-    )
-
-
-VECTOR_ACTION_SPACE = [2]
-VECTOR_OBS_SPACE = 8
-DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
-BUFFER_INIT_SAMPLES = 32
-NUM_AGENTS = 12
-
-
-def create_policy_mock(dummy_config, use_rnn, use_discrete, use_visual):
-    mock_brain = mb.setup_mock_brain(
-        use_discrete,
-        use_visual,
-        vector_action_space=VECTOR_ACTION_SPACE,
-        vector_obs_space=VECTOR_OBS_SPACE,
-        discrete_action_space=DISCRETE_ACTION_SPACE,
-    )
-
-    trainer_parameters = dummy_config
-    model_path = "testmodel"
-    trainer_parameters["model_path"] = model_path
-    trainer_parameters["keep_checkpoints"] = 3
-    trainer_parameters["use_recurrent"] = use_rnn
-    policy = NNPolicy(0, mock_brain, trainer_parameters, False, False)
-    return policy
-
-
-@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
-@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
-@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
-def test_policy_evaluate(dummy_config, rnn, visual, discrete):
-    # Test evaluate
-    tf.reset_default_graph()
-    policy = create_policy_mock(
-        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
-    )
-    step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
-
-    run_out = policy.evaluate(step, list(step.agent_id))
-    if discrete:
-        run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
-    else:
-        assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE[0])
-
-
-def test_normalization(dummy_config):
-    brain_params = BrainParameters(
-        brain_name="test_brain",
-        vector_observation_space_size=1,
-        camera_resolutions=[],
-        vector_action_space_size=[2],
-        vector_action_descriptions=[],
-        vector_action_space_type=0,
-    )
-    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
-    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
-
-    time_horizon = 6
-    trajectory = make_fake_trajectory(
-        length=time_horizon,
-        max_step_complete=True,
-        vec_obs_size=1,
-        num_vis_obs=0,
-        action_space=[2],
-    )
-    # Change half of the obs to 0
-    for i in range(3):
-        trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
-    policy = policy = NNPolicy(0, brain_params, dummy_config, False, False)
-
-    trajectory_buffer = trajectory.to_agentbuffer()
-    policy.update_normalization(trajectory_buffer["vector_obs"])
-
-    # Check that the running mean and variance is correct
-    steps, mean, variance = policy.sess.run(
-        [policy.normalization_steps, policy.running_mean, policy.running_variance]
-    )
-
-    assert steps == 6
-    assert mean[0] == 0.5
-    # Note: variance is divided by number of steps, and initialized to 1 to avoid
-    # divide by 0. The right answer is 0.25
-    assert (variance[0] - 1) / steps == 0.25
-
-    # Make another update, this time with all 1's
-    time_horizon = 10
-    trajectory = make_fake_trajectory(
-        length=time_horizon,
-        max_step_complete=True,
-        vec_obs_size=1,
-        num_vis_obs=0,
-        action_space=[2],
-    )
-    trajectory_buffer = trajectory.to_agentbuffer()
-    policy.update_normalization(trajectory_buffer["vector_obs"])
-
-    # Check that the running mean and variance is correct
-    steps, mean, variance = policy.sess.run(
-        [policy.normalization_steps, policy.running_mean, policy.running_variance]
-    )
-
-    assert steps == 16
-    assert mean[0] == 0.8125
-    assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
-
-
-def test_min_visual_size():
-    # Make sure each EncoderType has an entry in MIS_RESOLUTION_FOR_ENCODER
-    assert set(ModelUtils.MIN_RESOLUTION_FOR_ENCODER.keys()) == set(EncoderType)
-
-    for encoder_type in EncoderType:
-        with tf.Graph().as_default():
-            good_size = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[encoder_type]
-            good_res = CameraResolution(
-                width=good_size, height=good_size, num_channels=3
-            )
-            vis_input = ModelUtils.create_visual_input(good_res, "test_min_visual_size")
-            ModelUtils._check_resolution_for_encoder(vis_input, encoder_type)
-            enc_func = ModelUtils.get_encoder_for_type(encoder_type)
-            enc_func(vis_input, 32, ModelUtils.swish, 1, "test", False)
-
-        # Anything under the min size should raise an exception. If not, decrease the min size!
-        with pytest.raises(Exception):
-            with tf.Graph().as_default():
-                bad_size = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[encoder_type] - 1
-                bad_res = CameraResolution(
-                    width=bad_size, height=bad_size, num_channels=3
-                )
-                vis_input = ModelUtils.create_visual_input(
-                    bad_res, "test_min_visual_size"
-                )
-
-                with pytest.raises(UnityTrainerException):
-                    # Make sure we'd hit a friendly error during model setup time.
-                    ModelUtils._check_resolution_for_encoder(vis_input, encoder_type)
-
-                enc_func = ModelUtils.get_encoder_for_type(encoder_type)
-                enc_func(vis_input, 32, ModelUtils.swish, 1, "test", False)
-
-
-if __name__ == "__main__":
-    pytest.main()
--- a//ml-agents/mlagents/trainers/common/optimizer.py
+++ b//ml-agents/mlagents/trainers/common/optimizer.py