SAC CC working

5 年前 · 0ef40c08
--- a/ml-agents/mlagents/trainers/common/nn_policy.py
+++ b/ml-agents/mlagents/trainers/common/nn_policy.py
        is_training: bool,
        load: bool,
        tanh_squash: bool = False,
+        resample: bool = False,
    ):
        """
        Policy for Proximal Policy Optimization Networks.
        :param load: Whether a pre-trained model will be loaded or a new one created.
        """
        with tf.variable_scope("policy"):
-            super().__init__(seed, brain, trainer_params)
+            super().__init__(seed, brain, trainer_params, load)

            self.stats_name_to_update_name = {
                "Losses/Value Loss": "value_loss",
            with self.graph.as_default():
                if self.use_continuous_act:
                    self.create_cc_actor(
-                        h_size, num_layers, vis_encode_type, tanh_squash
+                        h_size, num_layers, vis_encode_type, tanh_squash, resample
                    )
                else:
                    self.create_dc_actor(h_size, num_layers, vis_encode_type)
            self.inference_dict["pre_action"] = self.output_pre
        if self.use_recurrent:
            self.inference_dict["policy_memory_out"] = self.memory_out
-        self.load = load
-
-    def initialize_or_load(self):
-        if self.load:
-            self._load_graph()
-        else:
-            self._initialize_graph()

    @timed
    def evaluate(
        num_layers: int,
        vis_encode_type: EncoderType,
        tanh_squash: bool = False,
+        resample: bool = False,
    ) -> None:
        """
        Creates Continuous control actor-critic model.
        sigma = tf.exp(self.log_sigma)

        self.epsilon = tf.random_normal(tf.shape(mu))
-        # Clip and scale output to ensure actions are always within [-1, 1] range.
-        policy_ = mu + sigma * self.epsilon
+
+        sampled_policy = mu + sigma * self.epsilon
+
+        # Stop gradient if we're not doing the resampling trick
+        if not resample:
+            sampled_policy = tf.stop_gradient(sampled_policy)
-            ((tf.stop_gradient(policy_) - mu) / (sigma + EPSILON)) ** 2
+            ((sampled_policy - mu) / (sigma + EPSILON)) ** 2
            + 2 * self.log_sigma
            + np.log(2 * np.pi)
        )
        if tanh_squash:
-            self.output_pre = tf.tanh(policy_)
+            self.output_pre = tf.tanh(sampled_policy)

            # Squash correction
            all_probs -= tf.reduce_sum(
        else:
-            self.output_pre = policy_
+            self.output_pre = sampled_policy
+            # Clip and scale output to ensure actions are always within [-1, 1] range.
            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
            self.output = tf.identity(output_post, name="action")

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.log_probs = tf.reduce_sum(
            (tf.identity(self.all_log_probs)), axis=1, keepdims=True
+        )
+
+        self.action_holder = tf.placeholder(
+            shape=[None, self.act_size[0]], dtype=tf.float32, name="action_holder"
        )

    def create_dc_actor(
--- a/ml-agents/mlagents/trainers/optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer.py
            value = tf.layers.dense(hidden_input, 1, name="{}_value".format(name))
            self.value_heads[name] = value
        self.value = tf.reduce_mean(list(self.value_heads.values()), 0)
+
+    def _execute_model(self, feed_dict, out_dict):
+        """
+        Executes model.
+        :param feed_dict: Input dictionary mapping nodes to input data.
+        :param out_dict: Output dictionary mapping names to nodes.
+        :return: Dictionary mapping names to input data.
+        """
+        network_out = self.sess.run(list(out_dict.values()), feed_dict=feed_dict)
+        run_out = dict(zip(list(out_dict.keys()), network_out))
+        return run_out
--- a/ml-agents/mlagents/trainers/ppo/optimizer.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer.py
            ]
            feed_dict[self.policy.memory_in] = mem_in
        return feed_dict
-
-    def _execute_model(self, feed_dict, out_dict):
-        """
-        Executes model.
-        :param feed_dict: Input dictionary mapping nodes to input data.
-        :param out_dict: Output dictionary mapping names to nodes.
-        :return: Dictionary mapping names to input data.
-        """
-        network_out = self.sess.run(list(out_dict.values()), feed_dict=feed_dict)
-        run_out = dict(zip(list(out_dict.keys()), network_out))
-        return run_out
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py

 from mlagents_envs.timers import timed
 from mlagents.trainers.tf_policy import TFPolicy
-from mlagents.trainers.sac.policy import SACPolicy
+from mlagents.trainers.common.nn_policy import NNPolicy
+from mlagents.trainers.sac.optimizer import SACOptimizer
 from mlagents.trainers.rl_trainer import RLTrainer
 from mlagents.trainers.trajectory import Trajectory, SplitObservations
 from mlagents.trainers.brain import BrainParameters
        self._check_param_keys()
        self.load = load
        self.seed = seed
-        self.policy: SACPolicy = None  # type: ignore
+        self.policy: NNPolicy = None  # type: ignore
+        self.optimizer: SACOptimizer = None  # type: ignore

        self.step = 0
        self.train_interval = (
            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

        # Get all value estimates for reporting purposes
-        value_estimates = self.policy.get_batched_value_estimates(
+        value_estimates = self.optimizer.get_batched_value_estimates(
-                self.policy.reward_signals[name].value_name, np.mean(v)
+                self.optimizer.reward_signals[name].value_name, np.mean(v)
            )

        # Bootstrap using the last step rather than the bootstrap step if max step is reached.
        )

        if trajectory.done_reached:
-            self._update_end_episode_stats(
-                agent_id, self.get_policy(trajectory.behavior_id)
-            )
+            self._update_end_episode_stats(agent_id, self.optimizer)

    def _is_ready_update(self) -> bool:
        """
            self.update_reward_signals()

    def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
-        policy = SACPolicy(
+        policy = NNPolicy(
+            tanh_squash=True,
        )
        for _reward_signal in policy.reward_signals.keys():
            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
                    sequence_length=self.policy.sequence_length,
                )
                # Get rewards for each reward
-                for name, signal in self.policy.reward_signals.items():
+                for name, signal in self.optimizer.reward_signals.items():
-                update_stats = self.policy.update(sampled_minibatch, n_sequences)
+                update_stats = self.optimizer.update(sampled_minibatch, n_sequences)
                for stat_name, value in update_stats.items():
                    batch_update_stats[stat_name].append(value)

        for _ in range(num_updates):
            # Get minibatches for reward signal update if needed
            reward_signal_minibatches = {}
-            for name, signal in self.policy.reward_signals.items():
+            for name, signal in self.optimizer.reward_signals.items():
                logger.debug("Updating {} at step {}".format(name, self.step))
                # Some signals don't need a minibatch to be sampled - so we don't!
                if signal.update_dict:
                    )
-            update_stats = self.policy.update_reward_signals(
+            update_stats = self.optimizer.update_reward_signals(
                reward_signal_minibatches, n_sequences
            )
            for stat_name, value in update_stats.items():
                    self.__class__.__name__
                )
            )
-        if not isinstance(policy, SACPolicy):
+        if not isinstance(policy, NNPolicy):
+        self.optimizer = SACOptimizer(self.policy, self.trainer_parameters)
+        self.policy.initialize_or_load()
+        for _reward_signal in self.optimizer.reward_signals.keys():
+            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)

    def get_policy(self, name_behavior_id: str) -> TFPolicy:
        """
--- a/ml-agents/mlagents/trainers/tf_policy.py
+++ b/ml-agents/mlagents/trainers/tf_policy.py
        "action_output_shape",
    ]

-    def __init__(self, seed, brain, trainer_parameters):
+    def __init__(self, seed, brain, trainer_parameters, load=False):
        """
        Initialized the policy.
        :param seed: Random seed to use for TensorFlow.
                    )
                )
        self._initialize_tensorflow_references()
+        self.load = load

    def _initialize_graph(self):
        with self.graph.as_default():
                    "--run-id".format(self.model_path)
                )
            self.saver.restore(self.sess, ckpt.model_checkpoint_path)
+
+    def initialize_or_load(self):
+        if self.load:
+            self._load_graph()
+        else:
+            self._initialize_graph()

    def evaluate(
        self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]
            self.update_normalization_op: Optional[tf.Operation] = None
            self.value: Optional[tf.Tensor] = None
            self.all_log_probs: Optional[tf.Tensor] = None
+            self.entropy: Optional[tf.Tensor] = None
            self.action_oh: tf.Tensor = None
            self.output_pre: Optional[tf.Tensor] = None
            self.output: Optional[tf.Tensor] = None
--- a/ml-agents/mlagents/trainers/sac/network.py
+++ b/ml-agents/mlagents/trainers/sac/network.py
+import logging
+from typing import Dict, Optional
+
+from mlagents.tf_utils import tf
+
+from mlagents.trainers.models import LearningModel, EncoderType
+
+LOG_STD_MAX = 2
+LOG_STD_MIN = -20
+EPSILON = 1e-6  # Small value to avoid divide by zero
+DISCRETE_TARGET_ENTROPY_SCALE = 0.2  # Roughly equal to e-greedy 0.05
+CONTINUOUS_TARGET_ENTROPY_SCALE = 1.0  # TODO: Make these an optional hyperparam.
+
+LOGGER = logging.getLogger("mlagents.trainers")
+
+POLICY_SCOPE = ""
+TARGET_SCOPE = "target_network"
+
+
+class SACNetwork:
+    """
+    Base class for an SAC network. Implements methods for creating the actor and critic heads.
+    """
+
+    def __init__(
+        self,
+        policy=None,
+        m_size=None,
+        h_size=128,
+        normalize=False,
+        use_recurrent=False,
+        num_layers=2,
+        stream_names=None,
+        vis_encode_type=EncoderType.SIMPLE,
+    ):
+        self.normalize = normalize
+        self.use_recurrent = use_recurrent
+        self.num_layers = num_layers
+        self.stream_names = stream_names
+        self.h_size = h_size
+        self.activ_fn = LearningModel.swish
+
+        self.sequence_length_ph = tf.placeholder(
+            shape=None, dtype=tf.int32, name="sequence_length"
+        )
+
+        self.policy_memory_in: Optional[tf.Tensor] = None
+        self.policy_memory_out: Optional[tf.Tensor] = None
+        self.value_memory_in: Optional[tf.Tensor] = None
+        self.value_memory_out: Optional[tf.Tensor] = None
+        self.q1: Optional[tf.Tensor] = None
+        self.q2: Optional[tf.Tensor] = None
+        self.q1_p: Optional[tf.Tensor] = None
+        self.q2_p: Optional[tf.Tensor] = None
+        self.q1_memory_in: Optional[tf.Tensor] = None
+        self.q2_memory_in: Optional[tf.Tensor] = None
+        self.q1_memory_out: Optional[tf.Tensor] = None
+        self.q2_memory_out: Optional[tf.Tensor] = None
+        self.prev_action: Optional[tf.Tensor] = None
+        self.action_masks: Optional[tf.Tensor] = None
+        self.external_action_in: Optional[tf.Tensor] = None
+        self.log_sigma_sq: Optional[tf.Tensor] = None
+        self.entropy: Optional[tf.Tensor] = None
+        self.deterministic_output: Optional[tf.Tensor] = None
+        self.normalized_logprobs: Optional[tf.Tensor] = None
+        self.action_probs: Optional[tf.Tensor] = None
+        self.output_oh: Optional[tf.Tensor] = None
+        self.output_pre: Optional[tf.Tensor] = None
+
+        self.value_vars = None
+        self.q_vars = None
+        self.critic_vars = None
+        self.policy_vars = None
+
+        self.q1_heads: Optional[Dict[str, tf.Tensor]] = None
+        self.q2_heads: Optional[Dict[str, tf.Tensor]] = None
+        self.q1_pheads: Optional[Dict[str, tf.Tensor]] = None
+        self.q2_pheads: Optional[Dict[str, tf.Tensor]] = None
+
+        self.policy = policy
+
+    def get_vars(self, scope):
+        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
+
+    def join_scopes(self, scope_1, scope_2):
+        """
+        Joins two scopes. Does so safetly (i.e., if one of the two scopes doesn't
+        exist, don't add any backslashes)
+        """
+        if not scope_1:
+            return scope_2
+        if not scope_2:
+            return scope_1
+        else:
+            return "/".join(filter(None, [scope_1, scope_2]))
+
+    def create_value_heads(self, stream_names, hidden_input):
+        """
+        Creates one value estimator head for each reward signal in stream_names.
+        Also creates the node corresponding to the mean of all the value heads in self.value.
+        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
+        :param stream_names: The list of reward signal names
+        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
+        of the hidden input.
+        """
+        self.value_heads = {}
+        for name in stream_names:
+            value = tf.layers.dense(hidden_input, 1, name="{}_value".format(name))
+            self.value_heads[name] = value
+        self.value = tf.reduce_mean(list(self.value_heads.values()), 0)
+
+    def create_cc_critic(self, hidden_value, scope, create_qs=True):
+        """
+        Creates just the critic network
+        """
+        scope = self.join_scopes(scope, "critic")
+        self.create_sac_value_head(
+            self.stream_names,
+            hidden_value,
+            self.num_layers,
+            self.h_size,
+            self.join_scopes(scope, "value"),
+        )
+
+        self.value_vars = self.get_vars(self.join_scopes(scope, "value"))
+        if create_qs:
+            hidden_q = tf.concat([hidden_value, self.policy.action_holder], axis=-1)
+            hidden_qp = tf.concat([hidden_value, self.policy.output], axis=-1)
+            self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads(
+                self.stream_names,
+                hidden_q,
+                self.num_layers,
+                self.h_size,
+                self.join_scopes(scope, "q"),
+            )
+            self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads(
+                self.stream_names,
+                hidden_qp,
+                self.num_layers,
+                self.h_size,
+                self.join_scopes(scope, "q"),
+                reuse=True,
+            )
+            self.q_vars = self.get_vars(self.join_scopes(scope, "q"))
+        self.critic_vars = self.get_vars(scope)
+
+    def create_dc_critic(self, hidden_value, scope, create_qs=True):
+        """
+        Creates just the critic network
+        """
+        scope = self.join_scopes(scope, "critic")
+        self.create_sac_value_head(
+            self.stream_names,
+            hidden_value,
+            self.num_layers,
+            self.h_size,
+            self.join_scopes(scope, "value"),
+        )
+
+        self.value_vars = self.get_vars("/".join([scope, "value"]))
+
+        if create_qs:
+            self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads(
+                self.stream_names,
+                hidden_value,
+                self.num_layers,
+                self.h_size,
+                self.join_scopes(scope, "q"),
+                num_outputs=sum(self.policy.act_size),
+            )
+            self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads(
+                self.stream_names,
+                hidden_value,
+                self.num_layers,
+                self.h_size,
+                self.join_scopes(scope, "q"),
+                reuse=True,
+                num_outputs=sum(self.policy.act_size),
+            )
+            self.q_vars = self.get_vars(scope)
+        self.critic_vars = self.get_vars(scope)
+
+    def create_sac_value_head(
+        self, stream_names, hidden_input, num_layers, h_size, scope
+    ):
+        """
+        Creates one value estimator head for each reward signal in stream_names.
+        Also creates the node corresponding to the mean of all the value heads in self.value.
+        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
+        :param stream_names: The list of reward signal names
+        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
+        of the hidden input.
+        :param num_layers: Number of hidden layers for value network
+        :param h_size: size of hidden layers for value network
+        :param scope: TF scope for value network.
+        """
+        with tf.variable_scope(scope):
+            value_hidden = LearningModel.create_vector_observation_encoder(
+                hidden_input, h_size, self.activ_fn, num_layers, "encoder", False
+            )
+            if self.use_recurrent:
+                value_hidden, memory_out = LearningModel.create_recurrent_encoder(
+                    value_hidden,
+                    self.value_memory_in,
+                    self.sequence_length_ph,
+                    name="lstm_value",
+                )
+                self.value_memory_out = memory_out
+            self.create_value_heads(stream_names, value_hidden)
+
+    def create_q_heads(
+        self,
+        stream_names,
+        hidden_input,
+        num_layers,
+        h_size,
+        scope,
+        reuse=False,
+        num_outputs=1,
+    ):
+        """
+        Creates two q heads for each reward signal in stream_names.
+        Also creates the node corresponding to the mean of all the value heads in self.value.
+        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
+        :param stream_names: The list of reward signal names
+        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
+        of the hidden input.
+        :param num_layers: Number of hidden layers for Q network
+        :param h_size: size of hidden layers for Q network
+        :param scope: TF scope for Q network.
+        :param reuse: Whether or not to reuse variables. Useful for creating Q of policy.
+        :param num_outputs: Number of outputs of each Q function. If discrete, equal to number of actions.
+        """
+        with tf.variable_scope(self.join_scopes(scope, "q1_encoding"), reuse=reuse):
+            q1_hidden = LearningModel.create_vector_observation_encoder(
+                hidden_input, h_size, self.activ_fn, num_layers, "q1_encoder", reuse
+            )
+            if self.use_recurrent:
+                q1_hidden, memory_out = LearningModel.create_recurrent_encoder(
+                    q1_hidden,
+                    self.q1_memory_in,
+                    self.sequence_length_ph,
+                    name="lstm_q1",
+                )
+                self.q1_memory_out = memory_out
+
+            q1_heads = {}
+            for name in stream_names:
+                _q1 = tf.layers.dense(q1_hidden, num_outputs, name="{}_q1".format(name))
+                q1_heads[name] = _q1
+
+            q1 = tf.reduce_mean(list(q1_heads.values()), axis=0)
+        with tf.variable_scope(self.join_scopes(scope, "q2_encoding"), reuse=reuse):
+            q2_hidden = LearningModel.create_vector_observation_encoder(
+                hidden_input, h_size, self.activ_fn, num_layers, "q2_encoder", reuse
+            )
+            if self.use_recurrent:
+                q2_hidden, memory_out = LearningModel.create_recurrent_encoder(
+                    q2_hidden,
+                    self.q2_memory_in,
+                    self.sequence_length_ph,
+                    name="lstm_q2",
+                )
+                self.q2_memory_out = memory_out
+
+            q2_heads = {}
+            for name in stream_names:
+                _q2 = tf.layers.dense(q2_hidden, num_outputs, name="{}_q2".format(name))
+                q2_heads[name] = _q2
+
+            q2 = tf.reduce_mean(list(q2_heads.values()), axis=0)
+
+        return q1_heads, q2_heads, q1, q2
+
+
+class SACTargetNetwork(SACNetwork):
+    """
+    Instantiation for the SAC target network. Only contains a single
+    value estimator and is updated from the Policy Network.
+    """
+
+    def __init__(
+        self,
+        policy,
+        m_size=None,
+        h_size=128,
+        normalize=False,
+        use_recurrent=False,
+        num_layers=2,
+        stream_names=None,
+        vis_encode_type=EncoderType.SIMPLE,
+    ):
+        super().__init__(
+            policy,
+            m_size,
+            h_size,
+            normalize,
+            use_recurrent,
+            num_layers,
+            stream_names,
+            vis_encode_type,
+        )
+        with tf.variable_scope(TARGET_SCOPE):
+            self.visual_in = LearningModel.create_visual_input_placeholders(
+                policy.brain.camera_resolutions
+            )
+            self.vector_in = LearningModel.create_vector_input(policy.vec_obs_size)
+            if self.policy.normalize:
+                normalization_tensors = LearningModel.create_normalizer(self.vector_in)
+                self.update_normalization_op = normalization_tensors[0]
+                self.normalization_steps = normalization_tensors[1]
+                self.running_mean = normalization_tensors[2]
+                self.running_variance = normalization_tensors[3]
+                self.processed_vector_in = LearningModel.normalize_vector_obs(
+                    self.vector_in,
+                    self.running_mean,
+                    self.running_variance,
+                    self.normalization_steps,
+                )
+            else:
+                self.processed_vector_in = self.vector_in
+                self.update_normalization_op = None
+
+            if self.policy.use_recurrent:
+                self.memory_in = tf.placeholder(
+                    shape=[None, self.policy.m_size],
+                    dtype=tf.float32,
+                    name="recurrent_in",
+                )
+                self.value_memory_in = self.memory_in
+            hidden_streams = LearningModel.create_observation_streams(
+                self.visual_in,
+                self.processed_vector_in,
+                1,
+                self.h_size,
+                0,
+                vis_encode_type=vis_encode_type,
+                stream_scopes=["critic/value/"],
+            )
+        if self.policy.use_continuous_act:
+            self.create_cc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
+        else:
+            self.create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
+        if self.use_recurrent:
+            self.memory_out = tf.concat(
+                self.value_memory_out, axis=1
+            )  # Needed for Barracuda to work
+
+    def copy_normalization(self, mean, variance, steps):
+        """
+        Copies the mean, variance, and steps into the normalizers of the
+        input of this SACNetwork. Used to copy the normalizer from the policy network
+        to the target network.
+        param mean: Tensor containing the mean.
+        param variance: Tensor containing the variance
+        param steps: Tensor containing the number of steps.
+        """
+        update_mean = tf.assign(self.running_mean, mean)
+        update_variance = tf.assign(self.running_variance, variance)
+        update_norm_step = tf.assign(self.normalization_steps, steps)
+        return tf.group([update_mean, update_variance, update_norm_step])
+
+
+class SACPolicyNetwork(SACNetwork):
+    """
+    Instantiation for SAC policy network. Contains a dual Q estimator,
+    a value estimator, and the actual policy network.
+    """
+
+    def __init__(
+        self,
+        policy,
+        m_size=None,
+        h_size=128,
+        normalize=False,
+        use_recurrent=False,
+        num_layers=2,
+        stream_names=None,
+        vis_encode_type=EncoderType.SIMPLE,
+    ):
+        super().__init__(
+            policy,
+            m_size,
+            h_size,
+            normalize,
+            use_recurrent,
+            num_layers,
+            stream_names,
+            vis_encode_type,
+        )
+        if self.policy.use_recurrent:
+            self.create_memory_ins(self.policy.m_size)
+
+        hidden_critic = self.create_observation_in(vis_encode_type)
+        self.policy.output = self.policy.output
+
+        if self.policy.use_continuous_act:
+            self.create_cc_critic(hidden_critic, POLICY_SCOPE)
+
+        else:
+            self.create_dc_critic(hidden_critic, POLICY_SCOPE)
+
+        if self.use_recurrent:
+            mem_outs = [
+                self.value_memory_out,
+                self.q1_memory_out,
+                self.q2_memory_out,
+                self.policy_memory_out,
+            ]
+            self.memory_out = tf.concat(mem_outs, axis=1)
+
+    def create_memory_ins(self, m_size):
+        """
+        Creates the memory input placeholders for LSTM.
+        :param m_size: the total size of the memory.
+        """
+        # Create the Policy input separate from the rest
+        # This is so in inference we only have to run the Policy network.
+        # Barracuda will grab the recurrent_in and recurrent_out named tensors.
+        self.inference_memory_in = tf.placeholder(
+            shape=[None, m_size // 4], dtype=tf.float32, name="recurrent_in"
+        )
+        # We assume m_size is divisible by 4
+        # Create the non-Policy inputs
+        # Use a default placeholder here so nothing has to be provided during
+        # Barracuda inference. Note that the default value is just the tiled input
+        # for the policy, which is thrown away.
+        three_fourths_m_size = m_size * 3 // 4
+        self.other_memory_in = tf.placeholder_with_default(
+            input=tf.tile(self.inference_memory_in, [1, 3]),
+            shape=[None, three_fourths_m_size],
+            name="other_recurrent_in",
+        )
+
+        # Concat and use this as the "placeholder"
+        # for training
+        self.memory_in = tf.concat(
+            [self.other_memory_in, self.inference_memory_in], axis=1
+        )
+
+        # Re-break-up for each network
+        num_mems = 4
+        mem_ins = []
+        for i in range(num_mems):
+            _start = m_size // num_mems * i
+            _end = m_size // num_mems * (i + 1)
+            mem_ins.append(self.memory_in[:, _start:_end])
+        self.value_memory_in = mem_ins[0]
+        self.q1_memory_in = mem_ins[1]
+        self.q2_memory_in = mem_ins[2]
+        self.policy_memory_in = mem_ins[3]
+
+    def create_observation_in(self, vis_encode_type):
+        """
+        Creates the observation inputs, and a CNN if needed,
+        :param vis_encode_type: Type of CNN encoder.
+        :param share_ac_cnn: Whether or not to share the actor and critic CNNs.
+        :return A tuple of (hidden_policy, hidden_critic). We don't save it to self since they're used
+        once and thrown away.
+        """
+        with tf.variable_scope(POLICY_SCOPE):
+            hidden_streams = LearningModel.create_observation_streams(
+                self.policy.visual_in,
+                self.policy.processed_vector_in,
+                1,
+                self.h_size,
+                0,
+                vis_encode_type=vis_encode_type,
+                stream_scopes=["policy/", "critic/value/"],
+            )
+        hidden_critic = hidden_streams[0]
+        return hidden_critic
--- a/ml-agents/mlagents/trainers/sac/optimizer.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer.py
+import logging
+import numpy as np
+from typing import Dict, List, Optional, Any, Mapping
+
+from mlagents.tf_utils import tf
+
+from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
+from mlagents.trainers.models import LearningRateSchedule, EncoderType, LearningModel
+from mlagents.trainers.optimizer import TFOptimizer
+from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.buffer import AgentBuffer
+from mlagents_envs.timers import timed
+
+LOG_STD_MAX = 2
+LOG_STD_MIN = -20
+EPSILON = 1e-6  # Small value to avoid divide by zero
+DISCRETE_TARGET_ENTROPY_SCALE = 0.2  # Roughly equal to e-greedy 0.05
+CONTINUOUS_TARGET_ENTROPY_SCALE = 1.0  # TODO: Make these an optional hyperparam.
+
+LOGGER = logging.getLogger("mlagents.trainers")
+
+POLICY_SCOPE = ""
+TARGET_SCOPE = "target_network"
+
+
+class SACOptimizer(TFOptimizer):
+    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
+        """
+        Takes a Unity environment and model-specific hyper-parameters and returns the
+        appropriate PPO agent model for the environment.
+        :param brain: Brain parameters used to generate specific network graph.
+        :param lr: Learning rate.
+        :param lr_schedule: Learning rate decay schedule.
+        :param h_size: Size of hidden layers
+        :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster,
+            set higher to explore more.
+        :return: a sub-class of PPOAgent tailored to the environment.
+        :param max_step: Total number of training steps.
+        :param normalize: Whether to normalize vector observation input.
+        :param use_recurrent: Whether to use an LSTM layer in the network.
+        :param num_layers: Number of hidden layers between encoded input and policy & value layers
+        :param tau: Strength of soft-Q update.
+        :param m_size: Size of brain memory.
+        """
+        with policy.graph.as_default():
+            with tf.variable_scope(""):
+                super().__init__(policy, trainer_params)
+                lr = float(trainer_params["learning_rate"])
+                lr_schedule = LearningRateSchedule(
+                    trainer_params.get("learning_rate_schedule", "constant")
+                )
+                self.policy = policy
+                self.act_size = self.policy.act_size
+                h_size = int(trainer_params["hidden_units"])
+                max_step = int(trainer_params["max_steps"])
+                num_layers = int(trainer_params["num_layers"])
+                vis_encode_type = EncoderType(
+                    trainer_params.get("vis_encode_type", "simple")
+                )
+                self.tau = trainer_params.get("tau", 0.005)
+                m_size = self.policy.m_size
+                self.init_entcoef = trainer_params.get("init_entcoef", 1.0)
+                stream_names = self.reward_signals.keys()
+                # Use to reduce "survivor bonus" when using Curiosity or GAIL.
+                self.gammas = [
+                    _val["gamma"] for _val in trainer_params["reward_signals"].values()
+                ]
+                self.use_dones_in_backup = {
+                    name: tf.Variable(1.0) for name in stream_names
+                }
+                self.disable_use_dones = {
+                    name: self.use_dones_in_backup[name].assign(0.0)
+                    for name in stream_names
+                }
+
+                if num_layers < 1:
+                    num_layers = 1
+
+                self.target_init_op: List[tf.Tensor] = []
+                self.target_update_op: List[tf.Tensor] = []
+                self.update_batch_policy: Optional[tf.Operation] = None
+                self.update_batch_value: Optional[tf.Operation] = None
+                self.update_batch_entropy: Optional[tf.Operation] = None
+
+                self.policy_network = SACPolicyNetwork(
+                    policy=self.policy,
+                    m_size=m_size,
+                    h_size=h_size,
+                    normalize=self.policy.normalize,
+                    use_recurrent=self.policy.use_recurrent,
+                    num_layers=num_layers,
+                    stream_names=stream_names,
+                    vis_encode_type=vis_encode_type,
+                )
+                self.target_network = SACTargetNetwork(
+                    policy=self.policy,
+                    m_size=m_size // 4 if m_size else None,
+                    h_size=h_size,
+                    normalize=self.policy.normalize,
+                    use_recurrent=self.policy.use_recurrent,
+                    num_layers=num_layers,
+                    stream_names=stream_names,
+                    vis_encode_type=vis_encode_type,
+                )
+                self.create_inputs_and_outputs()
+                self.learning_rate = LearningModel.create_learning_rate(
+                    lr_schedule, lr, self.policy.global_step, max_step
+                )
+                self.create_losses(
+                    self.policy_network.q1_heads,
+                    self.policy_network.q2_heads,
+                    lr,
+                    max_step,
+                    stream_names,
+                    discrete=not self.policy.use_continuous_act,
+                )
+                self.create_sac_optimizers()
+
+                self.selected_actions = (
+                    self.policy.selected_actions
+                )  # For GAIL and other reward signals
+                if self.policy.normalize:
+                    target_update_norm = self.target_network.copy_normalization(
+                        self.policy.running_mean,
+                        self.policy.running_variance,
+                        self.policy.normalization_steps,
+                    )
+                    # Update the normalization of the optimizer when the policy does.
+                    self.policy.update_normalization_op = tf.group(
+                        [self.policy.update_normalization_op, target_update_norm]
+                    )
+
+        self.stats_name_to_update_name = {
+            "Losses/Value Loss": "value_loss",
+            "Losses/Policy Loss": "policy_loss",
+            "Losses/Q1 Loss": "q1_loss",
+            "Losses/Q2 Loss": "q2_loss",
+            "Policy/Entropy Coeff": "entropy_coef",
+        }
+
+        self.update_dict = {
+            "value_loss": self.total_value_loss,
+            "policy_loss": self.policy_loss,
+            "q1_loss": self.q1_loss,
+            "q2_loss": self.q2_loss,
+            "entropy_coef": self.ent_coef,
+            "entropy": self.policy.entropy,
+            "update_batch": self.update_batch_policy,
+            "update_value": self.update_batch_value,
+            "update_entropy": self.update_batch_entropy,
+        }
+
+        # Add some stuff to inference dict from optimizer
+        self.policy.inference_dict["learning_rate"] = self.learning_rate
+        if self.policy.use_recurrent:
+            self.policy.inference_dict["optimizer_memory_out"] = self.memory_out
+
+    def create_inputs_and_outputs(self):
+        """
+        Assign the higher-level SACModel's inputs and outputs to those of its policy or
+        target network.
+        """
+        self.vector_in = self.policy.vector_in
+        self.visual_in = self.policy.visual_in
+        self.next_vector_in = self.target_network.vector_in
+        self.next_visual_in = self.target_network.visual_in
+        self.action_holder = self.policy.action_holder
+        self.sequence_length_ph = self.policy.sequence_length_ph
+        self.next_sequence_length_ph = self.target_network.sequence_length_ph
+        if not self.policy.use_continuous_act:
+            self.action_masks = self.policy_network.action_masks
+        else:
+            self.output_pre = self.policy_network.output_pre
+
+        # Don't use value estimate during inference. TODO: Check why PPO uses value_estimate in inference.
+        self.value = tf.identity(
+            self.policy_network.value, name="value_estimate_unused"
+        )
+        self.value_heads = self.policy_network.value_heads
+        self.all_log_probs = self.policy.all_log_probs
+        self.dones_holder = tf.placeholder(
+            shape=[None], dtype=tf.float32, name="dones_holder"
+        )
+        # This is just a dummy to get BC to work. PPO has this but SAC doesn't.
+        # TODO: Proper input and output specs for models
+        self.epsilon = tf.placeholder(
+            shape=[None, self.policy.act_size[0]], dtype=tf.float32, name="epsilon"
+        )
+
+        if self.policy.use_recurrent:
+            self.memory_in = self.policy_network.memory_in
+            self.memory_out = self.policy_network.memory_out
+
+            # For Barracuda
+            self.inference_memory_out = tf.identity(
+                self.policy_network.policy_memory_out, name="recurrent_out"
+            )
+
+            if not self.policy.use_continuous_act:
+                self.prev_action = self.policy_network.prev_action
+            self.next_memory_in = self.target_network.memory_in
+
+    def create_losses(
+        self, q1_streams, q2_streams, lr, max_step, stream_names, discrete=False
+    ):
+        """
+        Creates training-specific Tensorflow ops for SAC models.
+        :param q1_streams: Q1 streams from policy network
+        :param q1_streams: Q2 streams from policy network
+        :param lr: Learning rate
+        :param max_step: Total number of training steps.
+        :param stream_names: List of reward stream names.
+        :param discrete: Whether or not to use discrete action losses.
+        """
+
+        if discrete:
+            self.target_entropy = [
+                DISCRETE_TARGET_ENTROPY_SCALE * np.log(i).astype(np.float32)
+                for i in self.act_size
+            ]
+        else:
+            self.target_entropy = (
+                -1
+                * CONTINUOUS_TARGET_ENTROPY_SCALE
+                * np.prod(self.act_size[0]).astype(np.float32)
+            )
+
+        self.rewards_holders = {}
+        self.min_policy_qs = {}
+
+        for name in stream_names:
+            if discrete:
+                _branched_mpq1 = self.apply_as_branches(
+                    self.policy_network.q1_pheads[name] * self.policy.action_probs
+                )
+                branched_mpq1 = tf.stack(
+                    [
+                        tf.reduce_sum(_br, axis=1, keep_dims=True)
+                        for _br in _branched_mpq1
+                    ]
+                )
+                _q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0)
+
+                _branched_mpq2 = self.apply_as_branches(
+                    self.policy_network.q2_pheads[name] * self.policy.action_probs
+                )
+                branched_mpq2 = tf.stack(
+                    [
+                        tf.reduce_sum(_br, axis=1, keep_dims=True)
+                        for _br in _branched_mpq2
+                    ]
+                )
+                _q2_p_mean = tf.reduce_mean(branched_mpq2, axis=0)
+
+                self.min_policy_qs[name] = tf.minimum(_q1_p_mean, _q2_p_mean)
+            else:
+                self.min_policy_qs[name] = tf.minimum(
+                    self.policy_network.q1_pheads[name],
+                    self.policy_network.q2_pheads[name],
+                )
+
+            rewards_holder = tf.placeholder(
+                shape=[None], dtype=tf.float32, name="{}_rewards".format(name)
+            )
+            self.rewards_holders[name] = rewards_holder
+
+        q1_losses = []
+        q2_losses = []
+        # Multiple q losses per stream
+        expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
+        for i, name in enumerate(stream_names):
+            _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1)
+
+            q_backup = tf.stop_gradient(
+                _expanded_rewards
+                + (1.0 - self.use_dones_in_backup[name] * expanded_dones)
+                * self.gammas[i]
+                * self.target_network.value_heads[name]
+            )
+
+            if discrete:
+                # We need to break up the Q functions by branch, and update them individually.
+                branched_q1_stream = self.apply_as_branches(
+                    self.policy.action_oh * q1_streams[name]
+                )
+                branched_q2_stream = self.apply_as_branches(
+                    self.policy.action_oh * q2_streams[name]
+                )
+
+                # Reduce each branch into scalar
+                branched_q1_stream = [
+                    tf.reduce_sum(_branch, axis=1, keep_dims=True)
+                    for _branch in branched_q1_stream
+                ]
+                branched_q2_stream = [
+                    tf.reduce_sum(_branch, axis=1, keep_dims=True)
+                    for _branch in branched_q2_stream
+                ]
+
+                q1_stream = tf.reduce_mean(branched_q1_stream, axis=0)
+                q2_stream = tf.reduce_mean(branched_q2_stream, axis=0)
+
+            else:
+                q1_stream = q1_streams[name]
+                q2_stream = q2_streams[name]
+
+            _q1_loss = 0.5 * tf.reduce_mean(
+                tf.to_float(self.policy.mask)
+                * tf.squared_difference(q_backup, q1_stream)
+            )
+
+            _q2_loss = 0.5 * tf.reduce_mean(
+                tf.to_float(self.policy.mask)
+                * tf.squared_difference(q_backup, q2_stream)
+            )
+
+            q1_losses.append(_q1_loss)
+            q2_losses.append(_q2_loss)
+
+        self.q1_loss = tf.reduce_mean(q1_losses)
+        self.q2_loss = tf.reduce_mean(q2_losses)
+
+        # Learn entropy coefficient
+        if discrete:
+            # Create a log_ent_coef for each branch
+            self.log_ent_coef = tf.get_variable(
+                "log_ent_coef",
+                dtype=tf.float32,
+                initializer=np.log([self.init_entcoef] * len(self.act_size)).astype(
+                    np.float32
+                ),
+                trainable=True,
+            )
+        else:
+            self.log_ent_coef = tf.get_variable(
+                "log_ent_coef",
+                dtype=tf.float32,
+                initializer=np.log(self.init_entcoef).astype(np.float32),
+                trainable=True,
+            )
+
+        self.ent_coef = tf.exp(self.log_ent_coef)
+        if discrete:
+            # We also have to do a different entropy and target_entropy per branch.
+            branched_log_probs = self.apply_as_branches(self.policy.all_log_probs)
+            branched_ent_sums = tf.stack(
+                [
+                    tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
+                    for _lp, _te in zip(branched_log_probs, self.target_entropy)
+                ],
+                axis=1,
+            )
+            self.entropy_loss = -tf.reduce_mean(
+                tf.to_float(self.policy.mask)
+                * tf.reduce_mean(
+                    self.log_ent_coef
+                    * tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
+                    axis=1,
+                )
+            )
+
+            # Same with policy loss, we have to do the loss per branch and average them,
+            # so that larger branches don't get more weight.
+            # The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q
+            branched_q_term = self.apply_as_branches(
+                self.policy_network.action_probs * self.policy_network.q1_p
+            )
+
+            branched_policy_loss = tf.stack(
+                [
+                    tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True)
+                    for i, (_lp, _qt) in enumerate(
+                        zip(branched_log_probs, branched_q_term)
+                    )
+                ]
+            )
+            self.policy_loss = tf.reduce_mean(
+                tf.to_float(self.policy.mask) * tf.squeeze(branched_policy_loss)
+            )
+
+            # Do vbackup entropy bonus per branch as well.
+            branched_ent_bonus = tf.stack(
+                [
+                    tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True)
+                    for i, _lp in enumerate(branched_log_probs)
+                ]
+            )
+            value_losses = []
+            for name in stream_names:
+                v_backup = tf.stop_gradient(
+                    self.min_policy_qs[name]
+                    - tf.reduce_mean(branched_ent_bonus, axis=0)
+                )
+                value_losses.append(
+                    0.5
+                    * tf.reduce_mean(
+                        tf.to_float(self.policy.mask)
+                        * tf.squared_difference(
+                            self.policy_network.value_heads[name], v_backup
+                        )
+                    )
+                )
+
+        else:
+            self.entropy_loss = -tf.reduce_mean(
+                self.log_ent_coef
+                * tf.to_float(self.policy.mask)
+                * tf.stop_gradient(
+                    tf.reduce_sum(
+                        self.policy.all_log_probs + self.target_entropy,
+                        axis=1,
+                        keep_dims=True,
+                    )
+                )
+            )
+            batch_policy_loss = tf.reduce_mean(
+                self.ent_coef * self.policy.all_log_probs - self.policy_network.q1_p,
+                axis=1,
+            )
+            self.policy_loss = tf.reduce_mean(
+                tf.to_float(self.policy.mask) * batch_policy_loss
+            )
+
+            value_losses = []
+            for name in stream_names:
+                v_backup = tf.stop_gradient(
+                    self.min_policy_qs[name]
+                    - tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
+                )
+                value_losses.append(
+                    0.5
+                    * tf.reduce_mean(
+                        tf.to_float(self.policy.mask)
+                        * tf.squared_difference(
+                            self.policy_network.value_heads[name], v_backup
+                        )
+                    )
+                )
+        self.value_loss = tf.reduce_mean(value_losses)
+
+        self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss
+
+        self.entropy = self.policy_network.entropy
+
+    def apply_as_branches(self, concat_logits):
+        """
+        Takes in a concatenated set of logits and breaks it up into a list of non-concatenated logits, one per
+        action branch
+        """
+        action_idx = [0] + list(np.cumsum(self.act_size))
+        branches_logits = [
+            concat_logits[:, action_idx[i] : action_idx[i + 1]]
+            for i in range(len(self.act_size))
+        ]
+        return branches_logits
+
+    def create_sac_optimizers(self):
+        """
+        Creates the Adam optimizers and update ops for SAC, including
+        the policy, value, and entropy updates, as well as the target network update.
+        """
+        policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
+        entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
+        value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
+
+        self.target_update_op = [
+            tf.assign(target, (1 - self.tau) * target + self.tau * source)
+            for target, source in zip(
+                self.target_network.value_vars, self.policy_network.value_vars
+            )
+        ]
+        LOGGER.debug("value_vars")
+        self.print_all_vars(self.policy_network.value_vars)
+        LOGGER.debug("targvalue_vars")
+        self.print_all_vars(self.target_network.value_vars)
+        LOGGER.debug("critic_vars")
+        self.print_all_vars(self.policy_network.critic_vars)
+        LOGGER.debug("q_vars")
+        self.print_all_vars(self.policy_network.q_vars)
+        LOGGER.debug("policy_vars")
+        policy_vars = tf.get_collection(
+            tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy"
+        )
+        self.print_all_vars(policy_vars)
+
+        self.target_init_op = [
+            tf.assign(target, source)
+            for target, source in zip(
+                self.target_network.value_vars, self.policy_network.value_vars
+            )
+        ]
+
+        self.update_batch_policy = policy_optimizer.minimize(
+            self.policy_loss, var_list=policy_vars
+        )
+
+        # Make sure policy is updated first, then value, then entropy.
+        with tf.control_dependencies([self.update_batch_policy]):
+            self.update_batch_value = value_optimizer.minimize(
+                self.total_value_loss, var_list=self.policy_network.critic_vars
+            )
+            # Add entropy coefficient optimization operation
+            with tf.control_dependencies([self.update_batch_value]):
+                self.update_batch_entropy = entropy_optimizer.minimize(
+                    self.entropy_loss, var_list=self.log_ent_coef
+                )
+
+    def print_all_vars(self, variables):
+        for _var in variables:
+            LOGGER.debug(_var)
+
+    @timed
+    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
+        """
+        Updates model using buffer.
+        :param num_sequences: Number of trajectories in batch.
+        :param batch: Experience mini-batch.
+        :param update_target: Whether or not to update target value network
+        :param reward_signal_batches: Minibatches to use for updating the reward signals,
+            indexed by name. If none, don't update the reward signals.
+        :return: Output from update process.
+        """
+        feed_dict = self.construct_feed_dict(self.policy, batch, num_sequences)
+        stats_needed = self.stats_name_to_update_name
+        update_stats: Dict[str, float] = {}
+        update_vals = self._execute_model(feed_dict, self.update_dict)
+        for stat_name, update_name in stats_needed.items():
+            update_stats[stat_name] = update_vals[update_name]
+        # Update target network. By default, target update happens at every policy update.
+        self.sess.run(self.target_update_op)
+        return update_stats
+
+    def update_reward_signals(
+        self, reward_signal_minibatches: Mapping[str, Dict], num_sequences: int
+    ) -> Dict[str, float]:
+        """
+        Only update the reward signals.
+        :param reward_signal_batches: Minibatches to use for updating the reward signals,
+            indexed by name. If none, don't update the reward signals.
+        """
+        # Collect feed dicts for all reward signals.
+        feed_dict: Dict[tf.Tensor, Any] = {}
+        update_dict: Dict[str, tf.Tensor] = {}
+        update_stats: Dict[str, float] = {}
+        stats_needed: Dict[str, str] = {}
+        if reward_signal_minibatches:
+            self.add_reward_signal_dicts(
+                feed_dict,
+                update_dict,
+                stats_needed,
+                reward_signal_minibatches,
+                num_sequences,
+            )
+        update_vals = self._execute_model(feed_dict, update_dict)
+        for stat_name, update_name in stats_needed.items():
+            update_stats[stat_name] = update_vals[update_name]
+        return update_stats
+
+    def add_reward_signal_dicts(
+        self,
+        feed_dict: Dict[tf.Tensor, Any],
+        update_dict: Dict[str, tf.Tensor],
+        stats_needed: Dict[str, str],
+        reward_signal_minibatches: Mapping[str, Dict],
+        num_sequences: int,
+    ) -> None:
+        """
+        Adds the items needed for reward signal updates to the feed_dict and stats_needed dict.
+        :param feed_dict: Feed dict needed update
+        :param update_dit: Update dict that needs update
+        :param stats_needed: Stats needed to get from the update.
+        :param reward_signal_minibatches: Minibatches to use for updating the reward signals,
+            indexed by name.
+        """
+        for name, r_batch in reward_signal_minibatches.items():
+            feed_dict.update(
+                self.reward_signals[name].prepare_update(
+                    self.policy, r_batch, num_sequences
+                )
+            )
+            update_dict.update(self.reward_signals[name].update_dict)
+            stats_needed.update(self.reward_signals[name].stats_name_to_update_name)
+
+    def construct_feed_dict(
+        self, policy: TFPolicy, batch: Dict[str, Any], num_sequences: int
+    ) -> Dict[tf.Tensor, Any]:
+        """
+        Builds the feed dict for updating the SAC model.
+        :param model: The model to update. May be different when, e.g. using multi-GPU.
+        :param batch: Mini-batch to use to update.
+        :param num_sequences: Number of LSTM sequences in batch.
+        """
+        feed_dict = {
+            policy.batch_size_ph: num_sequences,
+            policy.sequence_length_ph: self.policy.sequence_length,
+            self.next_sequence_length_ph: self.policy.sequence_length,
+            self.policy.mask_input: batch["masks"],
+        }
+        for name in self.reward_signals:
+            feed_dict[self.rewards_holders[name]] = batch["{}_rewards".format(name)]
+
+        if self.policy.use_continuous_act:
+            feed_dict[policy.action_holder] = batch["actions"]
+        else:
+            feed_dict[policy.action_holder] = batch["actions"]
+            if self.policy.use_recurrent:
+                feed_dict[policy.prev_action] = batch["prev_action"]
+            feed_dict[policy.action_masks] = batch["action_mask"]
+        if self.policy.use_vec_obs:
+            feed_dict[policy.vector_in] = batch["vector_obs"]
+            feed_dict[self.next_vector_in] = batch["next_vector_in"]
+        if self.policy.vis_obs_size > 0:
+            for i, _ in enumerate(policy.visual_in):
+                _obs = batch["visual_obs%d" % i]
+                feed_dict[policy.visual_in[i]] = _obs
+            for i, _ in enumerate(self.next_visual_in):
+                _obs = batch["next_visual_obs%d" % i]
+                feed_dict[self.next_visual_in[i]] = _obs
+        if self.policy.use_recurrent:
+            mem_in = [
+                batch["memory"][i]
+                for i in range(0, len(batch["memory"]), self.policy.sequence_length)
+            ]
+            # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true.
+            offset = 1 if self.policy.sequence_length > 1 else 0
+            next_mem_in = [
+                batch["memory"][i][
+                    : self.policy.m_size // 4
+                ]  # only pass value part of memory to target network
+                for i in range(
+                    offset, len(batch["memory"]), self.policy.sequence_length
+                )
+            ]
+            feed_dict[policy.memory_in] = mem_in
+            feed_dict[self.next_memory_in] = next_mem_in
+        feed_dict[self.dones_holder] = batch["done"]
+        return feed_dict