import logging import numpy as np from typing import Dict, List, Optional from mlagents.tf_utils import tf, tf_variance_scaling from mlagents.trainers.models import LearningModel, LearningRateSchedule, EncoderType LOG_STD_MAX = 2 LOG_STD_MIN = -20 EPSILON = 1e-6 # Small value to avoid divide by zero DISCRETE_TARGET_ENTROPY_SCALE = 0.2 # Roughly equal to e-greedy 0.05 CONTINUOUS_TARGET_ENTROPY_SCALE = 1.0 # TODO: Make these an optional hyperparam. LOGGER = logging.getLogger("mlagents.trainers") POLICY_SCOPE = "" TARGET_SCOPE = "target_network" class SACNetwork(LearningModel): """ Base class for an SAC network. Implements methods for creating the actor and critic heads. """ def __init__( self, brain, m_size=None, h_size=128, normalize=False, use_recurrent=False, num_layers=2, stream_names=None, seed=0, vis_encode_type=EncoderType.SIMPLE, ): LearningModel.__init__( self, m_size, normalize, use_recurrent, brain, seed, stream_names ) self.normalize = normalize self.use_recurrent = use_recurrent self.num_layers = num_layers self.stream_names = stream_names self.h_size = h_size self.activ_fn = self.swish self.policy_memory_in: Optional[tf.Tensor] = None self.policy_memory_out: Optional[tf.Tensor] = None self.value_memory_in: Optional[tf.Tensor] = None self.value_memory_out: Optional[tf.Tensor] = None self.q1: Optional[tf.Tensor] = None self.q2: Optional[tf.Tensor] = None self.q1_p: Optional[tf.Tensor] = None self.q2_p: Optional[tf.Tensor] = None self.q1_memory_in: Optional[tf.Tensor] = None self.q2_memory_in: Optional[tf.Tensor] = None self.q1_memory_out: Optional[tf.Tensor] = None self.q2_memory_out: Optional[tf.Tensor] = None self.action_holder: Optional[tf.Tensor] = None self.prev_action: Optional[tf.Tensor] = None self.action_masks: Optional[tf.Tensor] = None self.external_action_in: Optional[tf.Tensor] = None self.log_sigma_sq: Optional[tf.Tensor] = None self.entropy: Optional[tf.Tensor] = None self.deterministic_output: Optional[tf.Tensor] = None self.all_log_probs: Optional[tf.Tensor] = None self.normalized_logprobs: Optional[tf.Tensor] = None self.action_probs: Optional[tf.Tensor] = None self.selected_actions: Optional[tf.Tensor] = None self.output: Optional[tf.Tensor] = None self.output_oh: Optional[tf.Tensor] = None self.output_pre: Optional[tf.Tensor] = None self.value_vars = None self.q_vars = None self.critic_vars = None self.policy_vars = None self.q1_heads: Optional[Dict[str, tf.Tensor]] = None self.q2_heads: Optional[Dict[str, tf.Tensor]] = None self.q1_pheads: Optional[Dict[str, tf.Tensor]] = None self.q2_pheads: Optional[Dict[str, tf.Tensor]] = None def get_vars(self, scope): return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) def join_scopes(self, scope_1, scope_2): """ Joins two scopes. Does so safetly (i.e., if one of the two scopes doesn't exist, don't add any backslashes) """ if not scope_1: return scope_2 if not scope_2: return scope_1 else: return "/".join(filter(None, [scope_1, scope_2])) def create_cc_critic(self, hidden_value, scope, create_qs=True): """ Creates just the critic network """ scope = self.join_scopes(scope, "critic") self.create_sac_value_head( self.stream_names, hidden_value, self.num_layers, self.h_size, self.join_scopes(scope, "value"), ) self.value_vars = self.get_vars(self.join_scopes(scope, "value")) if create_qs: hidden_q = tf.concat([hidden_value, self.external_action_in], axis=-1) hidden_qp = tf.concat([hidden_value, self.output], axis=-1) self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads( self.stream_names, hidden_q, self.num_layers, self.h_size, self.join_scopes(scope, "q"), ) self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads( self.stream_names, hidden_qp, self.num_layers, self.h_size, self.join_scopes(scope, "q"), reuse=True, ) self.q_vars = self.get_vars(self.join_scopes(scope, "q")) self.critic_vars = self.get_vars(scope) def create_dc_critic(self, hidden_value, scope, create_qs=True): """ Creates just the critic network """ scope = self.join_scopes(scope, "critic") self.create_sac_value_head( self.stream_names, hidden_value, self.num_layers, self.h_size, self.join_scopes(scope, "value"), ) self.value_vars = self.get_vars("/".join([scope, "value"])) if create_qs: self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads( self.stream_names, hidden_value, self.num_layers, self.h_size, self.join_scopes(scope, "q"), num_outputs=sum(self.act_size), ) self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads( self.stream_names, hidden_value, self.num_layers, self.h_size, self.join_scopes(scope, "q"), reuse=True, num_outputs=sum(self.act_size), ) self.q_vars = self.get_vars(scope) self.critic_vars = self.get_vars(scope) def create_cc_actor(self, hidden_policy, scope): """ Creates Continuous control actor for SAC. :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs). :param num_layers: TF scope to assign whatever is created in this block. """ # Create action input (continuous) self.action_holder = tf.placeholder( shape=[None, self.act_size[0]], dtype=tf.float32, name="action_holder" ) self.external_action_in = self.action_holder scope = self.join_scopes(scope, "policy") with tf.variable_scope(scope): hidden_policy = self.create_vector_observation_encoder( hidden_policy, self.h_size, self.activ_fn, self.num_layers, "encoder", False, ) if self.use_recurrent: hidden_policy, memory_out = self.create_recurrent_encoder( hidden_policy, self.policy_memory_in, self.sequence_length, name="lstm_policy", ) self.policy_memory_out = memory_out with tf.variable_scope(scope): mu = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, name="mu", kernel_initializer=LearningModel.scaled_init(0.01), ) # Policy-dependent log_sigma_sq log_sigma_sq = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, name="log_std", kernel_initializer=LearningModel.scaled_init(0.01), ) self.log_sigma_sq = tf.clip_by_value(log_sigma_sq, LOG_STD_MIN, LOG_STD_MAX) sigma_sq = tf.exp(self.log_sigma_sq) # Do the reparameterization trick policy_ = mu + tf.random_normal(tf.shape(mu)) * sigma_sq _gauss_pre = -0.5 * ( ((policy_ - mu) / (tf.exp(self.log_sigma_sq) + EPSILON)) ** 2 + 2 * self.log_sigma_sq + np.log(2 * np.pi) ) all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True) self.entropy = tf.reduce_sum( self.log_sigma_sq + 0.5 * np.log(2.0 * np.pi * np.e), axis=-1 ) # Squash probabilities # Keep deterministic around in case we want to use it. self.deterministic_output = tf.tanh(mu) # Note that this is just for symmetry with PPO. self.output_pre = tf.tanh(policy_) # Squash correction all_probs -= tf.reduce_sum( tf.log(1 - self.output_pre ** 2 + EPSILON), axis=1, keepdims=True ) self.all_log_probs = all_probs self.selected_actions = tf.stop_gradient(self.output_pre) self.action_probs = all_probs # Extract output for Barracuda self.output = tf.identity(self.output_pre, name="action") # Get all policy vars self.policy_vars = self.get_vars(scope) def create_dc_actor(self, hidden_policy, scope): """ Creates Discrete control actor for SAC. :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs). :param num_layers: TF scope to assign whatever is created in this block. """ scope = self.join_scopes(scope, "policy") # Create inputs outside of the scope self.action_masks = tf.placeholder( shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks" ) if self.use_recurrent: self.prev_action = tf.placeholder( shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action" ) with tf.variable_scope(scope): hidden_policy = self.create_vector_observation_encoder( hidden_policy, self.h_size, self.activ_fn, self.num_layers, "encoder", False, ) if self.use_recurrent: prev_action_oh = tf.concat( [ tf.one_hot(self.prev_action[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) hidden_policy = tf.concat([hidden_policy, prev_action_oh], axis=1) hidden_policy, memory_out = self.create_recurrent_encoder( hidden_policy, self.policy_memory_in, self.sequence_length, name="lstm_policy", ) self.policy_memory_out = memory_out with tf.variable_scope(scope): policy_branches = [] for size in self.act_size: policy_branches.append( tf.layers.dense( hidden_policy, size, activation=None, use_bias=False, kernel_initializer=tf_variance_scaling(0.01), ) ) all_logits = tf.concat( [branch for branch in policy_branches], axis=1, name="action_probs" ) output, normalized_probs, normalized_logprobs = self.create_discrete_action_masking_layer( all_logits, self.action_masks, self.act_size ) self.action_probs = normalized_probs # Really, this is entropy, but it has an analogous purpose to the log probs in the # continuous case. self.all_log_probs = self.action_probs * normalized_logprobs self.output = output # Create action input (discrete) self.action_holder = tf.placeholder( shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder" ) self.output_oh = tf.concat( [ tf.one_hot(self.action_holder[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) # For Curiosity and GAIL to retrieve selected actions. We don't # need the mask at this point because it's already stored in the buffer. self.selected_actions = tf.stop_gradient(self.output_oh) self.external_action_in = tf.concat( [ tf.one_hot(self.action_holder[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) # This is total entropy over all branches self.entropy = -1 * tf.reduce_sum(self.all_log_probs, axis=1) # Extract the normalized logprobs for Barracuda self.normalized_logprobs = tf.identity(normalized_logprobs, name="action") # We kept the LSTMs at a different scope than the rest, so add them if they exist. self.policy_vars = self.get_vars(scope) if self.use_recurrent: self.policy_vars += self.get_vars("lstm") def create_sac_value_head( self, stream_names, hidden_input, num_layers, h_size, scope ): """ Creates one value estimator head for each reward signal in stream_names. Also creates the node corresponding to the mean of all the value heads in self.value. self.value_head is a dictionary of stream name to node containing the value estimator head for that signal. :param stream_names: The list of reward signal names :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top of the hidden input. :param num_layers: Number of hidden layers for value network :param h_size: size of hidden layers for value network :param scope: TF scope for value network. """ with tf.variable_scope(scope): value_hidden = self.create_vector_observation_encoder( hidden_input, h_size, self.activ_fn, num_layers, "encoder", False ) if self.use_recurrent: value_hidden, memory_out = self.create_recurrent_encoder( value_hidden, self.value_memory_in, self.sequence_length, name="lstm_value", ) self.value_memory_out = memory_out self.create_value_heads(stream_names, value_hidden) def create_q_heads( self, stream_names, hidden_input, num_layers, h_size, scope, reuse=False, num_outputs=1, ): """ Creates two q heads for each reward signal in stream_names. Also creates the node corresponding to the mean of all the value heads in self.value. self.value_head is a dictionary of stream name to node containing the value estimator head for that signal. :param stream_names: The list of reward signal names :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top of the hidden input. :param num_layers: Number of hidden layers for Q network :param h_size: size of hidden layers for Q network :param scope: TF scope for Q network. :param reuse: Whether or not to reuse variables. Useful for creating Q of policy. :param num_outputs: Number of outputs of each Q function. If discrete, equal to number of actions. """ with tf.variable_scope(self.join_scopes(scope, "q1_encoding"), reuse=reuse): q1_hidden = self.create_vector_observation_encoder( hidden_input, h_size, self.activ_fn, num_layers, "q1_encoder", reuse ) if self.use_recurrent: q1_hidden, memory_out = self.create_recurrent_encoder( q1_hidden, self.q1_memory_in, self.sequence_length, name="lstm_q1" ) self.q1_memory_out = memory_out q1_heads = {} for name in stream_names: _q1 = tf.layers.dense(q1_hidden, num_outputs, name="{}_q1".format(name)) q1_heads[name] = _q1 q1 = tf.reduce_mean(list(q1_heads.values()), axis=0) with tf.variable_scope(self.join_scopes(scope, "q2_encoding"), reuse=reuse): q2_hidden = self.create_vector_observation_encoder( hidden_input, h_size, self.activ_fn, num_layers, "q2_encoder", reuse ) if self.use_recurrent: q2_hidden, memory_out = self.create_recurrent_encoder( q2_hidden, self.q2_memory_in, self.sequence_length, name="lstm_q2" ) self.q2_memory_out = memory_out q2_heads = {} for name in stream_names: _q2 = tf.layers.dense(q2_hidden, num_outputs, name="{}_q2".format(name)) q2_heads[name] = _q2 q2 = tf.reduce_mean(list(q2_heads.values()), axis=0) return q1_heads, q2_heads, q1, q2 def copy_normalization(self, mean, variance, steps): """ Copies the mean, variance, and steps into the normalizers of the input of this SACNetwork. Used to copy the normalizer from the policy network to the target network. param mean: Tensor containing the mean. param variance: Tensor containing the variance param steps: Tensor containing the number of steps. """ update_mean = tf.assign(self.running_mean, mean) update_variance = tf.assign(self.running_variance, variance) update_norm_step = tf.assign(self.normalization_steps, steps) return tf.group([update_mean, update_variance, update_norm_step]) class SACTargetNetwork(SACNetwork): """ Instantiation for the SAC target network. Only contains a single value estimator and is updated from the Policy Network. """ def __init__( self, brain, m_size=None, h_size=128, normalize=False, use_recurrent=False, num_layers=2, stream_names=None, seed=0, vis_encode_type=EncoderType.SIMPLE, ): super().__init__( brain, m_size, h_size, normalize, use_recurrent, num_layers, stream_names, seed, vis_encode_type, ) if self.use_recurrent: self.memory_in = tf.placeholder( shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in" ) self.value_memory_in = self.memory_in with tf.variable_scope(TARGET_SCOPE): hidden_streams = self.create_observation_streams( 1, self.h_size, 0, vis_encode_type=vis_encode_type, stream_scopes=["critic/value/"], ) if brain.vector_action_space_type == "continuous": self.create_cc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False) else: self.create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False) if self.use_recurrent: self.memory_out = tf.concat( self.value_memory_out, axis=1 ) # Needed for Barracuda to work class SACPolicyNetwork(SACNetwork): """ Instantiation for SAC policy network. Contains a dual Q estimator, a value estimator, and the actual policy network. """ def __init__( self, brain, m_size=None, h_size=128, normalize=False, use_recurrent=False, num_layers=2, stream_names=None, seed=0, vis_encode_type=EncoderType.SIMPLE, ): super().__init__( brain, m_size, h_size, normalize, use_recurrent, num_layers, stream_names, seed, vis_encode_type, ) self.share_ac_cnn = False if self.use_recurrent: self.create_memory_ins(self.m_size) hidden_policy, hidden_critic = self.create_observation_ins( vis_encode_type, self.share_ac_cnn ) if brain.vector_action_space_type == "continuous": self.create_cc_actor(hidden_policy, POLICY_SCOPE) self.create_cc_critic(hidden_critic, POLICY_SCOPE) else: self.create_dc_actor(hidden_policy, POLICY_SCOPE) self.create_dc_critic(hidden_critic, POLICY_SCOPE) if self.share_ac_cnn: # Make sure that the policy also contains the CNN self.policy_vars += self.get_vars( self.join_scopes(POLICY_SCOPE, "critic/value/main_graph_0_encoder0") ) if self.use_recurrent: mem_outs = [ self.value_memory_out, self.q1_memory_out, self.q2_memory_out, self.policy_memory_out, ] self.memory_out = tf.concat(mem_outs, axis=1) def create_memory_ins(self, m_size): """ Creates the memory input placeholders for LSTM. :param m_size: the total size of the memory. """ # Create the Policy input separate from the rest # This is so in inference we only have to run the Policy network. # Barracuda will grab the recurrent_in and recurrent_out named tensors. self.inference_memory_in = tf.placeholder( shape=[None, m_size // 4], dtype=tf.float32, name="recurrent_in" ) # We assume m_size is divisible by 4 # Create the non-Policy inputs # Use a default placeholder here so nothing has to be provided during # Barracuda inference. Note that the default value is just the tiled input # for the policy, which is thrown away. three_fourths_m_size = m_size * 3 // 4 self.other_memory_in = tf.placeholder_with_default( input=tf.tile(self.inference_memory_in, [1, 3]), shape=[None, three_fourths_m_size], name="other_recurrent_in", ) # Concat and use this as the "placeholder" # for training self.memory_in = tf.concat( [self.other_memory_in, self.inference_memory_in], axis=1 ) # Re-break-up for each network num_mems = 4 mem_ins = [] for i in range(num_mems): _start = m_size // num_mems * i _end = m_size // num_mems * (i + 1) mem_ins.append(self.memory_in[:, _start:_end]) self.value_memory_in = mem_ins[0] self.q1_memory_in = mem_ins[1] self.q2_memory_in = mem_ins[2] self.policy_memory_in = mem_ins[3] def create_observation_ins(self, vis_encode_type, share_ac_cnn): """ Creates the observation inputs, and a CNN if needed, :param vis_encode_type: Type of CNN encoder. :param share_ac_cnn: Whether or not to share the actor and critic CNNs. :return A tuple of (hidden_policy, hidden_critic). We don't save it to self since they're used once and thrown away. """ if share_ac_cnn: with tf.variable_scope(POLICY_SCOPE): hidden_streams = self.create_observation_streams( 1, self.h_size, 0, vis_encode_type=vis_encode_type, stream_scopes=["critic/value/"], ) hidden_policy = hidden_streams[0] hidden_critic = hidden_streams[0] else: with tf.variable_scope(POLICY_SCOPE): hidden_streams = self.create_observation_streams( 2, self.h_size, 0, vis_encode_type=vis_encode_type, stream_scopes=["policy/", "critic/value/"], ) hidden_policy = hidden_streams[0] hidden_critic = hidden_streams[1] return hidden_policy, hidden_critic class SACModel(LearningModel): def __init__( self, brain, lr=1e-4, lr_schedule=LearningRateSchedule.CONSTANT, h_size=128, init_entcoef=0.1, max_step=5e6, normalize=False, use_recurrent=False, num_layers=2, m_size=None, seed=0, stream_names=None, tau=0.005, gammas=None, vis_encode_type=EncoderType.SIMPLE, ): """ Takes a Unity environment and model-specific hyper-parameters and returns the appropriate PPO agent model for the environment. :param brain: BrainInfo used to generate specific network graph. :param lr: Learning rate. :param lr_schedule: Learning rate decay schedule. :param h_size: Size of hidden layers :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster, set higher to explore more. :return: a sub-class of PPOAgent tailored to the environment. :param max_step: Total number of training steps. :param normalize: Whether to normalize vector observation input. :param use_recurrent: Whether to use an LSTM layer in the network. :param num_layers: Number of hidden layers between encoded input and policy & value layers :param tau: Strength of soft-Q update. :param m_size: Size of brain memory. """ self.tau = tau self.gammas = gammas self.brain = brain self.init_entcoef = init_entcoef if stream_names is None: stream_names = [] # Use to reduce "survivor bonus" when using Curiosity or GAIL. self.use_dones_in_backup = {name: tf.Variable(1.0) for name in stream_names} self.disable_use_dones = { name: self.use_dones_in_backup[name].assign(0.0) for name in stream_names } LearningModel.__init__( self, m_size, normalize, use_recurrent, brain, seed, stream_names ) if num_layers < 1: num_layers = 1 self.target_init_op: List[tf.Tensor] = [] self.target_update_op: List[tf.Tensor] = [] self.update_batch_policy: Optional[tf.Operation] = None self.update_batch_value: Optional[tf.Operation] = None self.update_batch_entropy: Optional[tf.Operation] = None self.policy_network = SACPolicyNetwork( brain=brain, m_size=m_size, h_size=h_size, normalize=normalize, use_recurrent=use_recurrent, num_layers=num_layers, seed=seed, stream_names=stream_names, vis_encode_type=vis_encode_type, ) self.target_network = SACTargetNetwork( brain=brain, m_size=m_size // 4 if m_size else None, h_size=h_size, normalize=normalize, use_recurrent=use_recurrent, num_layers=num_layers, seed=seed, stream_names=stream_names, vis_encode_type=vis_encode_type, ) self.create_inputs_and_outputs() self.learning_rate = self.create_learning_rate( lr_schedule, lr, self.global_step, max_step ) self.create_losses( self.policy_network.q1_heads, self.policy_network.q2_heads, lr, max_step, stream_names, discrete=self.brain.vector_action_space_type == "discrete", ) self.selected_actions = ( self.policy_network.selected_actions ) # For GAIL and other reward signals if normalize: target_update_norm = self.target_network.copy_normalization( self.policy_network.running_mean, self.policy_network.running_variance, self.policy_network.normalization_steps, ) self.update_normalization = tf.group( [self.policy_network.update_normalization, target_update_norm] ) self.running_mean = self.policy_network.running_mean self.running_variance = self.policy_network.running_variance self.normalization_steps = self.policy_network.normalization_steps def create_inputs_and_outputs(self): """ Assign the higher-level SACModel's inputs and outputs to those of its policy or target network. """ self.vector_in = self.policy_network.vector_in self.visual_in = self.policy_network.visual_in self.next_vector_in = self.target_network.vector_in self.next_visual_in = self.target_network.visual_in self.action_holder = self.policy_network.action_holder self.sequence_length = self.policy_network.sequence_length self.next_sequence_length = self.target_network.sequence_length if self.brain.vector_action_space_type == "discrete": self.action_masks = self.policy_network.action_masks else: self.output_pre = self.policy_network.output_pre self.output = self.policy_network.output # Don't use value estimate during inference. TODO: Check why PPO uses value_estimate in inference. self.value = tf.identity( self.policy_network.value, name="value_estimate_unused" ) self.value_heads = self.policy_network.value_heads self.all_log_probs = self.policy_network.all_log_probs self.dones_holder = tf.placeholder( shape=[None], dtype=tf.float32, name="dones_holder" ) # This is just a dummy to get pretraining to work. PPO has this but SAC doesn't. # TODO: Proper input and output specs for models self.epsilon = tf.placeholder( shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon" ) if self.use_recurrent: self.memory_in = self.policy_network.memory_in self.memory_out = self.policy_network.memory_out # For Barracuda self.inference_memory_out = tf.identity( self.policy_network.policy_memory_out, name="recurrent_out" ) if self.brain.vector_action_space_type == "discrete": self.prev_action = self.policy_network.prev_action self.next_memory_in = self.target_network.memory_in def create_losses( self, q1_streams, q2_streams, lr, max_step, stream_names, discrete=False ): """ Creates training-specific Tensorflow ops for SAC models. :param q1_streams: Q1 streams from policy network :param q1_streams: Q2 streams from policy network :param lr: Learning rate :param max_step: Total number of training steps. :param stream_names: List of reward stream names. :param discrete: Whether or not to use discrete action losses. """ if discrete: self.target_entropy = [ DISCRETE_TARGET_ENTROPY_SCALE * np.log(i).astype(np.float32) for i in self.act_size ] else: self.target_entropy = ( -1 * CONTINUOUS_TARGET_ENTROPY_SCALE * np.prod(self.act_size[0]).astype(np.float32) ) self.rewards_holders = {} self.min_policy_qs = {} for i, name in enumerate(stream_names): if discrete: _branched_mpq1 = self.apply_as_branches( self.policy_network.q1_pheads[name] * self.policy_network.action_probs ) branched_mpq1 = tf.stack( [ tf.reduce_sum(_br, axis=1, keep_dims=True) for _br in _branched_mpq1 ] ) _q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0) _branched_mpq2 = self.apply_as_branches( self.policy_network.q2_pheads[name] * self.policy_network.action_probs ) branched_mpq2 = tf.stack( [ tf.reduce_sum(_br, axis=1, keep_dims=True) for _br in _branched_mpq2 ] ) _q2_p_mean = tf.reduce_mean(branched_mpq2, axis=0) self.min_policy_qs[name] = tf.minimum(_q1_p_mean, _q2_p_mean) else: self.min_policy_qs[name] = tf.minimum( self.policy_network.q1_pheads[name], self.policy_network.q2_pheads[name], ) rewards_holder = tf.placeholder( shape=[None], dtype=tf.float32, name="{}_rewards".format(name) ) rewards_holder = tf.placeholder( shape=[None], dtype=tf.float32, name="{}_rewards".format(name) ) self.rewards_holders[name] = rewards_holder q1_losses = [] q2_losses = [] # Multiple q losses per stream expanded_dones = tf.expand_dims(self.dones_holder, axis=-1) for i, name in enumerate(stream_names): _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1) q_backup = tf.stop_gradient( _expanded_rewards + (1.0 - self.use_dones_in_backup[name] * expanded_dones) * self.gammas[i] * self.target_network.value_heads[name] ) if discrete: # We need to break up the Q functions by branch, and update them individually. branched_q1_stream = self.apply_as_branches( self.policy_network.external_action_in * q1_streams[name] ) branched_q2_stream = self.apply_as_branches( self.policy_network.external_action_in * q2_streams[name] ) # Reduce each branch into scalar branched_q1_stream = [ tf.reduce_sum(_branch, axis=1, keep_dims=True) for _branch in branched_q1_stream ] branched_q2_stream = [ tf.reduce_sum(_branch, axis=1, keep_dims=True) for _branch in branched_q2_stream ] q1_stream = tf.reduce_mean(branched_q1_stream, axis=0) q2_stream = tf.reduce_mean(branched_q2_stream, axis=0) else: q1_stream = q1_streams[name] q2_stream = q2_streams[name] _q1_loss = 0.5 * tf.reduce_mean( tf.to_float(self.mask) * tf.squared_difference(q_backup, q1_stream) ) _q2_loss = 0.5 * tf.reduce_mean( tf.to_float(self.mask) * tf.squared_difference(q_backup, q2_stream) ) q1_losses.append(_q1_loss) q2_losses.append(_q2_loss) self.q1_loss = tf.reduce_mean(q1_losses) self.q2_loss = tf.reduce_mean(q2_losses) # Learn entropy coefficient if discrete: # Create a log_ent_coef for each branch self.log_ent_coef = tf.get_variable( "log_ent_coef", dtype=tf.float32, initializer=np.log([self.init_entcoef] * len(self.act_size)).astype( np.float32 ), trainable=True, ) else: self.log_ent_coef = tf.get_variable( "log_ent_coef", dtype=tf.float32, initializer=np.log(self.init_entcoef).astype(np.float32), trainable=True, ) self.ent_coef = tf.exp(self.log_ent_coef) if discrete: # We also have to do a different entropy and target_entropy per branch. branched_log_probs = self.apply_as_branches( self.policy_network.all_log_probs ) branched_ent_sums = tf.stack( [ tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te for _lp, _te in zip(branched_log_probs, self.target_entropy) ], axis=1, ) self.entropy_loss = -tf.reduce_mean( tf.to_float(self.mask) * tf.reduce_mean( self.log_ent_coef * tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2), axis=1, ) ) # Same with policy loss, we have to do the loss per branch and average them, # so that larger branches don't get more weight. # The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q branched_q_term = self.apply_as_branches( self.policy_network.action_probs * self.policy_network.q1_p ) branched_policy_loss = tf.stack( [ tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True) for i, (_lp, _qt) in enumerate( zip(branched_log_probs, branched_q_term) ) ] ) self.policy_loss = tf.reduce_mean( tf.to_float(self.mask) * tf.squeeze(branched_policy_loss) ) # Do vbackup entropy bonus per branch as well. branched_ent_bonus = tf.stack( [ tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True) for i, _lp in enumerate(branched_log_probs) ] ) value_losses = [] for name in stream_names: v_backup = tf.stop_gradient( self.min_policy_qs[name] - tf.reduce_mean(branched_ent_bonus, axis=0) ) value_losses.append( 0.5 * tf.reduce_mean( tf.to_float(self.mask) * tf.squared_difference( self.policy_network.value_heads[name], v_backup ) ) ) else: self.entropy_loss = -tf.reduce_mean( self.log_ent_coef * tf.to_float(self.mask) * tf.stop_gradient( tf.reduce_sum( self.policy_network.all_log_probs + self.target_entropy, axis=1, keep_dims=True, ) ) ) batch_policy_loss = tf.reduce_mean( self.ent_coef * self.policy_network.all_log_probs - self.policy_network.q1_p, axis=1, ) self.policy_loss = tf.reduce_mean( tf.to_float(self.mask) * batch_policy_loss ) value_losses = [] for name in stream_names: v_backup = tf.stop_gradient( self.min_policy_qs[name] - tf.reduce_sum( self.ent_coef * self.policy_network.all_log_probs, axis=1 ) ) value_losses.append( 0.5 * tf.reduce_mean( tf.to_float(self.mask) * tf.squared_difference( self.policy_network.value_heads[name], v_backup ) ) ) self.value_loss = tf.reduce_mean(value_losses) self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss self.entropy = self.policy_network.entropy def apply_as_branches(self, concat_logits): """ Takes in a concatenated set of logits and breaks it up into a list of non-concatenated logits, one per action branch """ action_idx = [0] + list(np.cumsum(self.act_size)) branches_logits = [ concat_logits[:, action_idx[i] : action_idx[i + 1]] for i in range(len(self.act_size)) ] return branches_logits def create_sac_optimizers(self): """ Creates the Adam optimizers and update ops for SAC, including the policy, value, and entropy updates, as well as the target network update. """ policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip( self.target_network.value_vars, self.policy_network.value_vars ) ] LOGGER.debug("value_vars") self.print_all_vars(self.policy_network.value_vars) LOGGER.debug("targvalue_vars") self.print_all_vars(self.target_network.value_vars) LOGGER.debug("critic_vars") self.print_all_vars(self.policy_network.critic_vars) LOGGER.debug("q_vars") self.print_all_vars(self.policy_network.q_vars) LOGGER.debug("policy_vars") self.print_all_vars(self.policy_network.policy_vars) self.target_init_op = [ tf.assign(target, source) for target, source in zip( self.target_network.value_vars, self.policy_network.value_vars ) ] self.update_batch_policy = policy_optimizer.minimize( self.policy_loss, var_list=self.policy_network.policy_vars ) # Make sure policy is updated first, then value, then entropy. with tf.control_dependencies([self.update_batch_policy]): self.update_batch_value = value_optimizer.minimize( self.total_value_loss, var_list=self.policy_network.critic_vars ) # Add entropy coefficient optimization operation with tf.control_dependencies([self.update_batch_value]): self.update_batch_entropy = entropy_optimizer.minimize( self.entropy_loss, var_list=self.log_ent_coef ) def print_all_vars(self, variables): for _var in variables: LOGGER.debug(_var)