ml-agents/ml-agents/mlagents/trainers/sac/network.py


								import logging

								from typing import Dict, Optional


								from mlagents.tf_utils import tf


								from mlagents.trainers.models import LearningModel, EncoderType


								LOG_STD_MAX = 2

								LOG_STD_MIN = -20

								EPSILON = 1e-6  # Small value to avoid divide by zero

								DISCRETE_TARGET_ENTROPY_SCALE = 0.2  # Roughly equal to e-greedy 0.05

								CONTINUOUS_TARGET_ENTROPY_SCALE = 1.0  # TODO: Make these an optional hyperparam.


								LOGGER = logging.getLogger("mlagents.trainers")


								POLICY_SCOPE = ""

								TARGET_SCOPE = "target_network"


								class SACNetwork:

								    """

								    Base class for an SAC network. Implements methods for creating the actor and critic heads.

								    """


								    def __init__(

								        self,

								        policy=None,

								        m_size=None,

								        h_size=128,

								        normalize=False,

								        use_recurrent=False,

								        num_layers=2,

								        stream_names=None,

								        vis_encode_type=EncoderType.SIMPLE,

								    ):

								        self.normalize = normalize

								        self.use_recurrent = use_recurrent

								        self.num_layers = num_layers

								        self.stream_names = stream_names

								        self.h_size = h_size

								        self.activ_fn = LearningModel.swish


								        self.sequence_length_ph = tf.placeholder(

								            shape=None, dtype=tf.int32, name="sac_sequence_length"

								        )


								        self.policy_memory_in: Optional[tf.Tensor] = None

								        self.policy_memory_out: Optional[tf.Tensor] = None

								        self.value_memory_in: Optional[tf.Tensor] = None

								        self.value_memory_out: Optional[tf.Tensor] = None

								        self.q1: Optional[tf.Tensor] = None

								        self.q2: Optional[tf.Tensor] = None

								        self.q1_p: Optional[tf.Tensor] = None

								        self.q2_p: Optional[tf.Tensor] = None

								        self.q1_memory_in: Optional[tf.Tensor] = None

								        self.q2_memory_in: Optional[tf.Tensor] = None

								        self.q1_memory_out: Optional[tf.Tensor] = None

								        self.q2_memory_out: Optional[tf.Tensor] = None

								        self.prev_action: Optional[tf.Tensor] = None

								        self.action_masks: Optional[tf.Tensor] = None

								        self.external_action_in: Optional[tf.Tensor] = None

								        self.log_sigma_sq: Optional[tf.Tensor] = None

								        self.entropy: Optional[tf.Tensor] = None

								        self.deterministic_output: Optional[tf.Tensor] = None

								        self.normalized_logprobs: Optional[tf.Tensor] = None

								        self.action_probs: Optional[tf.Tensor] = None

								        self.output_oh: Optional[tf.Tensor] = None

								        self.output_pre: Optional[tf.Tensor] = None


								        self.value_vars = None

								        self.q_vars = None

								        self.critic_vars = None

								        self.policy_vars = None


								        self.q1_heads: Dict[str, tf.Tensor] = None

								        self.q2_heads: Dict[str, tf.Tensor] = None

								        self.q1_pheads: Dict[str, tf.Tensor] = None

								        self.q2_pheads: Dict[str, tf.Tensor] = None


								        self.policy = policy


								    def get_vars(self, scope):

								        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)


								    def join_scopes(self, scope_1, scope_2):

								        """

								        Joins two scopes. Does so safetly (i.e., if one of the two scopes doesn't

								        exist, don't add any backslashes)

								        """

								        if not scope_1:

								            return scope_2

								        if not scope_2:

								            return scope_1

								        else:

								            return "/".join(filter(None, [scope_1, scope_2]))


								    def create_value_heads(self, stream_names, hidden_input):

								        """

								        Creates one value estimator head for each reward signal in stream_names.

								        Also creates the node corresponding to the mean of all the value heads in self.value.

								        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.

								        :param stream_names: The list of reward signal names

								        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top

								        of the hidden input.

								        """

								        self.value_heads = {}

								        for name in stream_names:

								            value = tf.layers.dense(hidden_input, 1, name="{}_value".format(name))

								            self.value_heads[name] = value

								        self.value = tf.reduce_mean(list(self.value_heads.values()), 0)


								    def create_cc_critic(self, hidden_value, scope, create_qs=True):

								        """

								        Creates just the critic network

								        """

								        scope = self.join_scopes(scope, "critic")

								        self.create_sac_value_head(

								            self.stream_names,

								            hidden_value,

								            self.num_layers,

								            self.h_size,

								            self.join_scopes(scope, "value"),

								        )


								        self.value_vars = self.get_vars(self.join_scopes(scope, "value"))

								        if create_qs:

								            hidden_q = tf.concat([hidden_value, self.policy.action_holder], axis=-1)

								            hidden_qp = tf.concat([hidden_value, self.policy.output], axis=-1)

								            self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads(

								                self.stream_names,

								                hidden_q,

								                self.num_layers,

								                self.h_size,

								                self.join_scopes(scope, "q"),

								            )

								            self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads(

								                self.stream_names,

								                hidden_qp,

								                self.num_layers,

								                self.h_size,

								                self.join_scopes(scope, "q"),

								                reuse=True,

								            )

								            self.q_vars = self.get_vars(self.join_scopes(scope, "q"))

								        self.critic_vars = self.get_vars(scope)


								    def create_dc_critic(self, hidden_value, scope, create_qs=True):

								        """

								        Creates just the critic network

								        """

								        scope = self.join_scopes(scope, "critic")

								        self.create_sac_value_head(

								            self.stream_names,

								            hidden_value,

								            self.num_layers,

								            self.h_size,

								            self.join_scopes(scope, "value"),

								        )


								        self.value_vars = self.get_vars("/".join([scope, "value"]))


								        if create_qs:

								            self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads(

								                self.stream_names,

								                hidden_value,

								                self.num_layers,

								                self.h_size,

								                self.join_scopes(scope, "q"),

								                num_outputs=sum(self.policy.act_size),

								            )

								            self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads(

								                self.stream_names,

								                hidden_value,

								                self.num_layers,

								                self.h_size,

								                self.join_scopes(scope, "q"),

								                reuse=True,

								                num_outputs=sum(self.policy.act_size),

								            )

								            self.q_vars = self.get_vars(scope)

								        self.critic_vars = self.get_vars(scope)


								    def create_sac_value_head(

								        self, stream_names, hidden_input, num_layers, h_size, scope

								    ):

								        """

								        Creates one value estimator head for each reward signal in stream_names.

								        Also creates the node corresponding to the mean of all the value heads in self.value.

								        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.

								        :param stream_names: The list of reward signal names

								        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top

								        of the hidden input.

								        :param num_layers: Number of hidden layers for value network

								        :param h_size: size of hidden layers for value network

								        :param scope: TF scope for value network.

								        """

								        with tf.variable_scope(scope):

								            value_hidden = LearningModel.create_vector_observation_encoder(

								                hidden_input, h_size, self.activ_fn, num_layers, "encoder", False

								            )

								            if self.use_recurrent:

								                value_hidden, memory_out = LearningModel.create_recurrent_encoder(

								                    value_hidden,

								                    self.value_memory_in,

								                    self.sequence_length_ph,

								                    name="lstm_value",

								                )

								                self.value_memory_out = memory_out

								            self.create_value_heads(stream_names, value_hidden)


								    def create_q_heads(

								        self,

								        stream_names,

								        hidden_input,

								        num_layers,

								        h_size,

								        scope,

								        reuse=False,

								        num_outputs=1,

								    ):

								        """

								        Creates two q heads for each reward signal in stream_names.

								        Also creates the node corresponding to the mean of all the value heads in self.value.

								        self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.

								        :param stream_names: The list of reward signal names

								        :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top

								        of the hidden input.

								        :param num_layers: Number of hidden layers for Q network

								        :param h_size: size of hidden layers for Q network

								        :param scope: TF scope for Q network.

								        :param reuse: Whether or not to reuse variables. Useful for creating Q of policy.

								        :param num_outputs: Number of outputs of each Q function. If discrete, equal to number of actions.

								        """

								        with tf.variable_scope(self.join_scopes(scope, "q1_encoding"), reuse=reuse):

								            q1_hidden = LearningModel.create_vector_observation_encoder(

								                hidden_input, h_size, self.activ_fn, num_layers, "q1_encoder", reuse

								            )

								            if self.use_recurrent:

								                q1_hidden, memory_out = LearningModel.create_recurrent_encoder(

								                    q1_hidden,

								                    self.q1_memory_in,

								                    self.sequence_length_ph,

								                    name="lstm_q1",

								                )

								                self.q1_memory_out = memory_out


								            q1_heads = {}

								            for name in stream_names:

								                _q1 = tf.layers.dense(q1_hidden, num_outputs, name="{}_q1".format(name))

								                q1_heads[name] = _q1


								            q1 = tf.reduce_mean(list(q1_heads.values()), axis=0)

								        with tf.variable_scope(self.join_scopes(scope, "q2_encoding"), reuse=reuse):

								            q2_hidden = LearningModel.create_vector_observation_encoder(

								                hidden_input, h_size, self.activ_fn, num_layers, "q2_encoder", reuse

								            )

								            if self.use_recurrent:

								                q2_hidden, memory_out = LearningModel.create_recurrent_encoder(

								                    q2_hidden,

								                    self.q2_memory_in,

								                    self.sequence_length_ph,

								                    name="lstm_q2",

								                )

								                self.q2_memory_out = memory_out


								            q2_heads = {}

								            for name in stream_names:

								                _q2 = tf.layers.dense(q2_hidden, num_outputs, name="{}_q2".format(name))

								                q2_heads[name] = _q2


								            q2 = tf.reduce_mean(list(q2_heads.values()), axis=0)


								        return q1_heads, q2_heads, q1, q2


								class SACTargetNetwork(SACNetwork):

								    """

								    Instantiation for the SAC target network. Only contains a single

								    value estimator and is updated from the Policy Network.

								    """


								    def __init__(

								        self,

								        policy,

								        m_size=None,

								        h_size=128,

								        normalize=False,

								        use_recurrent=False,

								        num_layers=2,

								        stream_names=None,

								        vis_encode_type=EncoderType.SIMPLE,

								    ):

								        super().__init__(

								            policy,

								            m_size,

								            h_size,

								            normalize,

								            use_recurrent,

								            num_layers,

								            stream_names,

								            vis_encode_type,

								        )

								        with tf.variable_scope(TARGET_SCOPE):

								            self.visual_in = LearningModel.create_visual_input_placeholders(

								                policy.brain.camera_resolutions

								            )

								            self.vector_in = LearningModel.create_vector_input(policy.vec_obs_size)

								            if self.policy.normalize:

								                normalization_tensors = LearningModel.create_normalizer(self.vector_in)

								                self.update_normalization_op = normalization_tensors.update_op

								                self.normalization_steps = normalization_tensors.steps

								                self.running_mean = normalization_tensors.running_mean

								                self.running_variance = normalization_tensors.running_variance

								                self.processed_vector_in = LearningModel.normalize_vector_obs(

								                    self.vector_in,

								                    self.running_mean,

								                    self.running_variance,

								                    self.normalization_steps,

								                )

								            else:

								                self.processed_vector_in = self.vector_in

								                self.update_normalization_op = None


								            if self.policy.use_recurrent:

								                self.memory_in = tf.placeholder(

								                    shape=[None, m_size], dtype=tf.float32, name="target_recurrent_in"

								                )

								                self.value_memory_in = self.memory_in

								            hidden_streams = LearningModel.create_observation_streams(

								                self.visual_in,

								                self.processed_vector_in,

								                1,

								                self.h_size,

								                0,

								                vis_encode_type=vis_encode_type,

								                stream_scopes=["critic/value/"],

								            )

								        if self.policy.use_continuous_act:

								            self.create_cc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)

								        else:

								            self.create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)

								        if self.use_recurrent:

								            self.memory_out = tf.concat(

								                self.value_memory_out, axis=1

								            )  # Needed for Barracuda to work


								    def copy_normalization(self, mean, variance, steps):

								        """

								        Copies the mean, variance, and steps into the normalizers of the

								        input of this SACNetwork. Used to copy the normalizer from the policy network

								        to the target network.

								        param mean: Tensor containing the mean.

								        param variance: Tensor containing the variance

								        param steps: Tensor containing the number of steps.

								        """

								        update_mean = tf.assign(self.running_mean, mean)

								        update_variance = tf.assign(self.running_variance, variance)

								        update_norm_step = tf.assign(self.normalization_steps, steps)

								        return tf.group([update_mean, update_variance, update_norm_step])


								class SACPolicyNetwork(SACNetwork):

								    """

								    Instantiation for SAC policy network. Contains a dual Q estimator,

								    a value estimator, and a reference to the actual policy network.

								    """


								    def __init__(

								        self,

								        policy,

								        m_size=None,

								        h_size=128,

								        normalize=False,

								        use_recurrent=False,

								        num_layers=2,

								        stream_names=None,

								        vis_encode_type=EncoderType.SIMPLE,

								    ):

								        super().__init__(

								            policy,

								            m_size,

								            h_size,

								            normalize,

								            use_recurrent,

								            num_layers,

								            stream_names,

								            vis_encode_type,

								        )

								        if self.policy.use_recurrent:

								            self.create_memory_ins(m_size)


								        hidden_critic = self.create_observation_in(vis_encode_type)

								        self.policy.output = self.policy.output

								        # Use the sequence length of the policy

								        self.sequence_length_ph = self.policy.sequence_length_ph


								        if self.policy.use_continuous_act:

								            self.create_cc_critic(hidden_critic, POLICY_SCOPE)


								        else:

								            self.create_dc_critic(hidden_critic, POLICY_SCOPE)


								        if self.use_recurrent:

								            mem_outs = [self.value_memory_out, self.q1_memory_out, self.q2_memory_out]

								            self.memory_out = tf.concat(mem_outs, axis=1)


								    def create_memory_ins(self, m_size):

								        """

								        Creates the memory input placeholders for LSTM.

								        :param m_size: the total size of the memory.

								        """

								        self.memory_in = tf.placeholder(

								            shape=[None, m_size * 3], dtype=tf.float32, name="value_recurrent_in"

								        )


								        # Re-break-up for each network

								        num_mems = 3

								        input_size = self.memory_in.get_shape().as_list()[1]

								        mem_ins = []

								        for i in range(num_mems):

								            _start = input_size // num_mems * i

								            _end = input_size // num_mems * (i + 1)

								            mem_ins.append(self.memory_in[:, _start:_end])

								        self.value_memory_in = mem_ins[0]

								        self.q1_memory_in = mem_ins[1]

								        self.q2_memory_in = mem_ins[2]


								    def create_observation_in(self, vis_encode_type):

								        """

								        Creates the observation inputs, and a CNN if needed,

								        :param vis_encode_type: Type of CNN encoder.

								        :param share_ac_cnn: Whether or not to share the actor and critic CNNs.

								        :return A tuple of (hidden_policy, hidden_critic). We don't save it to self since they're used

								        once and thrown away.

								        """

								        with tf.variable_scope(POLICY_SCOPE):

								            hidden_streams = LearningModel.create_observation_streams(

								                self.policy.visual_in,

								                self.policy.processed_vector_in,

								                1,

								                self.h_size,

								                0,

								                vis_encode_type=vis_encode_type,

								                stream_scopes=["critic/value/"],

								            )

								        hidden_critic = hidden_streams[0]

								        return hidden_critic