import logging from enum import Enum from typing import Callable, Dict, List, Optional import numpy as np from mlagents.tf_utils import tf from mlagents.trainers.trainer import UnityTrainerException from mlagents.trainers.brain import CameraResolution logger = logging.getLogger("mlagents.trainers") ActivationFunction = Callable[[tf.Tensor], tf.Tensor] EPSILON = 1e-7 class EncoderType(Enum): SIMPLE = "simple" NATURE_CNN = "nature_cnn" RESNET = "resnet" class LearningRateSchedule(Enum): CONSTANT = "constant" LINEAR = "linear" class LearningModel(object): _version_number_ = 2 def __init__( self, m_size, normalize, use_recurrent, brain, seed, stream_names=None ): tf.set_random_seed(seed) self.brain = brain self.vector_in = None self.global_step, self.increment_step, self.steps_to_increment = ( self.create_global_steps() ) self.visual_in = [] self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name="batch_size") self.sequence_length = tf.placeholder( shape=None, dtype=tf.int32, name="sequence_length" ) self.mask_input = tf.placeholder(shape=[None], dtype=tf.float32, name="masks") self.mask = tf.cast(self.mask_input, tf.int32) self.stream_names = stream_names or [] self.use_recurrent = use_recurrent if self.use_recurrent: self.m_size = m_size else: self.m_size = 0 self.normalize = normalize self.act_size = brain.vector_action_space_size self.vec_obs_size = brain.vector_observation_space_size self.vis_obs_size = brain.number_visual_observations tf.Variable( int(brain.vector_action_space_type == "continuous"), name="is_continuous_control", trainable=False, dtype=tf.int32, ) tf.Variable( self._version_number_, name="version_number", trainable=False, dtype=tf.int32, ) tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32) if brain.vector_action_space_type == "continuous": tf.Variable( self.act_size[0], name="action_output_shape", trainable=False, dtype=tf.int32, ) else: tf.Variable( sum(self.act_size), name="action_output_shape", trainable=False, dtype=tf.int32, ) self.value_heads: Dict[str, tf.Tensor] = {} self.normalization_steps: Optional[tf.Variable] = None self.running_mean: Optional[tf.Variable] = None self.running_variance: Optional[tf.Variable] = None self.update_normalization: Optional[tf.Operation] = None self.value: Optional[tf.Tensor] = None self.all_log_probs: Optional[tf.Tensor] = None self.output: Optional[tf.Tensor] = None self.selected_actions: Optional[tf.Tensor] = None self.action_holder: Optional[tf.Tensor] = None @staticmethod def create_global_steps(): """Creates TF ops to track and increment global training step.""" global_step = tf.Variable( 0, name="global_step", trainable=False, dtype=tf.int32 ) steps_to_increment = tf.placeholder( shape=[], dtype=tf.int32, name="steps_to_increment" ) increment_step = tf.assign(global_step, tf.add(global_step, steps_to_increment)) return global_step, increment_step, steps_to_increment @staticmethod def create_learning_rate( lr_schedule: LearningRateSchedule, lr: float, global_step: tf.Tensor, max_step: int, ) -> tf.Tensor: if lr_schedule == LearningRateSchedule.CONSTANT: learning_rate = tf.Variable(lr) elif lr_schedule == LearningRateSchedule.LINEAR: learning_rate = tf.train.polynomial_decay( lr, global_step, max_step, 1e-10, power=1.0 ) else: raise UnityTrainerException( "The learning rate schedule {} is invalid.".format(lr_schedule) ) return learning_rate @staticmethod def scaled_init(scale): return tf.initializers.variance_scaling(scale) @staticmethod def swish(input_activation: tf.Tensor) -> tf.Tensor: """Swish activation function. For more info: https://arxiv.org/abs/1710.05941""" return tf.multiply(input_activation, tf.nn.sigmoid(input_activation)) @staticmethod def create_visual_input( camera_parameters: CameraResolution, name: str ) -> tf.Tensor: """ Creates image input op. :param camera_parameters: Parameters for visual observation from BrainInfo. :param name: Desired name of input op. :return: input op. """ o_size_h = camera_parameters.height o_size_w = camera_parameters.width c_channels = camera_parameters.num_channels visual_in = tf.placeholder( shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32, name=name ) return visual_in def create_vector_input(self, name="vector_observation"): """ Creates ops for vector observation input. :param name: Name of the placeholder op. :param vec_obs_size: Size of stacked vector observation. :return: """ self.vector_in = tf.placeholder( shape=[None, self.vec_obs_size], dtype=tf.float32, name=name ) if self.normalize: self.create_normalizer(self.vector_in) return self.normalize_vector_obs(self.vector_in) else: return self.vector_in def normalize_vector_obs(self, vector_obs): normalized_state = tf.clip_by_value( (vector_obs - self.running_mean) / tf.sqrt( self.running_variance / (tf.cast(self.normalization_steps, tf.float32) + 1) ), -5, 5, name="normalized_state", ) return normalized_state def create_normalizer(self, vector_obs): self.normalization_steps = tf.get_variable( "normalization_steps", [], trainable=False, dtype=tf.int32, initializer=tf.ones_initializer(), ) self.running_mean = tf.get_variable( "running_mean", [self.vec_obs_size], trainable=False, dtype=tf.float32, initializer=tf.zeros_initializer(), ) self.running_variance = tf.get_variable( "running_variance", [self.vec_obs_size], trainable=False, dtype=tf.float32, initializer=tf.ones_initializer(), ) self.update_normalization = self.create_normalizer_update(vector_obs) def create_normalizer_update(self, vector_input): mean_current_observation = tf.reduce_mean(vector_input, axis=0) new_mean = self.running_mean + ( mean_current_observation - self.running_mean ) / tf.cast(tf.add(self.normalization_steps, 1), tf.float32) new_variance = self.running_variance + (mean_current_observation - new_mean) * ( mean_current_observation - self.running_mean ) update_mean = tf.assign(self.running_mean, new_mean) update_variance = tf.assign(self.running_variance, new_variance) update_norm_step = tf.assign( self.normalization_steps, self.normalization_steps + 1 ) return tf.group([update_mean, update_variance, update_norm_step]) @staticmethod def create_vector_observation_encoder( observation_input: tf.Tensor, h_size: int, activation: ActivationFunction, num_layers: int, scope: str, reuse: bool, ) -> tf.Tensor: """ Builds a set of hidden state encoders. :param reuse: Whether to re-use the weights within the same scope. :param scope: Graph scope for the encoder ops. :param observation_input: Input vector. :param h_size: Hidden layer size. :param activation: What type of activation function to use for layers. :param num_layers: number of hidden layers to create. :return: List of hidden layer tensors. """ with tf.variable_scope(scope): hidden = observation_input for i in range(num_layers): hidden = tf.layers.dense( hidden, h_size, activation=activation, reuse=reuse, name="hidden_{}".format(i), kernel_initializer=tf.initializers.variance_scaling(1.0), ) return hidden @staticmethod def create_visual_observation_encoder( image_input: tf.Tensor, h_size: int, activation: ActivationFunction, num_layers: int, scope: str, reuse: bool, ) -> tf.Tensor: """ Builds a set of resnet visual encoders. :param image_input: The placeholder for the image input to use. :param h_size: Hidden layer size. :param activation: What type of activation function to use for layers. :param num_layers: number of hidden layers to create. :param scope: The scope of the graph within which to create the ops. :param reuse: Whether to re-use the weights within the same scope. :return: List of hidden layer tensors. """ with tf.variable_scope(scope): conv1 = tf.layers.conv2d( image_input, 16, kernel_size=[8, 8], strides=[4, 4], activation=tf.nn.elu, reuse=reuse, name="conv_1", ) conv2 = tf.layers.conv2d( conv1, 32, kernel_size=[4, 4], strides=[2, 2], activation=tf.nn.elu, reuse=reuse, name="conv_2", ) hidden = tf.layers.flatten(conv2) with tf.variable_scope(scope + "/" + "flat_encoding"): hidden_flat = LearningModel.create_vector_observation_encoder( hidden, h_size, activation, num_layers, scope, reuse ) return hidden_flat @staticmethod def create_nature_cnn_visual_observation_encoder( image_input: tf.Tensor, h_size: int, activation: ActivationFunction, num_layers: int, scope: str, reuse: bool, ) -> tf.Tensor: """ Builds a set of resnet visual encoders. :param image_input: The placeholder for the image input to use. :param h_size: Hidden layer size. :param activation: What type of activation function to use for layers. :param num_layers: number of hidden layers to create. :param scope: The scope of the graph within which to create the ops. :param reuse: Whether to re-use the weights within the same scope. :return: List of hidden layer tensors. """ with tf.variable_scope(scope): conv1 = tf.layers.conv2d( image_input, 32, kernel_size=[8, 8], strides=[4, 4], activation=tf.nn.elu, reuse=reuse, name="conv_1", ) conv2 = tf.layers.conv2d( conv1, 64, kernel_size=[4, 4], strides=[2, 2], activation=tf.nn.elu, reuse=reuse, name="conv_2", ) conv3 = tf.layers.conv2d( conv2, 64, kernel_size=[3, 3], strides=[1, 1], activation=tf.nn.elu, reuse=reuse, name="conv_3", ) hidden = tf.layers.flatten(conv3) with tf.variable_scope(scope + "/" + "flat_encoding"): hidden_flat = LearningModel.create_vector_observation_encoder( hidden, h_size, activation, num_layers, scope, reuse ) return hidden_flat @staticmethod def create_resnet_visual_observation_encoder( image_input: tf.Tensor, h_size: int, activation: ActivationFunction, num_layers: int, scope: str, reuse: bool, ) -> tf.Tensor: """ Builds a set of resnet visual encoders. :param image_input: The placeholder for the image input to use. :param h_size: Hidden layer size. :param activation: What type of activation function to use for layers. :param num_layers: number of hidden layers to create. :param scope: The scope of the graph within which to create the ops. :param reuse: Whether to re-use the weights within the same scope. :return: List of hidden layer tensors. """ n_channels = [16, 32, 32] # channel for each stack n_blocks = 2 # number of residual blocks with tf.variable_scope(scope): hidden = image_input for i, ch in enumerate(n_channels): hidden = tf.layers.conv2d( hidden, ch, kernel_size=[3, 3], strides=[1, 1], reuse=reuse, name="layer%dconv_1" % i, ) hidden = tf.layers.max_pooling2d( hidden, pool_size=[3, 3], strides=[2, 2], padding="same" ) # create residual blocks for j in range(n_blocks): block_input = hidden hidden = tf.nn.relu(hidden) hidden = tf.layers.conv2d( hidden, ch, kernel_size=[3, 3], strides=[1, 1], padding="same", reuse=reuse, name="layer%d_%d_conv1" % (i, j), ) hidden = tf.nn.relu(hidden) hidden = tf.layers.conv2d( hidden, ch, kernel_size=[3, 3], strides=[1, 1], padding="same", reuse=reuse, name="layer%d_%d_conv2" % (i, j), ) hidden = tf.add(block_input, hidden) hidden = tf.nn.relu(hidden) hidden = tf.layers.flatten(hidden) with tf.variable_scope(scope + "/" + "flat_encoding"): hidden_flat = LearningModel.create_vector_observation_encoder( hidden, h_size, activation, num_layers, scope, reuse ) return hidden_flat @staticmethod def create_discrete_action_masking_layer(all_logits, action_masks, action_size): """ Creates a masking layer for the discrete actions :param all_logits: The concatenated unnormalized action probabilities for all branches :param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action] :param action_size: A list containing the number of possible actions for each branch :return: The action output dimension [batch_size, num_branches], the concatenated normalized probs (after softmax) and the concatenated normalized log probs """ action_idx = [0] + list(np.cumsum(action_size)) branches_logits = [ all_logits[:, action_idx[i] : action_idx[i + 1]] for i in range(len(action_size)) ] branch_masks = [ action_masks[:, action_idx[i] : action_idx[i + 1]] for i in range(len(action_size)) ] raw_probs = [ tf.multiply(tf.nn.softmax(branches_logits[k]) + EPSILON, branch_masks[k]) for k in range(len(action_size)) ] normalized_probs = [ tf.divide(raw_probs[k], tf.reduce_sum(raw_probs[k], axis=1, keepdims=True)) for k in range(len(action_size)) ] output = tf.concat( [ tf.multinomial(tf.log(normalized_probs[k] + EPSILON), 1) for k in range(len(action_size)) ], axis=1, ) return ( output, tf.concat([normalized_probs[k] for k in range(len(action_size))], axis=1), tf.concat( [ tf.log(normalized_probs[k] + EPSILON) for k in range(len(action_size)) ], axis=1, ), ) def create_observation_streams( self, num_streams: int, h_size: int, num_layers: int, vis_encode_type: EncoderType = EncoderType.SIMPLE, stream_scopes: List[str] = None, ) -> List[tf.Tensor]: """ Creates encoding stream for observations. :param num_streams: Number of streams to create. :param h_size: Size of hidden linear layers in stream. :param num_layers: Number of hidden linear layers in stream. :param stream_scopes: List of strings (length == num_streams), which contains the scopes for each of the streams. None if all under the same TF scope. :return: List of encoded streams. """ brain = self.brain activation_fn = self.swish self.visual_in = [] for i in range(brain.number_visual_observations): visual_input = self.create_visual_input( brain.camera_resolutions[i], name="visual_observation_" + str(i) ) self.visual_in.append(visual_input) vector_observation_input = self.create_vector_input() # Pick the encoder function based on the EncoderType create_encoder_func = LearningModel.create_visual_observation_encoder if vis_encode_type == EncoderType.RESNET: create_encoder_func = LearningModel.create_resnet_visual_observation_encoder elif vis_encode_type == EncoderType.NATURE_CNN: create_encoder_func = ( LearningModel.create_nature_cnn_visual_observation_encoder ) final_hiddens = [] for i in range(num_streams): visual_encoders = [] hidden_state, hidden_visual = None, None _scope_add = stream_scopes[i] if stream_scopes else "" if self.vis_obs_size > 0: for j in range(brain.number_visual_observations): encoded_visual = create_encoder_func( self.visual_in[j], h_size, activation_fn, num_layers, scope=f"{_scope_add}main_graph_{i}_encoder{j}", reuse=False, ) visual_encoders.append(encoded_visual) hidden_visual = tf.concat(visual_encoders, axis=1) if brain.vector_observation_space_size > 0: hidden_state = self.create_vector_observation_encoder( vector_observation_input, h_size, activation_fn, num_layers, scope=f"{_scope_add}main_graph_{i}", reuse=False, ) if hidden_state is not None and hidden_visual is not None: final_hidden = tf.concat([hidden_visual, hidden_state], axis=1) elif hidden_state is None and hidden_visual is not None: final_hidden = hidden_visual elif hidden_state is not None and hidden_visual is None: final_hidden = hidden_state else: raise Exception( "No valid network configuration possible. " "There are no states or observations in this brain" ) final_hiddens.append(final_hidden) return final_hiddens @staticmethod def create_recurrent_encoder(input_state, memory_in, sequence_length, name="lstm"): """ Builds a recurrent encoder for either state or observations (LSTM). :param sequence_length: Length of sequence to unroll. :param input_state: The input tensor to the LSTM cell. :param memory_in: The input memory to the LSTM cell. :param name: The scope of the LSTM cell. """ s_size = input_state.get_shape().as_list()[1] m_size = memory_in.get_shape().as_list()[1] lstm_input_state = tf.reshape(input_state, shape=[-1, sequence_length, s_size]) memory_in = tf.reshape(memory_in[:, :], [-1, m_size]) half_point = int(m_size / 2) with tf.variable_scope(name): rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(half_point) lstm_vector_in = tf.nn.rnn_cell.LSTMStateTuple( memory_in[:, :half_point], memory_in[:, half_point:] ) recurrent_output, lstm_state_out = tf.nn.dynamic_rnn( rnn_cell, lstm_input_state, initial_state=lstm_vector_in ) recurrent_output = tf.reshape(recurrent_output, shape=[-1, half_point]) return recurrent_output, tf.concat([lstm_state_out.c, lstm_state_out.h], axis=1) def create_value_heads(self, stream_names, hidden_input): """ Creates one value estimator head for each reward signal in stream_names. Also creates the node corresponding to the mean of all the value heads in self.value. self.value_head is a dictionary of stream name to node containing the value estimator head for that signal. :param stream_names: The list of reward signal names :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top of the hidden input. """ for name in stream_names: value = tf.layers.dense(hidden_input, 1, name="{}_value".format(name)) self.value_heads[name] = value self.value = tf.reduce_mean(list(self.value_heads.values()), 0)