from typing import Optional, Tuple from mlagents.tf_utils import tf from mlagents.trainers.policy.tf_policy import TFPolicy from mlagents.trainers.models import ModelUtils EPSILON = 1e-7 class GAILModel: def __init__( self, policy: TFPolicy, h_size: int = 128, learning_rate: float = 3e-4, encoding_size: int = 64, use_actions: bool = False, use_vail: bool = False, gradient_penalty_weight: float = 10.0, ): """ The initializer for the GAIL reward generator. https://arxiv.org/abs/1606.03476 :param policy_model: The policy of the learning algorithm :param h_size: Size of the hidden layer for the discriminator :param learning_rate: The learning Rate for the discriminator :param encoding_size: The encoding size for the encoder :param use_actions: Whether or not to use actions to discriminate :param use_vail: Whether or not to use a variational bottleneck for the discriminator. See https://arxiv.org/abs/1810.00821. """ self.h_size = h_size self.z_size = 128 self.alpha = 0.0005 self.mutual_information = 0.5 self.policy = policy self.encoding_size = encoding_size self.gradient_penalty_weight = gradient_penalty_weight self.use_vail = use_vail self.use_actions = use_actions # True # Not using actions self.noise: Optional[tf.Tensor] = None self.z: Optional[tf.Tensor] = None self.make_inputs() self.create_network() self.create_loss(learning_rate) if self.use_vail: self.make_beta_update() def make_beta_update(self) -> None: """ Creates the beta parameter and its updater for GAIL """ new_beta = tf.maximum( self.beta + self.alpha * (self.kl_loss - self.mutual_information), EPSILON ) with tf.control_dependencies([self.update_batch]): self.update_beta = tf.assign(self.beta, new_beta) def make_inputs(self) -> None: """ Creates the input layers for the discriminator """ self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32) self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32) self.done_expert = tf.expand_dims(self.done_expert_holder, -1) self.done_policy = tf.expand_dims(self.done_policy_holder, -1) if self.policy.behavior_spec.is_action_continuous(): action_length = self.policy.act_size[0] self.action_in_expert = tf.placeholder( shape=[None, action_length], dtype=tf.float32 ) self.expert_action = tf.identity(self.action_in_expert) else: action_length = len(self.policy.act_size) self.action_in_expert = tf.placeholder( shape=[None, action_length], dtype=tf.int32 ) self.expert_action = tf.concat( [ tf.one_hot(self.action_in_expert[:, i], act_size) for i, act_size in enumerate(self.policy.act_size) ], axis=1, ) encoded_policy_list = [] encoded_expert_list = [] ( self.obs_in_expert, self.expert_visual_in, ) = ModelUtils.create_input_placeholders( self.policy.behavior_spec.observation_shapes, "gail_" ) if self.policy.vec_obs_size > 0: if self.policy.normalize: encoded_expert_list.append( ModelUtils.normalize_vector_obs( self.obs_in_expert, self.policy.running_mean, self.policy.running_variance, self.policy.normalization_steps, ) ) encoded_policy_list.append(self.policy.processed_vector_in) else: encoded_expert_list.append(self.obs_in_expert) encoded_policy_list.append(self.policy.vector_in) if self.expert_visual_in: visual_policy_encoders = [] visual_expert_encoders = [] for i, (vis_in, exp_vis_in) in enumerate( zip(self.policy.visual_in, self.expert_visual_in) ): encoded_policy_visual = ModelUtils.create_visual_observation_encoder( vis_in, self.encoding_size, ModelUtils.swish, 1, f"gail_stream_{i}_visual_obs_encoder", False, ) encoded_expert_visual = ModelUtils.create_visual_observation_encoder( exp_vis_in, self.encoding_size, ModelUtils.swish, 1, f"gail_stream_{i}_visual_obs_encoder", True, ) visual_policy_encoders.append(encoded_policy_visual) visual_expert_encoders.append(encoded_expert_visual) hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1) hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1) encoded_policy_list.append(hidden_policy_visual) encoded_expert_list.append(hidden_expert_visual) self.encoded_expert = tf.concat(encoded_expert_list, axis=1) self.encoded_policy = tf.concat(encoded_policy_list, axis=1) def create_encoder( self, state_in: tf.Tensor, action_in: tf.Tensor, done_in: tf.Tensor, reuse: bool ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: """ Creates the encoder for the discriminator :param state_in: The encoded observation input :param action_in: The action input :param done_in: The done flags input :param reuse: If true, the weights will be shared with the previous encoder created """ with tf.variable_scope("GAIL_model"): if self.use_actions: concat_input = tf.concat([state_in, action_in, done_in], axis=1) else: concat_input = state_in hidden_1 = tf.layers.dense( concat_input, self.h_size, activation=ModelUtils.swish, name="gail_d_hidden_1", reuse=reuse, ) hidden_2 = tf.layers.dense( hidden_1, self.h_size, activation=ModelUtils.swish, name="gail_d_hidden_2", reuse=reuse, ) z_mean = None if self.use_vail: # Latent representation z_mean = tf.layers.dense( hidden_2, self.z_size, reuse=reuse, name="gail_z_mean", kernel_initializer=ModelUtils.scaled_init(0.01), ) self.noise = tf.random_normal(tf.shape(z_mean), dtype=tf.float32) # Sampled latent code self.z = z_mean + self.z_sigma * self.noise * self.use_noise estimate_input = self.z else: estimate_input = hidden_2 estimate = tf.layers.dense( estimate_input, 1, activation=tf.nn.sigmoid, name="gail_d_estimate", reuse=reuse, ) return estimate, z_mean, concat_input def create_network(self) -> None: """ Helper for creating the intrinsic reward nodes """ if self.use_vail: self.z_sigma = tf.get_variable( "gail_sigma_vail", self.z_size, dtype=tf.float32, initializer=tf.ones_initializer(), ) self.z_sigma_sq = self.z_sigma * self.z_sigma self.z_log_sigma_sq = tf.log(self.z_sigma_sq + EPSILON) self.use_noise = tf.placeholder( shape=[1], dtype=tf.float32, name="gail_NoiseLevel" ) self.expert_estimate, self.z_mean_expert, _ = self.create_encoder( self.encoded_expert, self.expert_action, self.done_expert, reuse=False ) self.policy_estimate, self.z_mean_policy, _ = self.create_encoder( self.encoded_policy, self.policy.selected_actions, self.done_policy, reuse=True, ) self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate) self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate) self.discriminator_score = tf.reshape( self.policy_estimate, [-1], name="gail_reward" ) self.intrinsic_reward = -tf.log(1.0 - self.discriminator_score + EPSILON) def create_gradient_magnitude(self) -> tf.Tensor: """ Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp. for off-policy. Compute gradients w.r.t randomly interpolated input. """ expert = [self.encoded_expert, self.expert_action, self.done_expert] policy = [self.encoded_policy, self.policy.selected_actions, self.done_policy] interp = [] for _expert_in, _policy_in in zip(expert, policy): alpha = tf.random_uniform(tf.shape(_expert_in)) interp.append(alpha * _expert_in + (1 - alpha) * _policy_in) grad_estimate, _, grad_input = self.create_encoder( interp[0], interp[1], interp[2], reuse=True ) grad = tf.gradients(grad_estimate, [grad_input])[0] # Norm's gradient could be NaN at 0. Use our own safe_norm safe_norm = tf.sqrt(tf.reduce_sum(grad ** 2, axis=-1) + EPSILON) gradient_mag = tf.reduce_mean(tf.pow(safe_norm - 1, 2)) return gradient_mag def create_loss(self, learning_rate: float) -> None: """ Creates the loss and update nodes for the GAIL reward generator :param learning_rate: The learning rate for the optimizer """ self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate) self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate) if self.use_vail: self.beta = tf.get_variable( "gail_beta", [], trainable=False, dtype=tf.float32, initializer=tf.ones_initializer(), ) self.discriminator_loss = -tf.reduce_mean( tf.log(self.expert_estimate + EPSILON) + tf.log(1.0 - self.policy_estimate + EPSILON) ) if self.use_vail: # KL divergence loss (encourage latent representation to be normal) self.kl_loss = tf.reduce_mean( -tf.reduce_sum( 1 + self.z_log_sigma_sq - 0.5 * tf.square(self.z_mean_expert) - 0.5 * tf.square(self.z_mean_policy) - tf.exp(self.z_log_sigma_sq), 1, ) ) self.loss = ( self.beta * (self.kl_loss - self.mutual_information) + self.discriminator_loss ) else: self.loss = self.discriminator_loss if self.gradient_penalty_weight > 0.0: self.loss += self.gradient_penalty_weight * self.create_gradient_magnitude() optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) self.update_batch = optimizer.minimize(self.loss)