import os
import numpy as np
from typing import Any, Dict, Optional, List, Tuple
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents_envs.base_env import DecisionSteps
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.models import EncoderType, NormalizerTensors
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.distributions import (
    GaussianDistribution,
    MultiCategoricalDistribution,
)

# import tf_slim as slim
EPSILON = 1e-6  # Small value to avoid divide by zero


class GaussianEncoderDistribution:
    def __init__(self, encoded: tf.Tensor, feature_size: int, reuse: bool = False):
        self.mu = tf.layers.dense(
            encoded,
            feature_size,
            activation=None,
            name="mu",
            kernel_initializer=ModelUtils.scaled_init(0.01),
            reuse=reuse,
        )

        self.log_sigma = tf.layers.dense(
            encoded,
            feature_size,
            activation=None,
            name="log_std",
            kernel_initializer=ModelUtils.scaled_init(0.01),
            reuse=reuse,
        )

        self.sigma = tf.exp(self.log_sigma)

    def sample(self):
        epsilon = tf.random_normal(tf.shape(self.mu))
        sampled = self.mu + self.sigma * epsilon

        return sampled

    def kl_standard(self):
        """
        KL divergence with a standard gaussian
        """
        kl = 0.5 * tf.reduce_sum(
            tf.square(self.mu) + tf.square(self.sigma) - 2 * self.log_sigma - 1, 1
        )
        return kl

    def w_distance(self, another):
        return tf.sqrt(
            tf.reduce_sum(tf.squared_difference(self.mu, another.mu), axis=1)
            + tf.reduce_sum(tf.squared_difference(self.sigma, another.sigma), axis=1)
        )


class TransferPolicy(TFPolicy):
    def __init__(
        self,
        seed: int,
        brain: BrainParameters,
        trainer_params: TrainerSettings,
        is_training: bool,
        model_path: str,
        load: bool,
        tanh_squash: bool = False,
        reparameterize: bool = False,
        condition_sigma_on_obs: bool = True,
        create_tf_graph: bool = True,
    ):
        """
        Policy that uses a multilayer perceptron to map the observations to actions. Could
        also use a CNN to encode visual input prior to the MLP. Supports discrete and
        continuous action spaces, as well as recurrent networks.
        :param seed: Random seed.
        :param brain: Assigned BrainParameters object.
        :param trainer_params: Defined training parameters.
        :param is_training: Whether the model should be trained.
        :param load: Whether a pre-trained model will be loaded or a new one created.
        :param model_path: Path where the model should be saved and loaded.
        :param tanh_squash: Whether to use a tanh function on the continuous output, or a clipped output.
        :param reparameterize: Whether we are using the resampling trick to update the policy in continuous output.
        """
        super().__init__(seed, brain, trainer_params, model_path, load)
        self.grads = None
        self.update_batch: Optional[tf.Operation] = None
        num_layers = self.network_settings.num_layers
        self.h_size = self.network_settings.hidden_units
        if num_layers < 1:
            num_layers = 1
        self.num_layers = num_layers
        self.vis_encode_type = self.network_settings.vis_encode_type
        self.tanh_squash = tanh_squash
        self.reparameterize = reparameterize
        self.condition_sigma_on_obs = condition_sigma_on_obs
        self.trainable_variables: List[tf.Variable] = []
        self.next_visual_in: List[tf.Tensor] = []
        self.encoder = None
        self.encoder_distribution = None
        self.targ_encoder = None

        # Non-exposed parameters; these aren't exposed because they don't have a
        # good explanation and usually shouldn't be touched.
        self.log_std_min = -20
        self.log_std_max = 2
        if create_tf_graph:
            self.create_tf_graph()

    def get_trainable_variables(self, 
        train_encoder: bool=True,
        train_action: bool=True,
        train_model: bool=True,
        train_policy: bool=True) -> List[tf.Variable]:
        """
        Returns a List of the trainable variables in this policy. if create_tf_graph hasn't been called,
        returns empty list.
        """
        trainable_variables = []
        if train_encoder:
            trainable_variables += self.encoding_variables
        if train_action:
            trainable_variables += self.action_variables
        if train_model:
            trainable_variables += self.model_variables
        if train_policy:
            trainable_variables += self.policy_variables
        return trainable_variables

    def create_tf_graph(
        self,
        encoder_layers=1,
        action_layers=1,
        policy_layers=1,
        forward_layers=1,
        inverse_layers=1,
        feature_size=16,
        action_feature_size=16,
        transfer=False,
        separate_train=False,
        separate_policy_net=False,
        separate_model_train=False,
        var_encoder=False,
        var_predict=True,
        predict_return=True,
        inverse_model=False,
        reuse_encoder=True,
        use_bisim=True,
        tau=0.1,
    ) -> None:
        """
        Builds the tensorflow graph needed for this policy.
        """
        self.inverse_model = inverse_model
        self.reuse_encoder = reuse_encoder
        self.feature_size = feature_size
        self.action_feature_size = action_feature_size
        self.predict_return = predict_return
        self.use_bisim = use_bisim
        self.transfer = transfer
        self.tau = tau

        with self.graph.as_default():
            tf.set_random_seed(self.seed)
            _vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
            if len(_vars) > 0:
                # We assume the first thing created in the graph is the Policy. If
                # already populated, don't create more tensors.
                return
            self.create_input_placeholders()
            self.create_next_inputs()
            self.current_action = tf.placeholder(
                shape=[None, sum(self.act_size)],
                dtype=tf.float32,
                name="current_action",
            )
            self.current_reward = tf.placeholder(
                shape=[None], dtype=tf.float32, name="current_reward"
            )

            self.encoder = self._create_encoder_general(
                self.visual_in,
                self.processed_vector_in,
                self.h_size,
                self.feature_size,
                encoder_layers,
                self.vis_encode_type,
                scope="encoding"
            )

            self.next_encoder = self._create_encoder_general(
                self.visual_next,
                self.processed_vector_next,
                self.h_size,
                self.feature_size,
                encoder_layers,
                self.vis_encode_type,
                scope="encoding",
                reuse=True
            )

            self.targ_encoder = self._create_encoder_general(
                self.visual_in,
                self.processed_vector_in,
                self.h_size,
                self.feature_size,
                encoder_layers,
                self.vis_encode_type,
                scope="target_enc",
                stop_gradient=True,
            )

            self.next_targ_encoder = self._create_encoder_general(
                self.visual_next,
                self.processed_vector_next,
                self.h_size,
                self.feature_size,
                encoder_layers,
                self.vis_encode_type,
                scope="target_enc",
                reuse=True,
                stop_gradient=True,
            )

            self._create_hard_copy()
            self._create_soft_copy()

            self.action_encoder = self._create_action_encoder(
                self.current_action,
                self.h_size,
                self.action_feature_size,
                action_layers
            )

            if self.inverse_model:
                with tf.variable_scope("inverse"):
                    self.create_inverse_model(
                        self.encoder, self.targ_encoder, inverse_layers
                    )

            with tf.variable_scope("predict"):

                self.predict, self.predict_distribution = self.create_forward_model(
                    self.encoder,
                    self.action_encoder,
                    forward_layers,
                    var_predict=var_predict,
                    separate_train=separate_model_train
                )

                self.targ_predict, self.targ_predict_distribution = self.create_forward_model(
                    self.targ_encoder,
                    self.action_encoder,
                    forward_layers,
                    var_predict=var_predict,
                    reuse=True,
                    separate_train=separate_model_train
                )

                self.create_forward_loss(self.reuse_encoder, self.transfer)

            if predict_return:
                with tf.variable_scope("reward"):
                    self.create_reward_model(
                        self.encoder, self.action_encoder, forward_layers, separate_train=separate_model_train
                    )

            if self.use_bisim:
                self.create_bisim_model(
                    self.h_size,
                    self.feature_size,
                    encoder_layers,
                    action_layers,
                    self.vis_encode_type,
                    forward_layers,
                    var_predict,
                    predict_return,
                )

            if self.use_continuous_act:
                self._create_cc_actor(
                    self.encoder,
                    self.h_size,
                    policy_layers,
                    self.tanh_squash,
                    self.reparameterize,
                    self.condition_sigma_on_obs,
                    separate_train,
                    separate_policy_net
                )
            else:
                self._create_dc_actor(
                    self.encoder, self.h_size, policy_layers, separate_train, separate_policy_net
                )

            self.policy_variables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy"
            )
            self.encoding_variables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="encoding"
            )
            self.action_variables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="action_enc"
            )
            self.model_variables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="predict"
            ) + tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="reward"
            )

            self.encoding_variables += tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm"
            )  # LSTMs need to be root scope for Barracuda export
            if self.inverse_model:
                self.model_variables += tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope="inverse"
                )

        self.inference_dict: Dict[str, tf.Tensor] = {
            "action": self.output,
            "log_probs": self.all_log_probs,
            "entropy": self.entropy,
        }
        if self.use_continuous_act:
            self.inference_dict["pre_action"] = self.output_pre
        if self.use_recurrent:
            self.inference_dict["memory_out"] = self.memory_out

        # We do an initialize to make the Policy usable out of the box. If an optimizer is needed,
        # it will re-load the full graph
        self._initialize_graph()

        # slim.model_analyzer.analyze_vars(self.trainable_variables, print_info=True)

    def load_graph_partial(
        self,
        path: str,
        load_model=False,
        load_policy=False,
        load_value=False,
        load_encoder=False,
        load_action=False
    ):
        load_nets = []
        if load_model:
            load_nets.append("predict")
            if self.predict_return:
                load_nets.append("reward")
        if load_policy:
            load_nets.append("policy")
        if load_value:
            load_nets.append("value")
        if load_encoder:
            load_nets.append("encoding")
        if load_action:
            load_nets.append("action_enc")
        if self.inverse_model:
            load_nets.append("inverse")

        with self.graph.as_default():
            for net in load_nets:
                variables_to_restore = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, net
                )
                if net == "value" and len(variables_to_restore) == 0:
                    variables_to_restore = tf.get_collection(
                        tf.GraphKeys.TRAINABLE_VARIABLES, "critic"
                    )
                    net = "critic"
                partial_saver = tf.train.Saver(variables_to_restore)
                partial_model_checkpoint = os.path.join(path, f"{net}.ckpt")
                partial_saver.restore(self.sess, partial_model_checkpoint)
                print("loaded net", net, "from path", path)

    @timed
    def evaluate(
        self, decision_requests: DecisionSteps, global_agent_ids: List[str]
    ) -> Dict[str, Any]:
        """
        Evaluates policy for the agent experiences provided.
        :param decision_requests: DecisionSteps object containing inputs.
        :param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result.
        :return: Outputs from network as defined by self.inference_dict.
        """
        feed_dict = {
            self.batch_size_ph: len(decision_requests),
            self.sequence_length_ph: 1,
        }
        if self.use_recurrent:
            if not self.use_continuous_act:
                feed_dict[self.prev_action] = self.retrieve_previous_action(
                    global_agent_ids
                )
            feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
        feed_dict = self.fill_eval_dict(feed_dict, decision_requests)
        run_out = self._execute_model(feed_dict, self.inference_dict)
        return run_out

    def _create_encoder_general(
        self,
        visual_in: List[tf.Tensor],
        vector_in: tf.Tensor,
        h_size: int,
        feature_size: int,
        num_layers: int,
        vis_encode_type: EncoderType,
        scope: str,
        reuse: bool=False,
        stop_gradient: bool=False
    ) -> tf.Tensor:
        """
        Creates an encoder for visual and vector observations.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        :return: The hidden layer (tf.Tensor) after the encoder.
        """
        with tf.variable_scope(scope):
            hidden_stream = ModelUtils.create_observation_streams(
                visual_in, vector_in, 1, h_size, num_layers, vis_encode_type, reuse=reuse
            )[0]

            latent = tf.layers.dense(
                hidden_stream,
                feature_size,
                name="latent",
                activation=tf.tanh,  # ModelUtils.swish,
                kernel_initializer=tf.initializers.variance_scaling(1.0),
                reuse=reuse
            )
        if stop_gradient:
            latent = tf.stop_gradient(latent)
        return latent

    def _create_action_encoder(
        self,
        action: tf.Tensor,
        h_size: int,
        action_feature_size: int,
        num_layers: int,
        reuse: bool=False
    ) -> tf.Tensor:

        if num_layers < 0:
            return action
        
        hidden_stream = ModelUtils.create_vector_observation_encoder(
            action, 
            h_size, 
            ModelUtils.swish,
            num_layers, 
            scope="action_enc",
            reuse=reuse
        )

        with tf.variable_scope("action_enc"):
            latent = tf.layers.dense(
                hidden_stream,
                action_feature_size,
                name="latent",
                activation=tf.tanh,  
                kernel_initializer=tf.initializers.variance_scaling(1.0),
                reuse=reuse
            )
        return latent

    def _create_hard_copy(self):
        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target_enc")
        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="encoding")

        with tf.variable_scope("hard_replacement"):
            self.target_hardcp_op = [
                tf.assign(t, e) for t, e in zip(t_params, e_params)
            ]

    def _create_soft_copy(self):
        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target_enc")
        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="encoding")

        with tf.variable_scope("soft_replacement"):
            self.target_softcp_op = [
                tf.assign(t, (1-self.tau) * t + self.tau * e) for t, e in zip(t_params, e_params)
            ]

    def run_hard_copy(self):
        self.sess.run(self.target_hardcp_op)
    
    def run_soft_copy(self):
        self.sess.run(self.target_softcp_op)

    def _create_cc_actor(
        self,
        encoded: tf.Tensor,
        h_size: int,
        num_layers: int,
        tanh_squash: bool = False,
        reparameterize: bool = False,
        condition_sigma_on_obs: bool = True,
        separate_train: bool = False,
        separate_net: bool = False
    ) -> None:
        """
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        :param tanh_squash: Whether to use a tanh function, or a clipped output.
        :param reparameterize: Whether we are using the resampling trick to update the policy.
        """
        with tf.variable_scope("policy"):
            if separate_net:
                encoded = self._create_encoder_general(
                    self.visual_in,
                    self.processed_vector_in,
                    h_size,
                    self.feature_size,
                    num_layers,
                    self.vis_encode_type,
                    scope="policy_enc"
                )

            if self.use_recurrent:
                self.memory_in = tf.placeholder(
                    shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
                )
                hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                    encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy"
                )

                self.memory_out = tf.identity(memory_policy_out, name="recurrent_out")
            else:
                hidden_policy = encoded

            if not separate_net:
                if separate_train:
                    hidden_policy = tf.stop_gradient(hidden_policy)
                hidden_policy = ModelUtils.create_vector_observation_encoder(
                    hidden_policy,
                    h_size,
                    ModelUtils.swish,
                    num_layers,
                    scope=f"main_graph",
                    reuse=False,
                )
            
            distribution = GaussianDistribution(
                hidden_policy,
                self.act_size,
                reparameterize=reparameterize,
                tanh_squash=tanh_squash,
                condition_sigma=condition_sigma_on_obs,
            )

        if tanh_squash:
            self.output_pre = distribution.sample
            self.output = tf.identity(self.output_pre, name="action")
        else:
            self.output_pre = distribution.sample
            # Clip and scale output to ensure actions are always within [-1, 1] range.
            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
            self.output = tf.identity(output_post, name="action")

        self.selected_actions = tf.stop_gradient(self.output)

        self.all_log_probs = tf.identity(distribution.log_probs, name="action_probs")
        self.entropy = distribution.entropy

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.total_log_probs = distribution.total_log_probs

    def _create_dc_actor(
        self,
        encoded: tf.Tensor,
        h_size: int,
        num_layers: int,
        separate_train: bool = False,
        separate_net: bool = False
    ) -> None:
        """
        Creates Discrete control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        """
        with tf.variable_scope("policy"):
            if separate_net:
                encoded = self._create_encoder_general(
                    self.visual_in,
                    self.processed_vector_in,
                    h_size,
                    self.feature_size,
                    num_layers,
                    self.vis_encode_type,
                    scope="policy_enc"
                )
            if self.use_recurrent:
                self.prev_action = tf.placeholder(
                    shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action"
                )
                prev_action_oh = tf.concat(
                    [
                        tf.one_hot(self.prev_action[:, i], self.act_size[i])
                        for i in range(len(self.act_size))
                    ],
                    axis=1,
                )
                hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)

                self.memory_in = tf.placeholder(
                    shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
                )
                hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                    hidden_policy,
                    self.memory_in,
                    self.sequence_length_ph,
                    name="lstm_policy",
                )

                self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
            else:
                hidden_policy = encoded

            if not separate_net:
                if separate_train:
                    hidden_policy = tf.stop_gradient(hidden_policy)

                hidden_policy = ModelUtils.create_vector_observation_encoder(
                    hidden_policy,
                    h_size,
                    ModelUtils.swish,
                    num_layers,
                    scope=f"main_graph",
                    reuse=False,
                )

            self.action_masks = tf.placeholder(
                shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
            )
            
            distribution = MultiCategoricalDistribution(
                hidden_policy, self.act_size, self.action_masks
            )
        # It's important that we are able to feed_dict a value into this tensor to get the
        # right one-hot encoding, so we can't do identity on it.
        self.output = distribution.sample
        self.all_log_probs = tf.identity(distribution.log_probs, name="action")
        self.selected_actions = tf.stop_gradient(
            distribution.sample_onehot
        )  # In discrete, these are onehot
        self.entropy = distribution.entropy
        self.total_log_probs = distribution.total_log_probs

    def save_model(self, steps):
        """
        Saves the model
        :param steps: The number of steps the model was trained for
        :return:
        """
        # self.get_policy_weights()
        with self.graph.as_default():
            last_checkpoint = os.path.join(self.model_path, f"model-{steps}.ckpt")
            self.saver.save(self.sess, last_checkpoint)
            tf.train.write_graph(
                self.graph, self.model_path, "raw_graph_def.pb", as_text=False
            )
            # save each net separately
            policy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
            policy_saver = tf.train.Saver(policy_vars)
            policy_checkpoint = os.path.join(self.model_path, f"policy.ckpt")
            policy_saver.save(self.sess, policy_checkpoint)

            encoding_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, "encoding"
            )
            encoding_saver = tf.train.Saver(encoding_vars)
            encoding_checkpoint = os.path.join(self.model_path, f"encoding.ckpt")
            encoding_saver.save(self.sess, encoding_checkpoint)

            action_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, "action_enc"
            )
            if len(action_vars) > 0:
                action_saver = tf.train.Saver(action_vars)
                action_checkpoint = os.path.join(self.model_path, f"action_enc.ckpt")
                action_saver.save(self.sess, action_checkpoint)

            latent_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent"
            )
            latent_saver = tf.train.Saver(latent_vars)
            latent_checkpoint = os.path.join(self.model_path, f"latent.ckpt")
            latent_saver.save(self.sess, latent_checkpoint)

            predict_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, "predict"
            )
            predict_saver = tf.train.Saver(predict_vars)
            predict_checkpoint = os.path.join(self.model_path, f"predict.ckpt")
            predict_saver.save(self.sess, predict_checkpoint)

            value_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
            if len(value_vars) > 0:
                value_saver = tf.train.Saver(value_vars)
                value_checkpoint = os.path.join(self.model_path, f"value.ckpt")
                value_saver.save(self.sess, value_checkpoint)
            
            critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "critic")
            if len(critic_vars) > 0:
                critic_saver = tf.train.Saver(critic_vars)
                critic_checkpoint = os.path.join(self.model_path, f"critic.ckpt")
                critic_saver.save(self.sess, critic_checkpoint)

            if self.inverse_model:
                inverse_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, "inverse"
                )
                inverse_saver = tf.train.Saver(inverse_vars)
                inverse_checkpoint = os.path.join(self.model_path, f"inverse.ckpt")
                inverse_saver.save(self.sess, inverse_checkpoint)

            if self.predict_return:
                reward_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, "reward"
                )
                reward_saver = tf.train.Saver(reward_vars)
                reward_checkpoint = os.path.join(self.model_path, f"reward.ckpt")
                reward_saver.save(self.sess, reward_checkpoint)

    def create_target_normalizer(
        self, vector_obs: tf.Tensor, prefix="vn"
    ) -> NormalizerTensors:
        vec_obs_size = vector_obs.shape[1]
        steps = tf.get_variable(
            prefix + "_normalization_steps",
            [],
            trainable=False,
            dtype=tf.int32,
            initializer=tf.zeros_initializer(),
        )
        running_mean = tf.get_variable(
            prefix + "_running_mean",
            [vec_obs_size],
            trainable=False,
            dtype=tf.float32,
            initializer=tf.zeros_initializer(),
        )
        running_variance = tf.get_variable(
            prefix + "_running_variance",
            [vec_obs_size],
            trainable=False,
            dtype=tf.float32,
            initializer=tf.ones_initializer(),
        )
        update_normalization = ModelUtils.create_normalizer_update(
            vector_obs, steps, running_mean, running_variance
        )
        return NormalizerTensors(
            update_normalization, steps, running_mean, running_variance
        )

    def update_normalization(
        self,
        vector_obs: np.ndarray,
        vector_obs_next: np.ndarray,
        vector_obs_bisim: np.ndarray,
    ) -> None:
        """
        If this policy normalizes vector observations, this will update the norm values in the graph.
        :param vector_obs: The vector observations to add to the running estimate of the distribution.
        """
        if self.use_vec_obs and self.normalize:
            self.sess.run(
                self.update_normalization_op, feed_dict={self.vector_in: vector_obs}
            )
            self.sess.run(
                self.vn_update_normalization_op,
                feed_dict={self.vector_next: vector_obs_next},
            )
            if self.use_bisim:
                self.sess.run(
                    self.bi_update_normalization_op,
                    feed_dict={self.vector_bisim: vector_obs_bisim},
                )

    def get_encoder_weights(self):
        with self.graph.as_default():
            enc = tf.get_collection(
                tf.GraphKeys.GLOBAL_VARIABLES, "encoding/latent/bias:0"
            )
            targ = tf.get_collection(
                tf.GraphKeys.GLOBAL_VARIABLES, "target_enc/latent/bias:0"
            )
            print("encoding:", self.sess.run(enc))
            print("target:", self.sess.run(targ))

    def get_policy_weights(self):
        with self.graph.as_default():
            #    pol = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "policy/mu/bias:0")
            #    print("policy:", self.sess.run(pol))
            enc = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "encoding")
            print("encoding:", self.sess.run(enc))
            pred = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "predict")
            print("predict:", self.sess.run(pred))

        #    rew = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "reward")
        #    print("reward:", self.sess.run(rew))

    def create_inverse_model(
        self,
        encoded_state: tf.Tensor,
        encoded_next_state: tf.Tensor,
        inverse_layers: int,
    ) -> None:
        """
        Creates inverse model TensorFlow ops for Curiosity module.
        Predicts action taken given current and future encoded states.
        :param encoded_state: Tensor corresponding to encoded current state.
        :param encoded_next_state: Tensor corresponding to encoded next state.
        """
        combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
        # hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
        hidden = combined_input
        for i in range(inverse_layers - 1):
            hidden = tf.layers.dense(
                hidden,
                self.h_size,
                activation=ModelUtils.swish,
                name="hidden_{}".format(i),
                kernel_initializer=tf.initializers.variance_scaling(1.0),
            )

        if self.brain.vector_action_space_type == "continuous":
            pred_action = tf.layers.dense(
                hidden, self.act_size[0], activation=None, name="pred_action"
            )
            squared_difference = tf.reduce_sum(
                tf.squared_difference(pred_action, self.current_action), axis=1
            )
            self.inverse_loss = tf.reduce_mean(
                tf.dynamic_partition(squared_difference, self.mask, 2)[1]
            )
        else:
            pred_action = tf.concat(
                [
                    tf.layers.dense(
                        hidden,
                        self.act_size[i],
                        activation=tf.nn.softmax,
                        name="pred_action",
                    )
                    for i in range(len(self.act_size))
                ],
                axis=1,
            )
            cross_entropy = tf.reduce_sum(
                -tf.log(pred_action + 1e-10) * self.current_action, axis=1
            )
            self.inverse_loss = tf.reduce_mean(
                tf.dynamic_partition(cross_entropy, self.mask, 2)[1]
            )

    def create_forward_model(
        self,
        encoded_state: tf.Tensor,
        encoded_action: tf.Tensor,
        forward_layers: int,
        var_predict: bool = False,
        reuse: bool = False,
        separate_train: bool = False
    ) -> None:
        """
        Creates forward model TensorFlow ops for Curiosity module.
        Predicts encoded future state based on encoded current state and given action.
        :param encoded_state: Tensor corresponding to encoded current state.
        :param encoded_next_state: Tensor corresponding to encoded next state.
        """

        if separate_train:
            encoded_state = tf.stop_gradient(encoded_state)
            # encoded_action = tf.stop_gradient(encoded_action)

        combined_input = tf.concat([encoded_state, encoded_action], axis=1)
        hidden = combined_input

        for i in range(forward_layers):
            hidden = tf.layers.dense(
                hidden,
                self.h_size,
                name="hidden_{}".format(i),
                activation=ModelUtils.swish,
                # kernel_initializer=tf.initializers.variance_scaling(1.0),
                reuse=reuse
            )

        if var_predict:
            predict_distribution = GaussianEncoderDistribution(
                hidden, self.feature_size, reuse=reuse
            )
            predict = predict_distribution.sample()
        else:
            predict = tf.layers.dense(
                hidden,
                self.feature_size,
                name="latent",
                # activation=tf.tanh,
                # kernel_initializer=tf.initializers.variance_scaling(1.0),
                reuse=reuse
            )
            predict_distribution = None
        
        return predict, predict_distribution

        # if not self.transfer:
        #     encoded_next_state = tf.stop_gradient(encoded_next_state)
        # squared_difference = 0.5 * tf.reduce_sum(
        #     tf.squared_difference(tf.tanh(self.predict), encoded_next_state), axis=1
        # )

        # # self.forward_loss = tf.reduce_mean(squared_difference)
        # self.next_state = encoded_next_state
        # self.forward_loss = tf.reduce_mean(
        #     tf.dynamic_partition(squared_difference, self.mask, 2)[1]
        # )
    
    def create_forward_loss(self, reuse: bool, transfer: bool):

        if not transfer:
            if reuse:
                encoded_next_state = tf.stop_gradient(self.next_encoder)
            else:
                encoded_next_state = self.next_targ_encoder # gradient of target encode is already stopped

            squared_difference = 0.5 * tf.reduce_sum(
                tf.squared_difference(tf.tanh(self.predict), encoded_next_state), axis=1
            )
            self.forward_loss = tf.reduce_mean(
                tf.dynamic_partition(squared_difference, self.mask, 2)[1]
            )

        else:
            if reuse:
                squared_difference_1 = 0.5 * tf.reduce_sum(
                    tf.squared_difference(tf.tanh(self.predict), tf.stop_gradient(self.next_encoder)), 
                    axis=1
                )
                squared_difference_2 = 0.5 * tf.reduce_sum(
                    tf.squared_difference(tf.tanh(tf.stop_gradient(self.predict)), self.next_encoder), 
                    axis=1
                )
            else:
                squared_difference_1 = 0.5 * tf.reduce_sum(
                    tf.squared_difference(tf.tanh(self.predict), self.next_targ_encoder), 
                    axis=1
                )
                squared_difference_2 = 0.5 * tf.reduce_sum(
                    tf.squared_difference(tf.tanh(self.targ_predict), self.next_encoder), 
                    axis=1
                )
            self.forward_loss = tf.reduce_mean(
                tf.dynamic_partition(0.5 * squared_difference_1 + 0.5 * squared_difference_2, self.mask, 2)[1]
            )


    def create_reward_model(
        self,
        encoded_state: tf.Tensor,
        encoded_action: tf.Tensor,
        forward_layers: int,
        separate_train: bool = False
    ):

        if separate_train:
            encoded_state = tf.stop_gradient(encoded_state)
            # encoded_action = tf.stop_gradient(encoded_action)

        combined_input = tf.concat([encoded_state, encoded_action], axis=1)

        hidden = combined_input
        
        for i in range(forward_layers):
            hidden = tf.layers.dense(
                hidden,
                self.h_size * (self.vis_obs_size + int(self.vec_obs_size > 0)),
                name="hidden_{}".format(i),
                activation=ModelUtils.swish,
                # kernel_initializer=tf.initializers.variance_scaling(1.0),
            )
        self.pred_reward = tf.layers.dense(
            hidden,
            1,
            name="reward",
            # activation=ModelUtils.swish,
            # kernel_initializer=tf.initializers.variance_scaling(1.0),
        )

        self.reward_loss = tf.reduce_mean(
            tf.squared_difference(self.pred_reward, self.current_reward)
        )
        # self.reward_loss = tf.clip_by_value(
        #    tf.reduce_mean(
        #        tf.squared_difference(self.pred_reward, self.current_reward)
        #    ),
        #    1e-10,
        #    1.0,
        # )

    def create_bisim_model(
        self,
        h_size: int,
        feature_size: int,
        encoder_layers: int,
        action_layers: int,
        vis_encode_type: EncoderType,
        forward_layers: int,
        var_predict: bool,
        predict_return: bool,
    ) -> None:
        with tf.variable_scope("encoding"):
            self.visual_bisim = ModelUtils.create_visual_input_placeholders(
                self.brain.camera_resolutions
            )
            self.vector_bisim = ModelUtils.create_vector_input(self.vec_obs_size)
            if self.normalize:
                bi_normalization_tensors = self.create_target_normalizer(
                    self.vector_bisim, prefix="bi"
                )
                self.bi_update_normalization_op = bi_normalization_tensors.update_op
                self.bi_normalization_steps = bi_normalization_tensors.steps
                self.bi_running_mean = bi_normalization_tensors.running_mean
                self.bi_running_variance = bi_normalization_tensors.running_variance
                self.processed_vector_bisim = ModelUtils.normalize_vector_obs(
                    self.vector_bisim,
                    self.bi_running_mean,
                    self.bi_running_variance,
                    self.bi_normalization_steps,
                )
            else:
                self.processed_vector_bisim = self.vector_bisim
                self.vp_update_normalization_op = None

            hidden_stream = ModelUtils.create_observation_streams(
                self.visual_bisim,
                self.processed_vector_bisim,
                1,
                h_size,
                encoder_layers,
                vis_encode_type,
                reuse=True,
            )[0]

            self.bisim_encoder = tf.layers.dense(
                hidden_stream,
                feature_size,
                name="latent",
                activation=ModelUtils.swish,
                kernel_initializer=tf.initializers.variance_scaling(1.0),
                reuse=True,
            )
        self.bisim_action = tf.placeholder(
            shape=[None, sum(self.act_size)], dtype=tf.float32, name="bisim_action"
        )
        # self.bisim_action_encoder = self._create_action_encoder(
        #     self.bisim_action,
        #     self.h_size,
        #     self.action_feature_size,
        #     action_layers,
        #     reuse=True,
        # )
        combined_input = tf.concat([self.bisim_encoder, self.bisim_action], axis=1)
        combined_input = tf.stop_gradient(combined_input)

        with tf.variable_scope("predict"):
            hidden = combined_input
            for i in range(forward_layers):
                hidden = tf.layers.dense(
                    hidden,
                    self.h_size,
                    name="hidden_{}".format(i),
                    reuse=True,
                    activation=ModelUtils.swish,
                    # kernel_initializer=tf.initializers.variance_scaling(1.0),
                )

            self.bisim_predict_distribution = GaussianEncoderDistribution(
                hidden, self.feature_size, reuse=True
            )
            self.bisim_predict = self.predict_distribution.sample()
        with tf.variable_scope("reward"):
            hidden = combined_input
            for i in range(forward_layers):
                hidden = tf.layers.dense(
                    hidden,
                    self.h_size * (self.vis_obs_size + int(self.vec_obs_size > 0)),
                    name="hidden_{}".format(i),
                    reuse=True,
                    activation=ModelUtils.swish,
                    # kernel_initializer=tf.initializers.variance_scaling(1.0),
                )
            self.bisim_pred_reward = tf.layers.dense(
                hidden,
                1,
                name="reward",
                reuse=True
                # activation=ModelUtils.swish,
                # kernel_initializer=tf.initializers.variance_scaling(1.0),
            )

    def create_next_inputs(self):
        self.visual_next = ModelUtils.create_visual_input_placeholders(
            self.brain.camera_resolutions
        )
        self.vector_next = ModelUtils.create_vector_input(self.vec_obs_size)
        if self.normalize:
            vn_normalization_tensors = self.create_target_normalizer(self.vector_next)
            self.vn_update_normalization_op = vn_normalization_tensors.update_op
            self.vn_normalization_steps = vn_normalization_tensors.steps
            self.vn_running_mean = vn_normalization_tensors.running_mean
            self.vn_running_variance = vn_normalization_tensors.running_variance
            self.processed_vector_next = ModelUtils.normalize_vector_obs(
                self.vector_next,
                self.vn_running_mean,
                self.vn_running_variance,
                self.vn_normalization_steps,
            )
        else:
            self.processed_vector_next = self.vector_next
            self.vp_update_normalization_op = None