import abc
from typing import NamedTuple, List, Tuple
import numpy as np

from mlagents.tf_utils import tf
from mlagents.trainers.models import ModelUtils

EPSILON = 1e-6  # Small value to avoid divide by zero


class OutputDistribution(abc.ABC):
    @abc.abstractproperty
    def log_probs(self) -> tf.Tensor:
        """
        Returns a Tensor that when evaluated, produces the per-action log probabilities of this distribution.
        The shape of this Tensor should be equivalent to (batch_size x the number of actions) produced in sample.
        """
        pass

    @abc.abstractproperty
    def total_log_probs(self) -> tf.Tensor:
        """
        Returns a Tensor that when evaluated, produces the total log probability for a single sample.
        The shape of this Tensor should be equivalent to (batch_size x 1) produced in sample.
        """
        pass

    @abc.abstractproperty
    def sample(self) -> tf.Tensor:
        """
        Returns a Tensor that when evaluated, produces a sample of this OutputDistribution.
        """
        pass

    @abc.abstractproperty
    def entropy(self) -> tf.Tensor:
        """
        Returns a Tensor that when evaluated, produces the entropy of this distribution.
        """
        pass


class DiscreteOutputDistribution(OutputDistribution):
    @abc.abstractproperty
    def sample_onehot(self) -> tf.Tensor:
        """
        Returns a one-hot version of the output.
        """


class GaussianDistribution(OutputDistribution):
    """
    A Gaussian output distribution for continuous actions.
    """

    class MuSigmaTensors(NamedTuple):
        mu: tf.Tensor
        log_sigma: tf.Tensor
        sigma: tf.Tensor

    def __init__(
        self,
        logits: tf.Tensor,
        act_size: List[int],
        reparameterize: bool = False,
        tanh_squash: bool = False,
        log_sigma_min: float = -20,
        log_sigma_max: float = 2,
    ):
        """
        A Gaussian output distribution for continuous actions.
        :param logits: Hidden layer to use as the input to the Gaussian distribution.
        :param act_size: List containing the number of continuous actions.
        :param reparameterize: Whether or not to use the reparameterization trick (block gradients through
            log probability calculation.)
        :param tanh_squash: Squash the output using tanh, constraining it between -1 and 1.
            From: Haarnoja et. al, https://arxiv.org/abs/1801.01290
        :param log_sigma_min: Minimum log standard deviation to clip by.
        :param log_sigma_max: Maximum log standard deviation to clip by.
        """
        encoded = self._create_mu_log_sigma(
            logits, act_size, log_sigma_min, log_sigma_max
        )
        self._sampled_policy = self._create_sampled_policy(encoded)
        if not reparameterize:
            _sampled_policy_probs = tf.stop_gradient(self._sampled_policy)
        else:
            _sampled_policy_probs = self._sampled_policy
        self._all_probs = self._create_log_probs(_sampled_policy_probs, encoded)
        if tanh_squash:
            self._sampled_policy = tf.tanh(self._sampled_policy)
            self._all_probs = self._do_squash_correction_for_tanh(
                self._all_probs, self._sampled_policy
            )
        self._total_prob = tf.reduce_sum(self._all_probs, axis=1, keepdims=True)
        self._entropy = self._create_entropy(encoded)

    def _create_mu_log_sigma(
        self,
        logits: tf.Tensor,
        act_size: List[int],
        log_sigma_min: float,
        log_sigma_max: float,
    ) -> "GaussianDistribution.MuSigmaTensors":

        mu = tf.layers.dense(
            logits,
            act_size[0],
            activation=None,
            name="mu",
            kernel_initializer=ModelUtils.scaled_init(0.01),
            reuse=tf.AUTO_REUSE,
        )

        # Policy-dependent log_sigma_sq
        log_sigma = tf.layers.dense(
            logits,
            act_size[0],
            activation=None,
            name="log_std",
            kernel_initializer=ModelUtils.scaled_init(0.01),
        )
        log_sigma = tf.clip_by_value(log_sigma, log_sigma_min, log_sigma_max)
        sigma = tf.exp(log_sigma)
        return self.MuSigmaTensors(mu, log_sigma, sigma)

    def _create_sampled_policy(
        self, encoded: "GaussianDistribution.MuSigmaTensors"
    ) -> tf.Tensor:
        epsilon = tf.random_normal(tf.shape(encoded.mu))
        sampled_policy = encoded.mu + encoded.sigma * epsilon

        return sampled_policy

    def _create_log_probs(
        self, sampled_policy: tf.Tensor, encoded: "GaussianDistribution.MuSigmaTensors"
    ) -> tf.Tensor:
        _gauss_pre = -0.5 * (
            ((sampled_policy - encoded.mu) / (encoded.sigma + EPSILON)) ** 2
            + 2 * encoded.log_sigma
            + np.log(2 * np.pi)
        )
        return _gauss_pre

    def _create_entropy(
        self, encoded: "GaussianDistribution.MuSigmaTensors"
    ) -> tf.Tensor:
        single_dim_entropy = 0.5 * tf.reduce_mean(
            tf.log(2 * np.pi * np.e) + tf.square(encoded.log_sigma)
        )
        # Make entropy the right shape
        return tf.ones_like(tf.reshape(encoded.mu[:, 0], [-1])) * single_dim_entropy

    def _do_squash_correction_for_tanh(self, probs, squashed_policy):
        """
        Adjust probabilities for squashed sample before output
        """
        probs -= tf.log(1 - squashed_policy ** 2 + EPSILON)
        return probs

    @property
    def total_log_probs(self) -> tf.Tensor:
        return self._total_prob

    @property
    def log_probs(self) -> tf.Tensor:
        return self._all_probs

    @property
    def sample(self) -> tf.Tensor:
        return self._sampled_policy

    @property
    def entropy(self) -> tf.Tensor:
        return self._entropy


class MultiCategoricalDistribution(DiscreteOutputDistribution):
    """
    A categorical distribution for multi-branched discrete actions. Also supports action masking.
    """

    def __init__(self, logits: tf.Tensor, act_size: List[int], action_masks: tf.Tensor):
        """
        A categorical distribution for multi-branched discrete actions.
        :param logits: Hidden layer to use as the input to the Gaussian distribution.
        :param act_size: List containing the number of discrete actions per branch.
        :param action_masks: Tensor representing action masks. Should be of length sum(act_size), and 0 for masked
            and 1 for unmasked.
        """
        unmasked_log_probs = self._create_policy_branches(logits, act_size)
        (
            self._sampled_policy,
            self._all_probs,
            action_index,
        ) = self._get_masked_actions_probs(unmasked_log_probs, act_size, action_masks)
        self._sampled_onehot = self._action_onehot(self._sampled_policy, act_size)
        self._entropy = self._create_entropy(self._all_probs, action_index, act_size)
        self._total_prob = self._get_log_probs(
            self._sampled_onehot, self._all_probs, action_index, act_size
        )

    def _create_policy_branches(
        self, logits: tf.Tensor, act_size: List[int]
    ) -> List[tf.Tensor]:
        policy_branches = []
        for size in act_size:
            policy_branches.append(
                tf.layers.dense(
                    logits,
                    size,
                    activation=None,
                    use_bias=False,
                    kernel_initializer=ModelUtils.scaled_init(0.01),
                )
            )
        return policy_branches

    def _get_masked_actions_probs(
        self,
        unmasked_log_probs: List[tf.Tensor],
        act_size: List[int],
        action_masks: tf.Tensor,
    ) -> Tuple[tf.Tensor, tf.Tensor, np.ndarray]:
        output, _, all_log_probs = ModelUtils.create_discrete_action_masking_layer(
            unmasked_log_probs, action_masks, act_size
        )

        action_idx = [0] + list(np.cumsum(act_size))
        return output, all_log_probs, action_idx

    def _action_onehot(self, sample: tf.Tensor, act_size: List[int]) -> tf.Tensor:
        action_oh = tf.concat(
            [tf.one_hot(sample[:, i], act_size[i]) for i in range(len(act_size))],
            axis=1,
        )
        return action_oh

    def _get_log_probs(
        self,
        sample_onehot: tf.Tensor,
        all_log_probs: tf.Tensor,
        action_idx: List[int],
        act_size: List[int],
    ) -> tf.Tensor:
        log_probs = tf.reduce_sum(
            (
                tf.stack(
                    [
                        -tf.nn.softmax_cross_entropy_with_logits_v2(
                            labels=sample_onehot[:, action_idx[i] : action_idx[i + 1]],
                            logits=all_log_probs[:, action_idx[i] : action_idx[i + 1]],
                        )
                        for i in range(len(act_size))
                    ],
                    axis=1,
                )
            ),
            axis=1,
            keepdims=True,
        )
        return log_probs

    def _create_entropy(
        self, all_log_probs: tf.Tensor, action_idx: List[int], act_size: List[int]
    ) -> tf.Tensor:
        entropy = tf.reduce_sum(
            (
                tf.stack(
                    [
                        tf.nn.softmax_cross_entropy_with_logits_v2(
                            labels=tf.nn.softmax(
                                all_log_probs[:, action_idx[i] : action_idx[i + 1]]
                            ),
                            logits=all_log_probs[:, action_idx[i] : action_idx[i + 1]],
                        )
                        for i in range(len(act_size))
                    ],
                    axis=1,
                )
            ),
            axis=1,
        )

        return entropy

    @property
    def log_probs(self) -> tf.Tensor:
        return self._all_probs

    @property
    def total_log_probs(self) -> tf.Tensor:
        return self._total_prob

    @property
    def sample(self) -> tf.Tensor:
        return self._sampled_policy

    @property
    def sample_onehot(self) -> tf.Tensor:
        return self._sampled_onehot

    @property
    def entropy(self) -> tf.Tensor:
        return self._entropy