ml-agents/ml-agents/mlagents/trainers/torch/distributions.py


								import torch

								from torch import nn

								import numpy as np

								import math


								EPSILON = 1e-7  # Small value to avoid divide by zero


								class GaussianDistInstance(nn.Module):

								    def __init__(self, mean, std):

								        super().__init__()

								        self.mean = mean

								        self.std = std


								    def sample(self):

								        sample = self.mean + torch.randn_like(self.mean) * self.std

								        return sample


								    def log_prob(self, value):

								        var = self.std ** 2

								        log_scale = torch.log(self.std + EPSILON)

								        return (

								            -((value - self.mean) ** 2) / (2 * var + EPSILON)

								            - log_scale

								            - math.log(math.sqrt(2 * math.pi))

								        )


								    def pdf(self, value):

								        log_prob = self.log_prob(value)

								        return torch.exp(log_prob)


								    def entropy(self):

								        return torch.log(2 * math.pi * math.e * self.std + EPSILON)


								class TanhGaussianDistInstance(GaussianDistInstance):

								    def __init__(self, mean, std):

								        super().__init__(mean, std)

								        self.transform = torch.distributions.transforms.TanhTransform(cache_size=1)


								    def sample(self):

								        unsquashed_sample = super().sample()

								        squashed = self.transform(unsquashed_sample)

								        return squashed


								    def _inverse_tanh(self, value):

								        capped_value = torch.clamp(value, -1 + EPSILON, 1 - EPSILON)

								        return 0.5 * torch.log((1 + capped_value) / (1 - capped_value) + EPSILON)


								    def log_prob(self, value):

								        unsquashed = self.transform.inv(value)

								        return super().log_prob(unsquashed) - self.transform.log_abs_det_jacobian(

								            unsquashed, value

								        )


								class CategoricalDistInstance(nn.Module):

								    def __init__(self, logits):

								        super().__init__()

								        self.logits = logits

								        self.probs = torch.softmax(self.logits, dim=-1)


								    def sample(self):

								        return torch.multinomial(self.probs, 1)


								    def pdf(self, value):

								        return torch.diag(self.probs.T[value.flatten().long()])


								    def log_prob(self, value):

								        return torch.log(self.pdf(value))


								    def all_log_prob(self):

								        return torch.log(self.probs)


								    def entropy(self):

								        return torch.sum(self.probs * torch.log(self.probs), dim=-1)


								class GaussianDistribution(nn.Module):

								    def __init__(

								        self,

								        hidden_size,

								        num_outputs,

								        conditional_sigma=False,

								        tanh_squash=False,

								        **kwargs

								    ):

								        super().__init__(**kwargs)

								        self.conditional_sigma = conditional_sigma

								        self.mu = nn.Linear(hidden_size, num_outputs)

								        self.tanh_squash = tanh_squash

								        nn.init.xavier_uniform_(self.mu.weight, gain=0.01)

								        if conditional_sigma:

								            self.log_sigma = nn.Linear(hidden_size, num_outputs)

								            nn.init.xavier_uniform(self.log_sigma.weight, gain=0.01)

								        else:

								            self.log_sigma = nn.Parameter(

								                torch.zeros(1, num_outputs, requires_grad=True)

								            )


								    def forward(self, inputs):

								        mu = self.mu(inputs)

								        if self.conditional_sigma:

								            log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2)

								        else:

								            log_sigma = self.log_sigma

								        if self.tanh_squash:

								            return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))]

								        else:

								            return [GaussianDistInstance(mu, torch.exp(log_sigma))]


								class MultiCategoricalDistribution(nn.Module):

								    def __init__(self, hidden_size, act_sizes):

								        super().__init__()

								        self.act_sizes = act_sizes

								        self.branches = self.create_policy_branches(hidden_size)


								    def create_policy_branches(self, hidden_size):

								        branches = []

								        for size in self.act_sizes:

								            branch_output_layer = nn.Linear(hidden_size, size)

								            nn.init.xavier_uniform_(branch_output_layer.weight, gain=0.01)

								            branches.append(branch_output_layer)

								        return nn.ModuleList(branches)


								    def mask_branch(self, logits, mask):

								        raw_probs = torch.nn.functional.softmax(logits, dim=-1) * mask

								        normalized_probs = raw_probs / torch.sum(raw_probs, dim=-1).unsqueeze(-1)

								        normalized_logits = torch.log(normalized_probs + EPSILON)

								        return normalized_logits


								    def split_masks(self, masks):

								        split_masks = []

								        for idx, _ in enumerate(self.act_sizes):

								            start = int(np.sum(self.act_sizes[:idx]))

								            end = int(np.sum(self.act_sizes[: idx + 1]))

								            split_masks.append(masks[:, start:end])

								        return split_masks


								    def forward(self, inputs, masks):

								        # Todo - Support multiple branches in mask code

								        branch_distributions = []

								        masks = self.split_masks(masks)

								        for idx, branch in enumerate(self.branches):

								            logits = branch(inputs)

								            norm_logits = self.mask_branch(logits, masks[idx])

								            distribution = CategoricalDistInstance(norm_logits)

								            branch_distributions.append(distribution)

								        return branch_distributions