ml-agents/ml-agents/mlagents/trainers/torch/components/reward_providers/base_reward_provider.py


								import numpy as np

								from mlagents.torch_utils import torch

								from abc import ABC, abstractmethod

								from typing import Dict


								from mlagents.trainers.buffer import AgentBuffer

								from mlagents.trainers.settings import RewardSignalSettings

								from mlagents_envs.base_env import BehaviorSpec


								class BaseRewardProvider(ABC):

								    def __init__(self, specs: BehaviorSpec, settings: RewardSignalSettings) -> None:

								        self._policy_specs = specs

								        self._gamma = settings.gamma

								        self._strength = settings.strength

								        self._ignore_done = False


								    @property

								    def gamma(self) -> float:

								        """

								        The discount factor for the reward signal

								        """

								        return self._gamma


								    @property

								    def strength(self) -> float:

								        """

								        The strength multiplier of the reward provider

								        """

								        return self._strength


								    @property

								    def name(self) -> str:

								        """

								        The name of the reward provider. Is used for reporting and identification

								        """

								        class_name = self.__class__.__name__

								        return class_name.replace("RewardProvider", "")


								    @property

								    def ignore_done(self) -> bool:

								        """

								        If true, when the agent is done, the rewards of the next episode must be

								        used to calculate the return of the current episode.

								        Is used to mitigate the positive bias in rewards with no natural end.

								        """

								        return self._ignore_done


								    @abstractmethod

								    def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:

								        """

								        Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward

								        function drawn straight from a Buffer.

								        :param mini_batch: A Dict of numpy arrays (the format used by our Buffer)

								            when drawing from the update buffer.

								        :return: a np.ndarray of rewards generated by the reward provider

								        """

								        raise NotImplementedError(

								            "The reward provider's evaluate method has not been implemented "

								        )


								    @abstractmethod

								    def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:

								        """

								        Update the reward for the data present in the Dict mini_batch. Use this when updating a reward

								        function drawn straight from a Buffer.

								        :param mini_batch: A Dict of numpy arrays (the format used by our Buffer)

								            when drawing from the update buffer.

								        :return: A dictionary from string to stats values

								        """

								        raise NotImplementedError(

								            "The reward provider's update method has not been implemented "

								        )


								    def get_modules(self) -> Dict[str, torch.nn.Module]:

								        """

								        Returns a dictionary of string identifiers to the torch.nn.Modules used by

								        the reward providers. This method is used for loading and saving the weights

								        of the reward providers.

								        """

								        return {}