浏览代码

Group reward function

/develop/action-slice
Ervin Teng 4 年前
当前提交
c6904f86
共有 5 个文件被更改,包括 52 次插入4 次删除
  1. 24
      ml-agents/mlagents/trainers/coma/trainer.py
  2. 1
      ml-agents/mlagents/trainers/settings.py
  3. 3
      ml-agents/mlagents/trainers/torch/components/reward_providers/__init__.py
  4. 4
      ml-agents/mlagents/trainers/torch/components/reward_providers/reward_provider_factory.py
  5. 24
      ml-agents/mlagents/trainers/torch/components/reward_providers/group_extrinsic_reward_provider.py

24
ml-agents/mlagents/trainers/coma/trainer.py


import numpy as np
from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
from mlagents.trainers.optimizer import Optimizer
from mlagents.trainers.policy import Policy
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.coma.optimizer_torch import TorchCOMAOptimizer

)
self.seed = seed
self.policy: Policy = None # type: ignore
self.collected_rewards["environment_team"] = defaultdict(lambda: 0)
def _process_trajectory(self, trajectory: Trajectory) -> None:
"""

self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]
)
self.collected_rewards["environment_team"][agent_id] += np.sum(
agent_buffer_trajectory[BufferKey.GROUP_REWARD]
)
for name, reward_signal in self.optimizer.reward_signals.items():
evaluate_result = (
reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength

tmp_returns = []
for name in self.optimizer.reward_signals:
local_rewards = agent_buffer_trajectory[
RewardSignalUtil.rewards_key(name)
].get_batch()
local_rewards = np.array(
agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].get_batch(),
dtype=np.float32,
)
baseline_estimates = agent_buffer_trajectory[
RewardSignalUtil.baseline_estimates_key(name)
].get_batch()

return self.policy
def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None:
super()._update_end_episode_stats(agent_id, optimizer)
if "environment_team" in self.collected_rewards:
self.stats_reporter.add_stat(
"Environment/Team Cumulative Reward",
self.collected_rewards["environment_team"].get(agent_id, 0),
aggregation=StatsAggregationMethod.HISTOGRAM,
)
def discount_rewards(r, gamma=0.99, value_next=0.0):
"""

:param lambd: GAE weighing factor.
:return: list of advantage estimates for time-steps t to T.
"""
rewards = np.array(rewards)
returns_b = lambda_return(
rewards, baseline_estimates, gamma=gamma, lambd=lambd, value_next=value_next
)

1
ml-agents/mlagents/trainers/settings.py


# INTRINSIC REWARD SIGNALS #############################################################
class RewardSignalType(Enum):
EXTRINSIC: str = "extrinsic"
GROUP_EXTRINSIC: str = "group"
GAIL: str = "gail"
CURIOSITY: str = "curiosity"
RND: str = "rnd"

3
ml-agents/mlagents/trainers/torch/components/reward_providers/__init__.py


from mlagents.trainers.torch.components.reward_providers.extrinsic_reward_provider import ( # noqa F401
ExtrinsicRewardProvider,
)
from mlagents.trainers.torch.components.reward_providers.group_extrinsic_reward_provider import ( # noqa F401
GroupExtrinsicRewardProvider,
)
from mlagents.trainers.torch.components.reward_providers.curiosity_reward_provider import ( # noqa F401
CuriosityRewardProvider,
)

4
ml-agents/mlagents/trainers/torch/components/reward_providers/reward_provider_factory.py


from mlagents.trainers.torch.components.reward_providers.gail_reward_provider import (
GAILRewardProvider,
)
from mlagents.trainers.torch.components.reward_providers.group_extrinsic_reward_provider import (
GroupExtrinsicRewardProvider,
)
from mlagents.trainers.torch.components.reward_providers.rnd_reward_provider import (
RNDRewardProvider,
)

NAME_TO_CLASS: Dict[RewardSignalType, Type[BaseRewardProvider]] = {
RewardSignalType.EXTRINSIC: ExtrinsicRewardProvider,
RewardSignalType.GROUP_EXTRINSIC: GroupExtrinsicRewardProvider,
RewardSignalType.CURIOSITY: CuriosityRewardProvider,
RewardSignalType.GAIL: GAILRewardProvider,
RewardSignalType.RND: RNDRewardProvider,

24
ml-agents/mlagents/trainers/torch/components/reward_providers/group_extrinsic_reward_provider.py


import numpy as np
from typing import Dict
from mlagents.trainers.buffer import AgentBuffer, BufferKey
from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
BaseRewardProvider,
)
class GroupExtrinsicRewardProvider(BaseRewardProvider):
def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
indiv_rewards = np.array(
mini_batch[BufferKey.ENVIRONMENT_REWARDS], dtype=np.float32
)
groupmate_rewards_list = mini_batch[BufferKey.GROUPMATE_REWARDS]
groupmate_rewards_sum = np.array(
[sum(_rew) for _rew in groupmate_rewards_list], dtype=np.ndarray
)
group_rewards = np.array(mini_batch[BufferKey.GROUP_REWARD], dtype=np.float32)
# Add all the group rewards to the individual rewards
return indiv_rewards + groupmate_rewards_sum + group_rewards
def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
return {}
正在加载...
取消
保存