浏览代码

[coma2] Make group extrinsic reward part of extrinsic (#5033)

* Make group extrinsic part of extrinsic

* Fix test and init

* Fix tests and bug

* Add baseline loss to TensorBoard
/develop/action-slice
GitHub 3 年前
当前提交
ba2af269
共有 10 个文件被更改,包括 122 次插入50 次删除
  1. 4
      config/ppo/PushBlockCollab.yaml
  2. 31
      ml-agents/mlagents/trainers/coma/optimizer_torch.py
  3. 10
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  4. 12
      ml-agents/mlagents/trainers/settings.py
  5. 24
      ml-agents/mlagents/trainers/tests/torch/test_coma.py
  6. 27
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_extrinsic.py
  7. 3
      ml-agents/mlagents/trainers/torch/components/reward_providers/__init__.py
  8. 33
      ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py
  9. 4
      ml-agents/mlagents/trainers/torch/components/reward_providers/reward_provider_factory.py
  10. 24
      ml-agents/mlagents/trainers/torch/components/reward_providers/group_extrinsic_reward_provider.py

4
config/ppo/PushBlockCollab.yaml


num_layers: 2
vis_encode_type: simple
reward_signals:
group:
extrinsic:
max_steps: 20000000 #2000000
max_steps: 20000000
time_horizon: 64
summary_freq: 60000
threaded: true

31
ml-agents/mlagents/trainers/coma/optimizer_torch.py


from mlagents_envs.base_env import ObservationSpec, ActionSpec
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
from mlagents.trainers.settings import TrainerSettings, PPOSettings
from mlagents.trainers.settings import (
ExtrinsicSettings,
RewardSignalSettings,
RewardSignalType,
TrainerSettings,
PPOSettings,
)
from mlagents.trainers.torch.networks import Critic, MultiInputNetworkBody
from mlagents.trainers.torch.decoders import ValueHeads
from mlagents.trainers.torch.agent_action import AgentAction

from mlagents.trainers.settings import NetworkSettings
from mlagents_envs.logging_util import get_logger
logger = get_logger(__name__)
class TorchCOMAOptimizer(TorchOptimizer):

self.stream_names = list(self.reward_signals.keys())
self.value_memory_dict: Dict[str, torch.Tensor] = {}
self.baseline_memory_dict: Dict[str, torch.Tensor] = {}
def create_reward_signals(
self, reward_signal_configs: Dict[RewardSignalType, RewardSignalSettings]
) -> None:
"""
Create reward signals. Override default to provide warnings for Curiosity and
GAIL, and make sure Extrinsic adds team rewards.
:param reward_signal_configs: Reward signal config.
"""
for reward_signal, settings in reward_signal_configs.items():
if reward_signal != RewardSignalType.EXTRINSIC:
logger.warning(
f"Reward Signal {reward_signal.value} is not supported with the COMA2 trainer; \
results may be unexpected."
)
elif isinstance(settings, ExtrinsicSettings):
settings.add_groupmate_rewards = True
super().create_reward_signals(reward_signal_configs)
@property
def critic(self):

# TODO: After PyTorch is default, change to something more correct.
"Losses/Policy Loss": torch.abs(policy_loss).item(),
"Losses/Value Loss": value_loss.item(),
"Losses/Baseline Loss": baseline_loss.item(),
"Policy/Learning Rate": decay_lr,
"Policy/Epsilon": decay_eps,
"Policy/Beta": decay_bet,

10
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer import Optimizer
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.settings import (
TrainerSettings,
RewardSignalSettings,
RewardSignalType,
)
from mlagents.trainers.torch.utils import ModelUtils

def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
pass
def create_reward_signals(self, reward_signal_configs):
def create_reward_signals(
self, reward_signal_configs: Dict[RewardSignalType, RewardSignalSettings]
) -> None:
"""
Create reward signals
:param reward_signal_configs: Reward signal config.

12
ml-agents/mlagents/trainers/settings.py


# INTRINSIC REWARD SIGNALS #############################################################
class RewardSignalType(Enum):
EXTRINSIC: str = "extrinsic"
GROUP_EXTRINSIC: str = "group"
GAIL: str = "gail"
CURIOSITY: str = "curiosity"
RND: str = "rnd"

RewardSignalType.EXTRINSIC: RewardSignalSettings,
RewardSignalType.GROUP_EXTRINSIC: RewardSignalSettings,
RewardSignalType.EXTRINSIC: ExtrinsicSettings,
RewardSignalType.GAIL: GAILSettings,
RewardSignalType.CURIOSITY: CuriositySettings,
RewardSignalType.RND: RNDSettings,

"encoding_size"
]
return d_final
@attr.s(auto_attribs=True)
class ExtrinsicSettings(RewardSignalSettings):
# For use with COMA2. Add groupmate rewards to the final extrinsic reward.
add_groupmate_rewards = False
@attr.s(auto_attribs=True)

network_settings: NetworkSettings = attr.ib(factory=NetworkSettings)
reward_signals: Dict[RewardSignalType, RewardSignalSettings] = attr.ib(
factory=lambda: {RewardSignalType.EXTRINSIC: RewardSignalSettings()}
factory=lambda: {RewardSignalType.EXTRINSIC: ExtrinsicSettings()}
)
init_path: Optional[str] = None
keep_checkpoints: int = 5

24
ml-agents/mlagents/trainers/tests/torch/test_coma.py


import attr
from mlagents.trainers.coma.optimizer_torch import TorchCOMAOptimizer
from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
from mlagents.trainers.settings import ExtrinsicSettings, RewardSignalType
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.tests import mock_brain as mb

trainer_settings = attr.evolve(dummy_config)
trainer_settings.reward_signals = {
RewardSignalType.GROUP_EXTRINSIC: RewardSignalSettings(strength=1.0, gamma=0.99)
RewardSignalType.EXTRINSIC: ExtrinsicSettings(
strength=1.0, gamma=0.99, add_groupmate_rewards=True
)
}
trainer_settings.network_settings.memory = (

max_step_complete=True,
num_other_agents_in_group=NUM_AGENTS,
)
value_estimates, baseline_estimates, next_value_estimates = optimizer.get_trajectory_and_baseline_value_estimates(
(
value_estimates,
baseline_estimates,
next_value_estimates,
) = optimizer.get_trajectory_and_baseline_value_estimates(
trajectory.to_agentbuffer(),
trajectory.next_obs,
trajectory.next_group_obs,

# if all_memories is not None:
# assert len(all_memories) == 15
value_estimates, baseline_estimates, next_value_estimates = optimizer.get_trajectory_and_baseline_value_estimates(
(
value_estimates,
baseline_estimates,
next_value_estimates,
) = optimizer.get_trajectory_and_baseline_value_estimates(
trajectory.to_agentbuffer(),
trajectory.next_obs,
trajectory.next_group_obs,

# Check if we ignore terminal states properly
optimizer.reward_signals["group"].use_terminal_states = False
value_estimates, baseline_estimates, next_value_estimates = optimizer.get_trajectory_and_baseline_value_estimates(
(
value_estimates,
baseline_estimates,
next_value_estimates,
) = optimizer.get_trajectory_and_baseline_value_estimates(
trajectory.to_agentbuffer(),
trajectory.next_obs,
trajectory.next_group_obs,

27
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_extrinsic.py


from mlagents.trainers.buffer import BufferKey
import numpy as np
from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
from mlagents.trainers.settings import ExtrinsicSettings, RewardSignalType
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
create_agent_buffer,
)

],
)
def test_construction(behavior_spec: BehaviorSpec) -> None:
settings = RewardSignalSettings()
settings = ExtrinsicSettings()
settings.gamma = 0.2
extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
assert extrinsic_rp.gamma == 0.2

],
)
def test_factory(behavior_spec: BehaviorSpec) -> None:
settings = RewardSignalSettings()
settings = ExtrinsicSettings()
extrinsic_rp = create_reward_provider(
RewardSignalType.EXTRINSIC, behavior_spec, settings
)

)
def test_reward(behavior_spec: BehaviorSpec, reward: float) -> None:
buffer = create_agent_buffer(behavior_spec, 1000, reward)
settings = RewardSignalSettings()
settings = ExtrinsicSettings()
# Test group rewards. Rewards should be double of the environment rewards, but shouldn't count
# the groupmate rewards.
buffer[BufferKey.GROUP_REWARD] = buffer[BufferKey.ENVIRONMENT_REWARDS]
# 2 agents with identical rewards
buffer[BufferKey.GROUPMATE_REWARDS].set(
[np.ones(1, dtype=np.float32) * reward] * 2
for _ in range(buffer.num_experiences)
)
generated_rewards = extrinsic_rp.evaluate(buffer)
assert (generated_rewards == 2 * reward).all()
# Test groupmate rewards. Total reward should be indiv_reward + 2 * teammate_reward + group_reward
settings.add_groupmate_rewards = True
extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
generated_rewards = extrinsic_rp.evaluate(buffer)
assert (generated_rewards == 4 * reward).all()

3
ml-agents/mlagents/trainers/torch/components/reward_providers/__init__.py


from mlagents.trainers.torch.components.reward_providers.extrinsic_reward_provider import ( # noqa F401
ExtrinsicRewardProvider,
)
from mlagents.trainers.torch.components.reward_providers.group_extrinsic_reward_provider import ( # noqa F401
GroupExtrinsicRewardProvider,
)
from mlagents.trainers.torch.components.reward_providers.curiosity_reward_provider import ( # noqa F401
CuriosityRewardProvider,
)

33
ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py


from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
BaseRewardProvider,
)
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.settings import ExtrinsicSettings
"""
Evaluates extrinsic reward. For single-agent, this equals the individual reward
given to the agent. For the COMA2 algorithm, we want not only the individual reward
but also the team and the individual rewards of the other agents.
"""
def __init__(self, specs: BehaviorSpec, settings: ExtrinsicSettings) -> None:
super().__init__(specs, settings)
self._add_groupmate_rewards = settings.add_groupmate_rewards
return np.array(mini_batch[BufferKey.ENVIRONMENT_REWARDS], dtype=np.float32)
indiv_rewards = np.array(
mini_batch[BufferKey.ENVIRONMENT_REWARDS], dtype=np.float32
)
total_rewards = indiv_rewards
if (
BufferKey.GROUPMATE_REWARDS in mini_batch
and BufferKey.GROUP_REWARD in mini_batch
):
if self._add_groupmate_rewards:
groupmate_rewards_list = mini_batch[BufferKey.GROUPMATE_REWARDS]
groupmate_rewards_sum = np.array(
[sum(_rew) for _rew in groupmate_rewards_list], dtype=np.float32
)
total_rewards += groupmate_rewards_sum
group_rewards = np.array(
mini_batch[BufferKey.GROUP_REWARD], dtype=np.float32
)
# Add all the group rewards to the individual rewards
total_rewards += group_rewards
return total_rewards
def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
return {}

4
ml-agents/mlagents/trainers/torch/components/reward_providers/reward_provider_factory.py


from mlagents.trainers.torch.components.reward_providers.gail_reward_provider import (
GAILRewardProvider,
)
from mlagents.trainers.torch.components.reward_providers.group_extrinsic_reward_provider import (
GroupExtrinsicRewardProvider,
)
from mlagents.trainers.torch.components.reward_providers.rnd_reward_provider import (
RNDRewardProvider,
)

NAME_TO_CLASS: Dict[RewardSignalType, Type[BaseRewardProvider]] = {
RewardSignalType.EXTRINSIC: ExtrinsicRewardProvider,
RewardSignalType.GROUP_EXTRINSIC: GroupExtrinsicRewardProvider,
RewardSignalType.CURIOSITY: CuriosityRewardProvider,
RewardSignalType.GAIL: GAILRewardProvider,
RewardSignalType.RND: RNDRewardProvider,

24
ml-agents/mlagents/trainers/torch/components/reward_providers/group_extrinsic_reward_provider.py


import numpy as np
from typing import Dict
from mlagents.trainers.buffer import AgentBuffer, BufferKey
from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
BaseRewardProvider,
)
class GroupExtrinsicRewardProvider(BaseRewardProvider):
def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
indiv_rewards = np.array(
mini_batch[BufferKey.ENVIRONMENT_REWARDS], dtype=np.float32
)
groupmate_rewards_list = mini_batch[BufferKey.GROUPMATE_REWARDS]
groupmate_rewards_sum = np.array(
[sum(_rew) for _rew in groupmate_rewards_list], dtype=np.ndarray
)
group_rewards = np.array(mini_batch[BufferKey.GROUP_REWARD], dtype=np.float32)
# Add all the group rewards to the individual rewards
return indiv_rewards + groupmate_rewards_sum + group_rewards
def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
return {}
正在加载...
取消
保存