浏览代码

Fix reporting of group rewards, CLI print of group

/develop/coma2/fixgroup
Ervin Teng 4 年前
当前提交
a9fb37aa
共有 3 个文件被更改,包括 18 次插入13 次删除
  1. 23
      ml-agents/mlagents/trainers/coma/trainer.py
  2. 6
      ml-agents/mlagents/trainers/stats.py
  3. 2
      ml-agents/mlagents/trainers/trajectory.py

23
ml-agents/mlagents/trainers/coma/trainer.py


from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.optimizer import Optimizer
from mlagents.trainers.policy import Policy
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.coma.optimizer_torch import TorchCOMAOptimizer

agent_buffer_trajectory,
trajectory.next_obs,
trajectory.next_group_obs,
trajectory.teammate_dones_reached
trajectory.all_group_dones_reached
and trajectory.done_reached
and not trajectory.interrupted,
)

# If this was a terminal trajectory, append stats and reset reward collection
if trajectory.done_reached:
self._update_end_episode_stats(agent_id, self.optimizer)
# Remove dead agents from group reward recording
self.collected_group_rewards.pop(agent_id)
# If the whole team is done, average the remaining group rewards.
if trajectory.all_group_dones_reached:
self.stats_reporter.add_stat(
"Environment/Group Cumulative Reward",
self.collected_group_rewards.get(agent_id, 0),
aggregation=StatsAggregationMethod.HISTOGRAM,
)
self.collected_group_rewards.pop(agent_id)
def _is_ready_update(self):
"""

"""
return self.policy
def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None:
super()._update_end_episode_stats(agent_id, optimizer)
self.stats_reporter.add_stat(
"Environment/Team Cumulative Reward",
self.collected_group_rewards.get(agent_id, 0),
aggregation=StatsAggregationMethod.HISTOGRAM,
)
self.collected_group_rewards.pop(agent_id)
def lambda_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0):

6
ml-agents/mlagents/trainers/stats.py


log_info.append(f"Rank: {self.rank}")
log_info.append(f"Mean Reward: {stats_summary.mean:0.3f}")
log_info.append(f"Std of Reward: {stats_summary.std:0.3f}")
if "Environment/Group Cumulative Reward" in values:
group_stats_summary = values["Environment/Group Cumulative Reward"]
log_info.append(f"Mean Group Reward: {group_stats_summary.mean:0.3f}")
else:
log_info.append(f"Std of Reward: {stats_summary.std:0.3f}")
log_info.append(is_training)
if self.self_play and "Self-play/ELO" in values:

2
ml-agents/mlagents/trainers/trajectory.py


return self.steps[-1].done
@property
def teammate_dones_reached(self) -> bool:
def all_group_dones_reached(self) -> bool:
"""
Returns true if all teammates are done at the end of the trajectory.
Combine with done_reached to check if the whole team is done.

正在加载...
取消
保存