Fix reporting of group rewards, CLI print of group

4 年前 · a9fb37aa
--- a/ml-agents/mlagents/trainers/coma/trainer.py
+++ b/ml-agents/mlagents/trainers/coma/trainer.py
 from mlagents_envs.base_env import BehaviorSpec
 from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
 from mlagents.trainers.trainer.rl_trainer import RLTrainer
-from mlagents.trainers.optimizer import Optimizer
 from mlagents.trainers.policy import Policy
 from mlagents.trainers.policy.torch_policy import TorchPolicy
 from mlagents.trainers.coma.optimizer_torch import TorchCOMAOptimizer
            agent_buffer_trajectory,
            trajectory.next_obs,
            trajectory.next_group_obs,
-            trajectory.teammate_dones_reached
+            trajectory.all_group_dones_reached
            and trajectory.done_reached
            and not trajectory.interrupted,
        )
        # If this was a terminal trajectory, append stats and reset reward collection
        if trajectory.done_reached:
            self._update_end_episode_stats(agent_id, self.optimizer)
+            # Remove dead agents from group reward recording
+            self.collected_group_rewards.pop(agent_id)
+
+        # If the whole team is done, average the remaining group rewards.
+        if trajectory.all_group_dones_reached:
+            self.stats_reporter.add_stat(
+                "Environment/Group Cumulative Reward",
+                self.collected_group_rewards.get(agent_id, 0),
+                aggregation=StatsAggregationMethod.HISTOGRAM,
+            )
+            self.collected_group_rewards.pop(agent_id)

    def _is_ready_update(self):
        """
        """

        return self.policy
-
-    def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None:
-        super()._update_end_episode_stats(agent_id, optimizer)
-        self.stats_reporter.add_stat(
-            "Environment/Team Cumulative Reward",
-            self.collected_group_rewards.get(agent_id, 0),
-            aggregation=StatsAggregationMethod.HISTOGRAM,
-        )
-        self.collected_group_rewards.pop(agent_id)


 def lambda_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0):
--- a/ml-agents/mlagents/trainers/stats.py
+++ b/ml-agents/mlagents/trainers/stats.py
                log_info.append(f"Rank: {self.rank}")

            log_info.append(f"Mean Reward: {stats_summary.mean:0.3f}")
-            log_info.append(f"Std of Reward: {stats_summary.std:0.3f}")
+            if "Environment/Group Cumulative Reward" in values:
+                group_stats_summary = values["Environment/Group Cumulative Reward"]
+                log_info.append(f"Mean Group Reward: {group_stats_summary.mean:0.3f}")
+            else:
+                log_info.append(f"Std of Reward: {stats_summary.std:0.3f}")
            log_info.append(is_training)

            if self.self_play and "Self-play/ELO" in values:
--- a/ml-agents/mlagents/trainers/trajectory.py
+++ b/ml-agents/mlagents/trainers/trajectory.py
        return self.steps[-1].done

    @property
-    def teammate_dones_reached(self) -> bool:
+    def all_group_dones_reached(self) -> bool:
        """
        Returns true if all teammates are done at the end of the trajectory.
        Combine with done_reached to check if the whole team is done.