docstrings for all ghost trainer functions

5 年前 · b15a8b75
--- a/ml-agents/mlagents/trainers/ghost/trainer.py
+++ b/ml-agents/mlagents/trainers/ghost/trainer.py


 class GhostTrainer(Trainer):
+    """
+    The GhostTrainer trains agents in adversarial games (there are teams in opposition) using a self-play mechanism.
+    In adversarial settings with self-play, at any time, there is only a single learning team. The other team(s) is
+    "ghosted" which means that its agents are executing fixed policies and not learning. The GhostTrainer wraps
+    a standard RL trainer which trains the learning team and ensures that only the trajectories collected
+    by the learning team are used for training.  The GhostTrainer also maintains past policy snapshots to be used
+    as the fixed policies when the team is not learning. The GhostTrainer is 1:1 with brain_names as the other
+    trainers, and is responsible for one or more teams. Note, a GhostTrainer can have only one team in
+    asymmetric games where there is only one team with a particular behavior i.e. Hide and Seek.
+    The GhostController manages high level coordination between multiple ghost trainers. The learning team id
+    is cycled throughout a training run.
+    """
+
    def __init__(
        self,
        trainer,
        run_id,
    ):
        """
-        Responsible for collecting experiences and training trainer model via self_play.
+        Creates a GhostTrainer.
-        :param controller: Object that coordinates all ghost trainers
+        :param controller: GhostController that coordinates all ghost trainers and calculates ELO
        :param reward_buff_cap: Max reward history to track in the reward buffer
        :param trainer_parameters: The parameters for the trainer (dictionary).
        :param training: Whether the trainer is set for training.

    @property
    def current_elo(self) -> float:
+        """
+        Gets ELO of current policy which is always last in the list
+        :return: ELO of current policy
+        """
+        """
+        Changes elo of current policy which is always last in the list
+        :param change: Amount to change current elo by
+        """
+        """
+        Get elo of current opponent policy
+        :return: ELO of current opponent policy
+        """
+        """
+        Changes elo of current opponent policy
+        :param change: Amount to change current opponent elo by
+        """
+        """
+        Determines the final result of an episode and asks the GhostController
+        to calculate the ELO change. The GhostController changes the ELO
+        of the opponent policy since this may be in a different GhostTrainer
+        i.e. in asymmetric games. We assume the last reward determines the winner.
+        :param trajectory: Trajectory.
+        """
        if trajectory.done_reached and not trajectory.max_step_reached:
            # Assumption is that final reward is 1/.5/0 for win/draw/loss
            final_reward = trajectory.steps[-1].reward
            self.last_swap = self.ghost_step

    def end_episode(self):
+        """
+        Forwarding call to wrapped trainers end_episode
+        """
+        """
+        Forwarding call to wrapped trainers save_model
+        """
+        """
+        Forwarding call to wrapped trainers export_model
+        """
+        """
+        Creates policy with the wrapped trainer's create_policy function
+        """
        return self.trainer.create_policy(brain_parameters)

    def add_policy(
-        Adds policy to trainer. For the first policy added, add a trainer
-        to the policy and set the learning behavior name to name_behavior_id.
+        Adds policy to trainer. The first policy encountered sets the wrapped
+        trainer team.  This is to ensure that all agents from the same multi-agent
+        team are grouped. All policies associated with this team are added to the
+        wrapped trainer to be trained.
        :param name_behavior_id: Behavior ID that the policy should belong to.
        :param policy: Policy to associate with name_behavior_id.
        """
            policy.init_load_weights()

    def get_policy(self, name_behavior_id: str) -> TFPolicy:
+        """
+        Gets policy associated with name_behavior_id
+        :param name_behavior_id: Fully qualified behavior name
+        :return: Policy associated with name_behavior_id
+        """
+        """
+        Saves a snapshot of the weights of the policy and maintains the policy_snapshots
+        according to the window size
+        :param policy: The policy to be snapshotted
+        """
        weights = policy.get_weights()
        try:
            self.policy_snapshots[self.snapshot_counter] = weights
        self.snapshot_counter = (self.snapshot_counter + 1) % self.window

    def _swap_snapshots(self) -> None:
+        """
+        Swaps the appropriate weight to the policy and pushes it to respective policy queues
+        """
+
-            # here is the place for a sampling protocol
-            if parsed_behavior_id.team_id == self._learning_team:
-                continue
-            elif np.random.uniform() < (1 - self.play_against_current_self_ratio):
+            # Here is the place for a sampling protocol. If the learning team switches
+            # immediately before swapping, this first check ensures that the new learning
+            # team gets the current policy snapshot. Otherwise, it redundantly swaps
+            # the current policy.
+            if parsed_behavior_id.team_id != self._learning_team and np.random.uniform() < (
+                1 - self.play_against_current_self_ratio
+            ):
                x = np.random.randint(len(self.policy_snapshots))
                snapshot = self.policy_snapshots[x]
            else:

    def publish_policy_queue(self, policy_queue: AgentManagerQueue[Policy]) -> None:
        """
-        Adds a policy queue to the list of queues to publish to when this Trainer
-        makes a policy update
+        Adds a policy queue for every member of the team to the list of queues to publish to when this Trainer
+        makes a policy update.  Creates an internal policy queue for the wrapped
+        trainer to push to.  The GhostTrainer pushes all policies to the env.
        :param queue: Policy queue to publish to.
        """
        super().publish_policy_queue(policy_queue)
        self, trajectory_queue: AgentManagerQueue[Trajectory]
    ) -> None:
        """
-        Adds a trajectory queue to the list of queues for the trainer to ingest Trajectories from.
+        Adds a trajectory queue for every member of the team to the list of queues for the trainer
+        to ingest Trajectories from. Creates an internal trajectory queue to push trajectories from
+        the learning team.  The wrapped trainer subscribes to this queue.
        :param queue: Trajectory queue to publish to.
        """
        super().subscribe_trajectory_queue(trajectory_queue)