Don't block one policy queue

Only put policies when policy is actually updated
5 年前 · f29b17a9
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py

        pass

-    def __init__(self, behavior_id: str, maxlen: int = 1):
+    def __init__(self, behavior_id: str, maxlen: int = 20):
        """
        Initializes an AgentManagerQueue. Note that we can give it a behavior_id so that it can be identified
        separately from an AgentManager.
        self.trajectory_queue: AgentManagerQueue[Trajectory] = AgentManagerQueue(
            self.behavior_id
        )
+        # NOTE: we make policy queues of infinite length to avoid lockups of the trainers.
+        # In the environment manager, we make sure to empty the policy queue before continuing to produce steps.
-            self.behavior_id
+            self.behavior_id, maxlen=0
        )
        self.publish_trajectory_queue(self.trajectory_queue)

--- a/ml-agents/mlagents/trainers/env_manager.py
+++ b/ml-agents/mlagents/trainers/env_manager.py
        if self.first_step_infos is not None:
            self._process_step_infos(self.first_step_infos)
            self.first_step_infos = None
-        # Get new policies if found
+        # Get new policies if found. Always get the latest policy.
+            _policy = None
-                _policy = self.agent_managers[brain_name].policy_queue.get(block=False)
-                self.set_policy(brain_name, _policy)
+                # We make sure to empty the policy queue before continuing to produce steps.
+                # This halts the trainers until the policy queue is empty.
+                while True:
+                    _policy = self.agent_managers[brain_name].policy_queue.get(
+                        block=False
+                    )
-                pass
+                if _policy is not None:
+                    self.set_policy(brain_name, _policy)
        # Step the environment
        new_step_infos = self._step()
        # Add to AgentProcessor
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
            for stat, val in update_stats.items():
                self._stats_reporter.add_stat(stat, val)
        self._clear_update_buffer()
+        return True

    def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
        """
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
        )

    @timed
-    def _update_policy(self) -> None:
+    def _update_policy(self) -> bool:
+        :return: Whether or not the policy was updated.
-        self.update_sac_policy()
-        self.update_reward_signals()
+        policy_was_updated = self._update_sac_policy()
+        self._update_reward_signals()
+        return policy_was_updated

    def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
        policy = NNPolicy(

        return policy

-    def update_sac_policy(self) -> None:
+    def _update_sac_policy(self) -> bool:
-
+        has_updated = False
        self.cumulative_returns_since_policy_update.clear()
        n_sequences = max(
            int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
                for stat_name, value in update_stats.items():
                    batch_update_stats[stat_name].append(value)

-            self.update_steps += 1
+                self.update_steps += 1
+
+                for stat, stat_list in batch_update_stats.items():
+                    self._stats_reporter.add_stat(stat, np.mean(stat_list))
+                has_updated = True
+
+            if self.optimizer.bc_module:
+                update_stats = self.optimizer.bc_module.update()
+                for stat, val in update_stats.items():
+                    self._stats_reporter.add_stat(stat, val)

        # Truncate update buffer if neccessary. Truncate more than we need to to avoid truncating
        # a large buffer at each update.
            )
+        return has_updated
-        for stat, stat_list in batch_update_stats.items():
-            self._stats_reporter.add_stat(stat, np.mean(stat_list))
-
-        if self.optimizer.bc_module:
-            update_stats = self.optimizer.bc_module.update()
-            for stat, val in update_stats.items():
-                self._stats_reporter.add_stat(stat, val)
-
-    def update_reward_signals(self) -> None:
+    def _update_reward_signals(self) -> None:
        """
        Iterate through the reward signals and update them. Unlike in PPO,
        do it separate from the policy so that it can be done at a different
            int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
        )
        batch_update_stats: Dict[str, list] = defaultdict(list)
-        while self.step / self.reward_signal_update_steps > self.steps_per_update:
+        while (
+            self.step / self.reward_signal_update_steps
+            > self.reward_signal_steps_per_update
+        ):
            # Get minibatches for reward signal update if needed
            reward_signal_minibatches = {}
            for name, signal in self.optimizer.reward_signals.items():
                batch_update_stats[stat_name].append(value)
            self.reward_signal_update_steps += 1

-        for stat, stat_list in batch_update_stats.items():
-            self._stats_reporter.add_stat(stat, np.mean(stat_list))
+            for stat, stat_list in batch_update_stats.items():
+                self._stats_reporter.add_stat(stat, np.mean(stat_list))

    def add_policy(
        self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
--- a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
+++ b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
        return True

    def _update_policy(self):
-        pass
+        returnn> <span class="bp">True

    def add_policy(self):
        pass
--- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
        return False

    @abc.abstractmethod
-    def _update_policy(self):
+    def _update_policy(self) -> bool:
+        :return: Whether or not the policy was updated.
        """
        pass

        if self.should_still_train:
            if self._is_ready_update():
                with hierarchical_timer("_update_policy"):
-                    self._update_policy()
-                    for q in self.policy_queues:
-                        # Get policies that correspond to the policy queue in question
-                        q.put(self.get_policy(q.behavior_id))
+                    if self._update_policy():
+                        for q in self.policy_queues:
+                            # Get policies that correspond to the policy queue in question
+                            q.put(self.get_policy(q.behavior_id))
        else:
            self._clear_update_buffer()