Better decoupling for agent processor

5 年前 · 40bbe173
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
    BootstrapExperience,
 )
 from mlagents.envs.brain import BrainInfo
+from mlagents.trainers.tf_policy import TFPolicy
-    AgentProcessor contains a dictionary of AgentBuffer. The AgentBuffers are indexed by agent_id.
+    AgentProcessor contains a dictionary per-agent trajectory buffers. The buffers are indexed by agent_id.
+    One AgentProcessor should be created per agent group.
-    def __init__(self, trainer: Trainer):
+    def __init__(self, trainer: Trainer, policy: TFPolicy, time_horizon: int):
+        """
+        Create an AgentProcessor.
+        :param trainer: Trainer instance connected to this AgentProcessor. Trainer is given trajectory
+        when it is finished.
+        :param policy: Policy instance associated with this AgentProcessor.
+        :param time_horizon: Maximum length of a trajectory before it is added to the trainer.
+        """
        self.experience_buffers: Dict[str, List] = defaultdict(list)
        self.last_brain_info: Dict[str, BrainInfo] = {}
        self.last_take_action_outputs: Dict[str, ActionInfoOutputs] = defaultdict(
        # Note: this is needed until we switch to AgentExperiences as the data input type.
        # We still need some info from the policy (memories, previous actions)
        # that really should be gathered by the env-manager.
-        self.policy = trainer.policy
+        self.policy = policy
-        self.time_horizon: int = trainer.parameters["time_horizon"]
+        self.time_horizon = time_horizon
        self.trainer = trainer

    def __str__(self):
                        steps=self.experience_buffers[agent_id],
                        bootstrap_step=bootstrap_step,
                    )
+                    # This will eventually be replaced with a queue
                    self.trainer.process_trajectory(trajectory)
                    self.experience_buffers[agent_id] = []
                elif not next_info.local_done[next_idx]:
--- a/ml-agents/mlagents/trainers/trainer_controller.py
+++ b/ml-agents/mlagents/trainers/trainer_controller.py
                            env_manager.external_brains[name]
                        )
                        self.start_trainer(trainer, env_manager)
-                        agent_manager = AgentManager(processor=AgentProcessor(trainer))
+                        agent_manager = AgentManager(
+                            processor=AgentProcessor(
+                                trainer,
+                                trainer.policy,
+                                trainer.parameters["time_horizon"],
+                            )
+                        )
                        self.managers[name] = agent_manager
                    last_brain_names = external_brains
                n_steps = self.advance(env_manager)