Remove BootstrapExperience

5 年前 · 97d66e71
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
 import numpy as np

 from mlagents.trainers.trainer import Trainer
-from mlagents.trainers.trajectory import (
-    Trajectory,
-    AgentExperience,
-    BootstrapExperience,
-)
+from mlagents.trainers.trajectory import Trajectory, AgentExperience
 from mlagents.envs.brain import BrainInfo
 from mlagents.trainers.tf_policy import TFPolicy
 from mlagents.envs.action_info import ActionInfoOutputs
    One AgentProcessor should be created per agent group.
    """

-    def __init__(self, trainer: Trainer, policy: TFPolicy, time_horizon: int):
+    def __init__(self, trainer: Trainer, policy: TFPolicy, max_trajectory_length: int):
-        :param time_horizon: Maximum length of a trajectory before it is added to the trainer.
+        :param max_trajectory_length: Maximum length of a trajectory before it is added to the trainer.
        """
        self.experience_buffers: Dict[str, List] = defaultdict(list)
        self.last_brain_info: Dict[str, BrainInfo] = {}
        # that really should be gathered by the env-manager.
        self.policy = policy
        self.episode_steps: Dict[str, int] = {}
-        self.time_horizon = time_horizon
+        self.max_trajectory_length = max_trajectory_length
        self.trainer = trainer

    def __str__(self):

                if (
                    next_info.local_done[next_idx]
-                    or len(self.experience_buffers[agent_id]) >= self.time_horizon
+                    or len(self.experience_buffers[agent_id])
+                    >= self.max_trajectory_length
                ) and len(self.experience_buffers[agent_id]) > 0:
                    # Make next AgentExperience
                    next_obs = []
                        next_obs.append(next_info.vector_observations[next_idx])
-                    bootstrap_step = BootstrapExperience(
-                        obs=next_obs, agent_id=agent_id
-                    )
-                    trajectory = Trajectory(
-                        steps=self.experience_buffers[agent_id],
-                        bootstrap_step=bootstrap_step,
-                    )
+                    trajectory = Trajectory(steps=self.experience_buffers[agent_id])
                    # This will eventually be replaced with a queue
                    self.trainer.process_trajectory(trajectory)
                    self.experience_buffers[agent_id] = []
--- a/ml-agents/mlagents/trainers/ppo/policy.py
+++ b/ml-agents/mlagents/trainers/ppo/policy.py
 from mlagents.envs.timers import timed
 from mlagents.envs.brain import BrainParameters
 from mlagents.trainers.models import EncoderType, LearningRateSchedule
-from mlagents.trainers.trajectory import BootstrapExperience, split_obs
+from mlagents.trainers.trajectory import split_obs, AgentExperience
 from mlagents.trainers.ppo.models import PPOModel
 from mlagents.trainers.tf_policy import TFPolicy
 from mlagents.trainers.buffer import AgentBuffer
        return value_estimates

    def get_value_estimates(
-        self, experience: BootstrapExperience, done: bool
+        self, experience: AgentExperience, done: bool
-        :param experience: BootstrapExperience to be used for bootstrapping.
+        :param experience: AgentExperience to be used for bootstrapping.
        :param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
        :return: The value estimate dictionary with key being the name of the reward signal and the value the
        corresponding value estimate.
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
        agent_id = trajectory.steps[
            -1
        ].agent_id  # All the agents should have the same ID
+
+        # Note that this agent buffer version of the traj. is one less than the len of the raw trajectory
+        # for bootstrapping purposes.
        agent_buffer_trajectory = trajectory_to_agentbuffer(trajectory)
        # Update the normalization
        if self.is_training:
            agent_buffer_trajectory["{}_value_estimates".format(name)].extend(v)

        value_next = self.policy.get_value_estimates(
-            trajectory.bootstrap_step,
+            trajectory.steps[-1],
            trajectory.steps[-1].done and not trajectory.steps[-1].max_step,
        )

--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
        """
        last_step = trajectory.steps[-1]
        agent_id = last_step.agent_id  # All the agents should have the same ID
+
+        # Note that this agent buffer version of the traj. is one less than the len of the raw trajectory
+        # for bootstrapping purposes.
+
        # Update the normalization
        if self.is_training:
            self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
--- a/ml-agents/mlagents/trainers/tests/test_agent_processor.py
+++ b/ml-agents/mlagents/trainers/tests/test_agent_processor.py
 def test_agentprocessor(num_vis_obs):
    policy = create_mock_policy()
    trainer = mock.Mock()
-    processor = AgentProcessor(trainer, policy, time_horizon=5)
+    processor = AgentProcessor(trainer, policy, max_trajectory_length=5)
    fake_action_outputs = {
        "action": [0.1, 0.1],
        "value_heads": {},
--- a/ml-agents/mlagents/trainers/tests/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/test_ppo.py
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False)
+    time_horizon = 15
-        length=15, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=2
+        length=time_horizon + 1,
+        max_step_complete=True,
+        vec_obs_size=1,
+        num_vis_obs=0,
+        action_space=2,
    )
    trainer.process_trajectory(trajectory)


    # Add a terminal trajectory
    trajectory = make_fake_trajectory(
-        length=15,
+        length=time_horizon + 1,
        max_step_complete=False,
        vec_obs_size=1,
        num_vis_obs=0,
--- a/ml-agents/mlagents/trainers/tests/test_trajectory.py
+++ b/ml-agents/mlagents/trainers/tests/test_trajectory.py

 from mlagents.trainers.trajectory import (
    AgentExperience,
-    BootstrapExperience,
    Trajectory,
    split_obs,
    trajectory_to_agentbuffer,
        agent_id=agent_id,
    )
    steps_list.append(last_experience)
-    bootstrap_experience = BootstrapExperience(obs=obs, agent_id=agent_id)
-    return Trajectory(steps=steps_list, bootstrap_step=bootstrap_experience)
+    return Trajectory(steps=steps_list)


@pytest.mark.parametrize("num_visual_obs", [0, 1, 2])
--- a/ml-agents/mlagents/trainers/trainer_controller.py
+++ b/ml-agents/mlagents/trainers/trainer_controller.py
                            processor=AgentProcessor(
                                trainer,
                                trainer.policy,
-                                trainer.parameters["time_horizon"],
+                                trainer.parameters["time_horizon"] + 1,
+                            # Note: for a trajectory to be useful for bootstrapping, we need the next state.
+                            # Hence we have the processor give us 1 more than the time horizon.
                        )
                        self.managers[name] = agent_manager
                    last_brain_names = external_brains
--- a/ml-agents/mlagents/trainers/trajectory.py
+++ b/ml-agents/mlagents/trainers/trajectory.py
    agent_id: str


-class BootstrapExperience(NamedTuple):
-    """
-    A partial AgentExperience needed to bootstrap GAE.
-    """
-
-    obs: List[np.ndarray]
-    agent_id: str
-
-
 class SplitObservations(NamedTuple):
    vector_observations: np.ndarray
    visual_observations: List[np.ndarray]
    steps: List[AgentExperience]
-    bootstrap_step: BootstrapExperience  # The next step after the trajectory. Used for GAE.


 class AgentProcessorException(UnityException):
    """
    Converts a Trajectory to an AgentBuffer
    :param trajectory: A Trajectory
-    :returns: AgentBuffer
+    :returns: AgentBuffer. Note that the length of the AgentBuffer will be one
+    less than the trajectory, as the next observation need to be populated from the last
+    step of the trajectory.
-    for step, exp in enumerate(trajectory.steps):
+    for step, exp in enumerate(trajectory.steps[:-1]):
-        if step < len(trajectory.steps) - 1:
-            next_vec_vis_obs = split_obs(trajectory.steps[step + 1].obs)
-        else:
-            next_vec_vis_obs = split_obs(trajectory.bootstrap_step.obs)
+        next_vec_vis_obs = split_obs(trajectory.steps[step + 1].obs)
+
        for i, _ in enumerate(vec_vis_obs.visual_observations):
            agent_buffer_trajectory["visual_obs%d" % i].append(
                vec_vis_obs.visual_observations[i]