Add back next_obs

5 年前 · 77ff4822
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
                    if self.policy.use_vec_obs:
                        next_obs.append(next_info.vector_observations[next_idx])
                    trajectory = Trajectory(
-                        steps=self.experience_buffers[agent_id], agent_id=agent_id
+                        steps=self.experience_buffers[agent_id],
+                        agent_id=agent_id,
+                        next_obs=next_obs,
                    )
                    # This will eventually be replaced with a queue
                    self.trainer.process_trajectory(trajectory)
--- a/ml-agents/mlagents/trainers/ppo/policy.py
+++ b/ml-agents/mlagents/trainers/ppo/policy.py
 import logging
 import numpy as np
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, List

 from mlagents.tf_utils import tf

-from mlagents.trainers.trajectory import split_obs, AgentExperience
+from mlagents.trainers.trajectory import split_obs
 from mlagents.trainers.ppo.models import PPOModel
 from mlagents.trainers.tf_policy import TFPolicy
 from mlagents.trainers.buffer import AgentBuffer
        return value_estimates

    def get_value_estimates(
-        self, experience: AgentExperience, agent_id: str, done: bool
+        self, next_obs: List[np.ndarray], agent_id: str, done: bool
    ) -> Dict[str, float]:
        """
        Generates value estimates for bootstrapping.
            self.model.batch_size: 1,
            self.model.sequence_length: 1,
        }
-        vec_vis_obs = split_obs(experience.obs)
+        vec_vis_obs = split_obs(next_obs)
        for i in range(len(vec_vis_obs.visual_observations)):
            feed_dict[self.model.visual_in[i]] = [vec_vis_obs.visual_observations[i]]

--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
            agent_buffer_trajectory["{}_value_estimates".format(name)].extend(v)

        value_next = self.policy.get_value_estimates(
-            trajectory.steps[-1],
+            trajectory.next_obs,
            trajectory.steps[-1].done and not trajectory.steps[-1].max_step,
            agent_id,
        )
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
        last_step = trajectory.steps[-1]
        agent_id = trajectory.agent_id  # All the agents should have the same ID

-        # Note that this agent buffer version of the traj. is one less than the len of the raw trajectory
-        # for bootstrapping purposes.
        agent_buffer_trajectory = trajectory_to_agentbuffer(trajectory)

        # Update the normalization
--- a/ml-agents/mlagents/trainers/trainer_controller.py
+++ b/ml-agents/mlagents/trainers/trainer_controller.py
                            processor=AgentProcessor(
                                trainer,
                                trainer.policy,
-                                trainer.parameters["time_horizon"] + 1,
+                                trainer.parameters["time_horizon"],
-                            # Note: for a trajectory to be useful for bootstrapping, we need the next state.
-                            # Hence we have the processor give us 1 more than the time horizon.
                        )
                        self.managers[name] = agent_manager
                    last_brain_names = external_brains
--- a/ml-agents/mlagents/trainers/trajectory.py
+++ b/ml-agents/mlagents/trainers/trajectory.py

 class Trajectory(NamedTuple):
    steps: List[AgentExperience]
+    next_obs: List[
+        np.ndarray
+    ]  # Observation following the trajectory, for bootstrapping
    agent_id: str


    step of the trajectory.
    """
    agent_buffer_trajectory = AgentBuffer()
-    for step, exp in enumerate(trajectory.steps[:-1]):
+    for step, exp in enumerate(trajectory.steps):
-        next_vec_vis_obs = split_obs(trajectory.steps[step + 1].obs)
+        if step < len(trajectory.steps) - 1:
+            next_vec_vis_obs = split_obs(trajectory.steps[step + 1].obs)
+        else:
+            next_vec_vis_obs = split_obs(trajectory.next_obs)

        for i, _ in enumerate(vec_vis_obs.visual_observations):
            agent_buffer_trajectory["visual_obs%d" % i].append(