Make conversion methods part of NamedTuples

5 年前 · fdf9aea7
--- a/ml-agents/mlagents/trainers/bc/trainer.py
+++ b/ml-agents/mlagents/trainers/bc/trainer.py

 from mlagents.trainers.bc.policy import BCPolicy
 from mlagents.trainers.buffer import AgentBuffer
-from mlagents.trainers.trajectory import Trajectory, trajectory_to_agentbuffer
+from mlagents.trainers.trajectory import Trajectory
 from mlagents.trainers.trainer import Trainer

 logger = logging.getLogger("mlagents.trainers")
        Processing involves calculating value and advantage targets for model updating step.
        """
        agent_id = trajectory.agent_id  # All the experiences should have the same ID
-        agent_buffer_trajectory = trajectory_to_agentbuffer(trajectory)
+        agent_buffer_trajectory = trajectory.to_agentbuffer()

        # Evaluate all reward functions
        self.cumulative_rewards[agent_id] += np.sum(
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
 from mlagents.trainers.ppo.policy import PPOPolicy
 from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy, get_devices
 from mlagents.trainers.rl_trainer import RLTrainer
-from mlagents.trainers.trajectory import Trajectory, trajectory_to_agentbuffer
+from mlagents.trainers.trajectory import Trajectory

 logger = logging.getLogger("mlagents.trainers")

        # Add to episode_steps
        self.episode_steps[agent_id] += len(trajectory.steps)

-        agent_buffer_trajectory = trajectory_to_agentbuffer(trajectory)
+        agent_buffer_trajectory = trajectory.to_agentbuffer()
        # Update the normalization
        if self.is_training:
            self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
 from mlagents.envs.timers import timed
 from mlagents.trainers.sac.policy import SACPolicy
 from mlagents.trainers.rl_trainer import RLTrainer
-from mlagents.trainers.trajectory import (
-    Trajectory,
-    trajectory_to_agentbuffer,
-    split_obs,
-)
+from mlagents.trainers.trajectory import Trajectory, SplitObservations


 LOGGER = logging.getLogger("mlagents.trainers")
        # Add to episode_steps
        self.episode_steps[agent_id] += len(trajectory.steps)

-        agent_buffer_trajectory = trajectory_to_agentbuffer(trajectory)
+        agent_buffer_trajectory = trajectory.to_agentbuffer()

        # Update the normalization
        if self.is_training:
        # Bootstrap using the last step rather than the bootstrap step if max step is reached.
        # Set last element to duplicate obs and remove dones.
        if last_step.max_step:
-            vec_vis_obs = split_obs(last_step.obs)
+            vec_vis_obs = SplitObservations.from_observations(last_step.obs)
            for i, obs in enumerate(vec_vis_obs.visual_observations):
                agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs
            if vec_vis_obs.vector_observations.size > 1:
--- a/ml-agents/mlagents/trainers/tests/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/test_ppo.py
 from mlagents.trainers.ppo.models import PPOModel
 from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
 from mlagents.trainers.ppo.policy import PPOPolicy
-from mlagents.trainers.trajectory import trajectory_to_agentbuffer
 from mlagents.trainers.brain import BrainParameters
 from mlagents.envs.environment import UnityEnvironment
 from mlagents.envs.mock_communicator import MockCommunicator
        assert type(key) is str
        assert val != 0.0

-    agentbuffer = trajectory_to_agentbuffer(trajectory)
+    agentbuffer = trajectory.to_agentbuffer()
    batched_values = policy.get_batched_value_estimates(agentbuffer)
    for values in batched_values.values():
        assert len(values) == 15
--- a/ml-agents/mlagents/trainers/tests/test_trajectory.py
+++ b/ml-agents/mlagents/trainers/tests/test_trajectory.py
 import numpy as np
 import pytest

-from mlagents.trainers.trajectory import (
-    AgentExperience,
-    Trajectory,
-    split_obs,
-    trajectory_to_agentbuffer,
-)
+from mlagents.trainers.trajectory import AgentExperience, Trajectory, SplitObservations

 VEC_OBS_SIZE = 6
 ACTION_SIZE = 4
        obs.append(np.ones((84, 84, 3), dtype=np.float32))
    for i in range(num_vec_obs):
        obs.append(np.ones(VEC_OBS_SIZE, dtype=np.float32))
-    split_observations = split_obs(obs)
+    split_observations = SplitObservations.from_observations(obs)

    if num_vec_obs == 1:
        assert len(split_observations.vector_observations) == VEC_OBS_SIZE
    ]
    wanted_keys = set(wanted_keys)
    trajectory = make_fake_trajectory(length=length)
-    agentbuffer = trajectory_to_agentbuffer(trajectory)
+    agentbuffer = trajectory.to_agentbuffer()
    seen_keys = set()
    for key, field in agentbuffer.items():
        assert len(field) == length
--- a/ml-agents/mlagents/trainers/tf_policy.py
+++ b/ml-agents/mlagents/trainers/tf_policy.py
 from tensorflow.python.platform import gfile
 from tensorflow.python.framework import graph_util
 from mlagents.trainers import tensorflow_to_barracuda as tf2bc
-from mlagents.trainers.trajectory import split_obs
+from mlagents.trainers.trajectory import SplitObservations
 from mlagents.trainers.buffer import AgentBuffer
 from mlagents.trainers.brain import BrainInfo

            self.model.batch_size: 1,
            self.model.sequence_length: 1,
        }
-        vec_vis_obs = split_obs(next_obs)
+        vec_vis_obs = SplitObservations.from_observations(next_obs)
        for i in range(len(vec_vis_obs.visual_observations)):
            feed_dict[self.model.visual_in[i]] = [vec_vis_obs.visual_observations[i]]

--- a/ml-agents/mlagents/trainers/trajectory.py
+++ b/ml-agents/mlagents/trainers/trajectory.py
    vector_observations: np.ndarray
    visual_observations: List[np.ndarray]

+    @staticmethod
+    def from_observations(obs: List[np.ndarray]) -> "SplitObservations":
+        vis_obs_indices = []
+        vec_obs_indices = []
+        for index, observation in enumerate(obs):
+            if len(observation.shape) == 1:
+                vec_obs_indices.append(index)
+            if len(observation.shape) == 3:
+                vis_obs_indices.append(index)
+        vec_obs = (
+            np.concatenate([obs[i] for i in vec_obs_indices], axis=0)
+            if len(vec_obs_indices) > 0
+            else np.array([], dtype=np.float32)
+        )
+        vis_obs = [obs[i] for i in vis_obs_indices]
+        return SplitObservations(
+            vector_observations=vec_obs, visual_observations=vis_obs
+        )
+

 class Trajectory(NamedTuple):
    steps: List[AgentExperience]
    agent_id: str

-
-def split_obs(obs: List[np.ndarray]) -> SplitObservations:
-    vis_obs_indices = []
-    vec_obs_indices = []
-    for index, observation in enumerate(obs):
-        if len(observation.shape) == 1:
-            vec_obs_indices.append(index)
-        if len(observation.shape) == 3:
-            vis_obs_indices.append(index)
-    vec_obs = (
-        np.concatenate([obs[i] for i in vec_obs_indices], axis=0)
-        if len(vec_obs_indices) > 0
-        else np.array([], dtype=np.float32)
-    )
-    vis_obs = [obs[i] for i in vis_obs_indices]
-    return SplitObservations(vector_observations=vec_obs, visual_observations=vis_obs)
+    def to_agentbuffer(self) -> AgentBuffer:
+        """
+        Converts a Trajectory to an AgentBuffer
+        :param trajectory: A Trajectory
+        :returns: AgentBuffer. Note that the length of the AgentBuffer will be one
+        less than the trajectory, as the next observation need to be populated from the last
+        step of the trajectory.
+        """
+        agent_buffer_trajectory = AgentBuffer()
+        for step, exp in enumerate(self.steps):
+            vec_vis_obs = SplitObservations.from_observations(exp.obs)
+            if step < len(self.steps) - 1:
+                next_vec_vis_obs = SplitObservations.from_observations(
+                    self.steps[step + 1].obs
+                )
+            else:
+                next_vec_vis_obs = SplitObservations.from_observations(self.next_obs)
-
-def trajectory_to_agentbuffer(trajectory: Trajectory) -> AgentBuffer:
-    """
-    Converts a Trajectory to an AgentBuffer
-    :param trajectory: A Trajectory
-    :returns: AgentBuffer. Note that the length of the AgentBuffer will be one
-    less than the trajectory, as the next observation need to be populated from the last
-    step of the trajectory.
-    """
-    agent_buffer_trajectory = AgentBuffer()
-    for step, exp in enumerate(trajectory.steps):
-        vec_vis_obs = split_obs(exp.obs)
-        if step < len(trajectory.steps) - 1:
-            next_vec_vis_obs = split_obs(trajectory.steps[step + 1].obs)
-        else:
-            next_vec_vis_obs = split_obs(trajectory.next_obs)
-
-        for i, _ in enumerate(vec_vis_obs.visual_observations):
-            agent_buffer_trajectory["visual_obs%d" % i].append(
-                vec_vis_obs.visual_observations[i]
+            for i, _ in enumerate(vec_vis_obs.visual_observations):
+                agent_buffer_trajectory["visual_obs%d" % i].append(
+                    vec_vis_obs.visual_observations[i]
+                )
+                agent_buffer_trajectory["next_visual_obs%d" % i].append(
+                    next_vec_vis_obs.visual_observations[i]
+                )
+            agent_buffer_trajectory["vector_obs"].append(
+                vec_vis_obs.vector_observations
-            agent_buffer_trajectory["next_visual_obs%d" % i].append(
-                next_vec_vis_obs.visual_observations[i]
+            agent_buffer_trajectory["next_vector_in"].append(
+                next_vec_vis_obs.vector_observations
-        agent_buffer_trajectory["vector_obs"].append(vec_vis_obs.vector_observations)
-        agent_buffer_trajectory["next_vector_in"].append(
-            next_vec_vis_obs.vector_observations
-        )
-        if exp.memory is not None:
-            agent_buffer_trajectory["memory"].append(exp.memory)
+            if exp.memory is not None:
+                agent_buffer_trajectory["memory"].append(exp.memory)
-        agent_buffer_trajectory["masks"].append(1.0)
-        agent_buffer_trajectory["done"].append(exp.done)
-        # Add the outputs of the last eval
-        if exp.action_pre is not None:
-            actions_pre = exp.action_pre
-            agent_buffer_trajectory["actions_pre"].append(actions_pre)
+            agent_buffer_trajectory["masks"].append(1.0)
+            agent_buffer_trajectory["done"].append(exp.done)
+            # Add the outputs of the last eval
+            if exp.action_pre is not None:
+                actions_pre = exp.action_pre
+                agent_buffer_trajectory["actions_pre"].append(actions_pre)
-        # value is a dictionary from name of reward to value estimate of the value head
-        agent_buffer_trajectory["actions"].append(exp.action)
-        agent_buffer_trajectory["action_probs"].append(exp.action_probs)
+            # value is a dictionary from name of reward to value estimate of the value head
+            agent_buffer_trajectory["actions"].append(exp.action)
+            agent_buffer_trajectory["action_probs"].append(exp.action_probs)
-        # Store action masks if necessary. Eventually these will be
-        # None for continuous actions
-        if exp.action_mask is not None:
-            agent_buffer_trajectory["action_mask"].append(
-                exp.action_mask, padding_value=1
-            )
+            # Store action masks if necessary. Eventually these will be
+            # None for continuous actions
+            if exp.action_mask is not None:
+                agent_buffer_trajectory["action_mask"].append(
+                    exp.action_mask, padding_value=1
+                )
-        agent_buffer_trajectory["prev_action"].append(exp.prev_action)
+            agent_buffer_trajectory["prev_action"].append(exp.prev_action)
-        # Add the value outputs if needed
-        agent_buffer_trajectory["environment_rewards"].append(exp.reward)
-    return agent_buffer_trajectory
+            # Add the value outputs if needed
+            agent_buffer_trajectory["environment_rewards"].append(exp.reward)
+        return agent_buffer_trajectory