use sensor types to differentiate obs (#4749)

4 年前 · 22658a40
--- a/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
    def get_trajectory_value_estimates(
        self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
    ) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
-        vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])[:, 1:]]
-        goals = [ModelUtils.list_to_tensor(batch["vector_obs"])[:, :1]]
+        vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
+        goals = [ModelUtils.list_to_tensor(batch["goals"])]
        if self.policy.use_vis_obs:
            visual_obs = []
            for idx, _ in enumerate(

        memory = torch.zeros([1, 1, self.policy.m_size])

-        vec_vis_obs = SplitObservations.from_observations(next_obs)
+        vec_vis_obs = SplitObservations.from_observations(
+            next_obs, self.policy.behavior_spec
+        )
-            ModelUtils.list_to_tensor(vec_vis_obs.vector_observations[1:]).unsqueeze(0)
+            ModelUtils.list_to_tensor(vec_vis_obs.vector_observations).unsqueeze(0)
        ]
        next_vis_obs = [
            ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0)
        # goals dont change but otherwise broken
-        next_goals = [torch.as_tensor(vec_vis_obs.vector_observations[:1])]
+        next_goals = [torch.as_tensor(vec_vis_obs.goals)]
        value_estimates, next_memory = self.policy.actor_critic.critic_pass(
            vector_obs, visual_obs, goals, memory, sequence_length=batch.num_experiences
        )
--- a/ml-agents/mlagents/trainers/policy/policy.py
+++ b/ml-agents/mlagents/trainers/policy/policy.py
 from typing import Dict, List, Optional
 import numpy as np

-from mlagents_envs.base_env import ActionTuple, BehaviorSpec, DecisionSteps
+from mlagents_envs.base_env import ActionTuple, BehaviorSpec, DecisionSteps, SensorType
 from mlagents_envs.exception import UnityException

 from mlagents.trainers.action_info import ActionInfo
            else [self.behavior_spec.action_spec.continuous_size]
        )
        self.vec_obs_size = sum(
-            shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1
+            shape[0]
+            for shape, obs_type in zip(
+                behavior_spec.observation_shapes, behavior_spec.sensor_types
+            )
+            if len(shape) == 1 and obs_type == SensorType.OBSERVATION
        )
        self.vis_obs_size = sum(
            1 for shape in behavior_spec.observation_shapes if len(shape) == 3
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py
    def _split_decision_step(
        self, decision_requests: DecisionSteps
    ) -> Tuple[SplitObservations, np.ndarray]:
-        vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
+        vec_vis_obs = SplitObservations.from_observations(
+            decision_requests.obs, self.behavior_spec
+        )
        mask = None
        if self.behavior_spec.action_spec.discrete_size > 0:
            mask = torch.ones([len(decision_requests), np.sum(self.act_size)])
        If this policy normalizes vector observations, this will update the norm values in the graph.
        :param vector_obs: The vector observations to add to the running estimate of the distribution.
        """
-        vector_obs = [torch.as_tensor(vector_obs)[:, 1:]]
+        vector_obs = [torch.as_tensor(vector_obs)]
        if self.use_vec_obs and self.normalize:
            self.actor_critic.update_normalization(vector_obs)

        :return: Outputs from network as defined by self.inference_dict.
        """
        vec_vis_obs, masks = self._split_decision_step(decision_requests)
-        vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations[:, 1:])]
+        vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)]
-        goals = [torch.as_tensor(vec_vis_obs.vector_observations[:, :1])]
+        goals = [torch.as_tensor(vec_vis_obs.goals)]
        memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze(
            0
        )
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
            )
            returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])

-        vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])[:, 1:]]
-        goals = [ModelUtils.list_to_tensor(batch["vector_obs"])[:, :1]]
+        vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
+        goals = [ModelUtils.list_to_tensor(batch["goals"])]
        act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
        actions = AgentAction.from_dict(batch)

--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
        super()._process_trajectory(trajectory)
        agent_id = trajectory.agent_id  # All the agents should have the same ID

-        agent_buffer_trajectory = trajectory.to_agentbuffer()
+        agent_buffer_trajectory = trajectory.to_agentbuffer(self.policy.behavior_spec)
        # Update the normalization
        if self.is_training:
            self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
        last_step = trajectory.steps[-1]
        agent_id = trajectory.agent_id  # All the agents should have the same ID

-        agent_buffer_trajectory = trajectory.to_agentbuffer()
+        agent_buffer_trajectory = trajectory.to_agentbuffer(self.policy.behavior_spec)

        # Update the normalization
        if self.is_training:
        # Bootstrap using the last step rather than the bootstrap step if max step is reached.
        # Set last element to duplicate obs and remove dones.
        if last_step.interrupted:
-            vec_vis_obs = SplitObservations.from_observations(last_step.obs)
+            vec_vis_obs = SplitObservations.from_observations(
+                last_step.obs, self.policy.behavior_spec
+            )
            for i, obs in enumerate(vec_vis_obs.visual_observations):
                agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs
            if vec_vis_obs.vector_observations.size > 1:
--- a/ml-agents/mlagents/trainers/torch/model_serialization.py
+++ b/ml-agents/mlagents/trainers/torch/model_serialization.py
            for shape in self.policy.behavior_spec.observation_shapes
            if len(shape) == 3
        ]
+        dummy_goals = [torch.zeros(batch_dim + [1])]
        dummy_masks = torch.ones(
            batch_dim + [sum(self.policy.behavior_spec.action_spec.discrete_branches)]
        )

-        self.dummy_input = (dummy_vec_obs, dummy_vis_obs, dummy_masks, dummy_memories)
+        self.dummy_input = (
+            dummy_vec_obs,
+            dummy_vis_obs,
+            dummy_goals,
+            dummy_masks,
+            dummy_memories,
+        )
+            + ["goals"]
            + [f"visual_observation_{i}" for i in range(self.policy.vis_obs_size)]
            + ["action_masks", "memories"]
        )
--- a/ml-agents/mlagents/trainers/torch/networks.py
+++ b/ml-agents/mlagents/trainers/torch/networks.py
        self,
        vec_inputs: List[torch.Tensor],
        vis_inputs: List[torch.Tensor],
-        goal: List[torch.tensor],
+        goals: List[torch.tensor],
        actions: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
        )
-        output = self.value_heads(encoding, goal)
+        output = self.value_heads(encoding, goals)
        return output, memories


        self,
        vec_inputs: List[torch.Tensor],
        vis_inputs: List[torch.Tensor],
-        goal: List[torch.Tensor],
+        goals: List[torch.Tensor],
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
    ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
        self,
        vec_inputs: List[torch.Tensor],
        vis_inputs: List[torch.Tensor],
+        goals: List[torch.Tensor],
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
    ) -> Tuple[Union[int, torch.Tensor], ...]:
        At this moment, torch.onnx.export() doesn't accept None as tensor to be exported,
        so the size of return tuple varies with action spec.
        """
-        vec_inputs = [vec_inputs[0][:, 1:]]
-        goal = [vec_inputs[0][:, :1]]
        encoding, memories_out = self.network_body(
            vec_inputs, vis_inputs, memories=memories, sequence_length=1
        )
            disc_action_out,
            action_out_deprecated,
-        ) = self.action_model.get_action_out(encoding, masks, goal)
+        ) = self.action_model.get_action_out(encoding, masks, goals)
        export_out = [
            self.version_number,
            torch.Tensor([self.network_body.memory_size]),
        self,
        vec_inputs: List[torch.Tensor],
        vis_inputs: List[torch.Tensor],
-        goal: List[torch.Tensor],
+        goals: List[torch.Tensor],
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
    ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
        self,
        vec_inputs: List[torch.Tensor],
        vis_inputs: List[torch.Tensor],
+        goals: List[torch.Tensor],
        actions: AgentAction,
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
            vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
        )
        log_probs, entropies = self.action_model.evaluate(
-            encoding, masks, actions, goal
+            encoding, masks, actions, goals
        )
        value_outputs = self.value_heads(encoding)
        return log_probs, entropies, value_outputs
        vec_inputs: List[torch.Tensor],
        vis_inputs: List[torch.Tensor],
+        goals: List[torch.Tensor],
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
        encoding, memories = self.network_body(
            vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
        )
-        action, log_probs, entropies = self.action_model(encoding, masks, goal)
+        action, log_probs, entropies = self.action_model(encoding, masks, goals)
        value_outputs = self.value_heads(encoding)
        return action, log_probs, entropies, value_outputs, memories

        self,
        vec_inputs: List[torch.Tensor],
        vis_inputs: List[torch.Tensor],
-        goal: List[torch.Tensor],
+        goals: List[torch.Tensor],
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
    ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
        value_outputs, critic_mem_out = self.critic(
            vec_inputs,
            vis_inputs,
-            goal,
+            goals,
            memories=critic_mem,
            sequence_length=sequence_length,
        )
--- a/ml-agents/mlagents/trainers/trajectory.py
+++ b/ml-agents/mlagents/trainers/trajectory.py
 import numpy as np

 from mlagents.trainers.buffer import AgentBuffer
-from mlagents_envs.base_env import ActionTuple
+from mlagents_envs.base_env import ActionTuple, BehaviorSpec, SensorType
 from mlagents.trainers.torch.action_log_probs import LogProbsTuple


 class SplitObservations(NamedTuple):
    vector_observations: np.ndarray
    visual_observations: List[np.ndarray]
+    goals: np.ndarray
-    def from_observations(obs: List[np.ndarray]) -> "SplitObservations":
+    def from_observations(obs: List[np.ndarray], behavior_spec) -> "SplitObservations":
        """
        Divides a List of numpy arrays into a SplitObservations NamedTuple.
        This allows you to access the vector and visual observations directly,
        """
        vis_obs_list: List[np.ndarray] = []
        vec_obs_list: List[np.ndarray] = []
+        goal_list: List[np.ndarray] = []
-        for observation in obs:
-            # Obs could be batched or single
-            if len(observation.shape) == 1 or len(observation.shape) == 2:
-                vec_obs_list.append(observation)
-            if len(observation.shape) == 3 or len(observation.shape) == 4:
-                vis_obs_list.append(observation)
-            last_obs = observation
+        for observation, sensor_type in zip(obs, behavior_spec.sensor_types):
+            if sensor_type == SensorType.PARAMETERIZATION:
+                goal_list.append(observation)
+            elif sensor_type == SensorType.OBSERVATION:
+                # Obs could be batched or single
+                if len(observation.shape) == 1 or len(observation.shape) == 2:
+                    vec_obs_list.append(observation)
+                if len(observation.shape) == 3 or len(observation.shape) == 4:
+                    vis_obs_list.append(observation)
+                last_obs = observation
        if last_obs is not None:
            is_batched = len(last_obs.shape) == 2 or len(last_obs.shape) == 4
            if is_batched:
                    else np.zeros((last_obs.shape[0], 0), dtype=np.float32)
                )
+                goals = (
+                    np.concatenate(goal_list, axis=1)
+                    if len(goal_list) > 0
+                    else np.zeros((last_obs.shape[0], 0), dtype=np.float32)
+                )
+
            else:
                vec_obs = (
                    np.concatenate(vec_obs_list, axis=0)
+                goals = (
+                    np.concatenate(goal_list, axis=0)
+                    if len(goal_list) > 0
+                    else np.array([], dtype=np.float32)
+                )
+
-            vector_observations=vec_obs, visual_observations=vis_obs_list
+            vector_observations=vec_obs, visual_observations=vis_obs_list, goals=goals
        )


    agent_id: str
    behavior_id: str

-    def to_agentbuffer(self) -> AgentBuffer:
+    def to_agentbuffer(self, behavior_spec: BehaviorSpec) -> AgentBuffer:
        """
        Converts a Trajectory to an AgentBuffer
        :param trajectory: A Trajectory
        """
        agent_buffer_trajectory = AgentBuffer()
-        vec_vis_obs = SplitObservations.from_observations(self.steps[0].obs)
+        vec_vis_obs = SplitObservations.from_observations(
+            self.steps[0].obs, behavior_spec
+        )
-                    self.steps[step + 1].obs
+                    self.steps[step + 1].obs, behavior_spec
-                next_vec_vis_obs = SplitObservations.from_observations(self.next_obs)
+                next_vec_vis_obs = SplitObservations.from_observations(
+                    self.next_obs, behavior_spec
+                )

            for i, _ in enumerate(vec_vis_obs.visual_observations):
                agent_buffer_trajectory["visual_obs%d" % i].append(
            agent_buffer_trajectory["next_vector_in"].append(
                next_vec_vis_obs.vector_observations
            )
+            agent_buffer_trajectory["goals"].append(vec_vis_obs.goals)
+            # this shouldnt be necessary in an optimized implementation since the goal does not change
+            agent_buffer_trajectory["next_goals"].append(next_vec_vis_obs.goals)

            if exp.memory is not None:
                agent_buffer_trajectory["memory"].append(exp.memory)