torch utils to and from buffer

4 年前 · eaecb59e
--- a/ml-agents-envs/mlagents_envs/base_env.py
+++ b/ml-agents-envs/mlagents_envs/base_env.py
 BehaviorName = str


-class ActionBuffer(NamedTuple):
+class ActionBuffers(NamedTuple):
    """
    Contains continuous and discrete actions as numpy arrays.
    """
        return self.discrete_action_size + self.continuous_action_size

    def create_empty_action(self, n_agents: int) -> Tuple[np.ndarray, np.ndarray]:
-        return ActionBuffer(
+        return ActionBuffers(
            np.zeros((n_agents, self.continuous_action_size), dtype=np.float32),
            np.zeros((n_agents, self.discrete_action_size), dtype=np.int32),
        )
                for i in range(self.discrete_action_size)
            ]
        )
-        return ActionBuffer(continuous_action, discrete_action)
+        return ActionBuffers(continuous_action, discrete_action)

 class BehaviorSpec(NamedTuple):
    observation_shapes: List[Tuple]

    @abstractmethod
    def set_actions(
-        self, behavior_name: BehaviorName, action: Union[ActionBuffer, np.ndarray]
+        self, behavior_name: BehaviorName, action: Union[ActionBuffers, np.ndarray]
    ) -> None:
        """
        Sets the action for all of the agents in the simulation for the next
        self,
        behavior_name: BehaviorName,
        agent_id: AgentId,
-        action: Union[ActionBuffer, np.ndarray],
+        action: Union[ActionBuffers, np.ndarray],
    ) -> None:
        """
        Sets the action for one of the agents in the simulation for the next
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
                action_pre = None
            action_probs = stored_take_action_outputs["log_probs"][idx]
            action_mask = stored_decision_step.action_mask
-            prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
+            #prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
+            prev_action = self.policy.retrieve_previous_action([global_id])
            experience = AgentExperience(
                obs=obs,
                reward=step.reward,
--- a/ml-agents/mlagents/trainers/policy/policy.py
+++ b/ml-agents/mlagents/trainers/policy/policy.py
        for index, agent_id in enumerate(agent_ids):
            if agent_id in self.previous_action_dict:
                action_matrix[index, :] = self.previous_action_dict[agent_id]
-        return action_matrix
+        return action_matrix[0, :]

    def remove_previous_action(self, agent_ids):
        for agent_id in agent_ids:
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py
 from mlagents.trainers.action_info import ActionInfo
 from mlagents.trainers.behavior_id_utils import get_global_agent_id
 from mlagents.trainers.policy import Policy
-from mlagents_envs.base_env import DecisionSteps, BehaviorSpec
+from mlagents_envs.base_env import DecisionSteps, BehaviorSpec, ActionBuffers
 from mlagents_envs.timers import timed

 from mlagents.trainers.settings import TrainerSettings
            GlobalSteps()
        )  # could be much simpler if TorchPolicy is nn.Module
        self.grads = None
+
+        self.previous_action_dict: Dict[str, ActionBuffers] = {}

        reward_signal_configs = trainer_settings.reward_signals
        reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
        self,
        vec_obs: torch.Tensor,
        vis_obs: torch.Tensor,
-        actions: torch.Tensor,
+        actions: List[torch.Tensor],
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        seq_len: int = 1,
                vec_obs, vis_obs, masks=masks, memories=memories
            )
        # Todo - make pre_action difference
-        run_out["pre_action"] = ModelUtils.to_numpy(action)
-        run_out["action"] = ModelUtils.to_numpy(action)
+        run_out["action"] = ModelUtils.to_action_buffers(action, self.action_spec)
+        run_out["pre_action"] = ModelUtils.to_action_buffers(action, self.action_spec)
        run_out["log_probs"] = ModelUtils.to_numpy(log_probs)
        run_out["entropy"] = ModelUtils.to_numpy(entropy)
        run_out["value_heads"] = {

    def get_modules(self):
        return {"Policy": self.actor_critic, "global_step": self.global_step}
+
+    # Overriding for use of ActionBuffers in torch
+    def make_empty_previous_action(self, num_agents):
+        """
+        Creates empty previous action for use with RNNs and discrete control
+        :param num_agents: Number of agents.
+        :return: Numpy array of zeros.
+        """
+        return self.action_spec.create_empty_action(num_agents)
+
+    def save_previous_action(
+        self, agent_ids: List[str], action_matrix: ActionBuffers) -> None:
+        if action_matrix is None:
+            return
+        for index, agent_id in enumerate(agent_ids):
+            self.previous_action_dict[agent_id] = action_matrix
+
+    def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
+        action_matrix = self.action_spec.create_empty_action(len(agent_ids))
+        for index, agent_id in enumerate(agent_ids):
+            if agent_id in self.previous_action_dict:
+                action_matrix = self.previous_action_dict[agent_id]
+        return action_matrix
+
+    def remove_previous_action(self, agent_ids):
+        for agent_id in agent_ids:
+            if agent_id in self.previous_action_dict:
+                self.previous_action_dict.pop(agent_id)
+
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py

        vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
        act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
-        actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
-        #discrete_actions = ModelUtils.list_to_tensor(batch["actions"][self.policy.continuous_act_size:], dtype=torch.long)
+        #actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
+        actions = ModelUtils.action_buffers_to_tensor_list(batch["actions"], self.policy.action_spec)

        memories = [
            ModelUtils.list_to_tensor(batch["memory"][i])
--- a/ml-agents/mlagents/trainers/tests/simple_test_envs.py
+++ b/ml-agents/mlagents/trainers/tests/simple_test_envs.py
    ActionType,
    BehaviorMapping,
    BehaviorName,
-    ActionBuffer,
+    ActionBuffers,
 )
 from mlagents_envs.tests.test_rpc_utils import proto_from_steps_and_action
 from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (

    def set_actions(self, behavior_name: BehaviorName, action) -> None:
        # print(action, self.goal[behavior_name])
-        continuous_action = action[:, : self.continuous_action_size]
-        discrete_action = action[:, self.continuous_action_size :]
+        #continuous_action = action[:, : self.continuous_action_size]
+        #discrete_action = action[:, self.continuous_action_size :]
+        continuous_action = action.continuous
+        discrete_action = action.discrete
+
        self.continuous_env.set_actions(behavior_name, continuous_action)
        self.discrete_env.set_actions(behavior_name, discrete_action)

--- a/ml-agents/mlagents/trainers/torch/action_model.py
+++ b/ml-agents/mlagents/trainers/torch/action_model.py
                distribution_instances.append(dist_instance)
        return distribution_instances

-    def evaluate(self, inputs: torch.Tensor, masks: torch.Tensor, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def evaluate(self, inputs: torch.Tensor, masks: torch.Tensor, actions: List[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
-        split_actions = torch.split(actions, self._split_list, dim=1) 
-        action_lists : List[torch.Tensor] = []
-        for split_action in split_actions:
-            action_list = [split_action[..., i] for i in range(split_action.shape[-1])]
-            action_lists += action_list
+        #split_actions = torch.split(actions, self._split_list, dim=1) 
+        #action_lists : List[torch.Tensor] = []
+        #for split_action in split_actions:
+        #    action_list = [split_action[..., i] for i in range(split_action.shape[-1])]
+        #    action_lists += action_list
        log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_lists, dists)
        return log_probs, entropies


    
-    def forward(self, inputs: torch.Tensor, masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(self, inputs: torch.Tensor, masks: torch.Tensor) -> Tuple[List[torch.Tensor], torch.Tensor, torch.Tensor]:
        dists = self._get_dists(inputs, masks)
        action_outs : List[torch.Tensor] = []
        action_lists = self._sample_action(dists)
        log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_lists, dists)
-        action = torch.cat(action_outs, dim=1) 
-        return (action, log_probs, entropies)
+        #action = torch.cat(action_outs, dim=1) 
+        return (action_outs, log_probs, entropies)
--- a/ml-agents/mlagents/trainers/torch/networks.py
+++ b/ml-agents/mlagents/trainers/torch/networks.py
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
-    ) -> Tuple[List[DistInstance], List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
        """
        Returns distributions, from which actions can be sampled, and value estimates.
        If memory is enabled, return the memories as well.
        self,
        vec_inputs: List[torch.Tensor],
        vis_inputs: List[torch.Tensor],
-        actions: torch.Tensor,
+        actions: List[torch.Tensor],
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
--- a/ml-agents/mlagents/trainers/torch/utils.py
+++ b/ml-agents/mlagents/trainers/torch/utils.py
 )
 from mlagents.trainers.settings import EncoderType, ScheduleType
 from mlagents.trainers.exception import UnityTrainerException
-from mlagents_envs.base_env import BehaviorSpec
+from mlagents_envs.base_env import BehaviorSpec, ActionSpec, ActionBuffers
 from mlagents.trainers.torch.distributions import DistInstance, DiscreteDistInstance


            nn.ModuleList(vector_encoders),
            total_processed_size,
        )
+
+    @staticmethod
+    def to_action_buffers(actions: List[torch.Tensor], action_spec : ActionSpec) -> ActionBuffers:
+        """
+        Converts a list of action Tensors to an ActionBuffers tuple. Implicitly
+        assumes order of actions in 'actions' is continuous, discrete 
+        """
+        continuous_action = None
+        discrete_action = []
+        # offset to index discrete actions depending on presence of continuous actions
+        _offset = 0
+        if action_spec.continuous_action_size > 0:
+            continuous_action = actions[0].detach().cpu().numpy()
+            _offset = 1
+        if action_spec.discrete_action_size > 0:
+            for _disc in range(action_spec.discrete_action_size):
+                discrete_action.append(actions[_disc + _offset].detach().cpu().numpy())
+        return ActionBuffers(continuous_action, discrete_action)
+
+    @staticmethod
+    def action_buffers_to_tensor_list(
+        action_buffers : ActionBuffers, action_spec : ActionSpec, dtype: Optional[torch.dtype] = None
+    ) -> List[torch.Tensor]:
+        """
+        Converts an ActionBuffer of numpy arrays into a List of tensors.
+        """
+        print(action_buffers)
+        #return torch.as_tensor(np.asanyarray(ndarray_list), dtype=dtype)

    @staticmethod
    def list_to_tensor(