Merge branch 'develop-action-buffer' into develop-hybrid-actions-singleton

4 年前 · 95566e44
--- a/Project/Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo.meta
+++ b/Project/Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo.meta
 guid: 7f11f35191533404c9957443a681aaee
 ScriptedImporter:
  fileIDToRecycleName:
-    11400000: Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo
+    11400002: Assets/ML-Agents/Examples/Pushblock/Demos/ExpertPush.demo
  externalObjects: {}
  userData: ' (Unity.MLAgents.Demonstrations.DemonstrationSummary)'
  assetBundleName: 
--- a/ml-agents-envs/mlagents_envs/base_env.py
+++ b/ml-agents-envs/mlagents_envs/base_env.py
        )


-class ActionBuffers(NamedTuple):
+class ActionTuple:
-    A NamedTuple whose fields correspond to actions of different types.
-    Continuous and discrete actions are numpy arrays.
+    An object whose fields correspond to actions of different types.
+    Continuous and discrete actions are numpy arrays of type float32 and
+    int32, respectively and are type checked on construction.
+    Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
+    respectively.
-    continuous: np.ndarray  # dims (n_agents, cont_size)
-    discrete: np.ndarray  # dims (n_agents, disc_size)
+    def __init__(self, continuous: np.ndarray, discrete: np.ndarray):
+        if continuous.dtype != np.float32:
+            continuous = continuous.astype(np.float32, copy=False)
+        self._continuous = continuous
+        if discrete.dtype != np.int32:
+            discrete = discrete.astype(np.int32, copy=False)
+        self._discrete = discrete
+
+    @property
+    def continuous(self) -> np.ndarray:
+        return self._continuous
+
+    @property
+    def discrete(self) -> np.ndarray:
+        return self._discrete


 class ActionSpec(NamedTuple):
        """
        return len(self.discrete_branches)

-    def empty_action(self, n_agents: int) -> ActionBuffers:
+    def empty_action(self, n_agents: int) -> ActionTuple:
-        Generates ActionBuffers corresponding to an empty action (all zeros)
+        Generates ActionTuple corresponding to an empty action (all zeros)
-        continuous: np.ndarray = None
-        discrete: np.ndarray = None
-        if self.continuous_size > 0:
-            continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
-
-        if self.discrete_size > 0:
-            discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
-        return ActionBuffers(continuous, discrete)
+        continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
+        discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
+        return ActionTuple(continuous, discrete)
-    def random_action(self, n_agents: int) -> ActionBuffers:
+    def random_action(self, n_agents: int) -> ActionTuple:
-        Generates ActionBuffers corresponding to a random action (either discrete
+        Generates ActionTuple corresponding to a random action (either discrete
-        continuous: np.ndarray = None
-        discrete: np.ndarray = None
-        if self.continuous_size > 0:
-            continuous = np.random.uniform(
-                low=-1.0, high=1.0, size=(n_agents, self.continuous_size)
-            ).astype(np.float32)
-
+        continuous = np.random.uniform(
+            low=-1.0, high=1.0, size=(n_agents, self.continuous_size)
+        )
+        discrete = np.array([])
        if self.discrete_size > 0:
            discrete = np.column_stack(
                [
                    for i in range(self.discrete_size)
                ]
            )
-        return ActionBuffers(continuous, discrete)
+        return ActionTuple(continuous, discrete)
-        self, actions: ActionBuffers, n_agents: int, name: str
-    ) -> ActionBuffers:
+        self, actions: ActionTuple, n_agents: int, name: str
+    ) -> ActionTuple:
-        if self.continuous_size > 0 and actions.continuous.shape != _expected_shape:
+        if actions.continuous.shape != _expected_shape:
-            if actions.continuous.dtype != np.float32:
-                actions.continuous = actions.continuous.astype(np.float32)
-
-        if self.discrete_size > 0 and actions.discrete.shape != _expected_shape:
+        if actions.discrete.shape != _expected_shape:
-            if actions.discrete.dtype != np.int32:
-                actions.discrete = actions.discrete.astype(np.int32)
        return actions

    @staticmethod
        """

    @abstractmethod
-    def set_actions(self, behavior_name: BehaviorName, action: ActionBuffers) -> None:
+    def set_actions(self, behavior_name: BehaviorName, action: ActionTuple) -> None:
-        :param action: ActionBuffers tuple of continuous and/or discrete action
+        :param action: ActionTuple tuple of continuous and/or discrete action.
+        Actions are np.arrays with dimensions  (n_agents, continuous_size) and
+        (n_agents, discrete_size), respectively.
-        self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionBuffers
+        self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionTuple
    ) -> None:
        """
        Sets the action for one of the agents in the simulation for the next
-        :param action: ActionBuffers tuple of continuous and/or discrete action
+        :param action: ActionTuple tuple of continuous and/or discrete action
+        Actions are np.arrays with dimensions  (1, continuous_size) and
+        (1, discrete_size), respectively. Note, this initial dimensions of 1 is because
+        this action is meant for a single agent.
        """

    @abstractmethod
--- a/ml-agents-envs/mlagents_envs/environment.py
+++ b/ml-agents-envs/mlagents_envs/environment.py
    DecisionSteps,
    TerminalSteps,
    BehaviorSpec,
-    ActionBuffers,
+    ActionTuple,
    BehaviorName,
    AgentId,
    BehaviorMapping,

        self._env_state: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
        self._env_specs: Dict[str, BehaviorSpec] = {}
-        self._env_actions: Dict[str, ActionBuffers] = {}
+        self._env_actions: Dict[str, ActionTuple] = {}
        self._is_first_message = True
        self._update_behavior_specs(aca_output)

                f"agent group in the environment"
            )

-    def set_actions(self, behavior_name: BehaviorName, action: ActionBuffers) -> None:
+    def set_actions(self, behavior_name: BehaviorName, action: ActionTuple) -> None:
        self._assert_behavior_exists(behavior_name)
        if behavior_name not in self._env_state:
            return
        self._env_actions[behavior_name] = action

    def set_action_for_agent(
-        self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionBuffers
+        self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionTuple
    ) -> None:
        self._assert_behavior_exists(behavior_name)
        if behavior_name not in self._env_state:
                    agent_id
                )
            ) from ie
-        self._env_actions[behavior_name][index] = action
+        if action_spec.continuous_size > 0:
+            self._env_actions[behavior_name].continuous[index] = action.continuous[0]
+        if action_spec.discrete_size > 0:
+            self._env_actions[behavior_name].discrete[index] = action.discrete[0]

    def get_steps(
        self, behavior_name: BehaviorName

    @timed
    def _generate_step_input(
-        self, vector_action: Dict[str, ActionBuffers]
+        self, vector_action: Dict[str, ActionTuple]
    ) -> UnityInputProto:
        rl_in = UnityRLInputProto()
        for b in vector_action:
--- a/ml-agents-envs/mlagents_envs/tests/test_steps.py
+++ b/ml-agents-envs/mlagents_envs/tests/test_steps.py
    specs = ActionSpec.create_continuous(action_len)
    zero_action = specs.empty_action(4).continuous
    assert np.array_equal(zero_action, np.zeros((4, action_len), dtype=np.float32))
+    print(specs.random_action(4))
+    print(random_action)
    assert random_action.dtype == np.float32
    assert random_action.shape == (4, action_len)
    assert np.min(random_action) >= -1
--- a/ml-agents/mlagents/trainers/env_manager.py
+++ b/ml-agents/mlagents/trainers/env_manager.py
    TerminalSteps,
    BehaviorSpec,
    BehaviorName,
-    ActionBuffers,
+    ActionTuple,
 )
 from mlagents_envs.side_channel.stats_side_channel import EnvironmentStats

    @staticmethod
    def action_buffers_from_numpy_dict(
        action_dict: Dict[str, np.ndarray]
-    ) -> ActionBuffers:
-        continuous: np.ndarray = None
-        discrete: np.ndarray = None
+    ) -> ActionTuple:
+        continuous: np.ndarray = np.array([], dtype=np.float32)
+        discrete: np.ndarray = np.array([], dtype=np.int32)
-        return ActionBuffers(continuous, discrete)
+        return ActionTuple(continuous, discrete)
--- a/ml-agents/mlagents/trainers/policy/policy.py
+++ b/ml-agents/mlagents/trainers/policy/policy.py
        :return: Dict of action type to np.ndarray
        """
        act_dict: Dict[str, np.ndarray] = {}
-        action_buffer = self.behavior_spec.action_spec.empty_action(num_agents)
-        if action_buffer.continuous is not None:
-            act_dict["continuous_action"] = action_buffer.continuous
-        if action_buffer.discrete is not None:
-            act_dict["discrete_action"] = action_buffer.discrete
+        action_tuple = self.behavior_spec.action_spec.empty_action(num_agents)
+        if self.behavior_spec.action_spec.continuous_size > 0:
+            act_dict["continuous_action"] = action_tuple.continuous
+        if self.behavior_spec.action_spec.discrete_size > 0:
+            act_dict["discrete_action"] = action_tuple.discrete
        return act_dict

    def save_previous_action(
--- a/ml-agents/mlagents/trainers/tests/simple_test_envs.py
+++ b/ml-agents/mlagents/trainers/tests/simple_test_envs.py

 from mlagents_envs.base_env import (
    ActionSpec,
-    ActionBuffers,
+    ActionTuple,
    BaseEnv,
    BehaviorSpec,
    DecisionSteps,
        )  # to set the goals/positions
        self.action_spec = action_spec
        self.behavior_spec = BehaviorSpec(self._make_obs_spec(), action_spec)
+        self.action_spec = action_spec
        self.names = brain_names
        self.positions: Dict[str, List[float]] = {}
        self.step_count: Dict[str, float] = {}
    def _take_action(self, name: str) -> bool:
        deltas = []
        _act = self.action[name]
-        if _act.discrete is not None:
+        if self.action_spec.discrete_size > 0:
-        if _act.continuous is not None:
+        if self.action_spec.continuous_size > 0:
            for _cont in _act.continuous[0]:
                deltas.append(_cont)
        for i, _delta in enumerate(deltas):
        for _ in range(self.n_demos):
            for name in self.names:
                if self.discrete:
-                    self.action[name] = ActionBuffers(
-                        [[]], np.array([[1]] if self.goal[name] > 0 else [[0]])
+                    self.action[name] = ActionTuple(
+                        np.array([], dtype=np.float32),
+                        np.array(
+                            [[1]] if self.goal[name] > 0 else [[0]], dtype=np.int32
+                        ),
-                    self.action[name] = ActionBuffers(
-                        np.array([[float(self.goal[name])]]), [[]]
+                    self.action[name] = ActionTuple(
+                        np.array([[float(self.goal[name])]], dtype=np.float32),
+                        np.array([], dtype=np.int32),
                    )
            self.step()
--- a/ml-agents/mlagents/trainers/torch/utils.py
+++ b/ml-agents/mlagents/trainers/torch/utils.py
    discrete log probs of individual actions as well as all the log probs for an entire branch.
    Utility functions provide numpy <=> tensor conversions to be used by the optimizers.
    :param continuous_tensor: Torch tensor corresponding to log probs of continuous actions
-    :param discrete_list: List of Torch tensors each corresponding to log probs of discrete actions
+    :param discrete_list: List of Torch tensors each corresponding to log probs of the discrete actions that were
+    sampled.
-    a discrete action branch
+    a discrete action branch, even the discrete actions that were not sampled. all_discrete_list is a list of Tensors,
+    each Tensor corresponds to one discrete branch log probabilities.
    """

    continuous_tensor: torch.Tensor
--- a/ml-agents/mlagents/trainers/trajectory.py
+++ b/ml-agents/mlagents/trainers/trajectory.py
            action_shape = None
            for act_type, act_array in exp.action.items():
                agent_buffer_trajectory[act_type].append(act_array)
-                action_shape = act_array.shape  # TODO Better way to make mask
            for log_type, log_array in exp.action_probs.items():
                agent_buffer_trajectory[log_type].append(log_array)

            else:
                # This should never be needed unless the environment somehow doesn't supply the
                # action mask in a discrete space.
+
+                if "discrete_action" in exp.action:
+                    action_shape = exp.action["discrete_action"].shape
+                else:
+                    action_shape = exp.action["continuous_action"].shape
                agent_buffer_trajectory["action_mask"].append(
                    np.ones(action_shape, dtype=np.float32), padding_value=1
                )