comms agent and fixed hallway

4 年前 · e81e68de
--- a/Project/Assets/ML-Agents/Examples/Hallway/Prefabs/SymbolFinderArea.prefab
+++ b/Project/Assets/ML-Agents/Examples/Hallway/Prefabs/SymbolFinderArea.prefab
--- a/Project/Assets/ML-Agents/Examples/Hallway/Scenes/Hallway.unity.meta
+++ b/Project/Assets/ML-Agents/Examples/Hallway/Scenes/Hallway.unity.meta
 fileFormatVersion: 2
-guid: d6d6a33ed0e18459a8d61817d600978a
+guid: 71d1487c63f604ff6985cc2f23c7afbf
 DefaultImporter:
  externalObjects: {}
  userData: 
--- a/Project/Assets/ML-Agents/Examples/Hallway/Scripts/HallwayAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/Hallway/Scripts/HallwayAgent.cs
    public GameObject symbolO;
    public GameObject symbolX;
    public bool useVectorObs;
-    Rigidbody m_AgentRb;
-    Material m_GroundMaterial;
-    Renderer m_GroundRenderer;
-    HallwaySettings m_HallwaySettings;
-    int m_Selection;
+    protected Rigidbody m_AgentRb;
+    protected Material m_GroundMaterial;
+    protected Renderer m_GroundRenderer;
+    protected HallwaySettings m_HallwaySettings;
+    protected int m_Selection;

    public override void Initialize()
    {
        }
    }

-    IEnumerator GoalScoredSwapGroundMaterial(Material mat, float time)
+    protected IEnumerator GoalScoredSwapGroundMaterial(Material mat, float time)
    {
        m_GroundRenderer.material = mat;
        yield return new WaitForSeconds(time);
--- a/Project/Assets/ML-Agents/Examples/Hallway/Scripts/HallwayCollabAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/Hallway/Scripts/HallwayCollabAgent.cs
    public GameObject symbolS;
    public HallwayCollabAgent teammate;
    public bool isSpotter = true;
+    TextMesh m_MessageText;
+    
+    public override void Initialize()
+    {
+        base.Initialize();
+        if (isSpotter)
+        {
+            m_MessageText = gameObject.GetComponentInChildren<TextMesh>();
+        }
+
+    }
+
-
        var agentOffset = 10f;
        if (isSpotter)
        {
    }
    public override void CollectObservations(VectorSensor sensor)
    {
-        if (useVectorObs)
-        {
-            sensor.AddObservation(StepCount / (float)MaxStep);
-        }
+        //if (useVectorObs)
+        //{
+        //    sensor.AddObservation(StepCount / (float)MaxStep);
+        //}
        sensor.AddObservation(toOnehot(m_Message));
    }

        }

        int comm_act = actionBuffers.DiscreteActions[1];
+
+        if (isSpotter)
+        {
+            m_MessageText.text = "Message:" + comm_act.ToString();
+        }
        teammate.tellAgent(comm_act);
        // if (isSpotter) // Test
        // {
            }
        }
    }
+
+    public override void WriteDiscreteActionMask(IDiscreteActionMask actionMask)
+    {
+        // Mask the necessary actions if selected by the user.
+        if (!isSpotter)
+        {
+            // Prevents the agent from picking an action that would make it collide with a wall
+            actionMask.WriteMask(1, new[] {0});
+        }
+    }
+
 }
--- a/Project/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs
 public class WallJumpAgent : Agent
 {
    // Depending on this value, the wall will have different height
-    int m_Configuration;
+    protected int m_Configuration;
    // Brain to use when no wall is present
    public NNModel noWallBrain;
    // Brain to use when a jumpable wall is present

    public GameObject ground;
    public GameObject spawnArea;
-    Bounds m_SpawnAreaBounds;
+    protected Bounds m_SpawnAreaBounds;
-    Rigidbody m_ShortBlockRb;
-    Rigidbody m_AgentRb;
-    Material m_GroundMaterial;
-    Renderer m_GroundRenderer;
-    WallJumpSettings m_WallJumpSettings;
+    protected Rigidbody m_ShortBlockRb;
+    protected Rigidbody m_AgentRb;
+    protected Material m_GroundMaterial;
+    protected Renderer m_GroundRenderer;
+    protected WallJumpSettings m_WallJumpSettings;

    public float jumpingTime;
    public float jumpTime;
    string m_SmallWallBehaviorName = "SmallWallJump";
    string m_BigWallBehaviorName = "BigWallJump";

-    EnvironmentParameters m_ResetParams;
+    protected EnvironmentParameters m_ResetParams;

    public override void Initialize()
    {
    /// <returns>The Enumerator to be used in a Coroutine.</returns>
    /// <param name="mat">The material to be swapped.</param>
    /// <param name="time">The time the material will remain.</param>
-    IEnumerator GoalScoredSwapGroundMaterial(Material mat, float time)
+    protected IEnumerator GoalScoredSwapGroundMaterial(Material mat, float time)
    {
        m_GroundRenderer.material = mat;
        yield return new WaitForSeconds(time); //wait for 2 sec
    }

    // Detect when the agent hits the goal
-    void OnTriggerStay(Collider col)
+    protected virtual void OnTriggerStay(Collider col)
    {
        if (col.gameObject.CompareTag("goal") && DoGroundCheck(true))
        {
    /// If 1:  Small wall and smallWallBrain.
    /// Other : Tall wall and BigWallBrain.
    /// </param>
-    void ConfigureAgent(int config)
+    protected virtual void ConfigureAgent(int config)
    {
        var localScale = wall.transform.localScale;
        if (config == 0)
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py

 from mlagents.trainers.settings import TrainerSettings
 from mlagents.trainers.trajectory import SplitObservations
-from mlagents.trainers.torch.networks import (
-    SharedActorCritic,
-    SeparateActorCritic,
-    GlobalSteps,
-)
+from mlagents.trainers.torch.networks import SeparateActorCritic, GlobalSteps

 from mlagents.trainers.torch.utils import ModelUtils
 from mlagents.trainers.torch.agent_action import AgentAction
            obs, masks, memories, seq_len
        )
        return (actions, log_probs, entropies, memories)
+
+    def get_comms(
+        self,
+        obs: List[torch.Tensor],
+        masks: Optional[torch.Tensor] = None,
+        memories: Optional[torch.Tensor] = None,
+        seq_len: int = 1,
+    ) -> Tuple[torch.Tensor]:
+        """
+        :param vec_obs: List of vector observations.
+        :param vis_obs: List of visual observations.
+        :param masks: Loss masks for RNN, else None.
+        :param memories: Input memories when using RNN, else None.
+        :param seq_len: Sequence length when using RNN.
+        :return: Tuple of AgentAction, ActionLogProbs, entropies, and output memories.
+        """
+        comms = self.actor_critic.get_comms(obs, masks, memories, seq_len)
+        return comms

    def evaluate_actions(
        self,
--- a/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
        critic_obs = [
            ModelUtils.list_to_tensor_list(_agent_obs) for _agent_obs in critic_obs_np
        ]
+        comm_obs_np = AgentBuffer.obs_list_list_to_obs_batch(batch["comm_obs"])
+
+        comm_obs = [
+            ModelUtils.list_to_tensor_list(_agent_obs) for _agent_obs in comm_obs_np
+        ]

        act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
        actions = AgentAction.from_dict(batch)
        ]
        if len(memories) > 0:
            memories = torch.stack(memories).unsqueeze(0)
+
+        comms = self.policy.get_comms(
+            comm_obs[0],
+            masks=act_masks,
+            memories=memories,
+            seq_len=self.policy.sequence_length,
+        )
+        obs[-1] = comms[1]

        log_probs, entropy, values = self.policy.evaluate_actions(
            obs,
--- a/ml-agents/mlagents/trainers/torch/action_model.py
+++ b/ml-agents/mlagents/trainers/torch/action_model.py
            action_out_deprecated = None
        return continuous_out, discrete_out, action_out_deprecated

+    def get_comms(self, inputs: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:
+        """
+        The forward method of this module. Outputs the action, log probs,
+        and entropies given the encoding from the network body.
+        :params inputs: The encoding from the network body
+        :params masks: Action masks for discrete actions
+        :return: Given the input, an AgentAction of the actions generated by the policy and the corresponding
+        ActionLogProbs and entropies.
+        """
+        comms = self._discrete_distribution.differentiable_forward(inputs, masks)
+        return comms
+
    def forward(
        self, inputs: torch.Tensor, masks: torch.Tensor
    ) -> Tuple[AgentAction, ActionLogProbs, torch.Tensor]:
--- a/ml-agents/mlagents/trainers/torch/distributions.py
+++ b/ml-agents/mlagents/trainers/torch/distributions.py
    ) -> torch.Tensor:
        # Zero out masked logits, then subtract a large value. Technique mentionend here:
        # https://arxiv.org/abs/2006.14171. Our implementation is ONNX and Barracuda-friendly.
+        if allow_mask.shape[1] == 3:
+            allow_mask = allow_mask[:, :1].expand(-1, 3)
        block_mask = -1.0 * allow_mask + 1.0
        # We do -1 * tensor + constant instead of constant - tensor because it seems
        # Barracuda might swap the inputs of a "Sub" operation
            end = int(np.sum(self.act_sizes[: idx + 1]))
            split_masks.append(masks[:, start:end])
        return split_masks
+
+    def differentiable_forward(
+        self, inputs: torch.Tensor, masks: torch.Tensor
+    ) -> List[DistInstance]:
+        # Todo - Support multiple branches in mask code
+        branch_distributions = []
+        masks = self._split_masks(masks)
+        for idx, branch in enumerate(self.branches):
+            logits = branch(inputs)
+            norm_logits = self._mask_branch(logits, masks[idx])
+            distribution = torch.nn.functional.gumbel_softmax(
+                norm_logits, tau=0.1, hard=True, dim=1
+            )
+            branch_distributions.append(distribution)
+        return branch_distributions

    def forward(self, inputs: torch.Tensor, masks: torch.Tensor) -> List[DistInstance]:
        # Todo - Support multiple branches in mask code
--- a/ml-agents/mlagents/trainers/torch/networks.py
+++ b/ml-agents/mlagents/trainers/torch/networks.py
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
        critic_obs: Optional[List[List[torch.Tensor]]] = None,
+        diff=False,
    ) -> Tuple[
        AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
    ]:
        return action, log_probs, entropies, memories


-class SharedActorCritic(SimpleActor, ActorCritic):
-    def __init__(
-        self,
-        observation_shapes: List[Tuple[int, ...]],
-        network_settings: NetworkSettings,
-        action_spec: ActionSpec,
-        stream_names: List[str],
-        conditional_sigma: bool = False,
-        tanh_squash: bool = False,
-    ):
-        self.use_lstm = network_settings.memory is not None
-        super().__init__(
-            observation_shapes,
-            network_settings,
-            action_spec,
-            conditional_sigma,
-            tanh_squash,
-        )
-        self.stream_names = stream_names
-        self.value_heads = ValueHeads(stream_names, self.encoding_size)
-
-    def critic_pass(
-        self,
-        net_inputs: List[torch.Tensor],
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-    ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
-        encoding, memories_out = self.network_body(
-            net_inputs, memories=memories, sequence_length=sequence_length
-        )
-        return self.value_heads(encoding), memories_out
-
-    def get_stats_and_value(
-        self,
-        net_inputs: List[torch.Tensor],
-        actions: AgentAction,
-        masks: Optional[torch.Tensor] = None,
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-        critic_obs: Optional[List[List[torch.Tensor]]] = None,
-    ) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
-        encoding, memories = self.network_body(
-            net_inputs, memories=memories, sequence_length=sequence_length
-        )
-        log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
-        value_outputs = self.value_heads(encoding)
-        return log_probs, entropies, value_outputs
-
-    def get_action_stats_and_value(
-        self,
-        net_inputs: List[torch.Tensor],
-        masks: Optional[torch.Tensor] = None,
-        memories: Optional[torch.Tensor] = None,
-        sequence_length: int = 1,
-    ) -> Tuple[
-        AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
-    ]:
-
-        encoding, memories = self.network_body(
-            net_inputs, memories=memories, sequence_length=sequence_length
-        )
-        action, log_probs, entropies = self.action_model(encoding, masks)
-        value_outputs = self.value_heads(encoding)
-        return action, log_probs, entropies, value_outputs, memories
-
-
 class SeparateActorCritic(SimpleActor, ActorCritic):
    def __init__(
        self,
            tanh_squash,
        )
        self.stream_names = stream_names
-        self.critic = CentralizedValueNetwork(
-            stream_names, observation_shapes, network_settings, num_agents=2
-        )
+        self.critic = ValueNetwork(stream_names, observation_shapes, network_settings)
+        # self.critic = CentralizedValueNetwork(
+        #    stream_names, observation_shapes, network_settings, num_agents=2
+        # )

    @property
    def memory_size(self) -> int:
        if self.use_lstm:
            # Use only the back half of memories for critic
            actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, -1)
-        all_net_inputs = [net_inputs]
-        if critic_obs is not None:
-            all_net_inputs.extend(critic_obs)
+        all_net_inputs = net_inputs
+        # if critic_obs is not None:
+        #    all_net_inputs.extend(critic_obs)
        value_outputs, critic_mem_out = self.critic(
            all_net_inputs, memories=critic_mem, sequence_length=sequence_length
        )
            net_inputs, memories=actor_mem, sequence_length=sequence_length
        )
        log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
-        all_net_inputs = [net_inputs]
-        if critic_obs is not None:
-            all_net_inputs.extend(critic_obs)
+        all_net_inputs = net_inputs
+        # if critic_obs is not None:
+        #    all_net_inputs.extend(critic_obs)
        value_outputs, critic_mem_outs = self.critic(
            all_net_inputs, memories=critic_mem, sequence_length=sequence_length
        )
            critic_mem = None
            actor_mem = None

-        all_net_inputs = [net_inputs]
-        if critic_obs is not None:
-            all_net_inputs.extend(critic_obs)
+        all_net_inputs = net_inputs
+        # if critic_obs is not None:
+        #    all_net_inputs.extend(critic_obs)

        encoding, actor_mem_outs = self.network_body(
            net_inputs, memories=actor_mem, sequence_length=sequence_length
        else:
            mem_out = None
        return action, log_probs, entropies, value_outputs, mem_out
+
+    def get_comms(
+        self,
+        net_inputs: List[torch.Tensor],
+        masks: Optional[torch.Tensor] = None,
+        memories: Optional[torch.Tensor] = None,
+        sequence_length: int = 1,
+    ) -> Tuple[torch.Tensor]:
+        encoding, memories = self.network_body(
+            net_inputs, memories=memories, sequence_length=sequence_length
+        )
+        comms = self.action_model.get_comms(encoding, masks)
+        return comms

    def update_normalization(self, net_inputs: List[torch.Tensor]) -> None:
        super().update_normalization(net_inputs)
--- a/ml-agents/mlagents/trainers/trajectory.py
+++ b/ml-agents/mlagents/trainers/trajectory.py
        agent_buffer_trajectory = AgentBuffer()
        curr_obs = self.steps[0].obs
        for step, exp in enumerate(self.steps):
+            if step == 0:
+                # this initial all zeros creates the offset for comms
+                agent_buffer_trajectory["comm_obs"].append(
+                    np.zeros_like(exp.collab_obs)
+                )
            if step < len(self.steps) - 1:
                next_obs = self.steps[step + 1].obs
            else:
            agent_buffer_trajectory["critic_obs"].append(exp.collab_obs)
+            # to avoid error of different sized bufferfields
+            if step < len(self.steps) - 1:
+                agent_buffer_trajectory["comm_obs"].append(exp.collab_obs)
            if exp.memory is not None:
                agent_buffer_trajectory["memory"].append(exp.memory)