MultiAgentGroup Interface (#4923)

* add SimpleMultiAgentGroup * add group reward field to agent and proto
4 年前 · ddb01eb2
--- a/com.unity.ml-agents/Runtime/Agent.cs
+++ b/com.unity.ml-agents/Runtime/Agent.cs
        public float reward;

        /// <summary>
+        /// The current group reward received by the agent.
+        /// </summary>
+        public float groupReward;
+
+        /// <summary>
        /// Whether the agent is done or not.
        /// </summary>
        public bool done;
        /// to separate between different agents in the environment.
        /// </summary>
        public int episodeId;
+
+        /// <summary>
+        /// MultiAgentGroup identifier.
+        /// </summary>
+        public int groupId;

        public void ClearActions()
        {
        /// Additionally, the magnitude of the reward should not exceed 1.0
        float m_Reward;

+        /// Represents the group reward the agent accumulated during the current step.
+        float m_GroupReward;
+
        /// Keeps track of the cumulative reward in this episode.
        float m_CumulativeReward;

        /// </summary>
        float[] m_LegacyHeuristicCache;

+        /// Currect MultiAgentGroup ID. Default to 0 (meaning no group)
+        int m_GroupId;
+
+        /// Delegate for the agent to unregister itself from the MultiAgentGroup without cyclic reference
+        /// between agent and the group
+        internal event Action<Agent> OnAgentDisabled;
+
        /// <summary>
        /// Called when the attached [GameObject] becomes enabled and active.
        /// [GameObject]: https://docs.unity3d.com/Manual/GameObjects.html
                new int[m_ActuatorManager.NumDiscreteActions]
            );

+            m_Info.groupId = m_GroupId;
+
            // The first time the Academy resets, all Agents in the scene will be
            // forced to reset through the <see cref="AgentForceReset"/> event.
            // To avoid the Agent resetting twice, the Agents will not begin their
                NotifyAgentDone(DoneReason.Disabled);
            }
            m_Brain?.Dispose();
+            OnAgentDisabled?.Invoke(this);
            m_Initialized = false;
        }

            }
            m_Info.episodeId = m_EpisodeId;
            m_Info.reward = m_Reward;
+            m_Info.groupReward = m_GroupReward;
+            m_Info.groupId = m_GroupId;
            if (collectObservationsSensor != null)
            {
                // Make sure the latest observations are being passed to training.
            }

            m_Reward = 0f;
+            m_GroupReward = 0f;
            m_CumulativeReward = 0f;
            m_RequestAction = false;
            m_RequestDecision = false;
            m_CumulativeReward += increment;
        }

+        internal void SetGroupReward(float reward)
+        {
+#if DEBUG
+            Utilities.DebugCheckNanAndInfinity(reward, nameof(reward), nameof(SetGroupReward));
+#endif
+            m_GroupReward = reward;
+        }
+
+        internal void AddGroupReward(float increment)
+        {
+#if DEBUG
+            Utilities.DebugCheckNanAndInfinity(increment, nameof(increment), nameof(AddGroupReward));
+#endif
+            m_GroupReward += increment;
+        }
+
        /// <summary>
        /// Retrieves the episode reward for the Agent.
        /// </summary>

            m_Info.discreteActionMasks = m_ActuatorManager.DiscreteActionMask?.GetMask();
            m_Info.reward = m_Reward;
+            m_Info.groupReward = m_GroupReward;
+            m_Info.groupId = m_GroupId;

            using (TimerStack.Instance.Scoped("RequestDecision"))
            {
            {
                SendInfoToBrain();
                m_Reward = 0f;
+                m_GroupReward = 0f;
                m_RequestDecision = false;
            }
        }
            var actions = m_Brain?.DecideAction() ?? new ActionBuffers();
            m_Info.CopyActions(actions);
            m_ActuatorManager.UpdateActions(actions);
+        }
+
+        internal void SetMultiAgentGroup(IMultiAgentGroup multiAgentGroup)
+        {
+            if (multiAgentGroup == null)
+            {
+                m_GroupId = 0;
+            }
+            else
+            {
+                var newGroupId = multiAgentGroup.GetId();
+                if (m_GroupId == 0 || m_GroupId == newGroupId)
+                {
+                    m_GroupId = newGroupId;
+                }
+                else
+                {
+                    throw new UnityAgentsException("Agent is already registered with a group. Unregister it first.");
+                }
+            }
        }
    }
 }
--- a/com.unity.ml-agents/Runtime/Communicator/GrpcExtensions.cs
+++ b/com.unity.ml-agents/Runtime/Communicator/GrpcExtensions.cs
            var agentInfoProto = new AgentInfoProto
            {
                Reward = ai.reward,
+                GroupReward = ai.groupReward,
+                GroupId = ai.groupId,
            };

            if (ai.discreteActionMasks != null)
--- a/com.unity.ml-agents/Runtime/Grpc/CommunicatorObjects/AgentInfo.cs
+++ b/com.unity.ml-agents/Runtime/Grpc/CommunicatorObjects/AgentInfo.cs
          string.Concat(
            "CjNtbGFnZW50c19lbnZzL2NvbW11bmljYXRvcl9vYmplY3RzL2FnZW50X2lu",
            "Zm8ucHJvdG8SFGNvbW11bmljYXRvcl9vYmplY3RzGjRtbGFnZW50c19lbnZz",
-            "L2NvbW11bmljYXRvcl9vYmplY3RzL29ic2VydmF0aW9uLnByb3RvItEBCg5B",
+            "L2NvbW11bmljYXRvcl9vYmplY3RzL29ic2VydmF0aW9uLnByb3RvIvkBCg5B",
-            "YXRvcl9vYmplY3RzLk9ic2VydmF0aW9uUHJvdG9KBAgBEAJKBAgCEANKBAgD",
-            "EARKBAgEEAVKBAgFEAZKBAgGEAdKBAgMEA1CJaoCIlVuaXR5Lk1MQWdlbnRz",
-            "LkNvbW11bmljYXRvck9iamVjdHNiBnByb3RvMw=="));
+            "YXRvcl9vYmplY3RzLk9ic2VydmF0aW9uUHJvdG8SEAoIZ3JvdXBfaWQYDiAB",
+            "KAUSFAoMZ3JvdXBfcmV3YXJkGA8gASgCSgQIARACSgQIAhADSgQIAxAESgQI",
+            "BBAFSgQIBRAGSgQIBhAHSgQIDBANQiWqAiJVbml0eS5NTEFnZW50cy5Db21t",
+            "dW5pY2F0b3JPYmplY3RzYgZwcm90bzM="));
-            new pbr::GeneratedClrTypeInfo(typeof(global::Unity.MLAgents.CommunicatorObjects.AgentInfoProto), global::Unity.MLAgents.CommunicatorObjects.AgentInfoProto.Parser, new[]{ "Reward", "Done", "MaxStepReached", "Id", "ActionMask", "Observations" }, null, null, null)
+            new pbr::GeneratedClrTypeInfo(typeof(global::Unity.MLAgents.CommunicatorObjects.AgentInfoProto), global::Unity.MLAgents.CommunicatorObjects.AgentInfoProto.Parser, new[]{ "Reward", "Done", "MaxStepReached", "Id", "ActionMask", "Observations", "GroupId", "GroupReward" }, null, null, null)
          }));
    }
    #endregion
      id_ = other.id_;
      actionMask_ = other.actionMask_.Clone();
      observations_ = other.observations_.Clone();
+      groupId_ = other.groupId_;
+      groupReward_ = other.groupReward_;
      _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields);
    }

      get { return observations_; }
    }

+    /// <summary>Field number for the "group_id" field.</summary>
+    public const int GroupIdFieldNumber = 14;
+    private int groupId_;
+    [global::System.Diagnostics.DebuggerNonUserCodeAttribute]
+    public int GroupId {
+      get { return groupId_; }
+      set {
+        groupId_ = value;
+      }
+    }
+
+    /// <summary>Field number for the "group_reward" field.</summary>
+    public const int GroupRewardFieldNumber = 15;
+    private float groupReward_;
+    [global::System.Diagnostics.DebuggerNonUserCodeAttribute]
+    public float GroupReward {
+      get { return groupReward_; }
+      set {
+        groupReward_ = value;
+      }
+    }
+
    [global::System.Diagnostics.DebuggerNonUserCodeAttribute]
    public override bool Equals(object other) {
      return Equals(other as AgentInfoProto);
      if (Id != other.Id) return false;
      if(!actionMask_.Equals(other.actionMask_)) return false;
      if(!observations_.Equals(other.observations_)) return false;
+      if (GroupId != other.GroupId) return false;
+      if (!pbc::ProtobufEqualityComparers.BitwiseSingleEqualityComparer.Equals(GroupReward, other.GroupReward)) return false;
      return Equals(_unknownFields, other._unknownFields);
    }

      if (Id != 0) hash ^= Id.GetHashCode();
      hash ^= actionMask_.GetHashCode();
      hash ^= observations_.GetHashCode();
+      if (GroupId != 0) hash ^= GroupId.GetHashCode();
+      if (GroupReward != 0F) hash ^= pbc::ProtobufEqualityComparers.BitwiseSingleEqualityComparer.GetHashCode(GroupReward);
      if (_unknownFields != null) {
        hash ^= _unknownFields.GetHashCode();
      }
      }
      actionMask_.WriteTo(output, _repeated_actionMask_codec);
      observations_.WriteTo(output, _repeated_observations_codec);
+      if (GroupId != 0) {
+        output.WriteRawTag(112);
+        output.WriteInt32(GroupId);
+      }
+      if (GroupReward != 0F) {
+        output.WriteRawTag(125);
+        output.WriteFloat(GroupReward);
+      }
      if (_unknownFields != null) {
        _unknownFields.WriteTo(output);
      }
      }
      size += actionMask_.CalculateSize(_repeated_actionMask_codec);
      size += observations_.CalculateSize(_repeated_observations_codec);
+      if (GroupId != 0) {
+        size += 1 + pb::CodedOutputStream.ComputeInt32Size(GroupId);
+      }
+      if (GroupReward != 0F) {
+        size += 1 + 4;
+      }
      if (_unknownFields != null) {
        size += _unknownFields.CalculateSize();
      }
      }
      actionMask_.Add(other.actionMask_);
      observations_.Add(other.observations_);
+      if (other.GroupId != 0) {
+        GroupId = other.GroupId;
+      }
+      if (other.GroupReward != 0F) {
+        GroupReward = other.GroupReward;
+      }
      _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields);
    }

          }
          case 106: {
            observations_.AddEntriesFrom(input, _repeated_observations_codec);
+            break;
+          }
+          case 112: {
+            GroupId = input.ReadInt32();
+            break;
+          }
+          case 125: {
+            GroupReward = input.ReadFloat();
            break;
          }
        }
--- a/gym-unity/gym_unity/tests/test_gym.py
+++ b/gym-unity/gym_unity/tests/test_gym.py
        ] * number_visual_observations
    rewards = np.array(num_agents * [1.0])
    agents = np.array(range(0, num_agents))
-    return DecisionSteps(obs, rewards, agents, None), TerminalSteps.empty(specs)
+    group_id = np.array(num_agents * [0])
+    group_rewards = np.array(num_agents * [0.0])
+    return (
+        DecisionSteps(obs, rewards, agents, None, group_id, group_rewards),
+        TerminalSteps.empty(specs),
+    )


 def setup_mock_unityenvironment(mock_env, mock_spec, mock_decision, mock_termination):
--- a/ml-agents-envs/mlagents_envs/base_env.py
+++ b/ml-agents-envs/mlagents_envs/base_env.py
    reward: float
    agent_id: AgentId
    action_mask: Optional[List[np.ndarray]]
+    group_id: int
+    group_reward: float


 class DecisionSteps(Mapping):
     this simulation step.
    """

-    def __init__(self, obs, reward, agent_id, action_mask):
+    def __init__(self, obs, reward, agent_id, action_mask, group_id, group_reward):
+        self.group_id: np.ndarray = group_id
+        self.group_reward: np.ndarray = group_reward
        self._agent_id_to_index: Optional[Dict[AgentId, int]] = None

    @property
            agent_mask = []
            for mask in self.action_mask:
                agent_mask.append(mask[agent_index])
+        group_id = self.group_id[agent_index]
+            group_id=group_id,
+            group_reward=self.group_reward[agent_index],
        )

    def __iter__(self) -> Iterator[Any]:
            reward=np.zeros(0, dtype=np.float32),
            agent_id=np.zeros(0, dtype=np.int32),
            action_mask=None,
+            group_id=np.zeros(0, dtype=np.int32),
+            group_reward=np.zeros(0, dtype=np.float32),
        )


    reward: float
    interrupted: bool
    agent_id: AgentId
+    group_id: int
+    group_reward: float


 class TerminalSteps(Mapping):
     across simulation steps.
    """

-    def __init__(self, obs, reward, interrupted, agent_id):
+    def __init__(self, obs, reward, interrupted, agent_id, group_id, group_reward):
+        self.group_id: np.ndarray = group_id
+        self.group_reward: np.ndarray = group_reward
        self._agent_id_to_index: Optional[Dict[AgentId, int]] = None

    @property
        agent_obs = []
        for batched_obs in self.obs:
            agent_obs.append(batched_obs[agent_index])
+        group_id = self.group_id[agent_index]
+            group_id=group_id,
+            group_reward=self.group_reward[agent_index],
        )

    def __iter__(self) -> Iterator[Any]:
            reward=np.zeros(0, dtype=np.float32),
            interrupted=np.zeros(0, dtype=np.bool),
            agent_id=np.zeros(0, dtype=np.int32),
+            group_id=np.zeros(0, dtype=np.int32),
+            group_reward=np.zeros(0, dtype=np.float32),
        )


--- a/ml-agents-envs/mlagents_envs/communicator_objects/agent_info_pb2.py
+++ b/ml-agents-envs/mlagents_envs/communicator_objects/agent_info_pb2.py
  name='mlagents_envs/communicator_objects/agent_info.proto',
  package='communicator_objects',
  syntax='proto3',
-  serialized_pb=_b('\n3mlagents_envs/communicator_objects/agent_info.proto\x12\x14\x63ommunicator_objects\x1a\x34mlagents_envs/communicator_objects/observation.proto\"\xd1\x01\n\x0e\x41gentInfoProto\x12\x0e\n\x06reward\x18\x07 \x01(\x02\x12\x0c\n\x04\x64one\x18\x08 \x01(\x08\x12\x18\n\x10max_step_reached\x18\t \x01(\x08\x12\n\n\x02id\x18\n \x01(\x05\x12\x13\n\x0b\x61\x63tion_mask\x18\x0b \x03(\x08\x12<\n\x0cobservations\x18\r \x03(\x0b\x32&.communicator_objects.ObservationProtoJ\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04J\x04\x08\x04\x10\x05J\x04\x08\x05\x10\x06J\x04\x08\x06\x10\x07J\x04\x08\x0c\x10\rB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+  serialized_pb=_b('\n3mlagents_envs/communicator_objects/agent_info.proto\x12\x14\x63ommunicator_objects\x1a\x34mlagents_envs/communicator_objects/observation.proto\"\xf9\x01\n\x0e\x41gentInfoProto\x12\x0e\n\x06reward\x18\x07 \x01(\x02\x12\x0c\n\x04\x64one\x18\x08 \x01(\x08\x12\x18\n\x10max_step_reached\x18\t \x01(\x08\x12\n\n\x02id\x18\n \x01(\x05\x12\x13\n\x0b\x61\x63tion_mask\x18\x0b \x03(\x08\x12<\n\x0cobservations\x18\r \x03(\x0b\x32&.communicator_objects.ObservationProto\x12\x10\n\x08group_id\x18\x0e \x01(\x05\x12\x14\n\x0cgroup_reward\x18\x0f \x01(\x02J\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04J\x04\x08\x04\x10\x05J\x04\x08\x05\x10\x06J\x04\x08\x06\x10\x07J\x04\x08\x0c\x10\rB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
  ,
  dependencies=[mlagents__envs_dot_communicator__objects_dot_observation__pb2.DESCRIPTOR,])

      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='group_id', full_name='communicator_objects.AgentInfoProto.group_id', index=6,
+      number=14, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='group_reward', full_name='communicator_objects.AgentInfoProto.group_reward', index=7,
+      number=15, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
  ],
  extensions=[
  ],
  oneofs=[
  ],
  serialized_start=132,
-  serialized_end=341,
+  serialized_end=381,
 )

 _AGENTINFOPROTO.fields_by_name['observations'].message_type = mlagents__envs_dot_communicator__objects_dot_observation__pb2._OBSERVATIONPROTO
--- a/ml-agents-envs/mlagents_envs/communicator_objects/agent_info_pb2.pyi
+++ b/ml-agents-envs/mlagents_envs/communicator_objects/agent_info_pb2.pyi
    max_step_reached = ... # type: builtin___bool
    id = ... # type: builtin___int
    action_mask = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[builtin___bool]
+    group_id = ... # type: builtin___int
+    group_reward = ... # type: builtin___float

    @property
    def observations(self) -> google___protobuf___internal___containers___RepeatedCompositeFieldContainer[mlagents_envs___communicator_objects___observation_pb2___ObservationProto]: ...
        id : typing___Optional[builtin___int] = None,
        action_mask : typing___Optional[typing___Iterable[builtin___bool]] = None,
        observations : typing___Optional[typing___Iterable[mlagents_envs___communicator_objects___observation_pb2___ObservationProto]] = None,
+        group_id : typing___Optional[builtin___int] = None,
+        group_reward : typing___Optional[builtin___float] = None,
        ) -> None: ...
    @classmethod
    def FromString(cls, s: builtin___bytes) -> AgentInfoProto: ...
-        def ClearField(self, field_name: typing_extensions___Literal[u"action_mask",u"done",u"id",u"max_step_reached",u"observations",u"reward"]) -> None: ...
+        def ClearField(self, field_name: typing_extensions___Literal[u"action_mask",u"done",u"group_id",u"group_reward",u"id",u"max_step_reached",u"observations",u"reward"]) -> None: ...
-        def ClearField(self, field_name: typing_extensions___Literal[u"action_mask",b"action_mask",u"done",b"done",u"id",b"id",u"max_step_reached",b"max_step_reached",u"observations",b"observations",u"reward",b"reward"]) -> None: ...
+        def ClearField(self, field_name: typing_extensions___Literal[u"action_mask",b"action_mask",u"done",b"done",u"group_id",b"group_id",u"group_reward",b"group_reward",u"id",b"id",u"max_step_reached",b"max_step_reached",u"observations",b"observations",u"reward",b"reward"]) -> None: ...
--- a/ml-agents-envs/mlagents_envs/rpc_utils.py
+++ b/ml-agents-envs/mlagents_envs/rpc_utils.py
        [agent_info.reward for agent_info in terminal_agent_info_list], dtype=np.float32
    )

+    decision_group_rewards = np.array(
+        [agent_info.group_reward for agent_info in decision_agent_info_list],
+        dtype=np.float32,
+    )
+    terminal_group_rewards = np.array(
+        [agent_info.group_reward for agent_info in terminal_agent_info_list],
+        dtype=np.float32,
+    )
+
+    _raise_on_nan_and_inf(decision_group_rewards, "group_rewards")
+    _raise_on_nan_and_inf(terminal_group_rewards, "group_rewards")
+
+    decision_group_id = [agent_info.group_id for agent_info in decision_agent_info_list]
+    terminal_group_id = [agent_info.group_id for agent_info in terminal_agent_info_list]

    max_step = np.array(
        [agent_info.max_step_reached for agent_info in terminal_agent_info_list],
            action_mask = np.split(action_mask, indices, axis=1)
    return (
        DecisionSteps(
-            decision_obs_list, decision_rewards, decision_agent_id, action_mask
+            decision_obs_list,
+            decision_rewards,
+            decision_agent_id,
+            action_mask,
+            decision_group_id,
+            decision_group_rewards,
-        TerminalSteps(terminal_obs_list, terminal_rewards, max_step, terminal_agent_id),
+        TerminalSteps(
+            terminal_obs_list,
+            terminal_rewards,
+            max_step,
+            terminal_agent_id,
+            terminal_group_id,
+            terminal_group_rewards,
+        ),
    )


--- a/ml-agents-envs/mlagents_envs/tests/test_steps.py
+++ b/ml-agents-envs/mlagents_envs/tests/test_steps.py
        reward=np.array(range(3), dtype=np.float32),
        agent_id=np.array(range(10, 13), dtype=np.int32),
        action_mask=[np.zeros((3, 4), dtype=np.bool)],
+        group_id=np.array(range(3), dtype=np.int32),
+        group_reward=np.array(range(3), dtype=np.float32),
    )

    assert ds.agent_id_to_index[10] == 0
        reward=np.array(range(3), dtype=np.float32),
        agent_id=np.array(range(10, 13), dtype=np.int32),
        interrupted=np.array([1, 0, 1], dtype=np.bool),
+        group_id=np.array(range(3), dtype=np.int32),
+        group_reward=np.array(range(3), dtype=np.float32),
    )

    assert ts.agent_id_to_index[10] == 0
--- a/ml-agents/mlagents/trainers/tests/mock_brain.py
+++ b/ml-agents/mlagents/trainers/tests/mock_brain.py
    reward = np.array(num_agents * [1.0], dtype=np.float32)
    interrupted = np.array(num_agents * [False], dtype=np.bool)
    agent_id = np.arange(num_agents, dtype=np.int32)
+    group_id = np.array(num_agents * [0], dtype=np.int32)
+    group_reward = np.array(num_agents * [0.0], dtype=np.float32)
-            TerminalSteps(obs_list, reward, interrupted, agent_id),
+            TerminalSteps(
+                obs_list, reward, interrupted, agent_id, group_id, group_reward
+            ),
-            DecisionSteps(obs_list, reward, agent_id, action_mask),
+            DecisionSteps(
+                obs_list, reward, agent_id, action_mask, group_id, group_reward
+            ),
            TerminalSteps.empty(behavior_spec),
        )

--- a/ml-agents/mlagents/trainers/tests/simple_test_envs.py
+++ b/ml-agents/mlagents/trainers/tests/simple_test_envs.py
        self.agent_id[name] = self.agent_id[name] + 1

    def _make_batched_step(
-        self, name: str, done: bool, reward: float
+        self, name: str, done: bool, reward: float, group_reward: float
+        m_group_id = np.array([0], dtype=np.int32)
+        m_group_reward = np.array([group_reward], dtype=np.float32)
-        decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id, action_mask)
+        decision_step = DecisionSteps(
+            m_vector_obs, m_reward, m_agent_id, action_mask, m_group_id, m_group_reward
+        )
        terminal_step = TerminalSteps.empty(self.behavior_spec)
        if done:
            self.final_rewards[name].append(self.rewards[name])
                new_done,
                new_agent_id,
                new_action_mask,
+                new_group_id,
+                new_group_reward,
-                new_vector_obs, new_reward, new_agent_id, new_action_mask
+                new_vector_obs,
+                new_reward,
+                new_agent_id,
+                new_action_mask,
+                new_group_id,
+                new_group_reward,
-                m_vector_obs, m_reward, np.array([False], dtype=np.bool), m_agent_id
+                m_vector_obs,
+                m_reward,
+                np.array([False], dtype=np.bool),
+                m_agent_id,
+                m_group_id,
+                m_group_reward,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
-        return new_reward, new_done, new_agent_id, new_action_mask
+        new_group_id = np.array([0], dtype=np.int32)
+        new_group_reward = np.array([0.0], dtype=np.float32)
+        return (
+            new_reward,
+            new_done,
+            new_agent_id,
+            new_action_mask,
+            new_group_id,
+            new_group_reward,
+        )

    def step(self) -> None:
        assert all(action is not None for action in self.action.values())
            reward = self._compute_reward(name, done)
            self.rewards[name] += reward
-            self.step_result[name] = self._make_batched_step(name, done, reward)
+            self.step_result[name] = self._make_batched_step(name, done, reward, 0.0)
-            self.step_result[name] = self._make_batched_step(name, False, 0.0)
+            self.step_result[name] = self._make_batched_step(name, False, 0.0, 0.0)

    @property
    def reset_parameters(self) -> Dict[str, str]:
        self.num_show_steps = 2

    def _make_batched_step(
-        self, name: str, done: bool, reward: float
+        self, name: str, done: bool, reward: float, group_reward: float
    ) -> Tuple[DecisionSteps, TerminalSteps]:
        recurrent_obs_val = (
            self.goal[name] if self.step_count[name] <= self.num_show_steps else 0
        m_agent_id = np.array([self.agent_id[name]], dtype=np.int32)
+        m_group_id = np.array([0], dtype=np.int32)
+        m_group_reward = np.array([group_reward], dtype=np.float32)
-        decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id, action_mask)
+        decision_step = DecisionSteps(
+            m_vector_obs, m_reward, m_agent_id, action_mask, m_group_id, m_group_reward
+        )
        terminal_step = TerminalSteps.empty(self.behavior_spec)
        if done:
            self.final_rewards[name].append(self.rewards[name])
                new_done,
                new_agent_id,
                new_action_mask,
+                new_group_id,
+                new_group_reward,
-                new_vector_obs, new_reward, new_agent_id, new_action_mask
+                new_vector_obs,
+                new_reward,
+                new_agent_id,
+                new_action_mask,
+                new_group_id,
+                new_group_reward,
-                m_vector_obs, m_reward, np.array([False], dtype=np.bool), m_agent_id
+                m_vector_obs,
+                m_reward,
+                np.array([False], dtype=np.bool),
+                m_agent_id,
+                m_group_id,
+                m_group_reward,
            )
        return (decision_step, terminal_step)

--- a/protobuf-definitions/proto/mlagents_envs/communicator_objects/agent_info.proto
+++ b/protobuf-definitions/proto/mlagents_envs/communicator_objects/agent_info.proto
    repeated bool action_mask = 11;
    reserved 12; // deprecated CustomObservationProto custom_observation = 12;
    repeated ObservationProto observations = 13;
+    int32 group_id = 14;
+    float group_reward = 15;
 }
--- a/com.unity.ml-agents/Runtime/IMultiAgentGroup.cs
+++ b/com.unity.ml-agents/Runtime/IMultiAgentGroup.cs
+namespace Unity.MLAgents
+{
+    /// <summary>
+    /// MultiAgentGroup interface for grouping agents to support multi-agent training.
+    /// </summary>
+    public interface IMultiAgentGroup
+    {
+        /// <summary>
+        /// Get the ID of MultiAgentGroup.
+        /// </summary>
+        /// <returns>
+        /// MultiAgentGroup ID.
+        /// </returns>
+        int GetId();
+
+        /// <summary>
+        /// Register agent to the MultiAgentGroup.
+        /// </summary>
+        void RegisterAgent(Agent agent);
+
+        /// <summary>
+        /// Unregister agent from the MultiAgentGroup.
+        /// </summary>
+        void UnregisterAgent(Agent agent);
+    }
+}
--- a/com.unity.ml-agents/Runtime/IMultiAgentGroup.cs.meta
+++ b/com.unity.ml-agents/Runtime/IMultiAgentGroup.cs.meta
+fileFormatVersion: 2
+guid: 3744ac27d956e43e1a39c7ba2550ab82
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/com.unity.ml-agents/Runtime/MultiAgentGroupIdCounter.cs
+++ b/com.unity.ml-agents/Runtime/MultiAgentGroupIdCounter.cs
+using System.Threading;
+
+namespace Unity.MLAgents
+{
+    internal static class MultiAgentGroupIdCounter
+    {
+        static int s_Counter;
+        public static int GetGroupId()
+        {
+            return Interlocked.Increment(ref s_Counter); ;
+        }
+    }
+}
--- a/com.unity.ml-agents/Runtime/MultiAgentGroupIdCounter.cs.meta
+++ b/com.unity.ml-agents/Runtime/MultiAgentGroupIdCounter.cs.meta
+fileFormatVersion: 2
+guid: 5661ffdb6c7704e84bc785572dcd5bd1
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/com.unity.ml-agents/Runtime/SimpleMultiAgentGroup.cs
+++ b/com.unity.ml-agents/Runtime/SimpleMultiAgentGroup.cs
+using System;
+using System.Linq;
+using System.Collections.Generic;
+
+namespace Unity.MLAgents
+{
+    /// <summary>
+    /// A basic class implementation of MultiAgentGroup.
+    /// </summary>
+    internal class SimpleMultiAgentGroup : IMultiAgentGroup, IDisposable
+    {
+        readonly int m_Id = MultiAgentGroupIdCounter.GetGroupId();
+        HashSet<Agent> m_Agents = new HashSet<Agent>();
+
+
+        public virtual void Dispose()
+        {
+            while (m_Agents.Count > 0)
+            {
+                UnregisterAgent(m_Agents.First());
+            }
+        }
+
+        /// <inheritdoc />
+        public virtual void RegisterAgent(Agent agent)
+        {
+            if (!m_Agents.Contains(agent))
+            {
+                agent.SetMultiAgentGroup(this);
+                m_Agents.Add(agent);
+                agent.OnAgentDisabled += UnregisterAgent;
+            }
+        }
+
+        /// <inheritdoc />
+        public virtual void UnregisterAgent(Agent agent)
+        {
+            if (m_Agents.Contains(agent))
+            {
+                agent.SetMultiAgentGroup(null);
+                m_Agents.Remove(agent);
+                agent.OnAgentDisabled -= UnregisterAgent;
+            }
+        }
+
+        /// <inheritdoc />
+        public int GetId()
+        {
+            return m_Id;
+        }
+
+        /// <summary>
+        /// Get list of all agents currently registered to this MultiAgentGroup.
+        /// </summary>
+        /// <returns>
+        /// List of agents registered to the MultiAgentGroup.
+        /// </returns>
+        public IReadOnlyCollection<Agent> GetRegisteredAgents()
+        {
+            return (IReadOnlyCollection<Agent>)m_Agents;
+        }
+
+        /// <summary>
+        /// Increments the group rewards for all agents in this MultiAgentGroup.
+        /// </summary>
+        /// <remarks>
+        /// This function increases or decreases the group rewards by a given amount for all agents
+        /// in the group. Use <see cref="SetGroupReward(float)"/> to set the group reward assigned
+        /// to the current step with a specific value rather than increasing or decreasing it.
+        ///
+        /// A positive group reward indicates the whole group's accomplishments or desired behaviors.
+        /// Every agent in the group will receive the same group reward no matter whether the
+        /// agent's act directly leads to the reward. Group rewards are meant to reinforce agents
+        /// to act in the group's best interest instead of individual ones.
+        /// Group rewards are treated differently than individual agent rewards during training, so
+        /// calling AddGroupReward() is not equivalent to calling agent.AddReward() on each agent in the group.
+        /// </remarks>
+        /// <param name="reward">Incremental group reward value.</param>
+        public void AddGroupReward(float reward)
+        {
+            foreach (var agent in m_Agents)
+            {
+                agent.AddGroupReward(reward);
+            }
+        }
+
+        /// <summary>
+        /// Set the group rewards for all agents in this MultiAgentGroup.
+        /// </summary>
+        /// <remarks>
+        /// This function replaces any group rewards given during the current step for all agents in the group.
+        /// Use <see cref="AddGroupReward(float)"/> to incrementally change the group reward rather than
+        /// overriding it.
+        ///
+        /// A positive group reward indicates the whole group's accomplishments or desired behaviors.
+        /// Every agent in the group will receive the same group reward no matter whether the
+        /// agent's act directly leads to the reward. Group rewards are meant to reinforce agents
+        /// to act in the group's best interest instead of indivisual ones.
+        /// Group rewards are treated differently than individual agent rewards during training, so
+        /// calling SetGroupReward() is not equivalent to calling agent.SetReward() on each agent in the group.
+        /// </remarks>
+        /// <param name="reward">The new value of the group reward.</param>
+        public void SetGroupReward(float reward)
+        {
+            foreach (var agent in m_Agents)
+            {
+                agent.SetGroupReward(reward);
+            }
+        }
+
+        /// <summary>
+        /// End episodes for all agents in this MultiAgentGroup.
+        /// </summary>
+        /// <remarks>
+        /// This should be used when the episode can no longer continue, such as when the group
+        /// reaches the goal or fails at the task.
+        /// </remarks>
+        public void EndGroupEpisode()
+        {
+            foreach (var agent in m_Agents)
+            {
+                agent.EndEpisode();
+            }
+        }
+
+        /// <summary>
+        /// Indicate that the episode is over but not due to the "fault" of the group.
+        /// This has the same end result as calling <see cref="EndGroupEpisode"/>, but has a
+        /// slightly different effect on training.
+        /// </summary>
+        /// <remarks>
+        /// This should be used when the episode could continue, but has gone on for
+        /// a sufficient number of steps, such as if the environment hits some maximum number of steps.
+        /// </remarks>
+        public void GroupEpisodeInterrupted()
+        {
+            foreach (var agent in m_Agents)
+            {
+                agent.EpisodeInterrupted();
+            }
+        }
+    }
+}
--- a/com.unity.ml-agents/Runtime/SimpleMultiAgentGroup.cs.meta
+++ b/com.unity.ml-agents/Runtime/SimpleMultiAgentGroup.cs.meta
+fileFormatVersion: 2
+guid: 3454e3c3c70964dca93b63ee4b650095
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/com.unity.ml-agents/Tests/Editor/MultiAgentGroupTests.cs
+++ b/com.unity.ml-agents/Tests/Editor/MultiAgentGroupTests.cs
+using Unity.MLAgents;
+using System;
+using System.Reflection;
+using NUnit.Framework;
+using UnityEngine;
+using Unity;
+
+namespace Unity.MLAgents.Tests
+{
+    public class MultiAgentGroupTests
+    {
+        class TestAgent : Agent
+        {
+            internal int _GroupId
+            {
+                get
+                {
+                    return (int)typeof(Agent).GetField("m_GroupId", BindingFlags.Instance | BindingFlags.NonPublic).GetValue(this);
+                }
+            }
+
+            internal float _GroupReward
+            {
+                get
+                {
+                    return (float)typeof(Agent).GetField("m_GroupReward", BindingFlags.Instance | BindingFlags.NonPublic).GetValue(this);
+                }
+            }
+
+            internal Action<Agent> _OnAgentDisabledActions
+            {
+                get
+                {
+                    return (Action<Agent>)typeof(Agent).GetField("OnAgentDisabled", BindingFlags.Instance | BindingFlags.NonPublic).GetValue(this);
+                }
+            }
+        }
+
+        [Test]
+        public void TestRegisteredAgentGroupId()
+        {
+            var agentGo = new GameObject("TestAgent");
+            agentGo.AddComponent<TestAgent>();
+            var agent = agentGo.GetComponent<TestAgent>();
+
+            // test register
+            SimpleMultiAgentGroup agentGroup1 = new SimpleMultiAgentGroup();
+            agentGroup1.RegisterAgent(agent);
+            Assert.AreEqual(agentGroup1.GetId(), agent._GroupId);
+            Assert.IsNotNull(agent._OnAgentDisabledActions);
+
+            // should not be able to registered to multiple groups
+            SimpleMultiAgentGroup agentGroup2 = new SimpleMultiAgentGroup();
+            Assert.Throws<UnityAgentsException>(
+                () => agentGroup2.RegisterAgent(agent));
+            Assert.AreEqual(agentGroup1.GetId(), agent._GroupId);
+
+            // test unregister
+            agentGroup1.UnregisterAgent(agent);
+            Assert.AreEqual(0, agent._GroupId);
+            Assert.IsNull(agent._OnAgentDisabledActions);
+
+            // test register to another group after unregister
+            agentGroup2.RegisterAgent(agent);
+            Assert.AreEqual(agentGroup2.GetId(), agent._GroupId);
+            Assert.IsNotNull(agent._OnAgentDisabledActions);
+        }
+
+        [Test]
+        public void TestRegisterMultipleAgent()
+        {
+            var agentGo1 = new GameObject("TestAgent");
+            agentGo1.AddComponent<TestAgent>();
+            var agent1 = agentGo1.GetComponent<TestAgent>();
+            var agentGo2 = new GameObject("TestAgent");
+            agentGo2.AddComponent<TestAgent>();
+            var agent2 = agentGo2.GetComponent<TestAgent>();
+
+            SimpleMultiAgentGroup agentGroup = new SimpleMultiAgentGroup();
+            agentGroup.RegisterAgent(agent1); // register
+            Assert.AreEqual(agentGroup.GetRegisteredAgents().Count, 1);
+            agentGroup.UnregisterAgent(agent2); // unregister non-member agent
+            Assert.AreEqual(agentGroup.GetRegisteredAgents().Count, 1);
+            agentGroup.UnregisterAgent(agent1); // unregister
+            Assert.AreEqual(agentGroup.GetRegisteredAgents().Count, 0);
+            agentGroup.RegisterAgent(agent1);
+            agentGroup.RegisterAgent(agent1); // duplicated register
+            Assert.AreEqual(agentGroup.GetRegisteredAgents().Count, 1);
+            agentGroup.RegisterAgent(agent2); // register another
+            Assert.AreEqual(agentGroup.GetRegisteredAgents().Count, 2);
+
+            // test add/set group rewards
+            agentGroup.AddGroupReward(0.1f);
+            Assert.AreEqual(0.1f, agent1._GroupReward);
+            agentGroup.AddGroupReward(0.5f);
+            Assert.AreEqual(0.6f, agent1._GroupReward);
+            agentGroup.SetGroupReward(0.3f);
+            Assert.AreEqual(0.3f, agent1._GroupReward);
+            // unregistered agent should not receive group reward
+            agentGroup.UnregisterAgent(agent1);
+            agentGroup.AddGroupReward(0.2f);
+            Assert.AreEqual(0.3f, agent1._GroupReward);
+            Assert.AreEqual(0.5f, agent2._GroupReward);
+
+            // dispose group should automatically unregister all
+            agentGroup.Dispose();
+            Assert.AreEqual(0, agent1._GroupId);
+            Assert.AreEqual(0, agent2._GroupId);
+        }
+
+        [Test]
+        public void TestGroupIdCounter()
+        {
+            SimpleMultiAgentGroup group1 = new SimpleMultiAgentGroup();
+            SimpleMultiAgentGroup group2 = new SimpleMultiAgentGroup();
+            // id should be unique
+            Assert.AreNotEqual(group1.GetId(), group2.GetId());
+        }
+    }
+}
--- a/com.unity.ml-agents/Tests/Editor/MultiAgentGroupTests.cs.meta
+++ b/com.unity.ml-agents/Tests/Editor/MultiAgentGroupTests.cs.meta
+fileFormatVersion: 2
+guid: ef0158fde748d478ca5ee3bbe22a4c9e
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: