add team reward field to agent and proto

4 年前 · 90c9280e
--- a/com.unity.ml-agents/Runtime/Agent.cs
+++ b/com.unity.ml-agents/Runtime/Agent.cs
        public float reward;

        /// <summary>
+        /// The current team reward received by the agent.
+        /// </summary>
+        public float teamReward;
+
+        /// <summary>
        /// Whether the agent is done or not.
        /// </summary>
        public bool done;
        /// Additionally, the magnitude of the reward should not exceed 1.0
        float m_Reward;

+        /// Represents the team reward the agent accumulated during the current step.
+        float m_TeamReward;
+
        /// Keeps track of the cumulative reward in this episode.
        float m_CumulativeReward;

        /// OnActionReceived method.
        /// </summary>
        float[] m_LegacyActionCache;
-
-        ITeamManager m_TeamManager;

        ITeamManager m_TeamManager;

                }
            }
            // Request the last decision with no callbacks
-            if (m_TeamManager != null)
-            {
-                // Send final observations to TeamManager if it exists.
-                // The TeamManager is responsible to keeping track of the Agent after it's
-                // done, including propagating any "posthumous" rewards.
-                m_TeamManager.OnAgentDone(this, doneReason, sensors);
-            }
-            else
-            {
-                SendDoneToTrainer();
-            }
+            // We request a decision so Python knows the Agent is done immediately
+            m_Brain?.RequestDecision(m_Info, sensors);
+            ResetSensors();

            // We also have to write any to any DemonstationStores so that they get the "done" flag.
            foreach (var demoWriter in DemonstrationWriters)
            m_RequestAction = false;
            m_RequestDecision = false;
            m_Info.storedActions.Clear();
-        }
-
-        public void SendDoneToTrainer()
-        {
-            // We request a decision so Python knows the Agent is done immediately
-            m_Brain?.RequestDecision(m_Info, sensors);
-            ResetSensors();
        }

        /// <summary>
            m_CumulativeReward += increment;
        }

-        public void AddRewardAfterDeath(float increment)
+        public void SetTeamReward(float reward)
+        {
+#if DEBUG
+            Utilities.DebugCheckNanAndInfinity(reward, nameof(reward), nameof(SetTeamReward));
+#endif
+            m_TeamReward += reward;
+        }
+
+        public void AddTeamReward(float increment)
-            m_Info.reward += increment;
+#if DEBUG
+            Utilities.DebugCheckNanAndInfinity(increment, nameof(increment), nameof(AddTeamReward));
+#endif
+            m_TeamReward += increment;
        }

        /// <summary>
--- a/com.unity.ml-agents/Runtime/Grpc/CommunicatorObjects/AgentInfo.cs
+++ b/com.unity.ml-agents/Runtime/Grpc/CommunicatorObjects/AgentInfo.cs
          string.Concat(
            "CjNtbGFnZW50c19lbnZzL2NvbW11bmljYXRvcl9vYmplY3RzL2FnZW50X2lu",
            "Zm8ucHJvdG8SFGNvbW11bmljYXRvcl9vYmplY3RzGjRtbGFnZW50c19lbnZz",
-            "L2NvbW11bmljYXRvcl9vYmplY3RzL29ic2VydmF0aW9uLnByb3RvIuoBCg5B",
+            "L2NvbW11bmljYXRvcl9vYmplY3RzL29ic2VydmF0aW9uLnByb3RvIv8BCg5B",
-            "X2lkGA4gASgFSgQIARACSgQIAhADSgQIAxAESgQIBBAFSgQIBRAGSgQIBhAH",
-            "SgQIDBANQiWqAiJVbml0eS5NTEFnZW50cy5Db21tdW5pY2F0b3JPYmplY3Rz",
-            "YgZwcm90bzM="));
+            "X2lkGA4gASgFEhMKC3RlYW1fcmV3YXJkGA8gASgCSgQIARACSgQIAhADSgQI",
+            "AxAESgQIBBAFSgQIBRAGSgQIBhAHSgQIDBANQiWqAiJVbml0eS5NTEFnZW50",
+            "cy5Db21tdW5pY2F0b3JPYmplY3RzYgZwcm90bzM="));
-            new pbr::GeneratedClrTypeInfo(typeof(global::Unity.MLAgents.CommunicatorObjects.AgentInfoProto), global::Unity.MLAgents.CommunicatorObjects.AgentInfoProto.Parser, new[]{ "Reward", "Done", "MaxStepReached", "Id", "ActionMask", "Observations", "TeamManagerId" }, null, null, null)
+            new pbr::GeneratedClrTypeInfo(typeof(global::Unity.MLAgents.CommunicatorObjects.AgentInfoProto), global::Unity.MLAgents.CommunicatorObjects.AgentInfoProto.Parser, new[]{ "Reward", "Done", "MaxStepReached", "Id", "ActionMask", "Observations", "TeamManagerId", "TeamReward" }, null, null, null)
          }));
    }
    #endregion
      actionMask_ = other.actionMask_.Clone();
      observations_ = other.observations_.Clone();
      teamManagerId_ = other.teamManagerId_;
+      teamReward_ = other.teamReward_;
      _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields);
    }

      }
    }

+    /// <summary>Field number for the "team_reward" field.</summary>
+    public const int TeamRewardFieldNumber = 15;
+    private float teamReward_;
+    [global::System.Diagnostics.DebuggerNonUserCodeAttribute]
+    public float TeamReward {
+      get { return teamReward_; }
+      set {
+        teamReward_ = value;
+      }
+    }
+
    [global::System.Diagnostics.DebuggerNonUserCodeAttribute]
    public override bool Equals(object other) {
      return Equals(other as AgentInfoProto);
      if(!actionMask_.Equals(other.actionMask_)) return false;
      if(!observations_.Equals(other.observations_)) return false;
      if (TeamManagerId != other.TeamManagerId) return false;
+      if (!pbc::ProtobufEqualityComparers.BitwiseSingleEqualityComparer.Equals(TeamReward, other.TeamReward)) return false;
      return Equals(_unknownFields, other._unknownFields);
    }

      hash ^= actionMask_.GetHashCode();
      hash ^= observations_.GetHashCode();
      if (TeamManagerId != 0) hash ^= TeamManagerId.GetHashCode();
+      if (TeamReward != 0F) hash ^= pbc::ProtobufEqualityComparers.BitwiseSingleEqualityComparer.GetHashCode(TeamReward);
      if (_unknownFields != null) {
        hash ^= _unknownFields.GetHashCode();
      }
        output.WriteRawTag(112);
        output.WriteInt32(TeamManagerId);
      }
+      if (TeamReward != 0F) {
+        output.WriteRawTag(125);
+        output.WriteFloat(TeamReward);
+      }
      if (_unknownFields != null) {
        _unknownFields.WriteTo(output);
      }
      if (TeamManagerId != 0) {
        size += 1 + pb::CodedOutputStream.ComputeInt32Size(TeamManagerId);
      }
+      if (TeamReward != 0F) {
+        size += 1 + 4;
+      }
      if (_unknownFields != null) {
        size += _unknownFields.CalculateSize();
      }
      observations_.Add(other.observations_);
      if (other.TeamManagerId != 0) {
        TeamManagerId = other.TeamManagerId;
+      }
+      if (other.TeamReward != 0F) {
+        TeamReward = other.TeamReward;
      }
      _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields);
    }
          }
          case 112: {
            TeamManagerId = input.ReadInt32();
+            break;
+          }
+          case 125: {
+            TeamReward = input.ReadFloat();
            break;
          }
        }
--- a/ml-agents-envs/mlagents_envs/communicator_objects/agent_info_pb2.py
+++ b/ml-agents-envs/mlagents_envs/communicator_objects/agent_info_pb2.py
  name='mlagents_envs/communicator_objects/agent_info.proto',
  package='communicator_objects',
  syntax='proto3',
-  serialized_pb=_b('\n3mlagents_envs/communicator_objects/agent_info.proto\x12\x14\x63ommunicator_objects\x1a\x34mlagents_envs/communicator_objects/observation.proto\"\xea\x01\n\x0e\x41gentInfoProto\x12\x0e\n\x06reward\x18\x07 \x01(\x02\x12\x0c\n\x04\x64one\x18\x08 \x01(\x08\x12\x18\n\x10max_step_reached\x18\t \x01(\x08\x12\n\n\x02id\x18\n \x01(\x05\x12\x13\n\x0b\x61\x63tion_mask\x18\x0b \x03(\x08\x12<\n\x0cobservations\x18\r \x03(\x0b\x32&.communicator_objects.ObservationProto\x12\x17\n\x0fteam_manager_id\x18\x0e \x01(\x05J\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04J\x04\x08\x04\x10\x05J\x04\x08\x05\x10\x06J\x04\x08\x06\x10\x07J\x04\x08\x0c\x10\rB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+  serialized_pb=_b('\n3mlagents_envs/communicator_objects/agent_info.proto\x12\x14\x63ommunicator_objects\x1a\x34mlagents_envs/communicator_objects/observation.proto\"\xff\x01\n\x0e\x41gentInfoProto\x12\x0e\n\x06reward\x18\x07 \x01(\x02\x12\x0c\n\x04\x64one\x18\x08 \x01(\x08\x12\x18\n\x10max_step_reached\x18\t \x01(\x08\x12\n\n\x02id\x18\n \x01(\x05\x12\x13\n\x0b\x61\x63tion_mask\x18\x0b \x03(\x08\x12<\n\x0cobservations\x18\r \x03(\x0b\x32&.communicator_objects.ObservationProto\x12\x17\n\x0fteam_manager_id\x18\x0e \x01(\x05\x12\x13\n\x0bteam_reward\x18\x0f \x01(\x02J\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04J\x04\x08\x04\x10\x05J\x04\x08\x05\x10\x06J\x04\x08\x06\x10\x07J\x04\x08\x0c\x10\rB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
  ,
  dependencies=[mlagents__envs_dot_communicator__objects_dot_observation__pb2.DESCRIPTOR,])

      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='team_reward', full_name='communicator_objects.AgentInfoProto.team_reward', index=7,
+      number=15, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
  ],
  extensions=[
  ],
  oneofs=[
  ],
  serialized_start=132,
-  serialized_end=366,
+  serialized_end=387,
 )

 _AGENTINFOPROTO.fields_by_name['observations'].message_type = mlagents__envs_dot_communicator__objects_dot_observation__pb2._OBSERVATIONPROTO
--- a/ml-agents-envs/mlagents_envs/communicator_objects/agent_info_pb2.pyi
+++ b/ml-agents-envs/mlagents_envs/communicator_objects/agent_info_pb2.pyi
    id = ... # type: builtin___int
    action_mask = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[builtin___bool]
    team_manager_id = ... # type: builtin___int
+    team_reward = ... # type: builtin___float

    @property
    def observations(self) -> google___protobuf___internal___containers___RepeatedCompositeFieldContainer[mlagents_envs___communicator_objects___observation_pb2___ObservationProto]: ...
        action_mask : typing___Optional[typing___Iterable[builtin___bool]] = None,
        observations : typing___Optional[typing___Iterable[mlagents_envs___communicator_objects___observation_pb2___ObservationProto]] = None,
        team_manager_id : typing___Optional[builtin___int] = None,
+        team_reward : typing___Optional[builtin___float] = None,
        ) -> None: ...
    @classmethod
    def FromString(cls, s: builtin___bytes) -> AgentInfoProto: ...
-        def ClearField(self, field_name: typing_extensions___Literal[u"action_mask",u"done",u"id",u"max_step_reached",u"observations",u"reward",u"team_manager_id"]) -> None: ...
+        def ClearField(self, field_name: typing_extensions___Literal[u"action_mask",u"done",u"id",u"max_step_reached",u"observations",u"reward",u"team_manager_id",u"team_reward"]) -> None: ...
-        def ClearField(self, field_name: typing_extensions___Literal[u"action_mask",b"action_mask",u"done",b"done",u"id",b"id",u"max_step_reached",b"max_step_reached",u"observations",b"observations",u"reward",b"reward",u"team_manager_id",b"team_manager_id"]) -> None: ...
+        def ClearField(self, field_name: typing_extensions___Literal[u"action_mask",b"action_mask",u"done",b"done",u"id",b"id",u"max_step_reached",b"max_step_reached",u"observations",b"observations",u"reward",b"reward",u"team_manager_id",b"team_manager_id",u"team_reward",b"team_reward"]) -> None: ...
--- a/protobuf-definitions/proto/mlagents_envs/communicator_objects/agent_info.proto
+++ b/protobuf-definitions/proto/mlagents_envs/communicator_objects/agent_info.proto
    reserved 12; // deprecated CustomObservationProto custom_observation = 12;
    repeated ObservationProto observations = 13;
    int32 team_manager_id = 14;
+    float team_reward = 15;
 }