Make the Agent reset immediately after Done (#3291)

* Made the Agent reset immediately * fixing the C# tests * Fixing the tests still * Trying with incremental episode ids * deleting buffer rather than using an empty list * Addressing the comments * Forgot to edit the comment on AgentInfo * Updating the migrating doc * Fixed an obvious bug * cleaning after an agent is done in agent processor * Fixing the pytest errors
5 年前 · 590559e7
--- a/UnitySDK/Assets/ML-Agents/Editor/Tests/DemonstrationTests.cs
+++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/DemonstrationTests.cs
                reward = 1f,
                actionMasks = new[] { false, true },
                done = true,
-                id = 5,
+                episodeId = 5,
                maxStepReached = true,
                storedVectorActions = new[] { 0f, 1f },
            };
--- a/UnitySDK/Assets/ML-Agents/Editor/Tests/MLAgentsEditModeTest.cs
+++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/MLAgentsEditModeTest.cs
            }
        }

-        public bool IsDone()
-        {
-            return (bool)typeof(Agent).GetField("m_Done", BindingFlags.Instance | BindingFlags.NonPublic).GetValue(this);
-        }
        public int initializeAgentCalls;
        public int collectObservationsCalls;
        public int agentActionCalls;
            agentGo2.AddComponent<TestAgent>();
            var agent2 = agentGo2.GetComponent<TestAgent>();

-            Assert.AreEqual(false, agent1.IsDone());
-            Assert.AreEqual(false, agent2.IsDone());
            Assert.AreEqual(0, agent1.agentResetCalls);
            Assert.AreEqual(0, agent2.agentResetCalls);
            Assert.AreEqual(0, agent1.initializeAgentCalls);
            agentEnableMethod?.Invoke(agent2, new object[] { });
            agentEnableMethod?.Invoke(agent1, new object[] { });

-            Assert.AreEqual(false, agent1.IsDone());
-            Assert.AreEqual(false, agent2.IsDone());
            // agent1 was not enabled when the academy started
            // The agents have been initialized
            Assert.AreEqual(0, agent1.agentResetCalls);
                if (i % 11 == 5)
                {
                    agent1.Done();
+                    numberAgent1Reset += 1;
-                    if (!(agent2.IsDone()))
-                    {
-                        // If the agent was already reset before the request decision
-                        // We should not reset again
-                        agent2.Done();
-                        numberAgent2Reset += 1;
-                        agent2StepSinceReset = 0;
-                    }
+                    agent2.Done();
+                    numberAgent2Reset += 1;
+                    agent2StepSinceReset = 0;
                }
                // Request a decision for agent 2 regularly
                if (i % 3 == 2)
                {
                    // Request an action without decision regularly
                    agent2.RequestAction();
-                }
-                if (agent1.IsDone())
-                {
-                    numberAgent1Reset += 1;
-                //Agent 1 is only initialized at step 2
-                if (i < 2)
-                { }
                aca.EnvironmentStep();
            }
        }
            var j = 0;
            for (var i = 0; i < 500; i++)
            {
+                if (i % 20 == 0)
+                {
+                    j = 0;
+                }
+                else
+                {
+                    j++;
+                }
-                Assert.LessOrEqual(Mathf.Abs(j * 0.1f + j * 10f - agent1.GetCumulativeReward()), 0.05f);
+                Assert.LessOrEqual(Mathf.Abs(j * 10.1f - agent1.GetCumulativeReward()), 0.05f);
-
+                agent1.AddReward(10f);
-                agent1.AddReward(10f);
-                if ((i % 21 == 0) && (i > 0))
-                {
-                    j = 0;
-                }
-                j++;
+
+
            }
        }
    }
--- a/UnitySDK/Assets/ML-Agents/Scripts/Academy.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Academy.cs
        // in addition to aligning on the step count of the global episode.
        public event System.Action<int> AgentSetStatus;

-        // Signals to all the agents at each environment step so they can reset
-        // if their flag has been set to done (assuming the agent has requested a
-        // decision).
-        public event System.Action AgentResetIfDone;
-
        // Signals to all the agents at each environment step so they can send
        // their state to their Policy if they have requested a decision.
        public event System.Action AgentSendState;
            DecideAction = () => { };
            DestroyAction = () => { };
            AgentSetStatus = i => { };
-            AgentResetIfDone = () => { };
            AgentSendState = () => { };
            AgentAct = () => { };
            AgentForceReset = () => { };

            AgentSetStatus?.Invoke(m_StepCount);

-            using (TimerStack.Instance.Scoped("AgentResetIfDone"))
-            {
-                AgentResetIfDone?.Invoke();
-            }

            using (TimerStack.Instance.Scoped("AgentSendState"))
            {
--- a/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
        public bool maxStepReached;

        /// <summary>
-        /// Unique identifier each agent receives at initialization. It is used
+        /// Episode identifier each agent receives at every reset. It is used
-        public int id;
+        public int episodeId;
    }

    /// <summary>
        /// Whether or not the agent requests a decision.
        bool m_RequestDecision;

-        /// Whether or not the agent has completed the episode. This may be due
-        /// to either reaching a success or fail state, or reaching the maximum
-        /// number of steps (i.e. timing out).
-        bool m_Done;
-
-        /// Whether or not the agent reached the maximum number of steps.
-        bool m_MaxStepReached;

        /// Keeps track of the number of steps taken by the agent in this episode.
        /// Note that this value is different for each agent, and may not overlap

-        /// Unique identifier each agent receives at initialization. It is used
+        /// Episode identifier each agent receives. It is used
-        int m_Id;
+        /// This Id will be changed every time the Agent resets.
+        int m_EpisodeId;

        /// Keeps track of the actions that are masked at each step.
        ActionMasker m_ActionMasker;
        /// becomes enabled or active.
        void OnEnable()
        {
-            m_Id = gameObject.GetInstanceID();
+            m_EpisodeId = EpisodeIdCounter.GetEpisodeId();
            OnEnableHelper();

            m_Recorder = GetComponent<DemonstrationRecorder>();
            m_Action = new AgentAction();
            sensors = new List<ISensor>();

-            Academy.Instance.AgentResetIfDone += ResetIfDone;
            Academy.Instance.AgentSendState += SendInfo;
            Academy.Instance.DecideAction += DecideAction;
            Academy.Instance.AgentAct += AgentStep;
            // We don't want to even try, because this will lazily create a new Academy!
            if (Academy.IsInitialized)
            {
-                Academy.Instance.AgentResetIfDone -= ResetIfDone;
                Academy.Instance.AgentSendState -= SendInfo;
                Academy.Instance.DecideAction -= DecideAction;
                Academy.Instance.AgentAct -= AgentStep;
            m_Brain?.Dispose();
        }

-        void NotifyAgentDone()
+        void NotifyAgentDone(bool maxStepReached = false)
+            m_Info.reward = m_Reward;
+            m_Info.maxStepReached = maxStepReached;
-            // We request a decision so Python knows the Agent is disabled
+            // We request a decision so Python knows the Agent is done immediately
+            // The Agent is done, so we give it a new episode Id
+            m_EpisodeId = EpisodeIdCounter.GetEpisodeId();
+            m_Reward = 0f;
+            m_CumulativeReward = 0f;
+            m_RequestAction = false;
+            m_RequestDecision = false;
        }

        /// <summary>
        /// </summary>
        public void Done()
        {
-            m_Done = true;
+            NotifyAgentDone();
+            _AgentReset();
+
        }

        /// <summary>
            m_RequestAction = true;
        }

-        /// <summary>
-        /// Indicates if the agent has reached his maximum number of steps.
-        /// </summary>
-        /// <returns>
-        /// <c>true</c>, if max step reached was reached, <c>false</c> otherwise.
-        /// </returns>
-        public bool IsMaxStepReached()
-        {
-            return m_MaxStepReached;
-        }
-
-        /// <summary>
-        /// Indicates if the agent is done
-        /// </summary>
-        /// <returns>
-        /// <c>true</c>, if the agent is done, <c>false</c> otherwise.
-        /// </returns>
-        public bool IsDone()
-        {
-            return m_Done;
-        }
-
        /// Helper function that resets all the data structures associated with
        /// the agent. Typically used when the agent is being initialized or reset
        /// at the end of an episode.
            m_Info.actionMasks = m_ActionMasker.GetMask();

            m_Info.reward = m_Reward;
-            m_Info.done = m_Done;
-            m_Info.maxStepReached = m_MaxStepReached;
-            m_Info.id = m_Id;
+            m_Info.done = false;
+            m_Info.maxStepReached = false;
+            m_Info.episodeId = m_EpisodeId;

            m_Brain.RequestDecision(m_Info, sensors, UpdateAgentAction);

        }


-        /// Signals the agent that it must reset if its done flag is set to true.
-        void ResetIfDone()
-        {
-            if (m_Done)
-            {
-                _AgentReset();
-            }
-        }
-
        /// <summary>
        /// Signals the agent that it must sent its decision to the brain.
        /// </summary>
-            if (m_RequestDecision || m_Done)
+            if (m_RequestDecision)
-                if (m_Done)
-                {
-                    m_CumulativeReward = 0f;
-                }
-                m_Done = false;
-                m_MaxStepReached = false;
                m_RequestDecision = false;
            }
        }
        {
+            if ((m_StepCount >= maxStep - 1) && (maxStep > 0))
+            {
+                NotifyAgentDone(true);
+                _AgentReset();
+
+            }
+            else
+            {
+                m_StepCount += 1;
+            }
-
-            if ((m_StepCount >= maxStep) && (maxStep > 0))
-            {
-                m_MaxStepReached = true;
-                Done();
-            }
-
-            m_StepCount += 1;
        }

        void DecideAction()
--- a/UnitySDK/Assets/ML-Agents/Scripts/Grpc/GrpcExtensions.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Grpc/GrpcExtensions.cs
                Reward = ai.reward,
                MaxStepReached = ai.maxStepReached,
                Done = ai.done,
-                Id = ai.id,
+                Id = ai.episodeId,
            };

            if (ai.actionMasks != null)
--- a/UnitySDK/Assets/ML-Agents/Scripts/Grpc/RpcCommunicator.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Grpc/RpcCommunicator.cs
            {
                m_ActionCallbacks[brainKey] = new List<IdCallbackPair>();
            }
-            m_ActionCallbacks[brainKey].Add(new IdCallbackPair { AgentId = info.id, Callback = action });
+            m_ActionCallbacks[brainKey].Add(new IdCallbackPair { AgentId = info.episodeId, Callback = action });
        }

        /// <summary>
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/GeneratorImpl.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/GeneratorImpl.cs

                if (info.done)
                {
-                    m_Memories.Remove(info.id);
+                    m_Memories.Remove(info.episodeId);
-                if (!m_Memories.TryGetValue(info.id, out memory))
+                if (!m_Memories.TryGetValue(info.episodeId, out memory))
                {
                    for (var j = 0; j < memorySize; j++)
                    {
                List<float> memory;
                if (info.done)
                {
-                    m_Memories.Remove(info.id);
+                    m_Memories.Remove(info.episodeId);
-                if (!m_Memories.TryGetValue(info.id, out memory))
+                if (!m_Memories.TryGetValue(info.episodeId, out memory))
                {

                    for (var j = 0; j < memorySize; j++)
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ModelRunner.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ModelRunner.cs
                sensors = sensors
            });

-            m_ActionFuncs.Add(new AgentIdActionPair { action = action, agentId = info.id });
+            m_ActionFuncs.Add(new AgentIdActionPair { action = action, agentId = info.episodeId });
        }
        public void DecideBatch()
        {
--- a/docs/Migrating.md
+++ b/docs/Migrating.md
 * The `AgentAction` struct no longer contains a `value` field. (Value estimates were not set during inference)
 * The `GetValueEstimate()` method on the Agent has been removed.
 * The `UpdateValueAction()` method on the Agent has been removed.
+* Calling `Done()` on the Agent will now reset it immediately and call the `AgentReset` virtual method. (This is to simplify the previous logic in which the Agent had to wait for the next `EnvironmentStep` to reset)

 ### Steps to Migrate
 * If you were not using `On Demand Decision` for your Agent, you **must** add a `DecisionRequester` component to your Agent GameObject and set its `Decision Period` field to the old `Decision Period` of the Agent.
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
                "Policy/Learning Rate", take_action_outputs["learning_rate"]
            )

+        terminated_agents: List[str] = []
        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
            get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids
                            "Environment/Episode Length",
                            self.episode_steps.get(global_id, 0),
                        )
-                        del self.episode_steps[global_id]
-                        del self.episode_rewards[global_id]
+                        terminated_agents += [global_id]
                elif not curr_agent_step.done:
                    self.episode_steps[global_id] += 1

            self.policy.save_previous_action(
                previous_action.agent_ids, take_action_outputs["action"]
            )
+
+        for terminated_id in terminated_agents:
+            self._clean_agent_data(terminated_id)
+
+    def _clean_agent_data(self, global_id: str) -> None:
+        """
+        Removes the data for an Agent.
+        """
+        del self.experience_buffers[global_id]
+        del self.last_take_action_outputs[global_id]
+        del self.episode_steps[global_id]
+        del self.episode_rewards[global_id]
+        del self.last_step_result[global_id]
+        self.policy.remove_previous_action([global_id])
+        self.policy.remove_memories([global_id])

    def publish_trajectory_queue(
        self, trajectory_queue: "AgentManagerQueue[Trajectory]"
--- a/UnitySDK/Assets/ML-Agents/Scripts/EpisodeIdCounter.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/EpisodeIdCounter.cs
+namespace MLAgents
+{
+    public static class EpisodeIdCounter
+    {
+        private static int Counter;
+        public static int GetEpisodeId()
+        {
+            return Counter++;
+        }
+    }
+}
--- a/UnitySDK/Assets/ML-Agents/Scripts/EpisodeIdCounter.cs.meta
+++ b/UnitySDK/Assets/ML-Agents/Scripts/EpisodeIdCounter.cs.meta
+fileFormatVersion: 2
+guid: 847786b7bcf9d4817b3f3879d57517c7
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: