Fix cumulative reward (Unity) and Nan reward (python) bugs

7 年前 · b56259f6
--- a/python/ppo.py
+++ b/python/ppo.py
    while steps <= max_steps or not train_model:
        if env.global_done:
            info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
+            trainer.reset_buffers(info, total=True)
        # Decide and take an action
        new_info = trainer.take_action(info, env, brain_name, steps)
        info = new_info
--- a/python/ppo/trainer.py
+++ b/python/ppo/trainer.py
                 'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': []}
        self.stats = stats
        self.is_training = training
-        self.training_buffer = vectorize_history(empty_local_history({}))
-
-        self.history_dict = empty_all_history(info)
+        self.reset_buffers(info, total=True)

        self.is_continuous = is_continuous
        self.use_observations = use_observations
                    history['cumulative_reward'] = 0
                    history['episode_steps'] = 0

+    def reset_buffers(self, brain_info=None, total=False):
+        self.training_buffer = vectorize_history(empty_local_history({}))
+        if not total:
+            for key in self.history_dict:
+                self.history_dict[key] = empty_local_history(self.history_dict[key])
+        else:
+            self.history_dict = empty_all_history(agent_info=brain_info)
+
    def update_model(self, batch_size, num_epoch):
        """
        Uses training_buffer to update model.
                total_p += p_loss
        self.stats['value_loss'].append(total_v)
        self.stats['policy_loss'].append(total_p)
-        self.training_buffer = vectorize_history(empty_local_history({}))
-        for key in self.history_dict:
-            self.history_dict[key] = empty_local_history(self.history_dict[key])
+        self.reset_buffers()

    def write_summary(self, summary_writer, steps, lesson_number):
        """
        """
-        print("Mean Reward: {0}".format(np.mean(self.stats['cumulative_reward'])))
+        if len(self.stats['cumulative_reward']) > 0:
+            mean_reward = np.mean(self.stats['cumulative_reward'])
+            print("Mean Reward: {0}".format(mean_reward))
        summary = tf.Summary()
        for key in self.stats:
            if len(self.stats[key]) > 0:
--- a/python/unityagents/environment.py
+++ b/python/unityagents/environment.py
            logger.info("\nLesson changed. Now in Lesson {0} : \t{1}"
                .format(self._curriculum.get_lesson_number(),
                    ', '.join([str(x)+' -> '+str(config[x]) for x in config])))
-        else:
+        elif config != {}:
            logger.info("\nAcademy Reset. In Lesson {0} : \t{1}"
                .format(self._curriculum.get_lesson_number(),
                    ', '.join([str(x)+' -> '+str(config[x]) for x in config])))
--- a/unity-environment/Assets/ML-Agents/Examples/Reacher/Prefabs/Agent.prefab
+++ b/unity-environment/Assets/ML-Agents/Examples/Reacher/Prefabs/Agent.prefab
  - component: {fileID: 4836354168995630}
  - component: {fileID: 33827327769986516}
  - component: {fileID: 23913365403597130}
-  - component: {fileID: 114223302162522712}
  m_Layer: 0
  m_Name: GoalOn
  m_TagString: Untagged
  m_Enabled: 1
  m_CastShadows: 1
  m_ReceiveShadows: 1
+  m_DynamicOccludee: 1
  m_MotionVectors: 1
  m_LightProbeUsage: 1
  m_ReflectionProbeUsage: 1
  m_PreserveUVs: 1
  m_IgnoreNormalsForChartDetection: 0
  m_ImportantGI: 0
+  m_StitchLightmapSeams: 0
  m_SelectedEditorRenderState: 3
  m_MinimumChartSize: 4
  m_AutoUVMaxDistance: 0.5
  m_Enabled: 1
  m_CastShadows: 1
  m_ReceiveShadows: 1
+  m_DynamicOccludee: 1
  m_MotionVectors: 1
  m_LightProbeUsage: 1
  m_ReflectionProbeUsage: 1
  m_PreserveUVs: 1
  m_IgnoreNormalsForChartDetection: 0
  m_ImportantGI: 0
+  m_StitchLightmapSeams: 0
  m_SelectedEditorRenderState: 3
  m_MinimumChartSize: 4
  m_AutoUVMaxDistance: 0.5
  m_Enabled: 1
  m_CastShadows: 1
  m_ReceiveShadows: 1
+  m_DynamicOccludee: 1
  m_MotionVectors: 1
  m_LightProbeUsage: 1
  m_ReflectionProbeUsage: 1
  m_PreserveUVs: 1
  m_IgnoreNormalsForChartDetection: 0
  m_ImportantGI: 0
+  m_StitchLightmapSeams: 0
  m_SelectedEditorRenderState: 3
  m_MinimumChartSize: 4
  m_AutoUVMaxDistance: 0.5
  m_Enabled: 1
  m_CastShadows: 1
  m_ReceiveShadows: 1
+  m_DynamicOccludee: 1
  m_MotionVectors: 1
  m_LightProbeUsage: 1
  m_ReflectionProbeUsage: 1
  m_PreserveUVs: 1
  m_IgnoreNormalsForChartDetection: 0
  m_ImportantGI: 0
+  m_StitchLightmapSeams: 0
  m_SelectedEditorRenderState: 3
  m_MinimumChartSize: 4
  m_AutoUVMaxDistance: 0.5
  m_Enabled: 1
  m_CastShadows: 1
  m_ReceiveShadows: 1
+  m_DynamicOccludee: 1
  m_MotionVectors: 1
  m_LightProbeUsage: 1
  m_ReflectionProbeUsage: 1
  m_PreserveUVs: 1
  m_IgnoreNormalsForChartDetection: 0
  m_ImportantGI: 0
+  m_StitchLightmapSeams: 0
  m_SelectedEditorRenderState: 3
  m_MinimumChartSize: 4
  m_AutoUVMaxDistance: 0.5
  m_Enabled: 1
  m_CastShadows: 1
  m_ReceiveShadows: 1
+  m_DynamicOccludee: 1
  m_MotionVectors: 1
  m_LightProbeUsage: 1
  m_ReflectionProbeUsage: 1
  m_PreserveUVs: 1
  m_IgnoreNormalsForChartDetection: 0
  m_ImportantGI: 0
+  m_StitchLightmapSeams: 0
  m_SelectedEditorRenderState: 3
  m_MinimumChartSize: 4
  m_AutoUVMaxDistance: 0.5
  m_Interpolate: 0
  m_Constraints: 0
  m_CollisionDetection: 0
--- !u!114 &114223302162522712
-MonoBehaviour:
-  m_ObjectHideFlags: 1
-  m_PrefabParentObject: {fileID: 0}
-  m_PrefabInternal: {fileID: 100100000}
-  m_GameObject: {fileID: 1065277484498824}
-  m_Enabled: 1
-  m_EditorHideFlags: 0
-  m_Script: {fileID: 11500000, guid: ee7cb421851cf4e088ddbd1254584c68, type: 3}
-  m_Name: 
-  m_EditorClassIdentifier: 
-  agent: {fileID: 1395682910799436}
-  hand: {fileID: 1654288206095398}
-  goalOn: {fileID: 0}
 --- !u!114 &114928491800121992
 MonoBehaviour:
  m_ObjectHideFlags: 1
--- a/unity-environment/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs
+++ b/unity-environment/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs
        torque_z = Mathf.Clamp(act[3], -1, 1) * 100f;
        rbB.AddTorque(new Vector3(torque_x, 0f, torque_z));

-        //reward -= 0.001f;
-        //reward += (Mathf.Max(-4f, pendulumB.transform.position.y) + 4f) / 200f;
 	}

    void UpdateGoalPosition() {

        goal.transform.position = new Vector3(goalY, -1f, goalX) + transform.position;
-
    }


--- a/unity-environment/Assets/ML-Agents/Scripts/Agent.cs
+++ b/unity-environment/Assets/ML-Agents/Scripts/Agent.cs
 	 * If AgentMonitor is attached to the Agent, this value will be displayed.*/

    [HideInInspector]
-    public float CummulativeReward;
-    /**< \brief Do not modify: This keeps track of the cummulative reward.*/
+    public float CumulativeReward;
+    /**< \brief Do not modify: This keeps track of the cumulative reward.*/

    [HideInInspector]
    public int stepCounter;
    public void Reset()
    {
        memory = new float[brain.brainParameters.memorySize];
-        CummulativeReward = 0f;
+        CumulativeReward = 0f;
        stepCounter = 0;
        AgentReset();
    }
    {
        return reward;
+    }
+
+    public void SetCumulativeReward()
+    {
+        CumulativeReward += reward;
+        //Debug.Log(reward);
    }

    /// Do not modify : Is used by the brain to collect done.
    {
        AgentStep(agentStoredAction);
        stepCounter += 1;
-        CummulativeReward += reward;
        if ((stepCounter > maxStep) && (maxStep > 0))
        {
            done = true;
--- a/unity-environment/Assets/ML-Agents/Scripts/Brain.cs
+++ b/unity-environment/Assets/ML-Agents/Scripts/Brain.cs
        Dictionary<int, List<float>> result = new Dictionary<int, List<float>>();
        foreach (KeyValuePair<int, Agent> idAgent in agents)
        {
+            idAgent.Value.SetCumulativeReward();
            List<float> states = idAgent.Value.CollectState();
            if ((states.Count != brainParameters.stateSize) && (brainParameters.stateSpaceType == StateType.continuous))
            {
--- a/unity-environment/Assets/ML-Agents/Scripts/CoreBrainExternal.cs
+++ b/unity-environment/Assets/ML-Agents/Scripts/CoreBrainExternal.cs
    {
        if (brain.gameObject.transform.parent.gameObject.GetComponent<Academy>().communicator == null)
        {
+            coord = null;
-            coord = null;
        }
        else if (brain.gameObject.transform.parent.gameObject.GetComponent<Academy>().communicator is ExternalCommunicator)
        {