[bug-fix] When agent isn't training, don't clear update buffer (#5205)

* Don't clear update buffer, but don't append to it either * Update changelog * Address comments * Make experience replay buffer saving more verbose
4 年前 · ff21216d
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
 settings. Unfortunately, this may require retraining models if it changes the resulting order of the sensors
 or actuators on your system. (#5194)
 #### ml-agents / ml-agents-envs / gym-unity (Python)
-
+- Fixed a bug where the SAC replay buffer would not be saved out at the end of a run, even if `save_replay_buffer` was enabled. (#5205)

 ## [1.9.0-preview] - 2021-03-17
 ### Major Changes
--- a/ml-agents/mlagents/trainers/poca/trainer.py
+++ b/ml-agents/mlagents/trainers/poca/trainer.py
        )
        agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages)

-        # Append to update buffer
-        agent_buffer_trajectory.resequence_and_append(
-            self.update_buffer, training_length=self.policy.sequence_length
-        )
+        self._append_to_update_buffer(agent_buffer_trajectory)

        # If this was a terminal trajectory, append stats and reset reward collection
        if trajectory.done_reached:
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
        global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
        agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages)
        agent_buffer_trajectory[BufferKey.DISCOUNTED_RETURNS].set(global_returns)
-        # Append to update buffer
-        agent_buffer_trajectory.resequence_and_append(
-            self.update_buffer, training_length=self.policy.sequence_length
-        )
+
+        self._append_to_update_buffer(agent_buffer_trajectory)

        # If this was a terminal trajectory, append stats and reset reward collection
        if trajectory.done_reached:
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
        Save the training buffer's update buffer to a pickle file.
        """
        filename = os.path.join(self.artifact_path, "last_replay_buffer.hdf5")
-        logger.info(f"Saving Experience Replay Buffer to {filename}")
+        logger.info(f"Saving Experience Replay Buffer to {filename}...")
+            logger.info(
+                f"Saved Experience Replay Buffer ({os.path.getsize(filename)} bytes)."
+            )

    def load_replay_buffer(self) -> None:
        """
                agent_buffer_trajectory[ObsUtil.get_name_at_next(i)][-1] = obs
            agent_buffer_trajectory[BufferKey.DONE][-1] = False

-        # Append to update buffer
-        agent_buffer_trajectory.resequence_and_append(
-            self.update_buffer, training_length=self.policy.sequence_length
-        )
+        self._append_to_update_buffer(agent_buffer_trajectory)

        if trajectory.done_reached:
            self._update_end_episode_stats(agent_id, self.optimizer)
--- a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
+++ b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py


@mock.patch("mlagents.trainers.trainer.trainer.Trainer.save_model")
-@mock.patch("mlagents.trainers.trainer.rl_trainer.RLTrainer._clear_update_buffer")
-def test_advance(mocked_clear_update_buffer, mocked_save_model):
+def test_advance(mocked_save_model):
    trainer = create_rl_trainer()
    mock_policy = mock.Mock()
    trainer.add_policy("TestBrain", mock_policy)
        with pytest.raises(AgentManagerQueue.Empty):
            policy_queue.get_nowait()

-    # Check that the buffer has been cleared
+    # Check that no model has been saved
-    assert mocked_clear_update_buffer.call_count > 0
    assert mocked_save_model.call_count == 0


        for step in checkpoint_range
    ]
    mock_add_checkpoint.assert_has_calls(add_checkpoint_calls)
+
+
+def test_update_buffer_append():
+    trainer = create_rl_trainer()
+    mock_policy = mock.Mock()
+    trainer.add_policy("TestBrain", mock_policy)
+    trajectory_queue = AgentManagerQueue("testbrain")
+    policy_queue = AgentManagerQueue("testbrain")
+    trainer.subscribe_trajectory_queue(trajectory_queue)
+    trainer.publish_policy_queue(policy_queue)
+    time_horizon = 10
+    trajectory = mb.make_fake_trajectory(
+        length=time_horizon,
+        observation_specs=create_observation_specs_with_shapes([(1,)]),
+        max_step_complete=True,
+        action_spec=ActionSpec.create_discrete((2,)),
+    )
+    agentbuffer_trajectory = trajectory.to_agentbuffer()
+    assert trainer.update_buffer.num_experiences == 0
+
+    # Check that if we append, our update buffer gets longer.
+    # max_steps = 100
+    for i in range(10):
+        trainer._process_trajectory(trajectory)
+        trainer._append_to_update_buffer(agentbuffer_trajectory)
+        assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon
+
+    # Check that if we append after stopping training, nothing happens.
+    # We process enough trajectories to hit max steps
+    trainer.set_is_policy_updating(False)
+    trainer._process_trajectory(trajectory)
+    trainer._append_to_update_buffer(agentbuffer_trajectory)
+    assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon


 class RLTrainerWarningTest(unittest.TestCase):
--- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
        if step_after_process >= self._next_summary_step and self.get_step != 0:
            self._write_summary(self._next_summary_step)

+    def _append_to_update_buffer(self, agentbuffer_trajectory: AgentBuffer) -> None:
+        """
+        Append an AgentBuffer to the update buffer. If the trainer isn't training,
+        don't update to avoid a memory leak.
+        """
+        if self.should_still_train:
+            seq_len = (
+                self.trainer_settings.network_settings.memory.sequence_length
+                if self.trainer_settings.network_settings.memory is not None
+                else 1
+            )
+            agentbuffer_trajectory.resequence_and_append(
+                self.update_buffer, training_length=seq_len
+            )
+
    def _maybe_save_model(self, step_after_process: int) -> None:
        """
        If processing the trajectory will make the step exceed the next model write,
                        for q in self.policy_queues:
                            # Get policies that correspond to the policy queue in question
                            q.put(self.get_policy(q.behavior_id))
-        else:
-            self._clear_update_buffer()