[bug-fix] Don't load non-wrapped policy (#4593)

* Always initialize non-wrapped policy * Load ghosted policy * Update changelog * Resume test * Add test * Add torch test and fix torch.
4 年前 · b5dd43f2
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
 if they are called recursively (for example, if they call `Agent.EndEpisode()`).
 Previously, this would result in an infinite loop and cause the editor to hang. (#4573)
 #### ml-agents / ml-agents-envs / gym-unity (Python)
+- Fixed an issue where runs could not be resumed when using TensorFlow and Ghost Training. (#4593)


 ## [1.5.0-preview] - 2020-10-14
--- a/ml-agents/mlagents/trainers/ghost/trainer.py
+++ b/ml-agents/mlagents/trainers/ghost/trainer.py
    @property
    def reward_buffer(self) -> Deque[float]:
        """
-         Returns the reward buffer. The reward buffer contains the cumulative
-         rewards of the most recent episodes completed by agents using this
-         trainer.
-         :return: the reward buffer.
-         """
+        Returns the reward buffer. The reward buffer contains the cumulative
+        rewards of the most recent episodes completed by agents using this
+        trainer.
+        :return: the reward buffer.
+        """
        return self.trainer.reward_buffer

    @property
        policy = self.trainer.create_policy(
            parsed_behavior_id, behavior_spec, create_graph=True
        )
-        self.trainer.model_saver.initialize_or_load(policy)
        team_id = parsed_behavior_id.team_id
        self.controller.subscribe_team_id(team_id, self)

            self._save_snapshot()  # Need to save after trainer initializes policy
            self._learning_team = self.controller.get_learning_team
            self.wrapped_trainer_team = team_id
+        else:
+            # Load the weights of the ghost policy from the wrapped one
+            policy.load_weights(
+                self.trainer.get_policy(parsed_behavior_id).get_weights()
+            )
        return policy

    def add_policy(
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py
+++ b/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py
        np.testing.assert_array_equal(w, lw)


+def test_resume(dummy_config, tmp_path):
+    mock_specs = mb.setup_test_behavior_specs(
+        True, False, vector_action_space=[2], vector_obs_space=1
+    )
+    behavior_id_team0 = "test_brain?team=0"
+    behavior_id_team1 = "test_brain?team=1"
+    brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name
+    tmp_path = tmp_path.as_posix()
+    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, tmp_path)
+    controller = GhostController(100)
+    trainer = GhostTrainer(
+        ppo_trainer, brain_name, controller, 0, dummy_config, True, tmp_path
+    )
+
+    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
+    policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
+    trainer.add_policy(parsed_behavior_id0, policy)
+
+    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1)
+    policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
+    trainer.add_policy(parsed_behavior_id1, policy)
+
+    trainer.save_model()
+
+    # Make a new trainer, check that the policies are the same
+    ppo_trainer2 = PPOTrainer(brain_name, 0, dummy_config, True, True, 0, tmp_path)
+    trainer2 = GhostTrainer(
+        ppo_trainer2, brain_name, controller, 0, dummy_config, True, tmp_path
+    )
+    policy = trainer2.create_policy(parsed_behavior_id0, mock_specs)
+    trainer2.add_policy(parsed_behavior_id0, policy)
+
+    policy = trainer2.create_policy(parsed_behavior_id1, mock_specs)
+    trainer2.add_policy(parsed_behavior_id1, policy)
+
+    trainer1_policy = trainer.get_policy(parsed_behavior_id1.behavior_id)
+    trainer2_policy = trainer2.get_policy(parsed_behavior_id1.behavior_id)
+    weights = trainer1_policy.get_weights()
+    weights2 = trainer2_policy.get_weights()
+
+    for w, lw in zip(weights, weights2):
+        np.testing.assert_array_equal(w, lw)
+
+
 def test_process_trajectory(dummy_config):
    mock_specs = mb.setup_test_behavior_specs(
        True, False, vector_action_space=[2], vector_obs_space=1
--- a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py
        np.testing.assert_array_equal(w, lw)


+def test_resume(dummy_config, tmp_path):
+    mock_specs = mb.setup_test_behavior_specs(
+        True, False, vector_action_space=[2], vector_obs_space=1
+    )
+    behavior_id_team0 = "test_brain?team=0"
+    behavior_id_team1 = "test_brain?team=1"
+    brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name
+    tmp_path = tmp_path.as_posix()
+    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, tmp_path)
+    controller = GhostController(100)
+    trainer = GhostTrainer(
+        ppo_trainer, brain_name, controller, 0, dummy_config, True, tmp_path
+    )
+
+    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
+    policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
+    trainer.add_policy(parsed_behavior_id0, policy)
+
+    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1)
+    policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
+    trainer.add_policy(parsed_behavior_id1, policy)
+
+    trainer.save_model()
+
+    # Make a new trainer, check that the policies are the same
+    ppo_trainer2 = PPOTrainer(brain_name, 0, dummy_config, True, True, 0, tmp_path)
+    trainer2 = GhostTrainer(
+        ppo_trainer2, brain_name, controller, 0, dummy_config, True, tmp_path
+    )
+    policy = trainer2.create_policy(parsed_behavior_id0, mock_specs)
+    trainer2.add_policy(parsed_behavior_id0, policy)
+
+    policy = trainer2.create_policy(parsed_behavior_id1, mock_specs)
+    trainer2.add_policy(parsed_behavior_id1, policy)
+
+    trainer1_policy = trainer.get_policy(parsed_behavior_id1.behavior_id)
+    trainer2_policy = trainer2.get_policy(parsed_behavior_id1.behavior_id)
+    weights = trainer1_policy.get_weights()
+    weights2 = trainer2_policy.get_weights()
+
+    for w, lw in zip(weights, weights2):
+        np.testing.assert_array_equal(w, lw)
+
+
 def test_process_trajectory(dummy_config):
    mock_specs = mb.setup_test_behavior_specs(
        True, False, vector_action_space=[2], vector_obs_space=1