simple rl asymm ghost tests

5 年前 · 93d344ff
--- a/ml-agents/mlagents/trainers/ghost/trainer.py
+++ b/ml-agents/mlagents/trainers/ghost/trainer.py
        self.steps_to_train_team = self_play_parameters.get("team_change", 100000)
        if self.steps_to_train_team > self.get_max_steps:
            logger.warning(
-                "The max steps of the GhostTrainer for behavior name {} is less than \
-            team change. This team will not face opposition that has been trained if the opposition \
-            is managed by a different GhostTrainer as in an asymmetric game.".format(
+                "The max steps of the GhostTrainer for behavior name {} is less than team change. This team will not face \
+                opposition that has been trained if the opposition is managed by a different GhostTrainer as in an \
+                asymmetric game.".format(
                    self.brain_name
                )
            )

        self.next_summary_step = self.trainer.next_summary_step
        self.trainer.advance()
-
        if self.get_step - self.last_team_change > self.steps_to_train_team:
            self.controller.finish_training(self.get_step)
            self.last_team_change = self.get_step
        policy.create_tf_graph()

        self._name_to_parsed_behavior_id[name_behavior_id] = parsed_behavior_id
+        # for saving/swapping snapshots
+        policy.init_load_weights()

        # First policy or a new agent on the same team encountered
        if self.wrapped_trainer_team is None or team_id == self.wrapped_trainer_team:
+
-        else:
-            # for saving/swapping snapshots
-            policy.init_load_weights()

    def get_policy(self, name_behavior_id: str) -> TFPolicy:
        """
            else:
                snapshot = self.current_policy_snapshot
                x = "current"
+
            self.current_opponent = -1 if x == "current" else x
            name_to_policy_queue = self._team_to_name_to_policy_queue[team_id]
            for brain_name in self._team_to_name_to_policy_queue[team_id]:
--- a/ml-agents/mlagents/trainers/policy/tf_policy.py
+++ b/ml-agents/mlagents/trainers/policy/tf_policy.py
                self.assign_ops.append(tf.assign(var, assign_ph))

    def load_weights(self, values):
+        if len(self.assign_ops) == 0:
+            logger.warning(
+                "Calling load_weights in tf_policy but assign_ops is empty. Did you forget to call init_load_weights?"
+            )
        with self.graph.as_default():
            feed_dict = {}
            for assign_ph, value in zip(self.assign_phs, values):
--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
    override_vals = {
        "max_steps": 2500,
        "self_play": {
-            "play_against_current_self_ratio": 1.0,
+            "play_against_current_best_ratio": 1.0,
            "save_steps": 2000,
            "swap_steps": 2000,
        },
    override_vals = {
        "max_steps": 2500,
        "self_play": {
-            "play_against_current_self_ratio": 1.0,
+            "play_against_current_best_ratio": 1.0,
+    _check_environment_trains(env, config, success_threshold=None)
+    processed_rewards = [
+        default_reward_processor(rewards) for rewards in env.final_rewards.values()
+    ]
+    success_threshold = 0.99
+    assert any(reward > success_threshold for reward in processed_rewards) and any(
+        reward < success_threshold for reward in processed_rewards
+    )
+
+
+@pytest.mark.parametrize("use_discrete", [True, False])
+def test_simple_asymm_ghost(use_discrete):
+    # Make opponent for asymmetric case
+    brain_name_opp = BRAIN_NAME + "Opp"
+    env = SimpleEnvironment(
+        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
+    )
+    override_vals = {
+        "max_steps": 2000,
+        "self_play": {
+            "play_against_current_best_ratio": 1.0,
+            "save_steps": 5000,
+            "swap_steps": 5000,
+            "team_change": 2000,
+        },
+    }
+    config = generate_config(PPO_CONFIG, override_vals)
+    config[brain_name_opp] = config[BRAIN_NAME]
+    _check_environment_trains(env, config)
+
+
+@pytest.mark.parametrize("use_discrete", [True, False])
+def test_simple_asymm_ghost_fails(use_discrete):
+    # Make opponent for asymmetric case
+    brain_name_opp = BRAIN_NAME + "Opp"
+    env = SimpleEnvironment(
+        [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
+    )
+    # This config should fail because the team that us not learning when both have reached
+    # max step should be executing the initial, untrained poliy.
+    override_vals = {
+        "max_steps": 2000,
+        "self_play": {
+            "play_against_current_best_ratio": 0.0,
+            "save_steps": 5000,
+            "swap_steps": 5000,
+            "team_change": 2000,
+        },
+    }
+    config = generate_config(PPO_CONFIG, override_vals)
+    config[brain_name_opp] = config[BRAIN_NAME]
    _check_environment_trains(env, config, success_threshold=None)
    processed_rewards = [
        default_reward_processor(rewards) for rewards in env.final_rewards.values()