浏览代码

simple rl asymm ghost tests

/develop/cubewars
Andrew Cohen 5 年前
当前提交
93d344ff
共有 3 个文件被更改,包括 64 次插入9 次删除
  1. 14
      ml-agents/mlagents/trainers/ghost/trainer.py
  2. 4
      ml-agents/mlagents/trainers/policy/tf_policy.py
  3. 55
      ml-agents/mlagents/trainers/tests/test_simple_rl.py

14
ml-agents/mlagents/trainers/ghost/trainer.py


self.steps_to_train_team = self_play_parameters.get("team_change", 100000)
if self.steps_to_train_team > self.get_max_steps:
logger.warning(
"The max steps of the GhostTrainer for behavior name {} is less than \
team change. This team will not face opposition that has been trained if the opposition \
is managed by a different GhostTrainer as in an asymmetric game.".format(
"The max steps of the GhostTrainer for behavior name {} is less than team change. This team will not face \
opposition that has been trained if the opposition is managed by a different GhostTrainer as in an \
asymmetric game.".format(
self.brain_name
)
)

self.next_summary_step = self.trainer.next_summary_step
self.trainer.advance()
if self.get_step - self.last_team_change > self.steps_to_train_team:
self.controller.finish_training(self.get_step)
self.last_team_change = self.get_step

policy.create_tf_graph()
self._name_to_parsed_behavior_id[name_behavior_id] = parsed_behavior_id
# for saving/swapping snapshots
policy.init_load_weights()
# First policy or a new agent on the same team encountered
if self.wrapped_trainer_team is None or team_id == self.wrapped_trainer_team:

else:
# for saving/swapping snapshots
policy.init_load_weights()
def get_policy(self, name_behavior_id: str) -> TFPolicy:
"""

else:
snapshot = self.current_policy_snapshot
x = "current"
self.current_opponent = -1 if x == "current" else x
name_to_policy_queue = self._team_to_name_to_policy_queue[team_id]
for brain_name in self._team_to_name_to_policy_queue[team_id]:

4
ml-agents/mlagents/trainers/policy/tf_policy.py


self.assign_ops.append(tf.assign(var, assign_ph))
def load_weights(self, values):
if len(self.assign_ops) == 0:
logger.warning(
"Calling load_weights in tf_policy but assign_ops is empty. Did you forget to call init_load_weights?"
)
with self.graph.as_default():
feed_dict = {}
for assign_ph, value in zip(self.assign_phs, values):

55
ml-agents/mlagents/trainers/tests/test_simple_rl.py


override_vals = {
"max_steps": 2500,
"self_play": {
"play_against_current_self_ratio": 1.0,
"play_against_current_best_ratio": 1.0,
"save_steps": 2000,
"swap_steps": 2000,
},

override_vals = {
"max_steps": 2500,
"self_play": {
"play_against_current_self_ratio": 1.0,
"play_against_current_best_ratio": 1.0,
_check_environment_trains(env, config, success_threshold=None)
processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]
success_threshold = 0.99
assert any(reward > success_threshold for reward in processed_rewards) and any(
reward < success_threshold for reward in processed_rewards
)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost(use_discrete):
# Make opponent for asymmetric case
brain_name_opp = BRAIN_NAME + "Opp"
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
)
override_vals = {
"max_steps": 2000,
"self_play": {
"play_against_current_best_ratio": 1.0,
"save_steps": 5000,
"swap_steps": 5000,
"team_change": 2000,
},
}
config = generate_config(PPO_CONFIG, override_vals)
config[brain_name_opp] = config[BRAIN_NAME]
_check_environment_trains(env, config)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost_fails(use_discrete):
# Make opponent for asymmetric case
brain_name_opp = BRAIN_NAME + "Opp"
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
)
# This config should fail because the team that us not learning when both have reached
# max step should be executing the initial, untrained poliy.
override_vals = {
"max_steps": 2000,
"self_play": {
"play_against_current_best_ratio": 0.0,
"save_steps": 5000,
"swap_steps": 5000,
"team_change": 2000,
},
}
config = generate_config(PPO_CONFIG, override_vals)
config[brain_name_opp] = config[BRAIN_NAME]
_check_environment_trains(env, config, success_threshold=None)
processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()

正在加载...
取消
保存