浏览代码

ELO calculation done in ghost controller

/asymm-envs
Andrew Cohen 5 年前
当前提交
79076b70
共有 4 个文件被更改,包括 80 次插入80 次删除
  1. 41
      ml-agents/mlagents/trainers/ghost/controller.py
  2. 102
      ml-agents/mlagents/trainers/ghost/trainer.py
  3. 16
      ml-agents/mlagents/trainers/stats.py
  4. 1
      ml-agents/mlagents/trainers/tests/test_simple_rl.py

41
ml-agents/mlagents/trainers/ghost/controller.py


from typing import Deque, List
from typing import Deque, Dict
from mlagents.trainers.ghost.trainer import GhostTrainer
class GhostController(object):

self._queue: Deque[int] = deque(maxlen=maxlen)
self._learning_team: int = 0
self._subscribed_teams: List[int] = []
self._ghost_trainers: Dict[int, GhostTrainer] = {}
def subscribe_team_id(self, team_id: int) -> None:
if team_id not in self._subscribed_teams:
def subscribe_team_id(self, team_id: int, trainer: GhostTrainer) -> None:
if team_id not in self._ghost_trainers:
self._subscribed_teams.append(team_id)
self._ghost_trainers[team_id] = trainer
self.subscribe_team_id(self._learning_team)
self._queue.append(self._learning_team)
# Adapted from https://github.com/Unity-Technologies/ml-agents/pull/1975 and
# https://metinmediamath.wordpress.com/2013/11/27/how-to-calculate-the-elo-rating-including-example/
# ELO calculation
# Taken from https://github.com/Unity-Technologies/ml-agents/pull/1975 and
# https://metinmediamath.wordpress.com/2013/11/27/how-to-calculate-the-elo-rating-including-example/
# ELO calculation
def compute_elo_rating_changes(self, rating: float, result: float) -> float:
opponent_rating: float = 0.0
for team_id, trainer in self._ghost_trainers.items():
if team_id != self._learning_team:
opponent_rating = trainer.get_opponent_elo()
r1 = pow(10, rating / 400)
r2 = pow(10, opponent_rating / 400)
def compute_elo_rating_changes(rating1: float, rating2: float, result: float) -> float:
r1 = pow(10, rating1 / 400)
r2 = pow(10, rating2 / 400)
summed = r1 + r2
e1 = r1 / summed
summed = r1 + r2
e1 = r1 / summed
change = result - e1
for team_id, trainer in self._ghost_trainers.items():
if team_id != self._learning_team:
trainer.change_opponent_elo(change)
change = result - e1
return change
return change

102
ml-agents/mlagents/trainers/ghost/trainer.py


self.trainer = trainer
self.controller = controller
self.internal_trajectory_queues: Dict[str, AgentManagerQueue[Trajectory]] = {}
self._name_to_trajectory_queue: Dict[str, AgentManagerQueue[Trajectory]] = {}
self.internal_policy_queues: Dict[str, AgentManagerQueue[Policy]] = {}
self._name_to_policy_queue: Dict[str, AgentManagerQueue[Policy]] = {}
self._internal_trajectory_queues: Dict[str, AgentManagerQueue[Trajectory]] = {}
self._internal_policy_queues: Dict[str, AgentManagerQueue[Policy]] = {}
self._name_to_parsed_behavior_id: Dict[str, BehaviorIdentifiers] = {}

)
self.steps_between_save = self_play_parameters.get("save_steps", 20000)
self.steps_between_swap = self_play_parameters.get("swap_steps", 20000)
self.ghost_step: int = 0
# wrapped_training_team and learning team need to be separate
# in the situation where new agents are created destroyed
# after learning team switches. These agents need to be added
# to trainers properly.
self.wrapped_trainer_team: int = None
self.current_policy_snapshot = None
self.last_save = 0
self.last_swap = 0

self.current_elo: float = self.initial_elo
self.policy_elos: List[float] = [self.initial_elo] * (
self.window + 1
) # for learning policy

"""
return self.trainer.reward_buffer
@property
def current_elo(self) -> float:
return self.policy_elos[-1]
def change_current_elo(self, change: float) -> None:
self.policy_elos[-1] += change
def get_opponent_elo(self) -> float:
return self.policy_elos[self.current_opponent]
def change_opponent_elo(self, change: float) -> None:
self.policy_elos[self.current_opponent] -= change
def _process_trajectory(self, trajectory: Trajectory) -> None:
if trajectory.done_reached and not trajectory.max_step_reached:
# Assumption is that final reward is 1/.5/0 for win/draw/loss

elif final_reward < 0:
result = 0.0
change = compute_elo_rating_changes(
self.current_elo, self.policy_elos[self.current_opponent], result
change = self.controller.compute_elo_rating_changes(
self.current_elo, result
self.current_elo += change
self.policy_elos[self.current_opponent] -= change
opponents = np.array(self.policy_elos, dtype=np.float32)
self.change_current_elo(change)
# opponents = np.array(self.policy_elos, dtype=np.float32)
self._stats_reporter.add_stat(
"Self-play/Mean Opponent ELO", opponents.mean()
)
self._stats_reporter.add_stat("Self-play/Std Opponent ELO", opponents.std())
# self._stats_reporter.add_stat(
# "Self-play/Mean Opponent ELO", opponents.mean()
# )
# self._stats_reporter.add_stat("Self-play/Std Opponent ELO", opponents.std())
def advance(self) -> None:
"""

]
if parsed_behavior_id.team_id == self.learning_team:
# With a future multiagent trainer, this will be indexed by 'role'
internal_trajectory_queue = self.internal_trajectory_queues[
internal_trajectory_queue = self._internal_trajectory_queues[
parsed_behavior_id.brain_name
]
try:

# Dump trajectories from non-learning policy
try:
for _ in range(trajectory_queue.maxlen):
trajectory_queue.get_nowait()
t = trajectory_queue.get_nowait()
# count ghost steps
self.ghost_step += len(t.steps)
except AgentManagerQueue.Empty:
pass

]
if parsed_behavior_id.team_id == self.learning_team:
# With a future multiagent trainer, this will be indexed by 'role'
internal_policy_queue = self.internal_policy_queues[
internal_policy_queue = self._internal_policy_queues[
parsed_behavior_id.brain_name
]
# Get policies that correspond to the policy queue in question

except AgentManagerQueue.Empty:
pass
if self.get_step - self.last_save > self.steps_between_save:
if self.ghost_step - self.last_save > self.steps_between_save:
self.last_save = self.get_step
self.last_save = self.ghost_step
if self.get_step - self.last_swap > self.steps_between_swap:
if self.ghost_step - self.last_swap > self.steps_between_swap:
self.last_swap = self.get_step
self.last_swap = self.ghost_step
self.learning_team = self.controller.get_learning_team(self.get_step)
self.learning_team = self.controller.get_learning_team(self.ghost_step)
def end_episode(self):
self.trainer.end_episode()

"""
name_behavior_id = parsed_behavior_id.behavior_id
team_id = parsed_behavior_id.team_id
self.controller.subscribe_team_id(team_id)
self.controller.subscribe_team_id(team_id, self)
# First policy encountered
if not self.learning_team:
# First policy or a new agent on the same team encountered
if self.wrapped_trainer_team is None or team_id == self.wrapped_trainer_team:
self.wrapped_trainer_team = team_id
self._stats_reporter.add_property(StatsPropertyType.SELF_PLAY_TEAM, team_id)
else:
# for saving/swapping snapshots

else:
snapshot = self.current_policy_snapshot
x = "current"
self.policy_elos[-1] = self.current_elo
self.current_opponent = -1 if x == "current" else x
logger.debug(
"Step {}: Swapping snapshot {} to id {} with {} learning".format(

"""
super().publish_policy_queue(policy_queue)
parsed_behavior_id = self._name_to_parsed_behavior_id[policy_queue.behavior_id]
self._name_to_policy_queue[parsed_behavior_id.behavior_id] = policy_queue
if parsed_behavior_id.team_id == self.learning_team:
if parsed_behavior_id.team_id == self.wrapped_trainer_team:
# With a future multiagent trainer, this will be indexed by 'role'
self.internal_policy_queues[
self._internal_policy_queues[
parsed_behavior_id.brain_name
] = internal_policy_queue
self.trainer.publish_policy_queue(internal_policy_queue)

parsed_behavior_id = self._name_to_parsed_behavior_id[
trajectory_queue.behavior_id
]
self._name_to_trajectory_queue[
parsed_behavior_id.behavior_id
] = trajectory_queue
if parsed_behavior_id.team_id == self.learning_team:
if parsed_behavior_id.team_id == self.wrapped_trainer_team:
self.internal_trajectory_queues[
self._internal_trajectory_queues[
# Taken from https://github.com/Unity-Technologies/ml-agents/pull/1975 and
# https://metinmediamath.wordpress.com/2013/11/27/how-to-calculate-the-elo-rating-including-example/
# ELO calculation
def compute_elo_rating_changes(rating1: float, rating2: float, result: float) -> float:
r1 = pow(10, rating1 / 400)
r2 = pow(10, rating2 / 400)
summed = r1 + r2
e1 = r1 / summed
change = result - e1
return change

16
ml-agents/mlagents/trainers/stats.py


)
if self.self_play and "Self-play/ELO" in values:
elo_stats = values["Self-play/ELO"]
mean_opponent_elo = values["Self-play/Mean Opponent ELO"]
std_opponent_elo = values["Self-play/Std Opponent ELO"]
# mean_opponent_elo = values["Self-play/Mean Opponent ELO"]
# std_opponent_elo = values["Self-play/Std Opponent ELO"]
"{} Team {}: ELO: {:0.3f}. "
"Mean Opponent ELO: {:0.3f}. "
"Std Opponent ELO: {:0.3f}. ".format(
category,
self.self_play_team,
elo_stats.mean,
mean_opponent_elo.mean,
std_opponent_elo.mean,
"{} Team {}: ELO: {:0.3f}. ".format(
category, self.self_play_team, elo_stats.mean
# mean_opponent_elo.mean,
# std_opponent_elo.mean,
)
else:
logger.info(

1
ml-agents/mlagents/trainers/tests/test_simple_rl.py


keep_checkpoints=1,
train_model=True,
load_model=False,
ghost_swap=10000,
seed=seed,
meta_curriculum=meta_curriculum,
multi_gpu=False,

正在加载...
取消
保存