浏览代码

ghost controller

/asymm-envs
Andrew Cohen 5 年前
当前提交
94654de4
共有 4 个文件被更改,包括 86 次插入27 次删除
  1. 61
      ml-agents/mlagents/trainers/ghost/trainer.py
  2. 10
      ml-agents/mlagents/trainers/learn.py
  3. 6
      ml-agents/mlagents/trainers/trainer_util.py
  4. 36
      ml-agents/mlagents/trainers/ghost/controller.py

61
ml-agents/mlagents/trainers/ghost/trainer.py


class GhostTrainer(Trainer):
def __init__(
self, trainer, brain_name, reward_buff_cap, trainer_parameters, training, run_id
self,
trainer,
brain_name,
controller,
reward_buff_cap,
trainer_parameters,
training,
run_id,
:param controller: Object that coordinates all ghost trainers
:param reward_buff_cap: Max reward history to track in the reward buffer
:param trainer_parameters: The parameters for the trainer (dictionary).
:param training: Whether the trainer is set for training.

)
self.trainer = trainer
self.controller = controller
self.internal_policy_queues: List[AgentManagerQueue[Policy]] = []
self.internal_trajectory_queues: List[AgentManagerQueue[Trajectory]] = []

"""
Steps the trainer, passing trajectories to wrapped trainer and calling trainer advance
"""
for traj_queue, internal_traj_queue in zip(
self.trajectory_queues, self.internal_trajectory_queues
):
try:
# We grab at most the maximum length of the queue.
# This ensures that even if the queue is being filled faster than it is
# being emptied, the trajectories in the queue are on-policy.
for _ in range(traj_queue.maxlen):
t = traj_queue.get_nowait()
# adds to wrapped trainers queue
internal_traj_queue.put(t)
self._process_trajectory(t)
except AgentManagerQueue.Empty:
pass
for traj_queue in self.trajectory_queues:
if traj_queue.behavior_id == self.learning_behavior_name:
for internal_traj_queue in self.internal_trajectory_queues:
try:
# We grab at most the maximum length of the queue.
# This ensures that even if the queue is being filled faster than it is
# being emptied, the trajectories in the queue are on-policy.
for _ in range(traj_queue.maxlen):
t = traj_queue.get_nowait()
# adds to wrapped trainers queue
internal_traj_queue.put(t)
self._process_trajectory(t)
except AgentManagerQueue.Empty:
pass
else:
# Dump trajectories from non-learning policy
try:
for _ in range(traj_queue.maxlen):
traj_queue.get_nowait()
except AgentManagerQueue.Empty:
pass
self.next_summary_step = self.trainer.next_summary_step
self.trainer.advance()

self._swap_snapshots()
self.last_swap = self.get_step
# Dump trajectories from non-learning policy
for traj_queue in self.ignored_trajectory_queues:
try:
for _ in range(traj_queue.maxlen):
traj_queue.get_nowait()
except AgentManagerQueue.Empty:
pass
self.learning_behavior_name = self.controller.get_learning_id(self.get_step)
def end_episode(self):
self.trainer.end_episode()

:param name_behavior_id: Behavior ID that the policy should belong to.
:param policy: Policy to associate with name_behavior_id.
"""
self.controller.subscribe_behavior_id(name_behavior_id)
self.policies[name_behavior_id] = policy
policy.create_tf_graph()

Adds a trajectory queue to the list of queues for the trainer to ingest Trajectories from.
:param queue: Trajectory queue to publish to.
"""
super().subscribe_trajectory_queue(trajectory_queue)
super().subscribe_trajectory_queue(trajectory_queue)
internal_trajectory_queue: AgentManagerQueue[
Trajectory
] = AgentManagerQueue(trajectory_queue.behavior_id)

else:
self.ignored_trajectory_queues.append(trajectory_queue)
# Taken from https://github.com/Unity-Technologies/ml-agents/pull/1975 and

10
ml-agents/mlagents/trainers/learn.py


type=int,
help="Number of parallel environments to use for training",
)
argparser.add_argument(
"--ghost-swap",
default=50000,
type=int,
help="Number of trainer steps between swapping behavior id being ghosted",
)
argparser.add_argument(
"--docker-target-name",
default=None,

keep_checkpoints: int = parser.get_default("keep_checkpoints")
base_port: int = parser.get_default("base_port")
num_envs: int = parser.get_default("num_envs")
ghost_swap: int = parser.get_default("ghost_swap")
curriculum_config: Optional[Dict] = None
lesson: int = parser.get_default("lesson")
no_graphics: bool = parser.get_default("no_graphics")

options.keep_checkpoints,
options.train_model,
options.load_model,
options.ghost_swap,
run_seed,
maybe_meta_curriculum,
options.multi_gpu,

6
ml-agents/mlagents/trainers/trainer_util.py


from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.sac.trainer import SACTrainer
from mlagents.trainers.ghost.trainer import GhostTrainer
from mlagents.trainers.ghost.controller import GhostController
logger = logging.getLogger("mlagents.trainers")

keep_checkpoints: int,
train_model: bool,
load_model: bool,
ghost_swap: int,
seed: int,
meta_curriculum: MetaCurriculum = None,
multi_gpu: bool = False,

self.seed = seed
self.meta_curriculum = meta_curriculum
self.multi_gpu = multi_gpu
self.ghost_controller = GhostController(ghost_swap)
def generate(self, brain_name: str) -> Trainer:
return initialize_trainer(

self.keep_checkpoints,
self.train_model,
self.load_model,
self.ghost_controller,
self.seed,
self.meta_curriculum,
self.multi_gpu,

keep_checkpoints: int,
train_model: bool,
load_model: bool,
ghost_controller: GhostController,
seed: int,
meta_curriculum: MetaCurriculum = None,
multi_gpu: bool = False,

trainer = GhostTrainer(
trainer,
brain_name,
ghost_controller,
min_lesson_length,
trainer_parameters,
train_model,

36
ml-agents/mlagents/trainers/ghost/controller.py


from typing import Deque
from collections import deque
class GhostController(object):
def __init__(self, swap_interval: int, maxlen: int = 10):
self._swap_interval = swap_interval
self._last_swap: int = 0
self._queue: Deque[int] = deque(maxlen=maxlen)
self._learning_team: int = 0
def subscribe_team_id(self, team_id: int) -> None:
self._queue.append(team_id)
def get_learning_id(self, step: int) -> int:
if step >= self._swap_interval + self._last_swap:
self._last_swap = step
self.subscribe_team_id(self._learning_team)
self._learning_team = self._queue.popleft()
return self._learning_team
# Taken from https://github.com/Unity-Technologies/ml-agents/pull/1975 and
# https://metinmediamath.wordpress.com/2013/11/27/how-to-calculate-the-elo-rating-including-example/
# ELO calculation
def compute_elo_rating_changes(rating1: float, rating2: float, result: float) -> float:
r1 = pow(10, rating1 / 400)
r2 = pow(10, rating2 / 400)
summed = r1 + r2
e1 = r1 / summed
change = result - e1
return change
正在加载...
取消
保存