浏览代码

self-play assym hacked branch

/develop/self-playassym
Andrew Cohen 4 年前
当前提交
bd78ec40
共有 3 个文件被更改,包括 69 次插入22 次删除
  1. 59
      ml-agents/mlagents/trainers/ghost/trainer.py
  2. 9
      ml-agents/mlagents/trainers/learn.py
  3. 23
      ml-agents/mlagents/trainers/trainer_controller.py

59
ml-agents/mlagents/trainers/ghost/trainer.py


self.policy_snapshots: List[Any] = []
self.snapshot_counter: int = 0
self.learning_behavior_name: str = None
self.global_learning_behavior_name: str = None
self.current_policy_snapshot = None
self.last_save = 0
self.last_swap = 0

Returns the number of steps the trainer has performed
:return: the step count of the trainer
"""
return self.trainer.get_step
return self.step
# return self.trainer.get_step
@property
def reward_buffer(self) -> Deque[float]:

"""
return self.trainer.reward_buffer
def _write_summary(self, step: int) -> None:
"""
Saves training statistics to Tensorboard.
"""
opponents = np.array(self.policy_elos, dtype=np.float32)
LOGGER.info(
" Learning brain {} ELO: {:0.3f}\n"
"Mean Opponent ELO: {:0.3f}"
" Std Opponent ELO: {:0.3f}".format(
self.learning_behavior_name,
self.current_elo,
opponents.mean(),
opponents.std(),
)
)
self.stats_reporter.add_stat("ELO", self.current_elo)
# def _write_summary(self, step: int) -> None:
# """
# Saves training statistics to Tensorboard.
# """
# opponents = np.array(self.policy_elos, dtype=np.float32)
# LOGGER.info(
# " Learning brain {} ELO: {:0.3f}\n"
# "Mean Opponent ELO: {:0.3f}"
# " Std Opponent ELO: {:0.3f}".format(
# self.learning_behavior_name,
# self.current_elo,
# opponents.mean(),
# opponents.std(),
# )
# )
# self.stats_reporter.add_stat("ELO", self.current_elo)
def _process_trajectory(self, trajectory: Trajectory) -> None:
if trajectory.done_reached and not trajectory.max_step_reached:

)
self.current_elo += change
self.policy_elos[self.current_opponent] -= change
super()._process_trajectory(trajectory)
def _is_ready_update(self) -> bool:
return False

for _ in range(traj_queue.maxlen):
t = traj_queue.get_nowait()
# adds to wrapped trainers queue
internal_traj_queue.put(t)
if (
self.global_learning_behavior_name
== internal_traj_queue.behavior_id
):
internal_traj_queue.put(t)
self._process_trajectory(t)
except AgentManagerQueue.Empty:
pass

self._maybe_write_summary(self.get_step)
# self._maybe_write_summary(self.get_step)
for internal_q in self.internal_policy_queues:
# Get policies that correspond to the policy queue in question

for q in self.policy_queues:
name_behavior_id = q.behavior_id
# here is the place for a sampling protocol
if name_behavior_id == self.learning_behavior_name:
if (
name_behavior_id == self.global_learning_behavior_name
): # self.learning_behavior_name:
continue
elif np.random.uniform() < (1 - self.play_against_current_self_ratio):
x = np.random.randint(len(self.policy_snapshots))

self.policy_elos[-1] = self.current_elo
self.current_opponent = -1 if x == "current" else x
LOGGER.debug(
"Step {}: Swapping snapshot {} to id {} with {} learning".format(
self.get_step, x, name_behavior_id, self.learning_behavior_name
"Step {}: Swapping snapshot {} to id {} with {} local learning and {} global learning".format(
self.get_step,
x,
name_behavior_id,
self.learning_behavior_name,
self.global_learning_behavior_name,
)
)
policy = self.get_policy(name_behavior_id)

9
ml-agents/mlagents/trainers/learn.py


help="Number of parallel environments to use for training",
)
argparser.add_argument(
"--ghost-swap",
default=1000,
type=int,
help="Number of global steps between ghost trainer swaps for asymmetric games",
)
argparser.add_argument(
"--docker-target-name",
default=None,
dest="docker_target_name",

keep_checkpoints: int = parser.get_default("keep_checkpoints")
base_port: int = parser.get_default("base_port")
num_envs: int = parser.get_default("num_envs")
ghost_swap: int = parser.get_default("ghost_swap")
curriculum_config: Optional[Dict] = None
lesson: int = parser.get_default("lesson")
no_graphics: bool = parser.get_default("no_graphics")

run_seed,
sampler_manager,
resampling_interval,
options.ghost_swap,
)
# Begin training
try:

23
ml-agents/mlagents/trainers/trainer_controller.py


from mlagents.trainers.sampler_class import SamplerManager
from mlagents_envs.timers import hierarchical_timer, get_timer_tree, timed
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.ghost.trainer import GhostTrainer
from mlagents.trainers.meta_curriculum import MetaCurriculum
from mlagents.trainers.trainer_util import TrainerFactory
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers

training_seed: int,
sampler_manager: SamplerManager,
resampling_interval: Optional[int],
ghost_swap: int,
):
"""
:param model_path: Path to save the model.

self.meta_curriculum = meta_curriculum
self.sampler_manager = sampler_manager
self.resampling_interval = resampling_interval
self.ghost_swap = ghost_swap
self.ghost_index: int = 0
self.ghost_names: List[str] = []
np.random.seed(training_seed)
tf.set_random_seed(training_seed)

trainer = self.trainers[brain_name]
except KeyError:
trainer = self.trainer_factory.generate(brain_name)
if isinstance(trainer, GhostTrainer):
self.ghost_names.append(name_behavior_id)
self.trainers[brain_name] = trainer
self.logger.info(trainer)
if self.train_model:

n_steps = self.advance(env_manager)
for _ in range(n_steps):
global_step += 1
if global_step % self.ghost_swap == 0 or global_step == 1:
for trainer in self.trainers.values():
if isinstance(trainer, GhostTrainer):
trainer.global_learning_behavior_name = self.ghost_names[
self.ghost_index
]
self.logger.debug(
"Global step {}: Swapping {} as global learning".format(
global_step, self.ghost_names[self.ghost_index]
)
)
self.ghost_index = (self.ghost_index + 1) % len(
self.ghost_names
)
self.reset_env_if_ready(env_manager, global_step)
if self._should_save_model(global_step):
self._save_model()

正在加载...
取消
保存