浏览代码

team id centric ghost trainer

/asymm-envs
Andrew Cohen 5 年前
当前提交
9f09a65d
共有 7 个文件被更改,包括 116 次插入78 次删除
  1. 22
      ml-agents/mlagents/trainers/behavior_id_utils.py
  2. 9
      ml-agents/mlagents/trainers/ghost/controller.py
  3. 139
      ml-agents/mlagents/trainers/ghost/trainer.py
  4. 7
      ml-agents/mlagents/trainers/ppo/trainer.py
  5. 5
      ml-agents/mlagents/trainers/sac/trainer.py
  6. 5
      ml-agents/mlagents/trainers/trainer/trainer.py
  7. 7
      ml-agents/mlagents/trainers/trainer_controller.py

22
ml-agents/mlagents/trainers/behavior_id_utils.py


from typing import Dict, NamedTuple
from typing import NamedTuple
name_behavior_id: str
behavior_id: str
behavior_ids: Dict[str, int]
team_id: int
@staticmethod
def from_name_behavior_id(name_behavior_id: str) -> "BehaviorIdentifiers":

:returns: A BehaviorIdentifiers object.
"""
ids: Dict[str, int] = {}
team_id: int = 0
name, identifiers = name_behavior_id.rsplit("?", 1)
if "&" in identifiers:
list_of_identifiers = identifiers.split("&")
else:
list_of_identifiers = [identifiers]
for identifier in list_of_identifiers:
key, value = identifier.split("=")
ids[key] = int(value)
name, team_and_id = name_behavior_id.rsplit("?", 1)
_, team_id_str = team_and_id.split("=")
team_id = int(team_id_str)
name_behavior_id=name_behavior_id, brain_name=name, behavior_ids=ids
behavior_id=name_behavior_id, brain_name=name, team_id=team_id
)

9
ml-agents/mlagents/trainers/ghost/controller.py


from typing import Deque
from typing import Deque, List
from collections import deque

self._last_swap: int = 0
self._queue: Deque[int] = deque(maxlen=maxlen)
self._learning_team: int = 0
self._subscribed_teams: List[int] = []
self._queue.append(team_id)
if team_id not in self._subscribed_teams:
self._queue.append(team_id)
self._subscribed_teams.append(team_id)
def get_learning_id(self, step: int) -> int:
def get_learning_team(self, step: int) -> int:
if step >= self._swap_interval + self._last_swap:
self._last_swap = step
self.subscribe_team_id(self._learning_team)

139
ml-agents/mlagents/trainers/ghost/trainer.py


self.trainer = trainer
self.controller = controller
self.internal_policy_queues: List[AgentManagerQueue[Policy]] = []
self.internal_trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
self.ignored_trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
self.learning_policy_queues: Dict[str, AgentManagerQueue[Policy]] = {}
self.internal_trajectory_queues: Dict[str, AgentManagerQueue[Trajectory]] = {}
self._name_to_trajectory_queue: Dict[str, AgentManagerQueue[Trajectory]] = {}
self.internal_policy_queues: Dict[str, AgentManagerQueue[Policy]] = {}
self._name_to_policy_queue: Dict[str, AgentManagerQueue[Policy]] = {}
self._name_to_parsed_behavior_id: Dict[str, BehaviorIdentifiers] = {}
# assign ghost's stats collection to wrapped trainer's
self._stats_reporter = self.trainer.stats_reporter

self.policies: Dict[str, TFPolicy] = {}
self.policy_snapshots: List[Any] = []
self.snapshot_counter: int = 0
self.learning_behavior_name: str = None
self.learning_team: int = None
self.current_policy_snapshot = None
self.last_save = 0
self.last_swap = 0

"""
Steps the trainer, passing trajectories to wrapped trainer and calling trainer advance
"""
for traj_queue in self.trajectory_queues:
if traj_queue.behavior_id == self.learning_behavior_name:
for internal_traj_queue in self.internal_trajectory_queues:
try:
# We grab at most the maximum length of the queue.
# This ensures that even if the queue is being filled faster than it is
# being emptied, the trajectories in the queue are on-policy.
for _ in range(traj_queue.maxlen):
t = traj_queue.get_nowait()
# adds to wrapped trainers queue
internal_traj_queue.put(t)
self._process_trajectory(t)
except AgentManagerQueue.Empty:
pass
for trajectory_queue in self.trajectory_queues:
parsed_behavior_id = self._name_to_parsed_behavior_id[
trajectory_queue.behavior_id
]
if parsed_behavior_id.team_id == self.learning_team:
# With a future multiagent trainer, this will be indexed by 'role'
internal_trajectory_queue = self.internal_trajectory_queues[
parsed_behavior_id.brain_name
]
try:
# We grab at most the maximum length of the queue.
# This ensures that even if the queue is being filled faster than it is
# being emptied, the trajectories in the queue are on-policy.
for _ in range(trajectory_queue.maxlen):
t = trajectory_queue.get_nowait()
# adds to wrapped trainers queue
internal_trajectory_queue.put(t)
self._process_trajectory(t)
except AgentManagerQueue.Empty:
pass
for _ in range(traj_queue.maxlen):
traj_queue.get_nowait()
for _ in range(trajectory_queue.maxlen):
trajectory_queue.get_nowait()
for internal_q in self.internal_policy_queues:
# Get policies that correspond to the policy queue in question
try:
policy = cast(TFPolicy, internal_q.get_nowait())
self.current_policy_snapshot = policy.get_weights()
self.learning_policy_queues[internal_q.behavior_id].put(policy)
except AgentManagerQueue.Empty:
pass
for policy_queue in self.policy_queues:
parsed_behavior_id = self._name_to_parsed_behavior_id[
policy_queue.behavior_id
]
if parsed_behavior_id.team_id == self.learning_team:
# With a future multiagent trainer, this will be indexed by 'role'
internal_policy_queue = self.internal_policy_queues[
parsed_behavior_id.brain_name
]
# Get policies that correspond to the policy queue in question
try:
policy = cast(TFPolicy, internal_policy_queue.get_nowait())
self.current_policy_snapshot = policy.get_weights()
policy_queue.put(policy)
except AgentManagerQueue.Empty:
pass
if self.get_step - self.last_save > self.steps_between_save:
self._save_snapshot(self.trainer.policy)

self._swap_snapshots()
self.last_swap = self.get_step
self.learning_behavior_name = self.controller.get_learning_id(self.get_step)
self.learning_team = self.controller.get_learning_team(self.get_step)
def end_episode(self):
self.trainer.end_episode()

def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
return self.trainer.create_policy(brain_parameters)
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
) -> None:
"""
Adds policy to trainer. For the first policy added, add a trainer
to the policy and set the learning behavior name to name_behavior_id.

self.controller.subscribe_behavior_id(name_behavior_id)
name_behavior_id = parsed_behavior_id.behavior_id
team_id = parsed_behavior_id.team_id
self.controller.subscribe_team_id(team_id)
self._name_to_parsed_behavior_id[name_behavior_id] = parsed_behavior_id
if not self.learning_behavior_name:
if not self.learning_team:
self.trainer.add_policy(name_behavior_id, policy)
self.trainer.add_policy(parsed_behavior_id, policy)
self.learning_behavior_name = name_behavior_id
behavior_id_parsed = BehaviorIdentifiers.from_name_behavior_id(
self.learning_behavior_name
)
team_id = behavior_id_parsed.behavior_ids["team"]
self.learning_team = team_id
self._stats_reporter.add_property(StatsPropertyType.SELF_PLAY_TEAM, team_id)
else:
# for saving/swapping snapshots

self.snapshot_counter = (self.snapshot_counter + 1) % self.window
def _swap_snapshots(self) -> None:
for q in self.policy_queues:
name_behavior_id = q.behavior_id
for policy_queue in self.policy_queues:
parsed_behavior_id = self._name_to_parsed_behavior_id[
policy_queue.behavior_id
]
if name_behavior_id == self.learning_behavior_name:
if parsed_behavior_id.team_id == self.learning_team:
continue
elif np.random.uniform() < (1 - self.play_against_current_self_ratio):
x = np.random.randint(len(self.policy_snapshots))

self.current_opponent = -1 if x == "current" else x
logger.debug(
"Step {}: Swapping snapshot {} to id {} with {} learning".format(
self.get_step, x, name_behavior_id, self.learning_behavior_name
self.get_step, x, parsed_behavior_id.behavior_id, self.learning_team
policy = self.get_policy(name_behavior_id)
policy = self.get_policy(parsed_behavior_id.behavior_id)
q.put(policy)
policy_queue.put(policy)
def publish_policy_queue(self, policy_queue: AgentManagerQueue[Policy]) -> None:
"""

"""
super().publish_policy_queue(policy_queue)
if policy_queue.behavior_id == self.learning_behavior_name:
parsed_behavior_id = self._name_to_parsed_behavior_id[policy_queue.behavior_id]
self._name_to_policy_queue[parsed_behavior_id.behavior_id] = policy_queue
if parsed_behavior_id.team_id == self.learning_team:
policy_queue.behavior_id
parsed_behavior_id.brain_name
self.internal_policy_queues.append(internal_policy_queue)
self.learning_policy_queues[policy_queue.behavior_id] = policy_queue
self.internal_policy_queues[
parsed_behavior_id.brain_name
] = internal_policy_queue
self.trainer.publish_policy_queue(internal_policy_queue)
def subscribe_trajectory_queue(

:param queue: Trajectory queue to publish to.
"""
super().subscribe_trajectory_queue(trajectory_queue)
if trajectory_queue.behavior_id == self.learning_behavior_name:
parsed_behavior_id = self._name_to_parsed_behavior_id[
trajectory_queue.behavior_id
]
self._name_to_trajectory_queue[
parsed_behavior_id.behavior_id
] = trajectory_queue
if parsed_behavior_id.team_id == self.learning_team:
# With a future multiagent trainer, this will be indexed by 'role'
] = AgentManagerQueue(trajectory_queue.behavior_id)
] = AgentManagerQueue(parsed_behavior_id.brain_name)
self.internal_trajectory_queues.append(internal_trajectory_queue)
self.internal_trajectory_queues[
parsed_behavior_id.brain_name
] = internal_trajectory_queue
self.trainer.subscribe_trajectory_queue(internal_trajectory_queue)

7
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
logger = logging.getLogger("mlagents.trainers")

return policy
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
) -> None:
:param name_behavior_id: Behavior ID that the policy should belong to.
:param parsed_behavior_id: Behavior identifiers that the policy should belong to.
:param policy: Policy to associate with name_behavior_id.
"""
if self.policy:

5
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents.trainers.trajectory import Trajectory, SplitObservations
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
logger = logging.getLogger("mlagents.trainers")

for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
) -> None:
"""
Adds policy to trainer.
:param brain_parameters: specifications for policy construction

5
ml-agents/mlagents/trainers/trainer/trainer.py


from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.policy import Policy
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
logger = logging.getLogger("mlagents.trainers")

pass
@abc.abstractmethod
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
) -> None:
"""
Adds policy to trainer.
"""

7
ml-agents/mlagents/trainers/trainer_controller.py


self, env_manager: EnvManager, name_behavior_id: str
) -> None:
brain_name = BehaviorIdentifiers.from_name_behavior_id(
name_behavior_id
).brain_name
parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(name_behavior_id)
brain_name = parsed_behavior_id.brain_name
try:
trainer = self.trainers[brain_name]
except KeyError:

policy = trainer.create_policy(env_manager.external_brains[name_behavior_id])
trainer.add_policy(name_behavior_id, policy)
trainer.add_policy(parsed_behavior_id, policy)
agent_manager = AgentManager(
policy,

正在加载...
取消
保存