浏览代码

recieves brain_name and identifier on python side

/develop/tanhsquash
Andrew Cohen 5 年前
当前提交
5097bcc0
共有 5 个文件被更改,包括 135 次插入51 次删除
  1. 46
      ml-agents/mlagents/trainers/ppo/trainer.py
  2. 37
      ml-agents/mlagents/trainers/sac/trainer.py
  3. 16
      ml-agents/mlagents/trainers/trainer.py
  4. 71
      ml-agents/mlagents/trainers/trainer_controller.py
  5. 16
      ml-agents/mlagents/trainers/trainer_util.py

46
ml-agents/mlagents/trainers/ppo/trainer.py


import numpy as np
from mlagents.envs.brain import BrainInfo
from mlagents.envs.brain import BrainParameters, BrainInfo
from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy, get_devices
from mlagents.trainers.rl_trainer import RLTrainer, AllRewardsOutput

def __init__(
self,
brain,
brain_name,
reward_buff_cap,
trainer_parameters,
training,

:param run_id: The identifier of the current run
"""
super(PPOTrainer, self).__init__(
brain, trainer_parameters, training, run_id, reward_buff_cap
brain_name, trainer_parameters, training, run_id, reward_buff_cap
)
self.param_keys = [
"batch_size",

"reward_signals",
]
self.check_param_keys()
if multi_gpu and len(get_devices()) > 1:
self.policy = MultiGpuPPOPolicy(
seed, brain, trainer_parameters, self.is_training, load
)
else:
self.policy = PPOPolicy(
seed, brain, trainer_parameters, self.is_training, load
)
for _reward_signal in self.policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = {}
self.load = load
self.multi_gpu = multi_gpu
self.seed = seed
self.policy = None
def process_experiences(
self, current_info: BrainInfo, next_info: BrainInfo

self.stats[stat].append(val)
self.clear_update_buffer()
self.trainer_metrics.end_policy_update()
def add_policy(self, brain_parameters: BrainParameters) -> None:
if self.multi_gpu and len(get_devices()) > 1:
self.policy = MultiGpuPPOPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,
self.is_training,
self.load,
)
else:
self.policy = PPOPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,
self.is_training,
self.load,
)
for _reward_signal in self.policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = {}
def get_policy(self, brain_name: str) -> PPOPolicy:
return self.policy
def discount_rewards(r, gamma=0.99, value_next=0.0):

37
ml-agents/mlagents/trainers/sac/trainer.py


# # Unity ML-Agents Toolkit
# ## ML-Agent Learning (SAC)
# Contains an implementation of SAC as described in https://arxiv.org/abs/1801.01290
# and implemented in https://github.com/hill-a/stable-baselines

import numpy as np
from mlagents.envs.brain import BrainInfo
from mlagents.envs.brain import BrainParameters, BrainInfo
from mlagents.envs.action_info import ActionInfoOutputs
from mlagents.envs.timers import timed
from mlagents.trainers.sac.policy import SACPolicy

"""
def __init__(
self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id
self,
brain_name,
reward_buff_cap,
trainer_parameters,
training,
load,
seed,
run_id,
):
"""
Responsible for collecting experiences and training SAC model.

:param seed: The seed the model will be initialized with
:param run_id: The The identifier of the current run
"""
super().__init__(brain, trainer_parameters, training, run_id, reward_buff_cap)
super().__init__(
brain_name, trainer_parameters, training, run_id, reward_buff_cap
)
self.param_keys = [
"batch_size",
"buffer_size",

]
self.check_param_keys()
self.load = load
self.seed = seed
self.policy = None
self.step = 0
self.train_interval = (

if "save_replay_buffer" in trainer_parameters
else False
)
self.policy = SACPolicy(seed, brain, trainer_parameters, self.is_training, load)
# Load the replay buffer if load
if load and self.checkpoint_replay_buffer:

self.update_buffer.num_experiences
)
)
for _reward_signal in self.policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = {}
self.episode_steps = {}

self.update_sac_policy()
self.update_reward_signals()
self.trainer_metrics.end_policy_update()
def add_policy(self, brain_parameters: BrainParameters) -> None:
self.policy = SACPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,
self.is_training,
self.load,
)
for _reward_signal in self.policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = {}
def get_policy(self, brain_name: str) -> SACPolicy:
return self.policy
def update_sac_policy(self) -> None:
"""

16
ml-agents/mlagents/trainers/trainer.py


def __init__(
self,
brain: BrainParameters,
brain_name: str,
trainer_parameters: dict,
training: bool,
run_id: str,

:int reward_buff_cap:
"""
self.param_keys: List[str] = []
self.brain_name = brain.brain_name
self.brain_name = brain_name
self.run_id = run_id
self.trainer_parameters = trainer_parameters
self.summary_path = trainer_parameters["summary_path"]

Uses demonstration_buffer to update model.
"""
raise UnityTrainerException("The update_model method was not implemented.")
def add_policy(self, brain_parameters: BrainParameters) -> None:
"""
Adds policy to trainers list of policies
"""
raise UnityTrainerException("The update_model method was not implemented.")
def get_policy(self, brain_name: str) -> TFPolicy:
"""
Adds policy to trainers list of policies
"""
raise UnityTrainerException("The update_model method was not implemented.")

71
ml-agents/mlagents/trainers/trainer_controller.py


# # Unity ML-Agents Toolkit
# ## ML-Agent Learning
# ## ML-Agenunique_t Learning
from typing import Dict, List, Optional, Set
from typing import Tuple, Dict, List, Optional, Set
from collections import defaultdict
import numpy as np
from mlagents.tf_utils import tf

:param resampling_interval: Specifies number of simulation steps after which reset parameters are resampled.
"""
self.trainers: Dict[str, Trainer] = {}
self.multi_trainers: Dict[str, Trainer] = {}
self.brain_name_to_identifier: Dict[str, Set] = defaultdict(set)
self.trainer_factory = trainer_factory
self.model_path = model_path
self.summaries_dir = summaries_dir

else:
trainer.write_summary(global_step, delta_train_start)
def start_trainer(self, trainer: Trainer, env_manager: EnvManager) -> None:
self.trainers[trainer.brain_name] = trainer
self.logger.info(trainer)
if self.train_model:
trainer.write_tensorboard_text("Hyperparameters", trainer.parameters)
env_manager.set_policy(trainer.brain_name, trainer.policy)
last_behavior_identifiers: Set[Tuple[str, str]] = set()
external_brains = set(env_manager.external_brains.keys())
external_brain_behavior_ids = set(env_manager.external_brains.keys())
brain_names, behavior_identifiers = zip(
*map(lambda x: [x.split("?")[0], x], external_brain_behavior_ids)
)
for brain_name, behavior_identifier in zip(
brain_names, behavior_identifiers
):
self.brain_name_to_identifier[brain_name].add(behavior_identifier)
external_brains = set(brain_names)
external_identifiers = set(zip(brain_names, behavior_identifiers))
if last_brain_names != env_manager.external_brains.keys():
for name in new_brains:
trainer = self.trainer_factory.generate(
env_manager.external_brains[name]
new_identifiers = external_identifiers - last_behavior_identifiers
for name in new_brains:
trainer = self.trainer_factory.generate(name)
self.trainers[name] = trainer
for behavior_identifier in self.brain_name_to_identifier[name]:
self.multi_trainers[behavior_identifier] = trainer
trainer.add_policy(
env_manager.external_brains[behavior_identifier]
self.start_trainer(trainer, env_manager)
last_brain_names = external_brains
env_manager.set_policy(
behavior_identifier, trainer.get_policy(behavior_identifier)
)
self.logger.info(trainer)
if self.train_model:
trainer.write_tensorboard_text(
"Hyperparameters", trainer.parameters
)
for name, new_identifier in new_identifiers:
trainer = self.trainers[name]
trainer[name].add_policy(
env_manager.external_brains[new_identifier]
)
env_manager.set_policy(
new_identifier, trainer.get_policy(new_identifier)
)
last_brain_names = external_brains
last_behavior_identifiers = external_identifiers
n_steps = self.advance(env_manager)
for i in range(n_steps):
global_step += 1

# Perform gradient descent with experience buffer
with hierarchical_timer("update_policy"):
trainer.update_policy()
env.set_policy(brain_name, trainer.policy)
for behavior_identifier in self.brain_name_to_identifier[
brain_name
]:
env.set_policy(
behavior_identifier, trainer.get_policy(behavior_identifier)
)
else:
# Avoid memory leak during inference
trainer.clear_update_buffer()

16
ml-agents/mlagents/trainers/trainer_util.py


from mlagents.trainers.meta_curriculum import MetaCurriculum
from mlagents.envs.exception import UnityEnvironmentException
from mlagents.trainers.trainer import Trainer
from mlagents.envs.brain import BrainParameters
from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.sac.trainer import SACTrainer
from mlagents.trainers.bc.offline_trainer import OfflineBCTrainer

self.meta_curriculum = meta_curriculum
self.multi_gpu = multi_gpu
def generate(self, brain_parameters: BrainParameters) -> Trainer:
def generate(self, brain_name: str) -> Trainer:
brain_parameters,
brain_name,
self.summaries_dir,
self.run_id,
self.model_path,

def initialize_trainer(
trainer_config: Any,
brain_parameters: BrainParameters,
brain_name: str,
summaries_dir: str,
run_id: str,
model_path: str,

some general training session options.
:param trainer_config: Original trainer configuration loaded from YAML
:param brain_parameters: BrainParameters provided by the Unity environment
:param brain_name: Name of the brain to be associated with trainer
:param summaries_dir: Directory to store trainer summary statistics
:param run_id: Run ID to associate with this training run
:param model_path: Path to save the model

:return:
"""
trainer_parameters = trainer_config["default"].copy()
brain_name, brain_name_identifiers = brain_parameters.brain_name.split("?")
trainer_parameters["summary_path"] = "{basedir}/{name}".format(
basedir=summaries_dir, name=str(run_id) + "_" + brain_name
)

trainer = None
if trainer_parameters["trainer"] == "offline_bc":
trainer = OfflineBCTrainer(
brain_parameters, trainer_parameters, train_model, load_model, seed, run_id
brain_name, trainer_parameters, train_model, load_model, seed, run_id
brain_parameters,
brain_name,
meta_curriculum.brains_to_curriculums[brain_name].min_lesson_length
if meta_curriculum
else 1,

)
elif trainer_parameters["trainer"] == "sac":
trainer = SACTrainer(
brain_parameters,
brain_name,
meta_curriculum.brains_to_curriculums[brain_name].min_lesson_length
if meta_curriculum
else 1,

正在加载...
取消
保存