浏览代码

Develop magic string + trajectory (#3122)

* added team id and identifier concat to behavior parameters

* splitting brain params into brain name and identifiers

* set team id in prefab

* recieves brain_name and identifier on python side

* added team id and identifier concat to behavior parameters

* splitting brain params into brain name and identifiers

* set team id in prefab

* recieves brain_name and identifier on python side

* rebased with develop

* Correctly calls concatBehaviorIdentifiers

* added team id and identifier concat to behavior parameters

* splitting brain params into brain name and identifiers

* set team id in prefab

* recieves brain_name and identifier on python side

* rebased with develop

* Correctly calls concatBehaviorIdentifiers

* trainer_controller expects name_behavior_ids

* add_policy and create_policy separated

* adjusting tests to expect trainer.add_policy to be called

* fixing tests

* fixed naming ...
/asymm-envs
GitHub 5 年前
当前提交
0b5b1b01
共有 17 个文件被更改,包括 324 次插入135 次删除
  1. 1
      UnitySDK/Assets/ML-Agents/Editor/BehaviorParametersEditor.cs
  2. 9
      UnitySDK/Assets/ML-Agents/Scripts/Policy/BehaviorParameters.cs
  3. 3
      ml-agents/mlagents/trainers/agent_processor.py
  4. 95
      ml-agents/mlagents/trainers/ppo/trainer.py
  5. 5
      ml-agents/mlagents/trainers/rl_trainer.py
  6. 121
      ml-agents/mlagents/trainers/sac/trainer.py
  7. 2
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  8. 34
      ml-agents/mlagents/trainers/tests/test_ppo.py
  9. 2
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  10. 15
      ml-agents/mlagents/trainers/tests/test_sac.py
  11. 5
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  12. 14
      ml-agents/mlagents/trainers/tests/test_trainer_util.py
  13. 5
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  14. 33
      ml-agents/mlagents/trainers/trainer.py
  15. 100
      ml-agents/mlagents/trainers/trainer_controller.py
  16. 14
      ml-agents/mlagents/trainers/trainer_util.py
  17. 1
      ml-agents/mlagents/trainers/trajectory.py

1
UnitySDK/Assets/ML-Agents/Editor/BehaviorParametersEditor.cs


EditorGUILayout.PropertyField(so.FindProperty("m_InferenceDevice"), true);
EditorGUI.indentLevel--;
EditorGUILayout.PropertyField(so.FindProperty("m_BehaviorType"));
EditorGUILayout.PropertyField(so.FindProperty("m_TeamID"));
EditorGUILayout.PropertyField(so.FindProperty("m_useChildSensors"), true);
// EditorGUILayout.PropertyField(serializedObject.FindProperty("m_Heuristic"), true);
EditorGUI.indentLevel--;

9
UnitySDK/Assets/ML-Agents/Scripts/Policy/BehaviorParameters.cs


using Barracuda;
using System;
using System.Collections.Generic;
using UnityEngine;
namespace MLAgents

[HideInInspector]
[SerializeField]
string m_BehaviorName = "My Behavior";
[HideInInspector] [SerializeField]
int m_TeamID = 0;
[HideInInspector]
[SerializeField]
[Tooltip("Use all Sensor components attached to child GameObjects of this Agent.")]

public string behaviorName
{
get { return m_BehaviorName; }
get { return m_BehaviorName + "?team=" + m_TeamID;}
}
public IPolicy GeneratePolicy(Func<float[]> heuristic)

case BehaviorType.Default:
if (FindObjectOfType<Academy>().IsCommunicatorOn)
{
return new RemotePolicy(m_BrainParameters, m_BehaviorName);
return new RemotePolicy(m_BrainParameters, behaviorName);
}
if (m_Model != null)
{

3
ml-agents/mlagents/trainers/agent_processor.py


self,
trainer: Trainer,
policy: TFPolicy,
behavior_id: str,
stats_reporter: StatsReporter,
max_trajectory_length: int = sys.maxsize,
):

self.stats_reporter = stats_reporter
self.trainer = trainer
self.max_trajectory_length = max_trajectory_length
self.behavior_id = behavior_id
def add_experiences(
self,

steps=self.experience_buffers[agent_id],
agent_id=agent_id,
next_obs=next_obs,
behavior_id=self.behavior_id,
)
# This will eventually be replaced with a queue
self.trainer.process_trajectory(trajectory)

95
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy, get_devices
from mlagents.trainers.rl_trainer import RLTrainer
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.trajectory import Trajectory
logger = logging.getLogger("mlagents.trainers")

def __init__(
self,
brain,
reward_buff_cap,
trainer_parameters,
training,
load,
seed,
run_id,
multi_gpu,
brain_name: str,
reward_buff_cap: int,
trainer_parameters: dict,
training: bool,
load: bool,
seed: int,
run_id: str,
multi_gpu: bool,
:param brain_name: The name of the brain associated with trainer config
:param reward_buff_cap: Max reward history to track in the reward buffer
:param reward_buff_cap: Max reward history to track in the reward buffer
:param multi_gpu: Boolean for multi-gpu policy model
brain, trainer_parameters, training, run_id, reward_buff_cap
brain_name, trainer_parameters, training, run_id, reward_buff_cap
)
self.param_keys = [
"batch_size",

"reward_signals",
]
self.check_param_keys()
if multi_gpu and len(get_devices()) > 1:
self.ppo_policy = MultiGpuPPOPolicy(
seed, brain, trainer_parameters, self.is_training, load
)
else:
self.ppo_policy = PPOPolicy(
seed, brain, trainer_parameters, self.is_training, load
)
self.policy = self.ppo_policy
for _reward_signal in self.policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
self.load = load
self.multi_gpu = multi_gpu
self.seed = seed
self.policy: TFPolicy = None
def process_trajectory(self, trajectory: Trajectory) -> None:
"""

# If this was a terminal trajectory, append stats and reset reward collection
if trajectory.done_reached:
self._update_end_episode_stats(agent_id)
self._update_end_episode_stats(
agent_id, self.get_policy(trajectory.behavior_id)
)
def is_ready_update(self):
"""

for stat, val in update_stats.items():
self.stats_reporter.add_stat(stat, val)
self.clear_update_buffer()
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
"""
Creates a PPO policy to trainers list of policies.
:param brain_parameters: specifications for policy construction
:return policy
"""
if self.multi_gpu and len(get_devices()) > 1:
policy: PPOPolicy = MultiGpuPPOPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,
self.is_training,
self.load,
)
else:
policy = PPOPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,
self.is_training,
self.load,
)
for _reward_signal in policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
return policy
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
"""
Adds policy to trainer.
:param brain_parameters: specifications for policy construction
"""
if self.policy:
logger.warning(
"add_policy has been called twice. {} is not a multi-agent trainer".format(
self.__class__.__name__
)
)
self.policy = policy
def get_policy(self, name_behavior_id: str) -> TFPolicy:
"""
Gets policy from trainer associated with name_behavior_id
:param name_behavior_id: full identifier of policy
"""
return self.policy
def discount_rewards(r, gamma=0.99, value_next=0.0):

5
ml-agents/mlagents/trainers/rl_trainer.py


from typing import Dict
from collections import defaultdict
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trainer import Trainer, UnityTrainerException
from mlagents.trainers.components.reward_signals import RewardSignalResult

for agent_id in rewards:
rewards[agent_id] = 0
def _update_end_episode_stats(self, agent_id: str) -> None:
def _update_end_episode_stats(self, agent_id: str, policy: TFPolicy) -> None:
self.episode_steps[agent_id] = 0
for name, rewards in self.collected_rewards.items():
if name == "environment":

rewards[agent_id] = 0
else:
self.stats_reporter.add_stat(
self.policy.reward_signals[name].stat_name, rewards.get(agent_id, 0)
policy.reward_signals[name].stat_name, rewards.get(agent_id, 0)
)
rewards[agent_id] = 0

121
ml-agents/mlagents/trainers/sac/trainer.py


# # Unity ML-Agents Toolkit
# ## ML-Agent Learning (SAC)
# Contains an implementation of SAC as described in https://arxiv.org/abs/1801.01290
# and implemented in https://github.com/hill-a/stable-baselines

import numpy as np
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.brain import BrainParameters
LOGGER = logging.getLogger("mlagents.trainers")
logger = logging.getLogger("mlagents.trainers")
BUFFER_TRUNCATE_PERCENT = 0.8

"""
def __init__(
self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id
self,
brain_name: str,
reward_buff_cap: int,
trainer_parameters: dict,
training: bool,
load: bool,
seed: int,
run_id: str,
:param brain_name: The name of the brain associated with trainer config
:param reward_buff_cap: Max reward history to track in the reward buffer
:param trainer_parameters: The parameters for the trainer (dictionary).
:param training: Whether the trainer is set for training.
:param load: Whether the model should be loaded.

super().__init__(brain, trainer_parameters, training, run_id, reward_buff_cap)
super().__init__(
brain_name, trainer_parameters, training, run_id, reward_buff_cap
)
self.param_keys = [
"batch_size",
"buffer_size",

]
self.check_param_keys()
self.load = load
self.seed = seed
self.policy: TFPolicy = None
self.step = 0
self.train_interval = (

if "save_replay_buffer" in trainer_parameters
else False
)
self.sac_policy = SACPolicy(
seed, brain, trainer_parameters, self.is_training, load
)
self.policy = self.sac_policy
# Load the replay buffer if load
if load and self.checkpoint_replay_buffer:
try:
self.load_replay_buffer()
except (AttributeError, FileNotFoundError):
LOGGER.warning(
"Replay buffer was unable to load, starting from scratch."
)
LOGGER.debug(
"Loaded update buffer with {} sequences".format(
self.update_buffer.num_experiences
)
)
for _reward_signal in self.policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
def save_model(self) -> None:
def save_model(self, name_behavior_id: str) -> None:
"""
Saves the model. Overrides the default save_model since we want to save
the replay buffer as well.

"""
Save the training buffer's update buffer to a pickle file.
"""
filename = os.path.join(self.policy.model_path, "last_replay_buffer.hdf5")
LOGGER.info("Saving Experience Replay Buffer to {}".format(filename))
filename = os.path.join(
self.trainer_parameters["model_path"], "last_replay_buffer.hdf5"
)
logger.info("Saving Experience Replay Buffer to {}".format(filename))
with open(filename, "wb") as file_object:
self.update_buffer.save_to_file(file_object)

"""
filename = os.path.join(self.policy.model_path, "last_replay_buffer.hdf5")
LOGGER.info("Loading Experience Replay Buffer from {}".format(filename))
filename = os.path.join(
self.trainer_parameters["model_path"], "last_replay_buffer.hdf5"
)
logger.info("Loading Experience Replay Buffer from {}".format(filename))
LOGGER.info(
logger.info(
"Experience replay buffer has {} experiences.".format(
self.update_buffer.num_experiences
)

)
if trajectory.done_reached:
self._update_end_episode_stats(agent_id)
self._update_end_episode_stats(
agent_id, self.get_policy(trajectory.behavior_id)
)
def is_ready_update(self) -> bool:
"""

self.update_sac_policy()
self.update_reward_signals()
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
policy = SACPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,
self.is_training,
self.load,
)
for _reward_signal in policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Load the replay buffer if load
if self.load and self.checkpoint_replay_buffer:
try:
self.load_replay_buffer()
except (AttributeError, FileNotFoundError):
logger.warning(
"Replay buffer was unable to load, starting from scratch."
)
logger.debug(
"Loaded update buffer with {} sequences".format(
self.update_buffer.num_experiences
)
)
return policy
def update_sac_policy(self) -> None:
"""
Uses demonstration_buffer to update the policy.

num_updates = self.trainer_parameters["num_update"]
batch_update_stats: Dict[str, list] = defaultdict(list)
for _ in range(num_updates):
LOGGER.debug("Updating SAC policy at step {}".format(self.step))
logger.debug("Updating SAC policy at step {}".format(self.step))
buffer = self.update_buffer
if (
self.update_buffer.num_experiences

for stat, stat_list in batch_update_stats.items():
self.stats_reporter.add_stat(stat, np.mean(stat_list))
bc_module = self.sac_policy.bc_module
bc_module = self.policy.bc_module
if bc_module:
update_stats = bc_module.update()
for stat, val in update_stats.items():

# Get minibatches for reward signal update if needed
reward_signal_minibatches = {}
for name, signal in self.policy.reward_signals.items():
LOGGER.debug("Updating {} at step {}".format(name, self.step))
logger.debug("Updating {} at step {}".format(name, self.step))
# Some signals don't need a minibatch to be sampled - so we don't!
if signal.update_dict:
reward_signal_minibatches[name] = buffer.sample_mini_batch(

update_stats = self.sac_policy.update_reward_signals(
update_stats = self.policy.update_reward_signals(
reward_signal_minibatches, n_sequences
)
for stat_name, value in update_stats.items():

def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
"""
Adds policy to trainer.
:param brain_parameters: specifications for policy construction
"""
if self.policy:
logger.warning(
"add_policy has been called twice. {} is not a multi-agent trainer".format(
self.__class__.__name__
)
)
self.policy = policy
def get_policy(self, name_behavior_id: str) -> TFPolicy:
"""
Gets policy from trainer associated with name_behavior_id
:param name_behavior_id: full identifier of policy
"""
return self.policy

2
ml-agents/mlagents/trainers/tests/test_agent_processor.py


def test_agentprocessor(num_vis_obs):
policy = create_mock_policy()
trainer = mock.Mock()
name_behavior_id = "test_brain_name"
name_behavior_id,
max_trajectory_length=5,
stats_reporter=StatsReporter("testcat"),
)

34
ml-agents/mlagents/trainers/tests/test_ppo.py


vector_action_space_type=0,
)
trainer = PPOTrainer(brain_params, 0, trainer_params, True, False, 0, "0", False)
trainer = PPOTrainer(
brain_params.brain_name, 0, trainer_params, True, False, 0, "0", False
)
step_count = 10
step_count = (
5
) # 10 hacked becausee this function is no longer called through trainer
print(trainer.policy.increment_step(5))
assert trainer.step == 10
assert trainer.step == step_count
@mock.patch("mlagents_envs.environment.UnityEnvironment")

trainer_params["reward_signals"]["curiosity"]["gamma"] = 0.99
trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128
trainer = PPOTrainer(mock_brain, 0, trainer_params, True, False, 0, "0", False)
trainer = PPOTrainer(
mock_brain.brain_name, 0, trainer_params, True, False, 0, "0", False
)
policy = trainer.create_policy(mock_brain)
trainer.add_policy(mock_brain.brain_name, policy)
# Test update with sequence length smaller than batch size
buffer = mb.simulate_rollout(env, trainer.policy, BUFFER_INIT_SAMPLES)
# Mock out reward signal eval

)
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False)
trainer = PPOTrainer(
brain_params.brain_name, 0, dummy_config, True, False, 0, "0", False
)
time_horizon = 15
trajectory = make_fake_trajectory(
length=time_horizon,

action_space=2,
)
policy = trainer.create_policy(brain_params)
trainer.add_policy(brain_params.brain_name, policy)
trainer.process_trajectory(trajectory)
# Check that trainer put trajectory in update buffer

)
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False)
trainer = PPOTrainer(
brain_params.brain_name, 0, dummy_config, True, False, 0, "0", False
)
time_horizon = 6
trajectory = make_fake_trajectory(
length=time_horizon,

# Change half of the obs to 0
for i in range(3):
trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
policy = trainer.create_policy(brain_params)
trainer.add_policy(brain_params.brain_name, policy)
steps, mean, variance = trainer.ppo_policy.sess.run(
steps, mean, variance = trainer.policy.sess.run(
[
trainer.policy.model.normalization_steps,
trainer.policy.model.running_mean,

trainer.process_trajectory(trajectory)
# Check that the running mean and variance is correct
steps, mean, variance = trainer.ppo_policy.sess.run(
steps, mean, variance = trainer.policy.sess.run(
[
trainer.policy.model.normalization_steps,
trainer.policy.model.running_mean,

2
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


def create_rl_trainer():
mock_brainparams = create_mock_brain()
trainer = RLTrainer(mock_brainparams, dummy_config(), True, 0)
trainer = RLTrainer(mock_brainparams.brain_name, dummy_config(), True, 0)
return trainer

15
ml-agents/mlagents/trainers/tests/test_sac.py


trainer_params["summary_path"] = str(tmpdir)
trainer_params["model_path"] = str(tmpdir)
trainer_params["save_replay_buffer"] = True
trainer = SACTrainer(mock_brain, 1, trainer_params, True, False, 0, 0)
trainer = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, False, 0, 0)
policy = trainer.create_policy(mock_brain)
trainer.add_policy(mock_brain.brain_name, policy)
trainer.save_model()
trainer.save_model(mock_brain.brain_name)
trainer2 = SACTrainer(mock_brain, 1, trainer_params, True, True, 0, 0)
trainer2 = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, True, 0, 0)
policy = trainer2.create_policy(mock_brain)
trainer2.add_policy(mock_brain.brain_name, policy)
assert trainer2.update_buffer.num_experiences == buffer_len

dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = trainer.create_policy(brain_params)
trainer.add_policy(brain_params.brain_name, policy)
trajectory = make_fake_trajectory(
length=15, max_step_complete=True, vec_obs_size=6, num_vis_obs=0, action_space=2
)

5
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


env_mock.step.return_value = [new_step_info]
env_mock.reset.return_value = [old_step_info]
tc.brain_name_to_identifier[brain_name].add(brain_name)
tc.advance(env_mock)
env_mock.reset.assert_not_called()

new_step_info.current_all_brain_info[brain_name],
new_step_info.brain_name_to_action_info[brain_name].outputs,
)
trainer_mock.update_policy.assert_called_once()
trainer_mock.increment_step.assert_called_once()

env_mock = MagicMock()
env_mock.step.return_value = [new_step_info]
env_mock.reset.return_value = [old_step_info]
tc.brain_name_to_identifier[brain_name].add(brain_name)
tc.advance(env_mock)
env_mock.reset.assert_not_called()

14
ml-agents/mlagents/trainers/tests/test_trainer_util.py


run_id,
multi_gpu,
):
assert brain == brain_params_mock
assert brain == brain_params_mock.brain_name
assert trainer_parameters == expected_config
assert reward_buff_cap == expected_reward_buff_cap
assert training == train_model

)
trainers = {}
for _, brain_parameters in external_brains.items():
trainers["testbrain"] = trainer_factory.generate(brain_parameters)
trainers["testbrain"] = trainer_factory.generate(
brain_parameters.brain_name
)
assert "testbrain" in trainers
assert isinstance(trainers["testbrain"], PPOTrainer)

run_id,
multi_gpu,
):
assert brain == brain_params_mock
assert brain == brain_params_mock.brain_name
assert trainer_parameters == expected_config
assert reward_buff_cap == expected_reward_buff_cap
assert training == train_model

)
trainers = {}
for brain_name, brain_parameters in external_brains.items():
trainers[brain_name] = trainer_factory.generate(brain_parameters)
trainers[brain_name] = trainer_factory.generate(brain_parameters.brain_name)
assert "testbrain" in trainers
assert isinstance(trainers["testbrain"], PPOTrainer)

)
trainers = {}
for brain_name, brain_parameters in external_brains.items():
trainers[brain_name] = trainer_factory.generate(brain_parameters)
trainers[brain_name] = trainer_factory.generate(brain_parameters.brain_name)
def test_handles_no_default_section(dummy_config):

load_model=False,
seed=42,
)
trainer_factory.generate(brain_parameters)
trainer_factory.generate(brain_parameters.brain_name)
def test_raise_if_no_config_for_brain(dummy_config):

5
ml-agents/mlagents/trainers/tests/test_trajectory.py


max_step = False
memory = np.ones(10, dtype=np.float32)
agent_id = "test_agent"
behavior_id = "test_brain"
experience = AgentExperience(
obs=obs,
reward=reward,

memory=memory,
)
steps_list.append(last_experience)
return Trajectory(steps=steps_list, agent_id=agent_id, next_obs=obs)
return Trajectory(
steps=steps_list, agent_id=agent_id, behavior_id=behavior_id, next_obs=obs
)
@pytest.mark.parametrize("num_visual_obs", [0, 1, 2])

33
ml-agents/mlagents/trainers/trainer.py


def __init__(
self,
brain: BrainParameters,
brain_name: str,
trainer_parameters: dict,
training: bool,
run_id: str,

:int reward_buff_cap:
"""
self.param_keys: List[str] = []
self.brain_name = brain.brain_name
self.brain_name = brain_name
self.run_id = run_id
self.trainer_parameters = trainer_parameters
self.summary_path = trainer_parameters["summary_path"]

self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)
self.policy: TFPolicy = None # type: ignore # this will always get set
self.step: int = 0
def check_param_keys(self):

:param n_steps: number of steps to increment the step count by
"""
self.step = self.policy.increment_step(n_steps)
self.step += n_steps
def save_model(self) -> None:
def save_model(self, name_behavior_id: str) -> None:
self.policy.save_model(self.get_step)
self.get_policy(name_behavior_id).save_model(self.get_step)
def export_model(self) -> None:
def export_model(self, name_behavior_id: str) -> None:
self.policy.export_model()
self.get_policy(name_behavior_id).export_model()
def write_summary(self, global_step: int, delta_train_start: float) -> None:
"""

Uses demonstration_buffer to update model.
"""
raise UnityTrainerException("The update_model method was not implemented.")
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
"""
Creates policy
"""
raise UnityTrainerException("The create_policy method was not implemented.")
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
"""
Adds policy to trainer
"""
raise UnityTrainerException("The add_policy method was not implemented")
def get_policy(self, name_behavior_id: str) -> TFPolicy:
"""
Gets policy from trainer
"""
raise UnityTrainerException("The get_policy method was not implemented.")
def advance(self) -> None:
pass

100
ml-agents/mlagents/trainers/trainer_controller.py


import json
import logging
from typing import Dict, List, Optional, Set, NamedTuple
from collections import defaultdict
import numpy as np
from mlagents.tf_utils import tf

:param resampling_interval: Specifies number of simulation steps after which reset parameters are resampled.
"""
self.trainers: Dict[str, Trainer] = {}
self.brain_name_to_identifier: Dict[str, Set] = defaultdict(set)
self.managers: Dict[str, AgentManager] = {}
self.trainer_factory = trainer_factory
self.model_path = model_path

Saves current model to checkpoint folder.
"""
for brain_name in self.trainers.keys():
self.trainers[brain_name].save_model()
for name_behavior_id in self.brain_name_to_identifier[brain_name]:
self.trainers[brain_name].save_model(name_behavior_id)
self.logger.info("Saved Model")
def _save_model_when_interrupted(self):

Exports latest saved models to .nn format for Unity embedding.
"""
for brain_name in self.trainers.keys():
self.trainers[brain_name].export_model()
for name_behavior_id in self.brain_name_to_identifier[brain_name]:
self.trainers[brain_name].export_model(name_behavior_id)
@staticmethod
def _create_model_path(model_path):

trainer.stats_reporter.add_stat("Environment/Lesson", lesson_num)
trainer.write_summary(global_step, delta_train_start)
def start_trainer(self, trainer: Trainer, env_manager: EnvManager) -> None:
self.trainers[trainer.brain_name] = trainer
self.logger.info(trainer)
if self.train_model:
trainer.write_tensorboard_text("Hyperparameters", trainer.parameters)
env_manager.set_policy(trainer.brain_name, trainer.policy)
last_brain_names: Set[str] = set()
last_brain_behavior_ids: Set[str] = set()
external_brains = set(env_manager.external_brains.keys())
new_brains = external_brains - last_brain_names
if last_brain_names != env_manager.external_brains.keys():
for name in new_brains:
trainer = self.trainer_factory.generate(
env_manager.external_brains[name]
)
self.start_trainer(trainer, env_manager)
agent_manager = AgentManager(
processor=AgentProcessor(
trainer,
trainer.policy,
trainer.stats_reporter,
trainer.parameters.get("time_horizon", sys.maxsize),
external_brain_behavior_ids = set(env_manager.external_brains.keys())
new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids
for name_behavior_id in new_behavior_ids:
try:
brain_name, _ = name_behavior_id.split("?")
except ValueError:
brain_name = name_behavior_id
try:
trainer = self.trainers[brain_name]
except KeyError:
trainer = self.trainer_factory.generate(brain_name)
self.trainers[brain_name] = trainer
self.logger.info(trainer)
if self.train_model:
trainer.write_tensorboard_text(
"Hyperparameters", trainer.parameters
policy = trainer.create_policy(
env_manager.external_brains[name_behavior_id]
)
trainer.add_policy(name_behavior_id, policy)
env_manager.set_policy(name_behavior_id, policy)
self.brain_name_to_identifier[brain_name].add(name_behavior_id)
agent_manager = AgentManager(
processor=AgentProcessor(
trainer,
policy,
name_behavior_id,
trainer.stats_reporter,
trainer.parameters.get("time_horizon", sys.maxsize),
self.managers[name] = agent_manager
last_brain_names = external_brains
)
self.managers[name_behavior_id] = agent_manager
last_brain_behavior_ids = external_brain_behavior_ids
n_steps = self.advance(env_manager)
for i in range(n_steps):
global_step += 1

new_step_infos = env.step()
for step_info in new_step_infos:
for brain_name, trainer in self.trainers.items():
if step_info.has_actions_for_brain(brain_name):
_processor = self.managers[brain_name].processor
_processor.add_experiences(
step_info.previous_all_brain_info[brain_name],
step_info.current_all_brain_info[brain_name],
step_info.brain_name_to_action_info[brain_name].outputs,
)
for name_behavior_id in self.brain_name_to_identifier[brain_name]:
if step_info.has_actions_for_brain(name_behavior_id):
_processor = self.managers[name_behavior_id].processor
_processor.add_experiences(
step_info.previous_all_brain_info[name_behavior_id],
step_info.current_all_brain_info[name_behavior_id],
step_info.brain_name_to_action_info[
name_behavior_id
].outputs,
)
trainer.increment_step(len(new_step_infos))
n_steps = len(new_step_infos)
trainer.increment_step(n_steps)
for name_behavior_id in self.brain_name_to_identifier[brain_name]:
trainer.get_policy(name_behavior_id).increment_step(n_steps)
env.set_policy(brain_name, trainer.policy)
for name_behavior_id in self.brain_name_to_identifier[brain_name]:
env.set_policy(
name_behavior_id, trainer.get_policy(name_behavior_id)
)
else:
# Avoid memory leak during inference
# Eventually this whole block will take place in advance()

14
ml-agents/mlagents/trainers/trainer_util.py


from mlagents.trainers.meta_curriculum import MetaCurriculum
from mlagents.trainers.exception import TrainerConfigError
from mlagents.trainers.trainer import Trainer, UnityTrainerException
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.sac.trainer import SACTrainer

self.meta_curriculum = meta_curriculum
self.multi_gpu = multi_gpu
def generate(self, brain_parameters: BrainParameters) -> Trainer:
def generate(self, brain_name: str) -> Trainer:
brain_parameters,
brain_name,
self.summaries_dir,
self.run_id,
self.model_path,

def initialize_trainer(
trainer_config: Any,
brain_parameters: BrainParameters,
brain_name: str,
summaries_dir: str,
run_id: str,
model_path: str,

some general training session options.
:param trainer_config: Original trainer configuration loaded from YAML
:param brain_parameters: BrainParameters provided by the Unity environment
:param brain_name: Name of the brain to be associated with trainer
:param summaries_dir: Directory to store trainer summary statistics
:param run_id: Run ID to associate with this training run
:param model_path: Path to save the model

:param multi_gpu: Whether to use multi-GPU training
:return:
"""
brain_name = brain_parameters.brain_name
if "default" not in trainer_config and brain_name not in trainer_config:
raise TrainerConfigError(
f'Trainer config must have either a "default" section, or a section for the brain name ({brain_name}). '

)
elif trainer_type == "ppo":
trainer = PPOTrainer(
brain_parameters,
brain_name,
min_lesson_length,
trainer_parameters,
train_model,

)
elif trainer_type == "sac":
trainer = SACTrainer(
brain_parameters,
brain_name,
min_lesson_length,
trainer_parameters,
train_model,

1
ml-agents/mlagents/trainers/trajectory.py


np.ndarray
] # Observation following the trajectory, for bootstrapping
agent_id: str
behavior_id: str
def to_agentbuffer(self) -> AgentBuffer:
"""

正在加载...
取消
保存