Ervin Teng
5 年前
当前提交
072d2ef8
共有 36 个文件被更改,包括 2077 次插入 和 1345 次删除
-
1ml-agents-envs/mlagents/envs/subprocess_env_manager.py
-
7ml-agents-envs/mlagents/envs/tests/test_timers.py
-
63ml-agents-envs/mlagents/envs/timers.py
-
2ml-agents/mlagents/trainers/bc/models.py
-
11ml-agents/mlagents/trainers/bc/offline_trainer.py
-
11ml-agents/mlagents/trainers/bc/online_trainer.py
-
31ml-agents/mlagents/trainers/bc/trainer.py
-
171ml-agents/mlagents/trainers/buffer.py
-
38ml-agents/mlagents/trainers/components/bc/module.py
-
68ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
-
5ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
-
6ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
-
62ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
-
55ml-agents/mlagents/trainers/learn.py
-
258ml-agents/mlagents/trainers/models.py
-
241ml-agents/mlagents/trainers/ppo/models.py
-
144ml-agents/mlagents/trainers/ppo/policy.py
-
334ml-agents/mlagents/trainers/ppo/trainer.py
-
14ml-agents/mlagents/trainers/tests/mock_brain.py
-
71ml-agents/mlagents/trainers/tests/test_buffer.py
-
9ml-agents/mlagents/trainers/tests/test_learn.py
-
2ml-agents/mlagents/trainers/tests/test_ppo.py
-
15ml-agents/mlagents/trainers/tests/test_reward_signals.py
-
289ml-agents/mlagents/trainers/tests/test_trainer_controller.py
-
5ml-agents/mlagents/trainers/tf_policy.py
-
173ml-agents/mlagents/trainers/trainer.py
-
113ml-agents/mlagents/trainers/trainer_controller.py
-
1ml-agents/setup.py
-
140ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py
-
253ml-agents/mlagents/trainers/rl_trainer.py
-
129ml-agents/mlagents/trainers/tests/test_multigpu.py
-
81ml-agents/mlagents/trainers/tests/test_rl_trainer.py
-
207ml-agents/mlagents/trainers/tests/test_simple_rl.py
-
315ml-agents/mlagents/trainers/tests/test_trainer_util.py
-
97ml-agents/mlagents/trainers/trainer_util.py
|
|||
import logging |
|||
import numpy as np |
|||
|
|||
import tensorflow as tf |
|||
from tensorflow.python.client import device_lib |
|||
from mlagents.envs.timers import timed |
|||
from mlagents.trainers.models import EncoderType |
|||
from mlagents.trainers.ppo.policy import PPOPolicy |
|||
from mlagents.trainers.ppo.models import PPOModel |
|||
from mlagents.trainers.components.reward_signals.reward_signal_factory import ( |
|||
create_reward_signal, |
|||
) |
|||
from mlagents.trainers.components.bc.module import BCModule |
|||
|
|||
# Variable scope in which created variables will be placed under |
|||
TOWER_SCOPE_NAME = "tower" |
|||
|
|||
logger = logging.getLogger("mlagents.trainers") |
|||
|
|||
|
|||
class MultiGpuPPOPolicy(PPOPolicy): |
|||
def __init__(self, seed, brain, trainer_params, is_training, load): |
|||
""" |
|||
Policy for Proximal Policy Optimization Networks with multi-GPU training |
|||
:param seed: Random seed. |
|||
:param brain: Assigned Brain object. |
|||
:param trainer_params: Defined training parameters. |
|||
:param is_training: Whether the model should be trained. |
|||
:param load: Whether a pre-trained model will be loaded or a new one created. |
|||
""" |
|||
super().__init__(seed, brain, trainer_params, is_training, load) |
|||
|
|||
with self.graph.as_default(): |
|||
avg_grads = self.average_gradients([t.grads for t in self.towers]) |
|||
self.update_batch = self.model.optimizer.apply_gradients(avg_grads) |
|||
|
|||
self.update_dict = {"update_batch": self.update_batch} |
|||
self.update_dict.update( |
|||
{ |
|||
"value_loss_" + str(i): self.towers[i].value_loss |
|||
for i in range(len(self.towers)) |
|||
} |
|||
) |
|||
self.update_dict.update( |
|||
{ |
|||
"policy_loss_" + str(i): self.towers[i].policy_loss |
|||
for i in range(len(self.towers)) |
|||
} |
|||
) |
|||
|
|||
def create_model(self, brain, trainer_params, reward_signal_configs, seed): |
|||
""" |
|||
Create PPO models, one on each device |
|||
:param brain: Assigned Brain object. |
|||
:param trainer_params: Defined training parameters. |
|||
:param reward_signal_configs: Reward signal config |
|||
:param seed: Random seed. |
|||
""" |
|||
self.devices = get_devices() |
|||
self.towers = [] |
|||
with self.graph.as_default(): |
|||
with tf.variable_scope(TOWER_SCOPE_NAME, reuse=tf.AUTO_REUSE): |
|||
for device in self.devices: |
|||
with tf.device(device): |
|||
self.towers.append( |
|||
PPOModel( |
|||
brain=brain, |
|||
lr=float(trainer_params["learning_rate"]), |
|||
h_size=int(trainer_params["hidden_units"]), |
|||
epsilon=float(trainer_params["epsilon"]), |
|||
beta=float(trainer_params["beta"]), |
|||
max_step=float(trainer_params["max_steps"]), |
|||
normalize=trainer_params["normalize"], |
|||
use_recurrent=trainer_params["use_recurrent"], |
|||
num_layers=int(trainer_params["num_layers"]), |
|||
m_size=self.m_size, |
|||
seed=seed, |
|||
stream_names=list(reward_signal_configs.keys()), |
|||
vis_encode_type=EncoderType( |
|||
trainer_params.get("vis_encode_type", "simple") |
|||
), |
|||
) |
|||
) |
|||
self.towers[-1].create_ppo_optimizer() |
|||
self.model = self.towers[0] |
|||
|
|||
@timed |
|||
def update(self, mini_batch, num_sequences): |
|||
""" |
|||
Updates model using buffer. |
|||
:param n_sequences: Number of trajectories in batch. |
|||
:param mini_batch: Experience batch. |
|||
:return: Output from update process. |
|||
""" |
|||
feed_dict = {} |
|||
|
|||
device_batch_size = num_sequences // len(self.devices) |
|||
device_batches = [] |
|||
for i in range(len(self.devices)): |
|||
device_batches.append( |
|||
{k: v[i : i + device_batch_size] for (k, v) in mini_batch.items()} |
|||
) |
|||
|
|||
for batch, tower in zip(device_batches, self.towers): |
|||
feed_dict.update(self.construct_feed_dict(tower, batch, num_sequences)) |
|||
|
|||
out = self._execute_model(feed_dict, self.update_dict) |
|||
run_out = {} |
|||
run_out["value_loss"] = np.mean( |
|||
[out["value_loss_" + str(i)] for i in range(len(self.towers))] |
|||
) |
|||
run_out["policy_loss"] = np.mean( |
|||
[out["policy_loss_" + str(i)] for i in range(len(self.towers))] |
|||
) |
|||
run_out["update_batch"] = out["update_batch"] |
|||
return run_out |
|||
|
|||
def average_gradients(self, tower_grads): |
|||
""" |
|||
Average gradients from all towers |
|||
:param tower_grads: Gradients from all towers |
|||
""" |
|||
average_grads = [] |
|||
for grad_and_vars in zip(*tower_grads): |
|||
grads = [g for g, _ in grad_and_vars if g is not None] |
|||
if not grads: |
|||
continue |
|||
avg_grad = tf.reduce_mean(tf.stack(grads), 0) |
|||
var = grad_and_vars[0][1] |
|||
average_grads.append((avg_grad, var)) |
|||
return average_grads |
|||
|
|||
|
|||
def get_devices(): |
|||
""" |
|||
Get all available GPU devices |
|||
""" |
|||
local_device_protos = device_lib.list_local_devices() |
|||
devices = [x.name for x in local_device_protos if x.device_type == "GPU"] |
|||
return devices |
|
|||
# # Unity ML-Agents Toolkit |
|||
import logging |
|||
from typing import Dict, List, Deque, Any |
|||
import os |
|||
import tensorflow as tf |
|||
import numpy as np |
|||
from collections import deque, defaultdict |
|||
|
|||
from mlagents.envs import UnityException, AllBrainInfo, ActionInfoOutputs, BrainInfo |
|||
from mlagents.trainers.buffer import Buffer |
|||
from mlagents.trainers.tf_policy import Policy |
|||
from mlagents.trainers.trainer import Trainer, UnityTrainerException |
|||
from mlagents.envs import BrainParameters |
|||
|
|||
LOGGER = logging.getLogger("mlagents.trainers") |
|||
|
|||
|
|||
class RLTrainer(Trainer): |
|||
""" |
|||
This class is the base class for trainers that use Reward Signals. |
|||
Contains methods for adding BrainInfos to the Buffer. |
|||
""" |
|||
|
|||
def __init__(self, *args, **kwargs): |
|||
super(RLTrainer, self).__init__(*args, **kwargs) |
|||
self.step = 0 |
|||
# Make sure we have at least one reward_signal |
|||
if not self.trainer_parameters["reward_signals"]: |
|||
raise UnityTrainerException( |
|||
"No reward signals were defined. At least one must be used with {}.".format( |
|||
self.__class__.__name__ |
|||
) |
|||
) |
|||
# collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward |
|||
# used for reporting only. We always want to report the environment reward to Tensorboard, regardless |
|||
# of what reward signals are actually present. |
|||
self.collected_rewards = {"environment": {}} |
|||
self.training_buffer = Buffer() |
|||
self.episode_steps = {} |
|||
|
|||
def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo: |
|||
""" |
|||
Constructs a BrainInfo which contains the most recent previous experiences for all agents |
|||
which correspond to the agents in a provided next_info. |
|||
:BrainInfo next_info: A t+1 BrainInfo. |
|||
:return: curr_info: Reconstructed BrainInfo to match agents of next_info. |
|||
""" |
|||
visual_observations: List[List[Any]] = [ |
|||
[] |
|||
] # TODO add types to brain.py methods |
|||
vector_observations = [] |
|||
text_observations = [] |
|||
memories = [] |
|||
rewards = [] |
|||
local_dones = [] |
|||
max_reacheds = [] |
|||
agents = [] |
|||
prev_vector_actions = [] |
|||
prev_text_actions = [] |
|||
action_masks = [] |
|||
for agent_id in next_info.agents: |
|||
agent_brain_info = self.training_buffer[agent_id].last_brain_info |
|||
if agent_brain_info is None: |
|||
agent_brain_info = next_info |
|||
agent_index = agent_brain_info.agents.index(agent_id) |
|||
for i in range(len(next_info.visual_observations)): |
|||
visual_observations[i].append( |
|||
agent_brain_info.visual_observations[i][agent_index] |
|||
) |
|||
vector_observations.append( |
|||
agent_brain_info.vector_observations[agent_index] |
|||
) |
|||
text_observations.append(agent_brain_info.text_observations[agent_index]) |
|||
if self.policy.use_recurrent: |
|||
if len(agent_brain_info.memories) > 0: |
|||
memories.append(agent_brain_info.memories[agent_index]) |
|||
else: |
|||
memories.append(self.policy.make_empty_memory(1)) |
|||
rewards.append(agent_brain_info.rewards[agent_index]) |
|||
local_dones.append(agent_brain_info.local_done[agent_index]) |
|||
max_reacheds.append(agent_brain_info.max_reached[agent_index]) |
|||
agents.append(agent_brain_info.agents[agent_index]) |
|||
prev_vector_actions.append( |
|||
agent_brain_info.previous_vector_actions[agent_index] |
|||
) |
|||
prev_text_actions.append( |
|||
agent_brain_info.previous_text_actions[agent_index] |
|||
) |
|||
action_masks.append(agent_brain_info.action_masks[agent_index]) |
|||
if self.policy.use_recurrent: |
|||
memories = np.vstack(memories) |
|||
curr_info = BrainInfo( |
|||
visual_observations, |
|||
vector_observations, |
|||
text_observations, |
|||
memories, |
|||
rewards, |
|||
agents, |
|||
local_dones, |
|||
prev_vector_actions, |
|||
prev_text_actions, |
|||
max_reacheds, |
|||
action_masks, |
|||
) |
|||
return curr_info |
|||
|
|||
def add_experiences( |
|||
self, |
|||
curr_all_info: AllBrainInfo, |
|||
next_all_info: AllBrainInfo, |
|||
take_action_outputs: ActionInfoOutputs, |
|||
) -> None: |
|||
""" |
|||
Adds experiences to each agent's experience history. |
|||
:param curr_all_info: Dictionary of all current brains and corresponding BrainInfo. |
|||
:param next_all_info: Dictionary of all current brains and corresponding BrainInfo. |
|||
:param take_action_outputs: The outputs of the Policy's get_action method. |
|||
""" |
|||
self.trainer_metrics.start_experience_collection_timer() |
|||
if take_action_outputs: |
|||
self.stats["Policy/Entropy"].append(take_action_outputs["entropy"].mean()) |
|||
self.stats["Policy/Learning Rate"].append( |
|||
take_action_outputs["learning_rate"] |
|||
) |
|||
for name, signal in self.policy.reward_signals.items(): |
|||
self.stats[signal.value_name].append( |
|||
np.mean(take_action_outputs["value_heads"][name]) |
|||
) |
|||
|
|||
curr_info = curr_all_info[self.brain_name] |
|||
next_info = next_all_info[self.brain_name] |
|||
|
|||
for agent_id in curr_info.agents: |
|||
self.training_buffer[agent_id].last_brain_info = curr_info |
|||
self.training_buffer[ |
|||
agent_id |
|||
].last_take_action_outputs = take_action_outputs |
|||
|
|||
if curr_info.agents != next_info.agents: |
|||
curr_to_use = self.construct_curr_info(next_info) |
|||
else: |
|||
curr_to_use = curr_info |
|||
|
|||
tmp_rewards_dict = {} |
|||
for name, signal in self.policy.reward_signals.items(): |
|||
tmp_rewards_dict[name] = signal.evaluate(curr_to_use, next_info) |
|||
|
|||
for agent_id in next_info.agents: |
|||
stored_info = self.training_buffer[agent_id].last_brain_info |
|||
stored_take_action_outputs = self.training_buffer[ |
|||
agent_id |
|||
].last_take_action_outputs |
|||
if stored_info is not None: |
|||
idx = stored_info.agents.index(agent_id) |
|||
next_idx = next_info.agents.index(agent_id) |
|||
if not stored_info.local_done[idx]: |
|||
for i, _ in enumerate(stored_info.visual_observations): |
|||
self.training_buffer[agent_id]["visual_obs%d" % i].append( |
|||
stored_info.visual_observations[i][idx] |
|||
) |
|||
self.training_buffer[agent_id]["next_visual_obs%d" % i].append( |
|||
next_info.visual_observations[i][next_idx] |
|||
) |
|||
if self.policy.use_vec_obs: |
|||
self.training_buffer[agent_id]["vector_obs"].append( |
|||
stored_info.vector_observations[idx] |
|||
) |
|||
self.training_buffer[agent_id]["next_vector_in"].append( |
|||
next_info.vector_observations[next_idx] |
|||
) |
|||
if self.policy.use_recurrent: |
|||
if stored_info.memories.shape[1] == 0: |
|||
stored_info.memories = np.zeros( |
|||
(len(stored_info.agents), self.policy.m_size) |
|||
) |
|||
self.training_buffer[agent_id]["memory"].append( |
|||
stored_info.memories[idx] |
|||
) |
|||
|
|||
self.training_buffer[agent_id]["masks"].append(1.0) |
|||
self.training_buffer[agent_id]["done"].append( |
|||
next_info.local_done[next_idx] |
|||
) |
|||
# Add the outputs of the last eval |
|||
self.add_policy_outputs(stored_take_action_outputs, agent_id, idx) |
|||
# Store action masks if neccessary |
|||
if not self.policy.use_continuous_act: |
|||
self.training_buffer[agent_id]["action_mask"].append( |
|||
stored_info.action_masks[idx], padding_value=1 |
|||
) |
|||
self.training_buffer[agent_id]["prev_action"].append( |
|||
stored_info.previous_vector_actions[idx] |
|||
) |
|||
|
|||
values = stored_take_action_outputs["value_heads"] |
|||
# Add the value outputs if needed |
|||
self.add_rewards_outputs( |
|||
values, tmp_rewards_dict, agent_id, idx, next_idx |
|||
) |
|||
|
|||
for name, rewards in self.collected_rewards.items(): |
|||
if agent_id not in rewards: |
|||
rewards[agent_id] = 0 |
|||
if name == "environment": |
|||
# Report the reward from the environment |
|||
rewards[agent_id] += np.array(next_info.rewards)[next_idx] |
|||
else: |
|||
# Report the reward signals |
|||
rewards[agent_id] += tmp_rewards_dict[name].scaled_reward[ |
|||
next_idx |
|||
] |
|||
if not next_info.local_done[next_idx]: |
|||
if agent_id not in self.episode_steps: |
|||
self.episode_steps[agent_id] = 0 |
|||
self.episode_steps[agent_id] += 1 |
|||
self.trainer_metrics.end_experience_collection_timer() |
|||
|
|||
def add_policy_outputs( |
|||
self, take_action_outputs: ActionInfoOutputs, agent_id: str, agent_idx: int |
|||
) -> None: |
|||
""" |
|||
Takes the output of the last action and store it into the training buffer. |
|||
We break this out from add_experiences since it is very highly dependent |
|||
on the type of trainer. |
|||
:param take_action_outputs: The outputs of the Policy's get_action method. |
|||
:param agent_id: the Agent we're adding to. |
|||
:param agent_idx: the index of the Agent agent_id |
|||
""" |
|||
raise UnityTrainerException( |
|||
"The process_experiences method was not implemented." |
|||
) |
|||
|
|||
def add_rewards_outputs( |
|||
self, |
|||
value: Dict[str, Any], |
|||
rewards_dict: Dict[str, float], |
|||
agent_id: str, |
|||
agent_idx: int, |
|||
agent_next_idx: int, |
|||
) -> None: |
|||
""" |
|||
Takes the value and evaluated rewards output of the last action and store it |
|||
into the training buffer. We break this out from add_experiences since it is very |
|||
highly dependent on the type of trainer. |
|||
:param take_action_outputs: The outputs of the Policy's get_action method. |
|||
:param rewards_dict: Dict of rewards after evaluation |
|||
:param agent_id: the Agent we're adding to. |
|||
:param agent_idx: the index of the Agent agent_id in the current brain info |
|||
:param agent_next_idx: the index of the Agent agent_id in the next brain info |
|||
""" |
|||
raise UnityTrainerException( |
|||
"The process_experiences method was not implemented." |
|||
) |
|
|||
import unittest.mock as mock |
|||
import pytest |
|||
|
|||
import numpy as np |
|||
import tensorflow as tf |
|||
import yaml |
|||
|
|||
from mlagents.trainers.ppo.trainer import PPOTrainer |
|||
from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy, get_devices |
|||
from mlagents.envs import UnityEnvironment, BrainParameters |
|||
from mlagents.envs.mock_communicator import MockCommunicator |
|||
from mlagents.trainers.tests.mock_brain import create_mock_brainparams |
|||
|
|||
|
|||
@pytest.fixture |
|||
def dummy_config(): |
|||
return yaml.safe_load( |
|||
""" |
|||
trainer: ppo |
|||
batch_size: 32 |
|||
beta: 5.0e-3 |
|||
buffer_size: 512 |
|||
epsilon: 0.2 |
|||
hidden_units: 128 |
|||
lambd: 0.95 |
|||
learning_rate: 3.0e-4 |
|||
max_steps: 5.0e4 |
|||
normalize: true |
|||
num_epoch: 5 |
|||
num_layers: 2 |
|||
time_horizon: 64 |
|||
sequence_length: 64 |
|||
summary_freq: 1000 |
|||
use_recurrent: false |
|||
memory_size: 8 |
|||
curiosity_strength: 0.0 |
|||
curiosity_enc_size: 1 |
|||
reward_signals: |
|||
extrinsic: |
|||
strength: 1.0 |
|||
gamma: 0.99 |
|||
""" |
|||
) |
|||
|
|||
|
|||
@mock.patch("mlagents.trainers.ppo.multi_gpu_policy.get_devices") |
|||
def test_create_model(mock_get_devices, dummy_config): |
|||
tf.reset_default_graph() |
|||
mock_get_devices.return_value = [ |
|||
"/device:GPU:0", |
|||
"/device:GPU:1", |
|||
"/device:GPU:2", |
|||
"/device:GPU:3", |
|||
] |
|||
|
|||
trainer_parameters = dummy_config |
|||
trainer_parameters["model_path"] = "" |
|||
trainer_parameters["keep_checkpoints"] = 3 |
|||
brain = create_mock_brainparams() |
|||
|
|||
policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False) |
|||
assert len(policy.towers) == len(mock_get_devices.return_value) |
|||
|
|||
|
|||
@mock.patch("mlagents.trainers.ppo.multi_gpu_policy.get_devices") |
|||
def test_average_gradients(mock_get_devices, dummy_config): |
|||
tf.reset_default_graph() |
|||
mock_get_devices.return_value = [ |
|||
"/device:GPU:0", |
|||
"/device:GPU:1", |
|||
"/device:GPU:2", |
|||
"/device:GPU:3", |
|||
] |
|||
|
|||
trainer_parameters = dummy_config |
|||
trainer_parameters["model_path"] = "" |
|||
trainer_parameters["keep_checkpoints"] = 3 |
|||
brain = create_mock_brainparams() |
|||
with tf.Session() as sess: |
|||
policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False) |
|||
var = tf.Variable(0) |
|||
tower_grads = [ |
|||
[(tf.constant(0.1), var)], |
|||
[(tf.constant(0.2), var)], |
|||
[(tf.constant(0.3), var)], |
|||
[(tf.constant(0.4), var)], |
|||
] |
|||
avg_grads = policy.average_gradients(tower_grads) |
|||
|
|||
init = tf.global_variables_initializer() |
|||
sess.run(init) |
|||
run_out = sess.run(avg_grads) |
|||
assert run_out == [(0.25, 0)] |
|||
|
|||
|
|||
@mock.patch("mlagents.trainers.tf_policy.TFPolicy._execute_model") |
|||
@mock.patch("mlagents.trainers.ppo.policy.PPOPolicy.construct_feed_dict") |
|||
@mock.patch("mlagents.trainers.ppo.multi_gpu_policy.get_devices") |
|||
def test_update( |
|||
mock_get_devices, mock_construct_feed_dict, mock_execute_model, dummy_config |
|||
): |
|||
tf.reset_default_graph() |
|||
mock_get_devices.return_value = ["/device:GPU:0", "/device:GPU:1"] |
|||
mock_construct_feed_dict.return_value = {} |
|||
mock_execute_model.return_value = { |
|||
"value_loss_0": 0.1, |
|||
"value_loss_1": 0.3, |
|||
"policy_loss_0": 0.5, |
|||
"policy_loss_1": 0.7, |
|||
"update_batch": None, |
|||
} |
|||
|
|||
trainer_parameters = dummy_config |
|||
trainer_parameters["model_path"] = "" |
|||
trainer_parameters["keep_checkpoints"] = 3 |
|||
brain = create_mock_brainparams() |
|||
policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False) |
|||
mock_mini_batch = mock.Mock() |
|||
mock_mini_batch.items.return_value = [("action", [1, 2]), ("value", [3, 4])] |
|||
run_out = policy.update(mock_mini_batch, 1) |
|||
|
|||
assert mock_mini_batch.items.call_count == len(mock_get_devices.return_value) |
|||
assert mock_construct_feed_dict.call_count == len(mock_get_devices.return_value) |
|||
assert run_out["value_loss"] == 0.2 |
|||
assert run_out["policy_loss"] == 0.6 |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
pytest.main() |
|
|||
import unittest.mock as mock |
|||
import pytest |
|||
import yaml |
|||
import mlagents.trainers.tests.mock_brain as mb |
|||
import numpy as np |
|||
from mlagents.trainers.rl_trainer import RLTrainer |
|||
|
|||
|
|||
@pytest.fixture |
|||
def dummy_config(): |
|||
return yaml.safe_load( |
|||
""" |
|||
summary_path: "test/" |
|||
reward_signals: |
|||
extrinsic: |
|||
strength: 1.0 |
|||
gamma: 0.99 |
|||
""" |
|||
) |
|||
|
|||
|
|||
def create_mock_brain(): |
|||
mock_brain = mb.create_mock_brainparams( |
|||
vector_action_space_type="continuous", |
|||
vector_action_space_size=[2], |
|||
vector_observation_space_size=8, |
|||
number_visual_observations=1, |
|||
) |
|||
return mock_brain |
|||
|
|||
|
|||
def create_rl_trainer(): |
|||
mock_brainparams = create_mock_brain() |
|||
trainer = RLTrainer(mock_brainparams, dummy_config(), True, 0) |
|||
return trainer |
|||
|
|||
|
|||
def create_mock_all_brain_info(brain_info): |
|||
return {"MockBrain": brain_info} |
|||
|
|||
|
|||
def create_mock_policy(): |
|||
mock_policy = mock.Mock() |
|||
mock_policy.reward_signals = {} |
|||
return mock_policy |
|||
|
|||
|
|||
@mock.patch("mlagents.trainers.rl_trainer.RLTrainer.add_policy_outputs") |
|||
@mock.patch("mlagents.trainers.rl_trainer.RLTrainer.add_rewards_outputs") |
|||
def test_rl_trainer(add_policy_outputs, add_rewards_outputs): |
|||
trainer = create_rl_trainer() |
|||
trainer.policy = create_mock_policy() |
|||
fake_action_outputs = { |
|||
"action": [0.1, 0.1], |
|||
"value_heads": {}, |
|||
"entropy": np.array([1.0]), |
|||
"learning_rate": 1.0, |
|||
} |
|||
mock_braininfo = mb.create_mock_braininfo( |
|||
num_agents=2, |
|||
num_vector_observations=8, |
|||
num_vector_acts=2, |
|||
num_vis_observations=1, |
|||
) |
|||
trainer.add_experiences( |
|||
create_mock_all_brain_info(mock_braininfo), |
|||
create_mock_all_brain_info(mock_braininfo), |
|||
fake_action_outputs, |
|||
) |
|||
|
|||
# Remove one of the agents |
|||
next_mock_braininfo = mb.create_mock_braininfo( |
|||
num_agents=1, |
|||
num_vector_observations=8, |
|||
num_vector_acts=2, |
|||
num_vis_observations=1, |
|||
) |
|||
brain_info = trainer.construct_curr_info(next_mock_braininfo) |
|||
|
|||
# assert construct_curr_info worked properly |
|||
assert len(brain_info.agents) == 1 |
|
|||
import math |
|||
import random |
|||
import tempfile |
|||
import pytest |
|||
import yaml |
|||
from typing import Any, Dict |
|||
|
|||
|
|||
from mlagents.trainers.trainer_controller import TrainerController |
|||
from mlagents.trainers.trainer_util import initialize_trainers |
|||
from mlagents.envs.base_unity_environment import BaseUnityEnvironment |
|||
from mlagents.envs import BrainInfo, AllBrainInfo, BrainParameters |
|||
from mlagents.envs.communicator_objects import AgentInfoProto |
|||
from mlagents.envs.simple_env_manager import SimpleEnvManager |
|||
from mlagents.envs.sampler_class import SamplerManager |
|||
|
|||
|
|||
BRAIN_NAME = __name__ |
|||
OBS_SIZE = 1 |
|||
STEP_SIZE = 0.1 |
|||
|
|||
TIME_PENALTY = 0.001 |
|||
MIN_STEPS = int(1.0 / STEP_SIZE) + 1 |
|||
SUCCESS_REWARD = 1.0 + MIN_STEPS * TIME_PENALTY |
|||
|
|||
|
|||
def clamp(x, min_val, max_val): |
|||
return max(min_val, min(x, max_val)) |
|||
|
|||
|
|||
class Simple1DEnvironment(BaseUnityEnvironment): |
|||
""" |
|||
Very simple "game" - the agent has a position on [-1, 1], gets a reward of 1 if it reaches 1, and a reward of -1 if |
|||
it reaches -1. The position is incremented by the action amount (clamped to [-step_size, step_size]). |
|||
""" |
|||
|
|||
def __init__(self, use_discrete): |
|||
super().__init__() |
|||
self.discrete = use_discrete |
|||
self._brains: Dict[str, BrainParameters] = {} |
|||
self._brains[BRAIN_NAME] = BrainParameters( |
|||
brain_name=BRAIN_NAME, |
|||
vector_observation_space_size=OBS_SIZE, |
|||
num_stacked_vector_observations=1, |
|||
camera_resolutions=[], |
|||
vector_action_space_size=[2] if use_discrete else [1], |
|||
vector_action_descriptions=["moveDirection"], |
|||
vector_action_space_type=0 if use_discrete else 1, |
|||
) |
|||
|
|||
# state |
|||
self.position = 0.0 |
|||
self.step_count = 0 |
|||
self.random = random.Random(str(self._brains)) |
|||
self.goal = random.choice([-1, 1]) |
|||
|
|||
def step( |
|||
self, |
|||
vector_action: Dict[str, Any] = None, |
|||
memory: Dict[str, Any] = None, |
|||
text_action: Dict[str, Any] = None, |
|||
value: Dict[str, Any] = None, |
|||
) -> AllBrainInfo: |
|||
assert vector_action is not None |
|||
|
|||
if self.discrete: |
|||
act = vector_action[BRAIN_NAME][0][0] |
|||
delta = 1 if act else -1 |
|||
else: |
|||
delta = vector_action[BRAIN_NAME][0][0] |
|||
delta = clamp(delta, -STEP_SIZE, STEP_SIZE) |
|||
self.position += delta |
|||
self.position = clamp(self.position, -1, 1) |
|||
self.step_count += 1 |
|||
done = self.position >= 1.0 or self.position <= -1.0 |
|||
if done: |
|||
reward = SUCCESS_REWARD * self.position * self.goal |
|||
else: |
|||
reward = -TIME_PENALTY |
|||
|
|||
agent_info = AgentInfoProto( |
|||
stacked_vector_observation=[self.goal] * OBS_SIZE, reward=reward, done=done |
|||
) |
|||
|
|||
if done: |
|||
self._reset_agent() |
|||
|
|||
return { |
|||
BRAIN_NAME: BrainInfo.from_agent_proto( |
|||
0, [agent_info], self._brains[BRAIN_NAME] |
|||
) |
|||
} |
|||
|
|||
def _reset_agent(self): |
|||
self.position = 0.0 |
|||
self.step_count = 0 |
|||
self.goal = random.choice([-1, 1]) |
|||
|
|||
def reset( |
|||
self, |
|||
config: Dict[str, float] = None, |
|||
train_mode: bool = True, |
|||
custom_reset_parameters: Any = None, |
|||
) -> AllBrainInfo: # type: ignore |
|||
self._reset_agent() |
|||
|
|||
agent_info = AgentInfoProto( |
|||
stacked_vector_observation=[self.goal] * OBS_SIZE, |
|||
done=False, |
|||
max_step_reached=False, |
|||
) |
|||
return { |
|||
BRAIN_NAME: BrainInfo.from_agent_proto( |
|||
0, [agent_info], self._brains[BRAIN_NAME] |
|||
) |
|||
} |
|||
|
|||
@property |
|||
def global_done(self): |
|||
return False |
|||
|
|||
@property |
|||
def external_brains(self) -> Dict[str, BrainParameters]: |
|||
return self._brains |
|||
|
|||
@property |
|||
def reset_parameters(self) -> Dict[str, str]: |
|||
return {} |
|||
|
|||
def close(self): |
|||
pass |
|||
|
|||
|
|||
def _check_environment_trains(env): |
|||
config = """ |
|||
default: |
|||
trainer: ppo |
|||
batch_size: 16 |
|||
beta: 5.0e-3 |
|||
buffer_size: 64 |
|||
epsilon: 0.2 |
|||
hidden_units: 128 |
|||
lambd: 0.95 |
|||
learning_rate: 5.0e-3 |
|||
max_steps: 2500 |
|||
memory_size: 256 |
|||
normalize: false |
|||
num_epoch: 3 |
|||
num_layers: 2 |
|||
time_horizon: 64 |
|||
sequence_length: 64 |
|||
summary_freq: 500 |
|||
use_recurrent: false |
|||
reward_signals: |
|||
extrinsic: |
|||
strength: 1.0 |
|||
gamma: 0.99 |
|||
""" |
|||
# Create controller and begin training. |
|||
with tempfile.TemporaryDirectory() as dir: |
|||
run_id = "id" |
|||
save_freq = 99999 |
|||
seed = 1337 |
|||
|
|||
trainer_config = yaml.safe_load(config) |
|||
env_manager = SimpleEnvManager(env) |
|||
trainers = initialize_trainers( |
|||
trainer_config=trainer_config, |
|||
external_brains=env_manager.external_brains, |
|||
summaries_dir=dir, |
|||
run_id=run_id, |
|||
model_path=dir, |
|||
keep_checkpoints=1, |
|||
train_model=True, |
|||
load_model=False, |
|||
seed=seed, |
|||
meta_curriculum=None, |
|||
multi_gpu=False, |
|||
) |
|||
print(trainers) |
|||
|
|||
tc = TrainerController( |
|||
trainers=trainers, |
|||
summaries_dir=dir, |
|||
model_path=dir, |
|||
run_id=run_id, |
|||
meta_curriculum=None, |
|||
train=True, |
|||
training_seed=seed, |
|||
fast_simulation=True, |
|||
sampler_manager=SamplerManager(None), |
|||
resampling_interval=None, |
|||
save_freq=save_freq, |
|||
) |
|||
|
|||
# Begin training |
|||
tc.start_learning(env_manager) |
|||
print(tc._get_measure_vals()) |
|||
for brain_name, mean_reward in tc._get_measure_vals().items(): |
|||
assert not math.isnan(mean_reward) |
|||
assert mean_reward > 0.99 |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_simple_rl(use_discrete): |
|||
env = Simple1DEnvironment(use_discrete=use_discrete) |
|||
_check_environment_trains(env) |
|
|||
import pytest |
|||
import yaml |
|||
import os |
|||
from unittest.mock import patch |
|||
|
|||
import mlagents.trainers.trainer_util as trainer_util |
|||
from mlagents.trainers.trainer_metrics import TrainerMetrics |
|||
from mlagents.trainers.ppo.trainer import PPOTrainer |
|||
from mlagents.trainers.bc.offline_trainer import OfflineBCTrainer |
|||
from mlagents.trainers.bc.online_trainer import OnlineBCTrainer |
|||
from mlagents.envs.exception import UnityEnvironmentException |
|||
|
|||
|
|||
@pytest.fixture |
|||
def dummy_config(): |
|||
return yaml.safe_load( |
|||
""" |
|||
default: |
|||
trainer: ppo |
|||
batch_size: 32 |
|||
beta: 5.0e-3 |
|||
buffer_size: 512 |
|||
epsilon: 0.2 |
|||
gamma: 0.99 |
|||
hidden_units: 128 |
|||
lambd: 0.95 |
|||
learning_rate: 3.0e-4 |
|||
max_steps: 5.0e4 |
|||
normalize: true |
|||
num_epoch: 5 |
|||
num_layers: 2 |
|||
time_horizon: 64 |
|||
sequence_length: 64 |
|||
summary_freq: 1000 |
|||
use_recurrent: false |
|||
memory_size: 8 |
|||
use_curiosity: false |
|||
curiosity_strength: 0.0 |
|||
curiosity_enc_size: 1 |
|||
""" |
|||
) |
|||
|
|||
|
|||
@pytest.fixture |
|||
def dummy_online_bc_config(): |
|||
return yaml.safe_load( |
|||
""" |
|||
default: |
|||
trainer: online_bc |
|||
brain_to_imitate: ExpertBrain |
|||
batches_per_epoch: 16 |
|||
batch_size: 32 |
|||
beta: 5.0e-3 |
|||
buffer_size: 512 |
|||
epsilon: 0.2 |
|||
gamma: 0.99 |
|||
hidden_units: 128 |
|||
lambd: 0.95 |
|||
learning_rate: 3.0e-4 |
|||
max_steps: 5.0e4 |
|||
normalize: true |
|||
num_epoch: 5 |
|||
num_layers: 2 |
|||
time_horizon: 64 |
|||
sequence_length: 64 |
|||
summary_freq: 1000 |
|||
use_recurrent: false |
|||
memory_size: 8 |
|||
use_curiosity: false |
|||
curiosity_strength: 0.0 |
|||
curiosity_enc_size: 1 |
|||
""" |
|||
) |
|||
|
|||
|
|||
@pytest.fixture |
|||
def dummy_offline_bc_config(): |
|||
return yaml.safe_load( |
|||
""" |
|||
default: |
|||
trainer: offline_bc |
|||
demo_path: """ |
|||
+ os.path.dirname(os.path.abspath(__file__)) |
|||
+ """/test.demo |
|||
batches_per_epoch: 16 |
|||
batch_size: 32 |
|||
beta: 5.0e-3 |
|||
buffer_size: 512 |
|||
epsilon: 0.2 |
|||
gamma: 0.99 |
|||
hidden_units: 128 |
|||
lambd: 0.95 |
|||
learning_rate: 3.0e-4 |
|||
max_steps: 5.0e4 |
|||
normalize: true |
|||
num_epoch: 5 |
|||
num_layers: 2 |
|||
time_horizon: 64 |
|||
sequence_length: 64 |
|||
summary_freq: 1000 |
|||
use_recurrent: false |
|||
memory_size: 8 |
|||
use_curiosity: false |
|||
curiosity_strength: 0.0 |
|||
curiosity_enc_size: 1 |
|||
""" |
|||
) |
|||
|
|||
|
|||
@pytest.fixture |
|||
def dummy_offline_bc_config_with_override(): |
|||
base = dummy_offline_bc_config() |
|||
base["testbrain"] = {} |
|||
base["testbrain"]["normalize"] = False |
|||
return base |
|||
|
|||
|
|||
@pytest.fixture |
|||
def dummy_bad_config(): |
|||
return yaml.safe_load( |
|||
""" |
|||
default: |
|||
trainer: incorrect_trainer |
|||
brain_to_imitate: ExpertBrain |
|||
batches_per_epoch: 16 |
|||
batch_size: 32 |
|||
beta: 5.0e-3 |
|||
buffer_size: 512 |
|||
epsilon: 0.2 |
|||
gamma: 0.99 |
|||
hidden_units: 128 |
|||
lambd: 0.95 |
|||
learning_rate: 3.0e-4 |
|||
max_steps: 5.0e4 |
|||
normalize: true |
|||
num_epoch: 5 |
|||
num_layers: 2 |
|||
time_horizon: 64 |
|||
sequence_length: 64 |
|||
summary_freq: 1000 |
|||
use_recurrent: false |
|||
memory_size: 8 |
|||
""" |
|||
) |
|||
|
|||
|
|||
@patch("mlagents.envs.BrainParameters") |
|||
def test_initialize_trainer_parameters_override_defaults(BrainParametersMock): |
|||
summaries_dir = "test_dir" |
|||
run_id = "testrun" |
|||
model_path = "model_dir" |
|||
keep_checkpoints = 1 |
|||
train_model = True |
|||
load_model = False |
|||
seed = 11 |
|||
|
|||
base_config = dummy_offline_bc_config_with_override() |
|||
expected_config = base_config["default"] |
|||
expected_config["summary_path"] = summaries_dir + f"/{run_id}_testbrain" |
|||
expected_config["model_path"] = model_path + "/testbrain" |
|||
expected_config["keep_checkpoints"] = keep_checkpoints |
|||
|
|||
# Override value from specific brain config |
|||
expected_config["normalize"] = False |
|||
|
|||
brain_params_mock = BrainParametersMock() |
|||
external_brains = {"testbrain": brain_params_mock} |
|||
|
|||
def mock_constructor(self, brain, trainer_parameters, training, load, seed, run_id): |
|||
assert brain == brain_params_mock |
|||
assert trainer_parameters == expected_config |
|||
assert training == train_model |
|||
assert load == load_model |
|||
assert seed == seed |
|||
assert run_id == run_id |
|||
|
|||
with patch.object(OfflineBCTrainer, "__init__", mock_constructor): |
|||
trainers = trainer_util.initialize_trainers( |
|||
trainer_config=base_config, |
|||
external_brains=external_brains, |
|||
summaries_dir=summaries_dir, |
|||
run_id=run_id, |
|||
model_path=model_path, |
|||
keep_checkpoints=keep_checkpoints, |
|||
train_model=train_model, |
|||
load_model=load_model, |
|||
seed=seed, |
|||
) |
|||
assert "testbrain" in trainers |
|||
assert isinstance(trainers["testbrain"], OfflineBCTrainer) |
|||
|
|||
|
|||
@patch("mlagents.envs.BrainParameters") |
|||
def test_initialize_online_bc_trainer(BrainParametersMock): |
|||
summaries_dir = "test_dir" |
|||
run_id = "testrun" |
|||
model_path = "model_dir" |
|||
keep_checkpoints = 1 |
|||
train_model = True |
|||
load_model = False |
|||
seed = 11 |
|||
|
|||
base_config = dummy_online_bc_config() |
|||
expected_config = base_config["default"] |
|||
expected_config["summary_path"] = summaries_dir + f"/{run_id}_testbrain" |
|||
expected_config["model_path"] = model_path + "/testbrain" |
|||
expected_config["keep_checkpoints"] = keep_checkpoints |
|||
|
|||
brain_params_mock = BrainParametersMock() |
|||
external_brains = {"testbrain": brain_params_mock} |
|||
|
|||
def mock_constructor(self, brain, trainer_parameters, training, load, seed, run_id): |
|||
assert brain == brain_params_mock |
|||
assert trainer_parameters == expected_config |
|||
assert training == train_model |
|||
assert load == load_model |
|||
assert seed == seed |
|||
assert run_id == run_id |
|||
|
|||
with patch.object(OnlineBCTrainer, "__init__", mock_constructor): |
|||
trainers = trainer_util.initialize_trainers( |
|||
trainer_config=base_config, |
|||
external_brains=external_brains, |
|||
summaries_dir=summaries_dir, |
|||
run_id=run_id, |
|||
model_path=model_path, |
|||
keep_checkpoints=keep_checkpoints, |
|||
train_model=train_model, |
|||
load_model=load_model, |
|||
seed=seed, |
|||
) |
|||
assert "testbrain" in trainers |
|||
assert isinstance(trainers["testbrain"], OnlineBCTrainer) |
|||
|
|||
|
|||
@patch("mlagents.envs.BrainParameters") |
|||
def test_initialize_ppo_trainer(BrainParametersMock): |
|||
brain_params_mock = BrainParametersMock() |
|||
external_brains = {"testbrain": BrainParametersMock()} |
|||
summaries_dir = "test_dir" |
|||
run_id = "testrun" |
|||
model_path = "model_dir" |
|||
keep_checkpoints = 1 |
|||
train_model = True |
|||
load_model = False |
|||
seed = 11 |
|||
expected_reward_buff_cap = 1 |
|||
|
|||
base_config = dummy_config() |
|||
expected_config = base_config["default"] |
|||
expected_config["summary_path"] = summaries_dir + f"/{run_id}_testbrain" |
|||
expected_config["model_path"] = model_path + "/testbrain" |
|||
expected_config["keep_checkpoints"] = keep_checkpoints |
|||
|
|||
def mock_constructor( |
|||
self, |
|||
brain, |
|||
reward_buff_cap, |
|||
trainer_parameters, |
|||
training, |
|||
load, |
|||
seed, |
|||
run_id, |
|||
multi_gpu, |
|||
): |
|||
self.trainer_metrics = TrainerMetrics("", "") |
|||
assert brain == brain_params_mock |
|||
assert trainer_parameters == expected_config |
|||
assert reward_buff_cap == expected_reward_buff_cap |
|||
assert training == train_model |
|||
assert load == load_model |
|||
assert seed == seed |
|||
assert run_id == run_id |
|||
assert multi_gpu == multi_gpu |
|||
|
|||
with patch.object(PPOTrainer, "__init__", mock_constructor): |
|||
trainers = trainer_util.initialize_trainers( |
|||
trainer_config=base_config, |
|||
external_brains=external_brains, |
|||
summaries_dir=summaries_dir, |
|||
run_id=run_id, |
|||
model_path=model_path, |
|||
keep_checkpoints=keep_checkpoints, |
|||
train_model=train_model, |
|||
load_model=load_model, |
|||
seed=seed, |
|||
) |
|||
assert "testbrain" in trainers |
|||
assert isinstance(trainers["testbrain"], PPOTrainer) |
|||
|
|||
|
|||
@patch("mlagents.envs.BrainParameters") |
|||
def test_initialize_invalid_trainer_raises_exception(BrainParametersMock): |
|||
summaries_dir = "test_dir" |
|||
run_id = "testrun" |
|||
model_path = "model_dir" |
|||
keep_checkpoints = 1 |
|||
train_model = True |
|||
load_model = False |
|||
seed = 11 |
|||
bad_config = dummy_bad_config() |
|||
external_brains = {"testbrain": BrainParametersMock()} |
|||
|
|||
with pytest.raises(UnityEnvironmentException): |
|||
trainer_util.initialize_trainers( |
|||
trainer_config=bad_config, |
|||
external_brains=external_brains, |
|||
summaries_dir=summaries_dir, |
|||
run_id=run_id, |
|||
model_path=model_path, |
|||
keep_checkpoints=keep_checkpoints, |
|||
train_model=train_model, |
|||
load_model=load_model, |
|||
seed=seed, |
|||
) |
|
|||
from typing import Any, Dict |
|||
|
|||
from mlagents.trainers import MetaCurriculum |
|||
from mlagents.envs.exception import UnityEnvironmentException |
|||
from mlagents.trainers import Trainer |
|||
from mlagents.envs.brain import BrainParameters |
|||
from mlagents.trainers.ppo.trainer import PPOTrainer |
|||
from mlagents.trainers.bc.offline_trainer import OfflineBCTrainer |
|||
from mlagents.trainers.bc.online_trainer import OnlineBCTrainer |
|||
|
|||
|
|||
def initialize_trainers( |
|||
trainer_config: Dict[str, Any], |
|||
external_brains: Dict[str, BrainParameters], |
|||
summaries_dir: str, |
|||
run_id: str, |
|||
model_path: str, |
|||
keep_checkpoints: int, |
|||
train_model: bool, |
|||
load_model: bool, |
|||
seed: int, |
|||
meta_curriculum: MetaCurriculum = None, |
|||
multi_gpu: bool = False, |
|||
) -> Dict[str, Trainer]: |
|||
""" |
|||
Initializes trainers given a provided trainer configuration and set of brains from the environment, as well as |
|||
some general training session options. |
|||
|
|||
:param trainer_config: Original trainer configuration loaded from YAML |
|||
:param external_brains: BrainParameters provided by the Unity environment |
|||
:param summaries_dir: Directory to store trainer summary statistics |
|||
:param run_id: Run ID to associate with this training run |
|||
:param model_path: Path to save the model |
|||
:param keep_checkpoints: How many model checkpoints to keep |
|||
:param train_model: Whether to train the model (vs. run inference) |
|||
:param load_model: Whether to load the model or randomly initialize |
|||
:param seed: The random seed to use |
|||
:param meta_curriculum: Optional meta_curriculum, used to determine a reward buffer length for PPOTrainer |
|||
:param multi_gpu: Whether to use multi-GPU training |
|||
:return: |
|||
""" |
|||
trainers = {} |
|||
trainer_parameters_dict = {} |
|||
for brain_name in external_brains: |
|||
trainer_parameters = trainer_config["default"].copy() |
|||
trainer_parameters["summary_path"] = "{basedir}/{name}".format( |
|||
basedir=summaries_dir, name=str(run_id) + "_" + brain_name |
|||
) |
|||
trainer_parameters["model_path"] = "{basedir}/{name}".format( |
|||
basedir=model_path, name=brain_name |
|||
) |
|||
trainer_parameters["keep_checkpoints"] = keep_checkpoints |
|||
if brain_name in trainer_config: |
|||
_brain_key: Any = brain_name |
|||
while not isinstance(trainer_config[_brain_key], dict): |
|||
_brain_key = trainer_config[_brain_key] |
|||
trainer_parameters.update(trainer_config[_brain_key]) |
|||
trainer_parameters_dict[brain_name] = trainer_parameters.copy() |
|||
for brain_name in external_brains: |
|||
if trainer_parameters_dict[brain_name]["trainer"] == "offline_bc": |
|||
trainers[brain_name] = OfflineBCTrainer( |
|||
external_brains[brain_name], |
|||
trainer_parameters_dict[brain_name], |
|||
train_model, |
|||
load_model, |
|||
seed, |
|||
run_id, |
|||
) |
|||
elif trainer_parameters_dict[brain_name]["trainer"] == "online_bc": |
|||
trainers[brain_name] = OnlineBCTrainer( |
|||
external_brains[brain_name], |
|||
trainer_parameters_dict[brain_name], |
|||
train_model, |
|||
load_model, |
|||
seed, |
|||
run_id, |
|||
) |
|||
elif trainer_parameters_dict[brain_name]["trainer"] == "ppo": |
|||
trainers[brain_name] = PPOTrainer( |
|||
external_brains[brain_name], |
|||
meta_curriculum.brains_to_curriculums[brain_name].min_lesson_length |
|||
if meta_curriculum |
|||
else 1, |
|||
trainer_parameters_dict[brain_name], |
|||
train_model, |
|||
load_model, |
|||
seed, |
|||
run_id, |
|||
multi_gpu, |
|||
) |
|||
else: |
|||
raise UnityEnvironmentException( |
|||
"The trainer config contains " |
|||
"an unknown trainer type for " |
|||
"brain {}".format(brain_name) |
|||
) |
|||
return trainers |
撰写
预览
正在加载...
取消
保存
Reference in new issue