浏览代码

Merge pull request #1858 from Unity-Technologies/develop-esh-metrics

Added logging per Brain of time to update policy, time elapsed during training, time to collect experiences, buffer length, average return per policy
/develop-generalizationTraining-TrainerController
GitHub 6 年前
当前提交
a0b44f1b
共有 15 个文件被更改,包括 277 次插入61 次删除
  1. 4
      docs/Training-ML-Agents.md
  2. 1
      ml-agents/mlagents/trainers/__init__.py
  3. 3
      ml-agents/mlagents/trainers/bc/offline_trainer.py
  4. 3
      ml-agents/mlagents/trainers/bc/online_trainer.py
  5. 14
      ml-agents/mlagents/trainers/bc/trainer.py
  6. 23
      ml-agents/mlagents/trainers/learn.py
  7. 30
      ml-agents/mlagents/trainers/ppo/trainer.py
  8. 1
      ml-agents/mlagents/trainers/tests/test_learn.py
  9. 2
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  10. 81
      ml-agents/mlagents/trainers/trainer.py
  11. 30
      ml-agents/mlagents/trainers/trainer_controller.py
  12. 107
      ml-agents/mlagents/trainers/trainer_metrics.py
  13. 39
      ml-agents/tests/trainers/test_trainer_metrics.py

4
docs/Training-ML-Agents.md


training doesn't involve visual observations (reading from Pixels). See
[here](https://docs.unity3d.com/Manual/CommandLineArguments.html) for more
details.
* `--debug` - Specify this option to run ML-Agents in debug mode and log Trainer
Metrics to a CSV stored in the `summaries` directory. The metrics stored are:
brain name, time to update policy, time since start of training, time for last experience collection, number of experiences used for training, mean return. This
option is not available currently for Imitation Learning.
### Training config file

1
ml-agents/mlagents/trainers/__init__.py


from .curriculum import *
from .meta_curriculum import *
from .models import *
from .trainer_metrics import *
from .trainer import *
from .policy import *
from .trainer_controller import *

3
ml-agents/mlagents/trainers/bc/offline_trainer.py


class OfflineBCTrainer(BCTrainer):
"""The OfflineBCTrainer is an implementation of Offline Behavioral Cloning."""
def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
def __init__(self, brain, trainer_parameters, training, load, seed,
run_id):
"""
Responsible for collecting experiences and training PPO model.
:param trainer_parameters: The parameters for the trainer (dictionary).

3
ml-agents/mlagents/trainers/bc/online_trainer.py


class OnlineBCTrainer(BCTrainer):
"""The OnlineBCTrainer is an implementation of Online Behavioral Cloning."""
def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
def __init__(self, brain, trainer_parameters, training, load, seed,
run_id):
"""
Responsible for collecting experiences and training PPO model.
:param trainer_parameters: The parameters for the trainer (dictionary).

14
ml-agents/mlagents/trainers/bc/trainer.py


# Contains an implementation of Behavioral Cloning Algorithm
import logging
import os
import numpy as np
import tensorflow as tf

class BCTrainer(Trainer):
"""The BCTrainer is an implementation of Behavioral Cloning."""
def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
def __init__(self, brain, trainer_parameters, training, load, seed,
run_id):
"""
Responsible for collecting experiences and training PPO model.
:param trainer_parameters: The parameters for the trainer (dictionary).

:param run_id: The The identifier of the current run
"""
super(BCTrainer, self).__init__(brain, trainer_parameters, training, run_id)
super(BCTrainer, self).__init__(brain, trainer_parameters, training,
run_id)
self.policy = BCPolicy(seed, brain, trainer_parameters, load)
self.n_sequences = 1
self.cumulative_rewards = {}

self.summary_path = trainer_parameters['summary_path']
if not os.path.exists(self.summary_path):
os.makedirs(self.summary_path)
self.summary_writer = tf.summary.FileWriter(self.summary_path)
@property
def parameters(self):

def end_episode(self):
"""
A signal that the Episode has ended. The buffer must be reset.
A signal that the Episode has ended. The buffer must be reset.
Get only called when the academy resets.
"""
self.evaluation_buffer.reset_local_buffers()

23
ml-agents/mlagents/trainers/learn.py


fast_simulation = not bool(run_options['--slow'])
no_graphics = run_options['--no-graphics']
trainer_config_path = run_options['<trainer-config-path>']
# Recognize and use docker volume if one is passed as an argument
if not docker_target_name:
model_path = './models/{run_id}'.format(run_id=run_id)

tc = TrainerController(model_path, summaries_dir, run_id + '-' + str(sub_id),
save_freq, maybe_meta_curriculum,
load_model, train_model,
keep_checkpoints, lesson, external_brains, run_seed)
keep_checkpoints, lesson, external_brains,
run_seed)
# Signal that environment has been launched.
process_queue.put(True)

if docker_training and env_path is not None:
"""
Comments for future maintenance:
Some OS/VM instances (e.g. COS GCP Image) mount filesystems
with COS flag which prevents execution of the Unity scene,
to get around this, we will copy the executable into the
Some OS/VM instances (e.g. COS GCP Image) mount filesystems
with COS flag which prevents execution of the Unity scene,
to get around this, we will copy the executable into the
container.
"""
# Navigate in docker path and find env_path and copy it.

def main():
try:
print('''
,m' ,▓▓▓▀▓▓▄ ▓▓▓ ▓▓▌

except:
print('\n\n\tUnity Technologies\n')
logger = logging.getLogger('mlagents.trainers')
_USAGE = '''
Usage:
mlagents-learn <trainer-config-path> [options]

--lesson=<n> Start learning from this lesson [default: 0].
--load Whether to load the model or randomly initialize [default: False].
--run-id=<path> The directory name for model and summary statistics [default: ppo].
--num-runs=<n> Number of concurrent training sessions [default: 1].
--num-runs=<n> Number of concurrent training sessions [default: 1].
--save-freq=<n> Frequency at which to save model [default: 50000].
--seed=<n> Random seed used for training [default: -1].
--slow Whether to run the game at training speed [default: False].

--no-graphics Whether to run the environment in no-graphics mode [default: False].
--debug Whether to run ML-Agents in debug mode with detailed logging [default: False].
logger.info(options)
trainer_logger = logging.getLogger('mlagents.trainers')
env_logger = logging.getLogger('mlagents.envs')
trainer_logger.info(options)
if options['--debug']:
trainer_logger.setLevel('DEBUG')
env_logger.setLevel('DEBUG')
num_runs = int(options['--num-runs'])
seed = int(options['--seed'])

30
ml-agents/mlagents/trainers/ppo/trainer.py


# Contains an implementation of PPO as described (https://arxiv.org/abs/1707.06347).
import logging
import os
from collections import deque
import numpy as np

from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.trainer import Trainer
logger = logging.getLogger("mlagents.trainers")

def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id):
def __init__(self, brain, reward_buff_cap, trainer_parameters, training,
load, seed, run_id):
"""
Responsible for collecting experiences and training PPO model.
:param trainer_parameters: The parameters for the trainer (dictionary).

:param run_id: The The identifier of the current run
"""
super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id)
super(PPOTrainer, self).__init__(brain, trainer_parameters,
training, run_id)
self.param_keys = ['batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd',
'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers',
'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent',

self.cumulative_rewards = {}
self._reward_buffer = deque(maxlen=reward_buff_cap)
self.episode_steps = {}
self.summary_path = trainer_parameters['summary_path']
if not os.path.exists(self.summary_path):
os.makedirs(self.summary_path)
self.summary_writer = tf.summary.FileWriter(self.summary_path)
def __str__(self):
return '''Hyperparameters for the PPO Trainer of brain {0}: \n{1}'''.format(

:param next_all_info: Dictionary of all current brains and corresponding BrainInfo.
:param take_action_outputs: The outputs of the Policy's get_action method.
"""
self.trainer_metrics.start_experience_collection_timer()
if take_action_outputs:
self.stats['Policy/Value Estimate'].append(take_action_outputs['value'].mean())
self.stats['Policy/Entropy'].append(take_action_outputs['entropy'].mean())

if agent_id not in self.episode_steps:
self.episode_steps[agent_id] = 0
self.episode_steps[agent_id] += 1
self.trainer_metrics.end_experience_collection_timer()
def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo):
"""

:param new_info: Dictionary of all next brains and corresponding BrainInfo.
"""
self.trainer_metrics.start_experience_collection_timer()
info = new_info[self.brain_name]
for l in range(len(info.agents)):
agent_actions = self.training_buffer[info.agents[l]]['actions']

self.training_buffer[agent_id].reset_agent()
if info.local_done[l]:
self.cumulative_returns_since_policy_update.append(self.
cumulative_rewards.get(agent_id, 0))
self.stats['Environment/Cumulative Reward'].append(
self.cumulative_rewards.get(agent_id, 0))
self.reward_buffer.appendleft(self.cumulative_rewards.get(agent_id, 0))

self.stats['Policy/Curiosity Reward'].append(
self.intrinsic_rewards.get(agent_id, 0))
self.intrinsic_rewards[agent_id] = 0
self.trainer_metrics.end_experience_collection_timer()
A signal that the Episode has ended. The buffer must be reset.
A signal that the Episode has ended. The buffer must be reset.
Get only called when the academy resets.
"""
self.training_buffer.reset_local_buffers()

"""
Uses demonstration_buffer to update the policy.
"""
self.trainer_metrics.start_policy_update_timer(
number_experiences=len(self.training_buffer.update_buffer['actions']),
mean_return=float(np.mean(self.cumulative_returns_since_policy_update)))
n_sequences = max(int(self.trainer_parameters['batch_size'] / self.policy.sequence_length), 1)
value_total, policy_total, forward_total, inverse_total = [], [], [], []
advantages = self.training_buffer.update_buffer['advantages'].get_batch()

for k in range(num_epoch):
for _ in range(num_epoch):
self.training_buffer.update_buffer.shuffle()
buffer = self.training_buffer.update_buffer
for l in range(len(self.training_buffer.update_buffer['actions']) // n_sequences):

self.stats['Losses/Forward Loss'].append(np.mean(forward_total))
self.stats['Losses/Inverse Loss'].append(np.mean(inverse_total))
self.training_buffer.reset_update_buffer()
self.trainer_metrics.end_policy_update()
def discount_rewards(r, gamma=0.99, value_next=0.0):
"""

1
ml-agents/mlagents/trainers/tests/test_learn.py


'--slow': False,
'--no-graphics': False,
'<trainer-config-path>': 'basic_path',
'--debug': False,
}

2
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


import pytest
from mlagents.trainers import ActionInfo
from mlagents.trainers import TrainerMetrics
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.bc.offline_trainer import OfflineBCTrainer

def assert_ppo_trainer_constructed(input_config, tc, expected_brain_info,
expected_config, expected_reward_buff_cap=0):
def mock_constructor(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id):
self.trainer_metrics = TrainerMetrics('', '')
assert(brain == expected_brain_info)
assert(trainer_parameters == expected_config)
assert(reward_buff_cap == expected_reward_buff_cap)

81
ml-agents/mlagents/trainers/trainer.py


# # Unity ML-Agents Toolkit
import logging
import os
logger = logging.getLogger("mlagents.trainers")
from mlagents.trainers import TrainerMetrics
LOGGER = logging.getLogger("mlagents.trainers")
class UnityTrainerException(UnityException):
"""

self.brain_name = brain.brain_name
self.run_id = run_id
self.trainer_parameters = trainer_parameters
self.summary_path = trainer_parameters['summary_path']
if not os.path.exists(self.summary_path):
os.makedirs(self.summary_path)
self.cumulative_returns_since_policy_update = []
self.summary_writer = None
self.trainer_metrics = TrainerMetrics(path=self.summary_path + '.csv',
brain_name=self.brain_name)
self.summary_writer = tf.summary.FileWriter(self.summary_path)
self.policy = None
def __str__(self):

"""
Returns the trainer parameters of the trainer.
"""
raise UnityTrainerException("The parameters property was not implemented.")
raise UnityTrainerException(
"The parameters property was not implemented.")
@property
def graph_scope(self):

raise UnityTrainerException("The graph_scope property was not implemented.")
raise UnityTrainerException(
"The graph_scope property was not implemented.")
@property
def get_max_steps(self):

"""
raise UnityTrainerException("The get_max_steps property was not implemented.")
raise UnityTrainerException(
"The get_max_steps property was not implemented.")
@property
def get_step(self):

"""
raise UnityTrainerException("The get_step property was not implemented.")
raise UnityTrainerException(
"The get_step property was not implemented.")
@property
def get_last_reward(self):

"""
raise UnityTrainerException("The get_last_reward property was not implemented.")
raise UnityTrainerException(
"The get_last_reward property was not implemented.")
def increment_step_and_update_last_reward(self):
"""

:param curr_info: Current BrainInfo.
:return: The ActionInfo given by the policy given the BrainInfo.
"""
return self.policy.get_action(curr_info)
self.trainer_metrics.start_experience_collection_timer()
action = self.policy.get_action(curr_info)
self.trainer_metrics.end_experience_collection_timer()
return action
def add_experiences(self, curr_info: AllBrainInfo, next_info: AllBrainInfo,
take_action_outputs):

:param next_info: Next AllBrainInfo.
:param take_action_outputs: The outputs of the take action method.
"""
raise UnityTrainerException("The add_experiences method was not implemented.")
raise UnityTrainerException(
"The add_experiences method was not implemented.")
def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInfo):
"""

:param next_info: Dictionary of all next-step brains and corresponding BrainInfo.
"""
raise UnityTrainerException("The process_experiences method was not implemented.")
raise UnityTrainerException(
"The process_experiences method was not implemented.")
A signal that the Episode has ended. The buffer must be reset.
A signal that the Episode has ended. The buffer must be reset.
raise UnityTrainerException("The end_episode method was not implemented.")
raise UnityTrainerException(
"The end_episode method was not implemented.")
def is_ready_update(self):
"""

raise UnityTrainerException("The is_ready_update method was not implemented.")
raise UnityTrainerException(
"The is_ready_update method was not implemented.")
raise UnityTrainerException("The update_model method was not implemented.")
raise UnityTrainerException(
"The update_model method was not implemented.")
def save_model(self):
"""

"""
self.policy.export_model()
def write_summary(self, global_step, lesson_num=0):
def write_training_metrics(self):
"""
Write training metrics to a CSV file
:return:
"""
self.trainer_metrics.write_training_metrics()
def write_summary(self, global_step, delta_train_start, lesson_num=0):
:param delta_train_start: Time elapsed since training started.
:param lesson_num: Current lesson number in curriculum.
:param global_step: The number of steps the simulation has been going for
"""

mean_reward = np.mean(self.stats['Environment/Cumulative Reward'])
logger.info(" {}: {}: Step: {}. Mean Reward: {:0.3f}. Std of Reward: {:0.3f}. {}"
mean_reward = np.mean(
self.stats['Environment/Cumulative Reward'])
LOGGER.info(" {}: {}: Step: {}. "
"Time Elapsed: {:0.3f} s "
"Mean "
"Reward: {"
":0.3f}. Std of Reward: {:0.3f}. {}"
mean_reward, np.std(self.stats['Environment/Cumulative Reward']),
delta_train_start,
mean_reward, np.std(
self.stats['Environment/Cumulative Reward']),
logger.info(" {}: {}: Step: {}. No episode was completed since last summary. {}"
LOGGER.info(" {}: {}: Step: {}. No episode was completed since last summary. {}"
summary.value.add(tag='{}'.format(key), simple_value=stat_mean)
summary.value.add(tag='{}'.format(
key), simple_value=stat_mean)
self.stats[key] = []
summary.value.add(tag='Environment/Lesson', simple_value=lesson_num)
self.summary_writer.add_summary(summary, self.get_step)

s = sess.run(s_op)
self.summary_writer.add_summary(s, self.get_step)
except:
logger.info(
LOGGER.info(
"Cannot write text summary for Tensorboard. Tensorflow version must be r1.2 or above.")
pass

30
ml-agents/mlagents/trainers/trainer_controller.py


import numpy as np
import tensorflow as tf
from time import time
from mlagents.envs import AllBrainInfo, BrainInfo
from mlagents.envs.exception import UnityEnvironmentException

self.train_model = train
self.keep_checkpoints = keep_checkpoints
self.trainers: Dict[str, Trainer] = {}
self.trainer_metrics: Dict[str, TrainerMetrics] = {}
self.training_start_time = time()
np.random.seed(self.seed)
tf.set_random_seed(self.seed)

return True
return False
def _write_training_metrics(self):
"""
Write all CSV metrics
:return:
"""
for brain_name in self.trainers.keys():
if brain_name in self.trainer_metrics:
self.trainers[brain_name].write_training_metrics()
def _export_graph(self):
"""
Exports latest saved models to .nn format for Unity embedding.

:param trainer_config: The configurations of the trainers
"""
trainer_parameters_dict = {}
for brain_name in self.external_brains:
trainer_parameters = trainer_config['default'].copy()
trainer_parameters['summary_path'] = '{basedir}/{name}'.format(

.brains_to_curriculums[brain_name]
.min_lesson_length if self.meta_curriculum else 0,
trainer_parameters_dict[brain_name],
self.train_model, self.load_model, self.seed, self.run_id)
self.train_model, self.load_model, self.seed,
self.run_id)
self.trainer_metrics[brain_name] = self.trainers[brain_name].trainer_metrics
else:
raise UnityEnvironmentException('The trainer config contains '
'an unknown trainer type for '

self._save_model_when_interrupted(steps=self.global_step)
pass
env.close()
self._write_training_metrics()
self._export_graph()
def take_step(self, env, curr_info: AllBrainInfo):

take_action_text[brain_name] = action_info.text
take_action_value[brain_name] = action_info.value
take_action_outputs[brain_name] = action_info.outputs
time_start_step = time()
new_info = env.step(
vector_action=take_action_vector,
memory=take_action_memories,

delta_time_step = time() - time_start_step
if brain_name in self.trainer_metrics:
self.trainer_metrics[brain_name].add_delta_step(delta_time_step)
trainer.add_experiences(curr_info, new_info,
take_action_outputs[brain_name])
trainer.process_experiences(curr_info, new_info)

delta_train_start = time() - self.training_start_time
lesson_num=self.meta_curriculum
delta_train_start, lesson_num=self.meta_curriculum
trainer.write_summary(self.global_step)
trainer.write_summary(self.global_step, delta_train_start)
if self.train_model \
and trainer.get_step <= trainer.get_max_steps:
trainer.increment_step_and_update_last_reward()

107
ml-agents/mlagents/trainers/trainer_metrics.py


# # Unity ML-Agents Toolkit
import logging
import csv
from time import time
LOGGER = logging.getLogger("mlagents.trainers")
FIELD_NAMES = ['Brain name', 'Time to update policy',
'Time since start of training', 'Time for last experience collection',
'Number of experiences used for training', 'Mean return']
class TrainerMetrics:
"""
Helper class to track, write training metrics. Tracks time since object
of this class is initialized.
"""
def __init__(self, path: str, brain_name: str):
"""
:str path: Fully qualified path where CSV is stored.
:str brain_name: Identifier for the Brain which we are training
"""
self.path = path
self.brain_name = brain_name
self.rows = []
self.time_start_experience_collection = None
self.time_training_start = time()
self.last_buffer_length = None
self.last_mean_return = None
self.time_policy_update_start = None
self.delta_last_experience_collection = None
self.delta_policy_update = None
def start_experience_collection_timer(self):
"""
Inform Metrics class that experience collection is starting. Intended to be idempotent
"""
if self.time_start_experience_collection is None:
self.time_start_experience_collection = time()
def end_experience_collection_timer(self):
"""
Inform Metrics class that experience collection is done.
"""
if self.time_start_experience_collection:
curr_delta = time() - self.time_start_experience_collection
if self.delta_last_experience_collection is None:
self.delta_last_experience_collection = curr_delta
else:
self.delta_last_experience_collection += curr_delta
self.time_start_experience_collection = None
def add_delta_step(self, delta: float):
"""
Inform Metrics class about time to step in environment.
"""
if self.delta_last_experience_collection:
self.delta_last_experience_collection += delta
else:
self.delta_last_experience_collection = delta
def start_policy_update_timer(self, number_experiences: int, mean_return: float):
"""
Inform Metrics class that policy update has started.
:int number_experiences: Number of experiences in Buffer at this point.
:float mean_return: Return averaged across all cumulative returns since last policy update
"""
self.last_buffer_length = number_experiences
self.last_mean_return = mean_return
self.time_policy_update_start = time()
def _add_row(self, delta_train_start):
row = [self.brain_name]
row.extend(format(c, '.3f') if isinstance(c, float) else c
for c in [self.delta_policy_update, delta_train_start,
self.delta_last_experience_collection,
self.last_buffer_length, self.last_mean_return])
self.delta_last_experience_collection = None
self.rows.append(row)
def end_policy_update(self):
"""
Inform Metrics class that policy update has started.
"""
if self.time_policy_update_start:
self.delta_policy_update = time() - self.time_policy_update_start
else:
self.delta_policy_update = 0
delta_train_start = time() - self.time_training_start
LOGGER.debug(" Policy Update Training Metrics for {}: "
"\n\t\tTime to update Policy: {:0.3f} s \n"
"\t\tTime elapsed since training: {:0.3f} s \n"
"\t\tTime for experience collection: {:0.3f} s \n"
"\t\tBuffer Length: {} \n"
"\t\tReturns : {:0.3f}\n"
.format(self.brain_name, self.delta_policy_update,
delta_train_start, self.delta_last_experience_collection,
self.last_buffer_length, self.last_mean_return))
self._add_row(delta_train_start)
def write_training_metrics(self):
"""
Write Training Metrics to CSV
"""
with open(self.path, 'w') as file:
writer = csv.writer(file)
writer.writerow(FIELD_NAMES)
for row in self.rows:
writer.writerow(row)

39
ml-agents/tests/trainers/test_trainer_metrics.py


import unittest.mock as mock
from mlagents.trainers import TrainerMetrics
class TestTrainerMetrics:
def test_field_names(self):
field_names = ['Brain name', 'Time to update policy',
'Time since start of training',
'Time for last experience collection',
'Number of experiences used for training', 'Mean return']
from mlagents.trainers.trainer_metrics import FIELD_NAMES
assert FIELD_NAMES == field_names
@mock.patch('mlagents.trainers.trainer_metrics.time', mock.MagicMock(return_value=42))
def test_experience_collection_timer(self):
mock_path = 'fake'
mock_brain_name = 'fake'
trainer_metrics = TrainerMetrics(path=mock_path,
brain_name=mock_brain_name)
trainer_metrics.start_experience_collection_timer()
trainer_metrics.end_experience_collection_timer()
assert trainer_metrics.delta_last_experience_collection == 0
@mock.patch('mlagents.trainers.trainer_metrics.time', mock.MagicMock(return_value=42))
def test_policy_update_timer(self):
mock_path = 'fake'
mock_brain_name = 'fake'
fake_buffer_length = 350
fake_mean_return = 0.3
trainer_metrics = TrainerMetrics(path=mock_path,
brain_name=mock_brain_name)
trainer_metrics.start_experience_collection_timer()
trainer_metrics.end_experience_collection_timer()
trainer_metrics.start_policy_update_timer(number_experiences=fake_buffer_length,
mean_return=fake_mean_return)
trainer_metrics.end_policy_update()
fake_row = [mock_brain_name, 0,0, 0, 350, '0.300']
assert trainer_metrics.rows[0] == fake_row
正在加载...
取消
保存