浏览代码

Added logging per Brain of time to update policy, time elapsed during training, time to collect experiences, buffer length, average return

/develop-generalizationTraining-TrainerController
eshvk 6 年前
当前提交
cc9bdf17
共有 14 个文件被更改,包括 252 次插入55 次删除
  1. 5
      docs/Training-ML-Agents.md
  2. 1
      ml-agents/mlagents/trainers/__init__.py
  3. 3
      ml-agents/mlagents/trainers/bc/offline_trainer.py
  4. 3
      ml-agents/mlagents/trainers/bc/online_trainer.py
  5. 12
      ml-agents/mlagents/trainers/bc/trainer.py
  6. 23
      ml-agents/mlagents/trainers/learn.py
  7. 23
      ml-agents/mlagents/trainers/ppo/trainer.py
  8. 1
      ml-agents/mlagents/trainers/tests/test_learn.py
  9. 78
      ml-agents/mlagents/trainers/trainer.py
  10. 20
      ml-agents/mlagents/trainers/trainer_controller.py
  11. 98
      ml-agents/mlagents/trainers/trainer_metrics.py
  12. 40
      ml-agents/tests/trainers/test_trainer_metrics.py

5
docs/Training-ML-Agents.md


training doesn't involve visual observations (reading from Pixels). See
[here](https://docs.unity3d.com/Manual/CommandLineArguments.html) for more
details.
* `--debug` - Specify this option to run ML-Agents in debug mode and log Trainer
Metrics to a CSV stored in the `summaries` directory. The metrics stored are:
brain name, Time to update policy, Time since start of training, Time for last experience collection, Number of experiences used for training, Mean return. This
option is not available currently for Imitation Learning.
`
### Training config file

1
ml-agents/mlagents/trainers/__init__.py


from .curriculum import *
from .meta_curriculum import *
from .models import *
from .trainer_metrics import *
from .trainer import *
from .policy import *
from .trainer_controller import *

3
ml-agents/mlagents/trainers/bc/offline_trainer.py


class OfflineBCTrainer(BCTrainer):
"""The OfflineBCTrainer is an implementation of Offline Behavioral Cloning."""
def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
def __init__(self, brain, trainer_parameters, training, load, seed,
run_id):
"""
Responsible for collecting experiences and training PPO model.
:param trainer_parameters: The parameters for the trainer (dictionary).

3
ml-agents/mlagents/trainers/bc/online_trainer.py


class OnlineBCTrainer(BCTrainer):
"""The OnlineBCTrainer is an implementation of Online Behavioral Cloning."""
def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
def __init__(self, brain, trainer_parameters, training, load, seed,
run_id):
"""
Responsible for collecting experiences and training PPO model.
:param trainer_parameters: The parameters for the trainer (dictionary).

12
ml-agents/mlagents/trainers/bc/trainer.py


# Contains an implementation of Behavioral Cloning Algorithm
import logging
import os
import numpy as np
import tensorflow as tf

class BCTrainer(Trainer):
"""The BCTrainer is an implementation of Behavioral Cloning."""
def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
def __init__(self, brain, trainer_parameters, training, load, seed,
run_id):
"""
Responsible for collecting experiences and training PPO model.
:param trainer_parameters: The parameters for the trainer (dictionary).

:param run_id: The The identifier of the current run
"""
super(BCTrainer, self).__init__(brain, trainer_parameters, training, run_id)
super(BCTrainer, self).__init__(brain, trainer_parameters, training,
run_id)
self.policy = BCPolicy(seed, brain, trainer_parameters, load)
self.n_sequences = 1
self.cumulative_rewards = {}

self.summary_path = trainer_parameters['summary_path']
if not os.path.exists(self.summary_path):
os.makedirs(self.summary_path)
self.summary_writer = tf.summary.FileWriter(self.summary_path)
@property
def parameters(self):

23
ml-agents/mlagents/trainers/learn.py


fast_simulation = not bool(run_options['--slow'])
no_graphics = run_options['--no-graphics']
trainer_config_path = run_options['<trainer-config-path>']
# Recognize and use docker volume if one is passed as an argument
if not docker_target_name:
model_path = './models/{run_id}'.format(run_id=run_id)

tc = TrainerController(model_path, summaries_dir, run_id + '-' + str(sub_id),
save_freq, maybe_meta_curriculum,
load_model, train_model,
keep_checkpoints, lesson, external_brains, run_seed)
keep_checkpoints, lesson, external_brains,
run_seed)
# Signal that environment has been launched.
process_queue.put(True)

if docker_training and env_path is not None:
"""
Comments for future maintenance:
Some OS/VM instances (e.g. COS GCP Image) mount filesystems
with COS flag which prevents execution of the Unity scene,
to get around this, we will copy the executable into the
Some OS/VM instances (e.g. COS GCP Image) mount filesystems
with COS flag which prevents execution of the Unity scene,
to get around this, we will copy the executable into the
container.
"""
# Navigate in docker path and find env_path and copy it.

def main():
try:
print('''
,m' ,▓▓▓▀▓▓▄ ▓▓▓ ▓▓▌

except:
print('\n\n\tUnity Technologies\n')
logger = logging.getLogger('mlagents.trainers')
_USAGE = '''
Usage:
mlagents-learn <trainer-config-path> [options]

--lesson=<n> Start learning from this lesson [default: 0].
--load Whether to load the model or randomly initialize [default: False].
--run-id=<path> The directory name for model and summary statistics [default: ppo].
--num-runs=<n> Number of concurrent training sessions [default: 1].
--num-runs=<n> Number of concurrent training sessions [default: 1].
--save-freq=<n> Frequency at which to save model [default: 50000].
--seed=<n> Random seed used for training [default: -1].
--slow Whether to run the game at training speed [default: False].

--no-graphics Whether to run the environment in no-graphics mode [default: False].
--debug Whether to run ML-Agents in debug mode with detailed logging [default: False].
logger.info(options)
trainer_logger = logging.getLogger('mlagents.trainers')
env_logger = logging.getLogger('mlagents.envs')
trainer_logger.info(options)
if options['--debug']:
trainer_logger.setLevel('DEBUG')
env_logger.setLevel('DEBUG')
num_runs = int(options['--num-runs'])
seed = int(options['--seed'])

23
ml-agents/mlagents/trainers/ppo/trainer.py


# Contains an implementation of PPO as described (https://arxiv.org/abs/1707.06347).
import logging
import os
from collections import deque
import numpy as np

class PPOTrainer(Trainer):
"""The PPOTrainer is an implementation of the PPO algorithm."""
def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id):
def __init__(self, brain, reward_buff_cap, trainer_parameters, training,
load, seed, run_id):
"""
Responsible for collecting experiences and training PPO model.
:param trainer_parameters: The parameters for the trainer (dictionary).

:param run_id: The The identifier of the current run
"""
super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id)
super(PPOTrainer, self).__init__(brain, trainer_parameters,
training, run_id)
self.param_keys = ['batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd',
'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers',
'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent',

self.cumulative_rewards = {}
self._reward_buffer = deque(maxlen=reward_buff_cap)
self.episode_steps = {}
self.summary_path = trainer_parameters['summary_path']
if not os.path.exists(self.summary_path):
os.makedirs(self.summary_path)
self.summary_writer = tf.summary.FileWriter(self.summary_path)
def __str__(self):
return '''Hyperparameters for the PPO Trainer of brain {0}: \n{1}'''.format(

self.training_buffer[agent_id].reset_agent()
if info.local_done[l]:
self.cumulative_returns_since_policy_update.append(self.
cumulative_rewards.get(agent_id, 0))
self.stats['Environment/Cumulative Reward'].append(
self.cumulative_rewards.get(agent_id, 0))
self.reward_buffer.appendleft(self.cumulative_rewards.get(agent_id, 0))

def end_episode(self):
"""
A signal that the Episode has ended. The buffer must be reset.
A signal that the Episode has ended. The buffer must be reset.
Get only called when the academy resets.
"""
self.training_buffer.reset_local_buffers()

"""
Uses demonstration_buffer to update the policy.
"""
self.trainer_metrics.end_experience_collection_timer()
self.trainer_metrics.start_policy_update_timer(number_experiences=len(self.training_buffer.update_buffer['actions']),
mean_return = float(np.mean(self.cumulative_returns_since_policy_update)))
n_sequences = max(int(self.trainer_parameters['batch_size'] / self.policy.sequence_length), 1)
value_total, policy_total, forward_total, inverse_total = [], [], [], []
advantages = self.training_buffer.update_buffer['advantages'].get_batch()

for k in range(num_epoch):
for _ in range(num_epoch):
self.training_buffer.update_buffer.shuffle()
buffer = self.training_buffer.update_buffer
for l in range(len(self.training_buffer.update_buffer['actions']) // n_sequences):

self.stats['Losses/Forward Loss'].append(np.mean(forward_total))
self.stats['Losses/Inverse Loss'].append(np.mean(inverse_total))
self.training_buffer.reset_update_buffer()
self.trainer_metrics.end_policy_update()
def discount_rewards(r, gamma=0.99, value_next=0.0):
"""

1
ml-agents/mlagents/trainers/tests/test_learn.py


'--slow': False,
'--no-graphics': False,
'<trainer-config-path>': 'basic_path',
'--debug': False,
}

78
ml-agents/mlagents/trainers/trainer.py


# # Unity ML-Agents Toolkit
import logging
import os
logger = logging.getLogger("mlagents.trainers")
from mlagents.trainers import TrainerMetrics
LOGGER = logging.getLogger("mlagents.trainers")
class UnityTrainerException(UnityException):
"""

self.brain_name = brain.brain_name
self.run_id = run_id
self.trainer_parameters = trainer_parameters
self.summary_path = trainer_parameters['summary_path']
if not os.path.exists(self.summary_path):
os.makedirs(self.summary_path)
self.trainer_metrics = TrainerMetrics(path=self.summary_path + '.csv',
brain_name=self.brain_name)
self.cumulative_returns_since_policy_update = []
self.summary_writer = None
self.summary_writer = tf.summary.FileWriter(self.summary_path)
self.policy = None
def __str__(self):

"""
Returns the trainer parameters of the trainer.
"""
raise UnityTrainerException("The parameters property was not implemented.")
raise UnityTrainerException(
"The parameters property was not implemented.")
@property
def graph_scope(self):

raise UnityTrainerException("The graph_scope property was not implemented.")
raise UnityTrainerException(
"The graph_scope property was not implemented.")
@property
def get_max_steps(self):

"""
raise UnityTrainerException("The get_max_steps property was not implemented.")
raise UnityTrainerException(
"The get_max_steps property was not implemented.")
@property
def get_step(self):

"""
raise UnityTrainerException("The get_step property was not implemented.")
raise UnityTrainerException(
"The get_step property was not implemented.")
@property
def get_last_reward(self):

"""
raise UnityTrainerException("The get_last_reward property was not implemented.")
raise UnityTrainerException(
"The get_last_reward property was not implemented.")
def increment_step_and_update_last_reward(self):
"""

:param curr_info: Current BrainInfo.
:return: The ActionInfo given by the policy given the BrainInfo.
"""
self.trainer_metrics.start_experience_collection_timer()
return self.policy.get_action(curr_info)
def add_experiences(self, curr_info: AllBrainInfo, next_info: AllBrainInfo,

:param next_info: Next AllBrainInfo.
:param take_action_outputs: The outputs of the take action method.
"""
raise UnityTrainerException("The add_experiences method was not implemented.")
raise UnityTrainerException(
"The add_experiences method was not implemented.")
def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInfo):
"""

:param next_info: Dictionary of all next-step brains and corresponding BrainInfo.
"""
raise UnityTrainerException("The process_experiences method was not implemented.")
raise UnityTrainerException(
"The process_experiences method was not implemented.")
A signal that the Episode has ended. The buffer must be reset.
A signal that the Episode has ended. The buffer must be reset.
raise UnityTrainerException("The end_episode method was not implemented.")
raise UnityTrainerException(
"The end_episode method was not implemented.")
def is_ready_update(self):
"""

raise UnityTrainerException("The is_ready_update method was not implemented.")
raise UnityTrainerException(
"The is_ready_update method was not implemented.")
raise UnityTrainerException("The update_model method was not implemented.")
raise UnityTrainerException(
"The update_model method was not implemented.")
def save_model(self):
"""

"""
self.policy.export_model()
def write_summary(self, global_step, lesson_num=0):
def write_training_metrics(self):
"""
Write training metrics to a CSV file
:return:
"""
self.trainer_metrics.write_training_metrics()
def write_summary(self, global_step, delta_train_start, lesson_num=0):
:param delta_train_start: Time elapsed since training started.
:param lesson_num: Current lesson number in curriculum.
:param global_step: The number of steps the simulation has been going for
"""

mean_reward = np.mean(self.stats['Environment/Cumulative Reward'])
logger.info(" {}: {}: Step: {}. Mean Reward: {:0.3f}. Std of Reward: {:0.3f}. {}"
mean_reward = np.mean(
self.stats['Environment/Cumulative Reward'])
LOGGER.info(" {}: {}: Step: {}. "
"Time Elapsed: {:0.3f} s "
"Mean "
"Reward: {"
":0.3f}. Std of Reward: {:0.3f}. {}"
mean_reward, np.std(self.stats['Environment/Cumulative Reward']),
delta_train_start,
mean_reward, np.std(
self.stats['Environment/Cumulative Reward']),
logger.info(" {}: {}: Step: {}. No episode was completed since last summary. {}"
LOGGER.info(" {}: {}: Step: {}. No episode was completed since last summary. {}"
summary.value.add(tag='{}'.format(key), simple_value=stat_mean)
summary.value.add(tag='{}'.format(
key), simple_value=stat_mean)
self.stats[key] = []
summary.value.add(tag='Environment/Lesson', simple_value=lesson_num)
self.summary_writer.add_summary(summary, self.get_step)

s = sess.run(s_op)
self.summary_writer.add_summary(s, self.get_step)
except:
logger.info(
LOGGER.info(
"Cannot write text summary for Tensorboard. Tensorflow version must be r1.2 or above.")
pass

20
ml-agents/mlagents/trainers/trainer_controller.py


import numpy as np
import tensorflow as tf
from time import time
from mlagents.envs import AllBrainInfo, BrainInfo
from mlagents.envs.exception import UnityEnvironmentException

self.global_step = 0
self.meta_curriculum = meta_curriculum
self.seed = training_seed
self.training_start_time = time()
np.random.seed(self.seed)
tf.set_random_seed(self.seed)

sys.exit()
return True
return False
def _write_training_metrics(self):
"""
Write all CSV metrics
:return:
"""
for brain_name in self.trainers.keys():
self.trainers[brain_name].write_training_metrics()
def _export_graph(self):
"""

.brains_to_curriculums[brain_name]
.min_lesson_length if self.meta_curriculum else 0,
trainer_parameters_dict[brain_name],
self.train_model, self.load_model, self.seed, self.run_id)
self.train_model, self.load_model, self.seed,
self.run_id)
else:
raise UnityEnvironmentException('The trainer config contains '
'an unknown trainer type for '

self._save_model_when_interrupted(steps=self.global_step)
pass
env.close()
self._write_training_metrics()
self._export_graph()
def take_step(self, env, curr_info: AllBrainInfo):

# Perform gradient descent with experience buffer
trainer.update_policy()
# Write training statistics to Tensorboard.
delta_train_start = time() - self.training_start_time
lesson_num=self.meta_curriculum
delta_train_start, lesson_num=self.meta_curriculum
trainer.write_summary(self.global_step)
trainer.write_summary(self.global_step, delta_train_start)
if self.train_model \
and trainer.get_step <= trainer.get_max_steps:
trainer.increment_step_and_update_last_reward()

98
ml-agents/mlagents/trainers/trainer_metrics.py


# # Unity ML-Agents Toolkit
import logging
import csv
from time import time
LOGGER = logging.getLogger("mlagents.trainers")
class TrainerMetrics:
"""
Helper class to track, write training metrics. Tracks time since object
of this class is initialized.
"""
def __init__(self, path: str, brain_name: str):
"""
:str path: Fully qualified path where CSV is stored.
:str brain_name: Identifier for the Brain which we are training
"""
self.path = path
self.brain_name = brain_name
self.FIELD_NAMES = ['Brain name', 'Time to update policy',
'Time since start of training', 'Time for last experience collection',
'Number of experiences used for training', 'Mean return']
self.rows = []
self.time_start_experience_collection = None
self.time_training_start = time()
self.last_buffer_length = None
self.last_mean_return = None
self.time_policy_update_start = None
self.delta_last_experience_collection = None
self.delta_policy_update = None
def start_experience_collection_timer(self):
"""
Inform Metrics class that experience collection is starting. Intended to be idempotent
"""
if self.time_start_experience_collection is None:
self.time_start_experience_collection = time()
def end_experience_collection_timer(self):
"""
Inform Metrics class that experience collection is done.
"""
if self.start_experience_collection_timer:
self.delta_last_experience_collection = time() - self.time_start_experience_collection
else:
self.delta_last_experience_collection = 0.0
self.time_start_experience_collection = None
def start_policy_update_timer(self, number_experiences: int, mean_return: float):
"""
Inform Metrics class that policy update has started.
:int number_experiences: Number of experiences in Buffer at this point.
:float mean_return: Return averaged across all cumulative returns since last policy update
"""
self.last_buffer_length = number_experiences
self.last_mean_return = mean_return
self.time_policy_update_start = time()
def _add_row(self, delta_train_start):
row = [self.brain_name]
row.extend(format(c, '.3f') if isinstance(c, float) else c
for c in [self.delta_policy_update, delta_train_start,
self.delta_last_experience_collection,
self.last_buffer_length, self.last_mean_return])
self.rows.append(row)
def end_policy_update(self):
"""
Inform Metrics class that policy update has started.
"""
if self.time_policy_update_start:
self.delta_policy_update = time() - self.time_policy_update_start
else:
self.delta_policy_update = 0
delta_train_start = time() - self.time_training_start
LOGGER.debug(" Policy Update Training Metrics for {}: "
"\n\t\tTime to update Policy: {:0.3f} s \n"
"\t\tTime elapsed since training: {:0.3f} s \n"
"\t\tTime for experience collection: {:0.3f} s \n"
"\t\tBuffer Length: {} \n"
"\t\tReturns : {:0.3f}\n"
.format(self.brain_name, self.delta_policy_update,
delta_train_start, self.delta_last_experience_collection,
self.last_buffer_length, self.last_mean_return))
self._add_row(delta_train_start)
def write_training_metrics(self):
"""
Write Training Metrics to CSV
"""
with open(self.path, 'w') as f:
writer = csv.writer(f)
writer.writerow(self.FIELD_NAMES)
for row in self.rows:
writer.writerow(row)

40
ml-agents/tests/trainers/test_trainer_metrics.py


import unittest.mock as mock
from mlagents.trainers import TrainerMetrics
class TestTrainerMetrics:
def test_field_names(self):
field_names = ['Brain name', 'Time to update policy',
'Time since start of training', 'Time for last experience collection', 'Number of experiences used for training', 'Mean return']
mock_path = 'fake'
mock_brain_name = 'fake'
trainer_metrics = TrainerMetrics(path=mock_path,
brain_name=mock_brain_name)
assert trainer_metrics.FIELD_NAMES == field_names
@mock.patch('mlagents.trainers.trainer_metrics.time', mock.MagicMock(return_value=42))
def test_experience_collection_timer(self):
mock_path = 'fake'
mock_brain_name = 'fake'
trainer_metrics = TrainerMetrics(path=mock_path,
brain_name=mock_brain_name)
trainer_metrics.start_experience_collection_timer()
trainer_metrics.end_experience_collection_timer()
assert trainer_metrics.delta_last_experience_collection == 0
@mock.patch('mlagents.trainers.trainer_metrics.time', mock.MagicMock(return_value=42))
def test_policy_update_timer(self):
mock_path = 'fake'
mock_brain_name = 'fake'
fake_buffer_length = 350
fake_mean_return = 0.3
trainer_metrics = TrainerMetrics(path=mock_path,
brain_name=mock_brain_name)
trainer_metrics.start_experience_collection_timer()
trainer_metrics.end_experience_collection_timer()
trainer_metrics.start_policy_update_timer(number_experiences=fake_buffer_length,
mean_return=fake_mean_return)
trainer_metrics.end_policy_update()
fake_row = [mock_brain_name, 0,0, 0, 350, '0.300']
assert trainer_metrics.rows[0] == fake_row
正在加载...
取消
保存