Added logging per Brain of time to update policy, time elapsed during training, time to collect experiences, buffer length, average return

6 年前 · cc9bdf17
--- a/docs/Training-ML-Agents.md
+++ b/docs/Training-ML-Agents.md
  training doesn't involve visual observations (reading from Pixels). See
  [here](https://docs.unity3d.com/Manual/CommandLineArguments.html) for more
  details.
+* `--debug` - Specify this option to run ML-Agents in debug mode and log Trainer
+  Metrics to a CSV stored in the `summaries` directory. The metrics  stored are:
+  brain name, Time to update policy, Time since start of training, Time for last experience collection, Number of experiences used for training, Mean return. This
+  option is not available currently for Imitation Learning.
+  `

 ### Training config file

--- a/ml-agents/mlagents/trainers/init.py
+++ b/ml-agents/mlagents/trainers/init.py
 from .curriculum import *
 from .meta_curriculum import *
 from .models import *
+from .trainer_metrics import *
 from .trainer import *
 from .policy import *
 from .trainer_controller import *
--- a/ml-agents/mlagents/trainers/bc/offline_trainer.py
+++ b/ml-agents/mlagents/trainers/bc/offline_trainer.py
 class OfflineBCTrainer(BCTrainer):
    """The OfflineBCTrainer is an implementation of Offline Behavioral Cloning."""

-    def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
+    def __init__(self, brain, trainer_parameters, training, load, seed,
+                 run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param  trainer_parameters: The parameters for the trainer (dictionary).
--- a/ml-agents/mlagents/trainers/bc/online_trainer.py
+++ b/ml-agents/mlagents/trainers/bc/online_trainer.py
 class OnlineBCTrainer(BCTrainer):
    """The OnlineBCTrainer is an implementation of Online Behavioral Cloning."""

-    def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
+    def __init__(self, brain, trainer_parameters, training, load, seed,
+                 run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param  trainer_parameters: The parameters for the trainer (dictionary).
--- a/ml-agents/mlagents/trainers/bc/trainer.py
+++ b/ml-agents/mlagents/trainers/bc/trainer.py
 # Contains an implementation of Behavioral Cloning Algorithm

 import logging
-import os

 import numpy as np
 import tensorflow as tf
 class BCTrainer(Trainer):
    """The BCTrainer is an implementation of Behavioral Cloning."""

-    def __init__(self, brain, trainer_parameters, training, load, seed, run_id):
+    def __init__(self, brain, trainer_parameters, training, load, seed,
+                 run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param  trainer_parameters: The parameters for the trainer (dictionary).
        :param run_id: The The identifier of the current run
        """
-        super(BCTrainer, self).__init__(brain, trainer_parameters, training, run_id)
+        super(BCTrainer, self).__init__(brain, trainer_parameters, training,
+                                        run_id)
        self.policy = BCPolicy(seed, brain, trainer_parameters, load)
        self.n_sequences = 1
        self.cumulative_rewards = {}

-        self.summary_path = trainer_parameters['summary_path']
-        if not os.path.exists(self.summary_path):
-            os.makedirs(self.summary_path)
+
-        self.summary_writer = tf.summary.FileWriter(self.summary_path)

    @property
    def parameters(self):
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
    fast_simulation = not bool(run_options['--slow'])
    no_graphics = run_options['--no-graphics']
    trainer_config_path = run_options['<trainer-config-path>']
-
    # Recognize and use docker volume if one is passed as an argument
    if not docker_target_name:
        model_path = './models/{run_id}'.format(run_id=run_id)
    tc = TrainerController(model_path, summaries_dir, run_id + '-' + str(sub_id),
                           save_freq, maybe_meta_curriculum,
                           load_model, train_model,
-                           keep_checkpoints, lesson, external_brains, run_seed)
+                           keep_checkpoints, lesson, external_brains,
+                           run_seed)

    # Signal that environment has been launched.
    process_queue.put(True)
    if docker_training and env_path is not None:
            """
            Comments for future maintenance:
-                Some OS/VM instances (e.g. COS GCP Image) mount filesystems 
-                with COS flag which prevents execution of the Unity scene, 
-                to get around this, we will copy the executable into the 
+                Some OS/VM instances (e.g. COS GCP Image) mount filesystems
+                with COS flag which prevents execution of the Unity scene,
+                to get around this, we will copy the executable into the
                container.
            """
            # Navigate in docker path and find env_path and copy it.
 def main():
    try:
        print('''
-    
+
                        ▄▄▄▓▓▓▓
                   ╓▓▓▓▓▓▓█▓▓▓▓▓
              ,▄▄▄m▀▀▀'  ,▓▓▓▀▓▓▄                           ▓▓▓  ▓▓▌
    except:
        print('\n\n\tUnity Technologies\n')

-    logger = logging.getLogger('mlagents.trainers')
    _USAGE = '''
    Usage:
      mlagents-learn <trainer-config-path> [options]
      --lesson=<n>               Start learning from this lesson [default: 0].
      --load                     Whether to load the model or randomly initialize [default: False].
      --run-id=<path>            The directory name for model and summary statistics [default: ppo].
-      --num-runs=<n>             Number of concurrent training sessions [default: 1]. 
+      --num-runs=<n>             Number of concurrent training sessions [default: 1].
      --save-freq=<n>            Frequency at which to save model [default: 50000].
      --seed=<n>                 Random seed used for training [default: -1].
      --slow                     Whether to run the game at training speed [default: False].
      --no-graphics              Whether to run the environment in no-graphics mode [default: False].
+      --debug                    Whether to run ML-Agents in debug mode with detailed logging [default: False].
-    logger.info(options)
+    trainer_logger = logging.getLogger('mlagents.trainers')
+    env_logger = logging.getLogger('mlagents.envs')
+    trainer_logger.info(options)
+    if options['--debug']:
+        trainer_logger.setLevel('DEBUG')
+        env_logger.setLevel('DEBUG')
    num_runs = int(options['--num-runs'])
    seed = int(options['--seed'])

--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
 # Contains an implementation of PPO as described (https://arxiv.org/abs/1707.06347).

 import logging
-import os
 from collections import deque

 import numpy as np
 class PPOTrainer(Trainer):
    """The PPOTrainer is an implementation of the PPO algorithm."""

-    def __init__(self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id):
+    def __init__(self, brain, reward_buff_cap, trainer_parameters, training,
+                 load, seed, run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param trainer_parameters: The parameters for the trainer (dictionary).
        :param run_id: The The identifier of the current run
        """
-        super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id)
+        super(PPOTrainer, self).__init__(brain, trainer_parameters,
+                                         training, run_id)
        self.param_keys = ['batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd',
                           'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers',
                           'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent',
        self.cumulative_rewards = {}
        self._reward_buffer = deque(maxlen=reward_buff_cap)
        self.episode_steps = {}
-        self.summary_path = trainer_parameters['summary_path']
-        if not os.path.exists(self.summary_path):
-            os.makedirs(self.summary_path)
-
-        self.summary_writer = tf.summary.FileWriter(self.summary_path)

    def __str__(self):
        return '''Hyperparameters for the PPO Trainer of brain {0}: \n{1}'''.format(

                self.training_buffer[agent_id].reset_agent()
                if info.local_done[l]:
+                    self.cumulative_returns_since_policy_update.append(self.
+                                                                       cumulative_rewards.get(agent_id, 0))
                    self.stats['Environment/Cumulative Reward'].append(
                        self.cumulative_rewards.get(agent_id, 0))
                    self.reward_buffer.appendleft(self.cumulative_rewards.get(agent_id, 0))

    def end_episode(self):
        """
-        A signal that the Episode has ended. The buffer must be reset. 
+        A signal that the Episode has ended. The buffer must be reset.
        Get only called when the academy resets.
        """
        self.training_buffer.reset_local_buffers()
        """
        Uses demonstration_buffer to update the policy.
        """
+        self.trainer_metrics.end_experience_collection_timer()
+        self.trainer_metrics.start_policy_update_timer(number_experiences=len(self.training_buffer.update_buffer['actions']),
+                                        mean_return = float(np.mean(self.cumulative_returns_since_policy_update)))
        n_sequences = max(int(self.trainer_parameters['batch_size'] / self.policy.sequence_length), 1)
        value_total, policy_total, forward_total, inverse_total = [], [], [], []
        advantages = self.training_buffer.update_buffer['advantages'].get_batch()
-        for k in range(num_epoch):
+        for _ in range(num_epoch):
            self.training_buffer.update_buffer.shuffle()
            buffer = self.training_buffer.update_buffer
            for l in range(len(self.training_buffer.update_buffer['actions']) // n_sequences):
            self.stats['Losses/Forward Loss'].append(np.mean(forward_total))
            self.stats['Losses/Inverse Loss'].append(np.mean(inverse_total))
        self.training_buffer.reset_update_buffer()
-
+        self.trainer_metrics.end_policy_update()

 def discount_rewards(r, gamma=0.99, value_next=0.0):
    """
--- a/ml-agents/mlagents/trainers/tests/test_learn.py
+++ b/ml-agents/mlagents/trainers/tests/test_learn.py
        '--slow': False,
        '--no-graphics': False,
        '<trainer-config-path>': 'basic_path',
+        '--debug': False,
    }


--- a/ml-agents/mlagents/trainers/trainer.py
+++ b/ml-agents/mlagents/trainers/trainer.py
 # # Unity ML-Agents Toolkit
 import logging
-
+import os
-
-logger = logging.getLogger("mlagents.trainers")
+from mlagents.trainers import TrainerMetrics
+LOGGER = logging.getLogger("mlagents.trainers")

 class UnityTrainerException(UnityException):
    """
        self.brain_name = brain.brain_name
        self.run_id = run_id
        self.trainer_parameters = trainer_parameters
+        self.summary_path = trainer_parameters['summary_path']
+        if not os.path.exists(self.summary_path):
+            os.makedirs(self.summary_path)
+        self.trainer_metrics = TrainerMetrics(path=self.summary_path + '.csv',
+                                              brain_name=self.brain_name)
+        self.cumulative_returns_since_policy_update = []
-        self.summary_writer = None
+
+        self.summary_writer = tf.summary.FileWriter(self.summary_path)
        self.policy = None

    def __str__(self):
        """
        Returns the trainer parameters of the trainer.
        """
-        raise UnityTrainerException("The parameters property was not implemented.")
+        raise UnityTrainerException(
+            "The parameters property was not implemented.")

    @property
    def graph_scope(self):
-        raise UnityTrainerException("The graph_scope property was not implemented.")
+        raise UnityTrainerException(
+            "The graph_scope property was not implemented.")

    @property
    def get_max_steps(self):
        """
-        raise UnityTrainerException("The get_max_steps property was not implemented.")
+        raise UnityTrainerException(
+            "The get_max_steps property was not implemented.")

    @property
    def get_step(self):
        """
-        raise UnityTrainerException("The get_step property was not implemented.")
+        raise UnityTrainerException(
+            "The get_step property was not implemented.")

    @property
    def get_last_reward(self):
        """
-        raise UnityTrainerException("The get_last_reward property was not implemented.")
+        raise UnityTrainerException(
+            "The get_last_reward property was not implemented.")

    def increment_step_and_update_last_reward(self):
        """
        :param curr_info: Current BrainInfo.
        :return: The ActionInfo given by the policy given the BrainInfo.
        """
+        self.trainer_metrics.start_experience_collection_timer()
        return self.policy.get_action(curr_info)

    def add_experiences(self, curr_info: AllBrainInfo, next_info: AllBrainInfo,
        :param next_info: Next AllBrainInfo.
        :param take_action_outputs: The outputs of the take action method.
        """
-        raise UnityTrainerException("The add_experiences method was not implemented.")
+        raise UnityTrainerException(
+            "The add_experiences method was not implemented.")

    def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInfo):
        """
        :param next_info: Dictionary of all next-step brains and corresponding BrainInfo.
        """
-        raise UnityTrainerException("The process_experiences method was not implemented.")
+        raise UnityTrainerException(
+            "The process_experiences method was not implemented.")
-        A signal that the Episode has ended. The buffer must be reset. 
+        A signal that the Episode has ended. The buffer must be reset.
-        raise UnityTrainerException("The end_episode method was not implemented.")
+        raise UnityTrainerException(
+            "The end_episode method was not implemented.")

    def is_ready_update(self):
        """
-        raise UnityTrainerException("The is_ready_update method was not implemented.")
+        raise UnityTrainerException(
+            "The is_ready_update method was not implemented.")
-        raise UnityTrainerException("The update_model method was not implemented.")
+        raise UnityTrainerException(
+            "The update_model method was not implemented.")

    def save_model(self):
        """
        """
        self.policy.export_model()

-    def write_summary(self, global_step, lesson_num=0):
+    def write_training_metrics(self):
+        """
+        Write training metrics to a CSV  file
+        :return:
+        """
+        self.trainer_metrics.write_training_metrics()
+
+    def write_summary(self, global_step, delta_train_start, lesson_num=0):
+        :param delta_train_start:  Time elapsed since training started.
        :param lesson_num: Current lesson number in curriculum.
        :param global_step: The number of steps the simulation has been going for
        """
-                mean_reward = np.mean(self.stats['Environment/Cumulative Reward'])
-                logger.info(" {}: {}: Step: {}. Mean Reward: {:0.3f}. Std of Reward: {:0.3f}. {}"
+                mean_reward = np.mean(
+                    self.stats['Environment/Cumulative Reward'])
+                LOGGER.info(" {}: {}: Step: {}. "
+                            "Time Elapsed: {:0.3f} s "
+                            "Mean "
+                            "Reward: {"
+                            ":0.3f}. Std of Reward: {:0.3f}. {}"
-                                    mean_reward, np.std(self.stats['Environment/Cumulative Reward']),
+                                    delta_train_start,
+                                    mean_reward, np.std(
+                                        self.stats['Environment/Cumulative Reward']),
-                logger.info(" {}: {}: Step: {}. No episode was completed since last summary. {}"
+                LOGGER.info(" {}: {}: Step: {}. No episode was completed since last summary. {}"
-                    summary.value.add(tag='{}'.format(key), simple_value=stat_mean)
+                    summary.value.add(tag='{}'.format(
+                        key), simple_value=stat_mean)
                    self.stats[key] = []
            summary.value.add(tag='Environment/Lesson', simple_value=lesson_num)
            self.summary_writer.add_summary(summary, self.get_step)
                s = sess.run(s_op)
                self.summary_writer.add_summary(s, self.get_step)
        except:
-            logger.info(
+            LOGGER.info(
                "Cannot write text summary for Tensorboard. Tensorflow version must be r1.2 or above.")
            pass
--- a/ml-agents/mlagents/trainers/trainer_controller.py
+++ b/ml-agents/mlagents/trainers/trainer_controller.py

 import numpy as np
 import tensorflow as tf
+from time import time

 from mlagents.envs import AllBrainInfo, BrainInfo
 from mlagents.envs.exception import UnityEnvironmentException
        self.global_step = 0
        self.meta_curriculum = meta_curriculum
        self.seed = training_seed
+        self.training_start_time = time()
        np.random.seed(self.seed)
        tf.set_random_seed(self.seed)

            sys.exit()
            return True
        return False
+
+    def _write_training_metrics(self):
+        """
+        Write all CSV metrics
+        :return:
+        """
+        for brain_name in self.trainers.keys():
+            self.trainers[brain_name].write_training_metrics()

    def _export_graph(self):
        """
                        .brains_to_curriculums[brain_name]
                        .min_lesson_length if self.meta_curriculum else 0,
                    trainer_parameters_dict[brain_name],
-                    self.train_model, self.load_model, self.seed, self.run_id)
+                    self.train_model, self.load_model, self.seed,
+                    self.run_id)
            else:
                raise UnityEnvironmentException('The trainer config contains '
                                                'an unknown trainer type for '
                self._save_model_when_interrupted(steps=self.global_step)
            pass
        env.close()
-
+            self._write_training_metrics()
            self._export_graph()

    def take_step(self, env, curr_info: AllBrainInfo):
                # Perform gradient descent with experience buffer
                trainer.update_policy()
            # Write training statistics to Tensorboard.
+            delta_train_start = time() - self.training_start_time
-                    lesson_num=self.meta_curriculum
+                    delta_train_start, lesson_num=self.meta_curriculum
-                trainer.write_summary(self.global_step)
+                trainer.write_summary(self.global_step, delta_train_start)
            if self.train_model \
                    and trainer.get_step <= trainer.get_max_steps:
                trainer.increment_step_and_update_last_reward()
--- a/ml-agents/mlagents/trainers/trainer_metrics.py
+++ b/ml-agents/mlagents/trainers/trainer_metrics.py
+# # Unity ML-Agents Toolkit
+import logging
+import csv
+from time import time
+
+LOGGER = logging.getLogger("mlagents.trainers")
+
+class TrainerMetrics:
+    """
+        Helper class to track, write training metrics. Tracks time since object
+        of this class is initialized.
+    """
+
+    def __init__(self, path: str, brain_name: str):
+        """
+        :str path: Fully qualified path where CSV is stored.
+        :str brain_name: Identifier for the Brain which we are training
+        """
+        self.path = path
+        self.brain_name = brain_name
+        self.FIELD_NAMES = ['Brain name', 'Time to update policy',
+                            'Time since start of training', 'Time for last experience collection',
+                            'Number of experiences used for training', 'Mean return']
+        self.rows = []
+        self.time_start_experience_collection = None
+        self.time_training_start = time()
+        self.last_buffer_length = None
+        self.last_mean_return = None
+        self.time_policy_update_start = None
+        self.delta_last_experience_collection = None
+        self.delta_policy_update = None
+
+    def start_experience_collection_timer(self):
+        """
+        Inform Metrics class that experience collection is starting. Intended to be idempotent
+        """
+        if self.time_start_experience_collection is None:
+            self.time_start_experience_collection = time()
+
+    def end_experience_collection_timer(self):
+        """
+        Inform Metrics class that experience collection is done.
+        """
+        if self.start_experience_collection_timer:
+            self.delta_last_experience_collection = time() - self.time_start_experience_collection
+        else:
+            self.delta_last_experience_collection = 0.0
+        self.time_start_experience_collection = None
+
+    def start_policy_update_timer(self, number_experiences: int, mean_return: float):
+        """
+        Inform Metrics class that policy update has started.
+        :int number_experiences: Number of experiences in Buffer at this point.
+        :float mean_return: Return averaged across all cumulative returns since last policy update
+        """
+        self.last_buffer_length = number_experiences
+        self.last_mean_return = mean_return
+        self.time_policy_update_start = time()
+
+    def _add_row(self, delta_train_start):
+        row = [self.brain_name]
+        row.extend(format(c, '.3f') if isinstance(c, float) else c
+                   for c in [self.delta_policy_update, delta_train_start,
+                             self.delta_last_experience_collection,
+                             self.last_buffer_length, self.last_mean_return])
+        self.rows.append(row)
+
+
+    def end_policy_update(self):
+        """
+        Inform Metrics class that policy update has started.
+        """
+        if self.time_policy_update_start:
+            self.delta_policy_update = time() - self.time_policy_update_start
+        else:
+            self.delta_policy_update = 0
+        delta_train_start = time() - self.time_training_start
+        LOGGER.debug(" Policy Update Training Metrics for {}: "
+                     "\n\t\tTime to update Policy: {:0.3f} s \n"
+                     "\t\tTime elapsed since training: {:0.3f} s \n"
+                     "\t\tTime for experience collection: {:0.3f} s \n"
+                     "\t\tBuffer Length: {} \n"
+                     "\t\tReturns : {:0.3f}\n"
+                     .format(self.brain_name, self.delta_policy_update,
+                             delta_train_start, self.delta_last_experience_collection,
+                             self.last_buffer_length, self.last_mean_return))
+        self._add_row(delta_train_start)
+
+
+    def write_training_metrics(self):
+        """
+        Write Training Metrics to CSV
+        """
+        with open(self.path, 'w') as f:
+            writer = csv.writer(f)
+            writer.writerow(self.FIELD_NAMES)
+            for row in self.rows:
+                writer.writerow(row)
--- a/ml-agents/tests/trainers/test_trainer_metrics.py
+++ b/ml-agents/tests/trainers/test_trainer_metrics.py
+import unittest.mock as mock
+from mlagents.trainers import TrainerMetrics
+
+class TestTrainerMetrics:
+
+    def test_field_names(self):
+        field_names = ['Brain name', 'Time to update policy',
+                    'Time since start of training', 'Time for last experience collection', 'Number of experiences used for training', 'Mean return']
+        mock_path = 'fake'
+        mock_brain_name = 'fake'
+        trainer_metrics = TrainerMetrics(path=mock_path,
+                                         brain_name=mock_brain_name)
+        assert trainer_metrics.FIELD_NAMES == field_names
+
+    @mock.patch('mlagents.trainers.trainer_metrics.time', mock.MagicMock(return_value=42))
+    def test_experience_collection_timer(self):
+        mock_path = 'fake'
+        mock_brain_name = 'fake'
+        trainer_metrics = TrainerMetrics(path=mock_path,
+                                         brain_name=mock_brain_name)
+        trainer_metrics.start_experience_collection_timer()
+        trainer_metrics.end_experience_collection_timer()
+        assert trainer_metrics.delta_last_experience_collection == 0
+
+    @mock.patch('mlagents.trainers.trainer_metrics.time', mock.MagicMock(return_value=42))
+    def test_policy_update_timer(self):
+        mock_path = 'fake'
+        mock_brain_name = 'fake'
+        fake_buffer_length =  350
+        fake_mean_return = 0.3
+        trainer_metrics = TrainerMetrics(path=mock_path,
+                                         brain_name=mock_brain_name)
+        trainer_metrics.start_experience_collection_timer()
+        trainer_metrics.end_experience_collection_timer()
+        trainer_metrics.start_policy_update_timer(number_experiences=fake_buffer_length,
+                                                  mean_return=fake_mean_return)
+        trainer_metrics.end_policy_update()
+        fake_row = [mock_brain_name, 0,0, 0, 350, '0.300']
+        assert trainer_metrics.rows[0] == fake_row
+