浏览代码

Use organized tags for tensorboard stats (#1248)

/develop-generalizationTraining-TrainerController
GitHub 6 年前
当前提交
840417ff
共有 5 个文件被更改,包括 507 次插入442 次删除
  1. 39
      docs/Using-Tensorboard.md
  2. 846
      docs/images/mlagents-TensorBoard.png
  3. 17
      ml-agents/mlagents/trainers/bc/trainer.py
  4. 37
      ml-agents/mlagents/trainers/ppo/trainer.py
  5. 10
      ml-agents/mlagents/trainers/trainer.py

39
docs/Using-Tensorboard.md


![Example TensorBoard Run](images/mlagents-TensorBoard.png)
* Lesson - Plots the progress from lesson to lesson. Only interesting when
### Environment Statistics
* `Environment/Lesson` - Plots the progress from lesson to lesson. Only interesting when
* Cumulative Reward - The mean cumulative episode reward over all agents. Should
* `Environment/Cumulative Reward` - The mean cumulative episode reward over all agents. Should
* `Environment/Episode Length` - The mean length of each episode in the environment for all agents.
* Entropy - How random the decisions of the model are. Should slowly decrease
### Policy Statistics
* `Policy/Entropy` (PPO; BC) - How random the decisions of the model are. Should slowly decrease
* Episode Length - The mean length of each episode in the environment for all
agents.
* `Policy/Learning Rate` (PPO; BC) - How large a step the training algorithm takes as it searches
for the optimal policy. Should decrease over time.
* `Policy/Value Estimate` (PPO) - The mean value estimate for all states visited by the agent. Should increase during a successful training session.
* `Policy/Curiosity Reward` (PPO+Curiosity) - This corresponds to the mean cumulative intrinsic reward generated per-episode.
* Learning Rate - How large a step the training algorithm takes as it searches
for the optimal policy. Should decrease over time.
### Learning Loss Functions
* Policy Loss - The mean magnitude of policy loss function. Correlates to how
* `Losses/Policy Loss` (PPO) - The mean magnitude of policy loss function. Correlates to how
* Value Estimate - The mean value estimate for all states visited by the agent.
Should increase during a successful training session.
* Value Loss - The mean loss of the value function update. Correlates to how
* `Losses/Value Loss` (PPO) - The mean loss of the value function update. Correlates to how
* _(Curiosity-Specific)_ Intrinsic Reward - This corresponds to the mean
cumulative intrinsic reward generated per-episode.
* _(Curiosity-Specific)_ Forward Loss - The mean magnitude of the inverse model
* `Losses/Forward Loss` (PPO+Curiosity) - The mean magnitude of the inverse model
* _(Curiosity-Specific)_ Inverse Loss - The mean magnitude of the forward model
* `Losses/Inverse Loss` (PPO+Curiosity) - The mean magnitude of the forward model
* `Losses/Cloning Loss` (BC) - The mean magnitude of the behavioral cloning loss. Corresponds to how well the model imitates the demonstration data.

846
docs/images/mlagents-TensorBoard.png

之前 之后
宽度: 1050  |  高度: 994  |  大小: 158 KiB

17
ml-agents/mlagents/trainers/bc/trainer.py


from mlagents.envs import AllBrainInfo
from mlagents.trainers.bc.policy import BCPolicy
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.trainer import UnityTrainerException, Trainer
from mlagents.trainers.trainer import Trainer
logger = logging.getLogger("mlagents.trainers")

self.n_sequences = 1
self.cumulative_rewards = {}
self.episode_steps = {}
self.stats = {'losses': [], 'episode_length': [], 'cumulative_reward': []}
self.stats = {'Losses/Cloning Loss': [], 'Environment/Episode Length': [],
'Environment/Cumulative Reward': []}
self.summary_path = trainer_parameters['summary_path']
self.batches_per_epoch = trainer_parameters['batches_per_epoch']

Returns the last reward the trainer has had
:return: the new last reward
"""
if len(self.stats['cumulative_reward']) > 0:
return np.mean(self.stats['cumulative_reward'])
if len(self.stats['Environment/Cumulative Reward']) > 0:
return np.mean(self.stats['Environment/Cumulative Reward'])
else:
return 0

for l in range(len(info_student.agents)):
if info_student.local_done[l]:
agent_id = info_student.agents[l]
self.stats['cumulative_reward'].append(
self.stats['Environment/Cumulative Reward'].append(
self.stats['episode_length'].append(
self.stats['Environment/Episode Length'].append(
self.episode_steps.get(agent_id, 0))
self.cumulative_rewards[agent_id] = 0
self.episode_steps[agent_id] = 0

loss = run_out['policy_loss']
batch_losses.append(loss)
if len(batch_losses) > 0:
self.stats['losses'].append(np.mean(batch_losses))
self.stats['Losses/Cloning Loss'].append(np.mean(batch_losses))
self.stats['losses'].append(0)
self.stats['Losses/Cloning Loss'].append(0)

37
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.envs import AllBrainInfo, BrainInfo
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.trainer import UnityTrainerException, Trainer
from mlagents.trainers.trainer import Trainer
logger = logging.getLogger("mlagents.trainers")

self.policy = PPOPolicy(seed, brain, trainer_parameters,
self.is_training, load)
stats = {'cumulative_reward': [], 'episode_length': [], 'value_estimate': [],
'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': []}
stats = {'Environment/Cumulative Reward': [], 'Environment/Episode Length': [],
'Policy/Value Estimate': [], 'Policy/Entropy': [], 'Losses/Value Loss': [],
'Losses/Policy Loss': [], 'Policy/Learning Rate': []}
stats['forward_loss'] = []
stats['inverse_loss'] = []
stats['intrinsic_reward'] = []
stats['Losses/Forward Loss'] = []
stats['Losses/Inverse Loss'] = []
stats['Policy/Curiosity Reward'] = []
self.intrinsic_rewards = {}
self.stats = stats

"""
Increment the step count of the trainer and Updates the last reward
"""
if len(self.stats['cumulative_reward']) > 0:
mean_reward = np.mean(self.stats['cumulative_reward'])
if len(self.stats['Environment/Cumulative Reward']) > 0:
mean_reward = np.mean(self.stats['Environment/Cumulative Reward'])
self.policy.update_reward(mean_reward)
self.policy.increment_step()
self.step = self.policy.get_current_step()

return [], [], [], None, None
run_out = self.policy.evaluate(curr_brain_info)
self.stats['value_estimate'].append(run_out['value'].mean())
self.stats['entropy'].append(run_out['entropy'].mean())
self.stats['learning_rate'].append(run_out['learning_rate'])
self.stats['Policy/Value Estimate'].append(run_out['value'].mean())
self.stats['Policy/Entropy'].append(run_out['entropy'].mean())
self.stats['Policy/Learning Rate'].append(run_out['learning_rate'])
if self.policy.use_recurrent:
return run_out['action'], run_out['memory_out'], None, \
run_out['value'], run_out

self.training_buffer[agent_id].reset_agent()
if info.local_done[l]:
self.stats['cumulative_reward'].append(
self.stats['Environment/Cumulative Reward'].append(
self.stats['episode_length'].append(
self.stats['Environment/Episode Length'].append(
self.stats['intrinsic_reward'].append(
self.stats['Policy/Curiosity Reward'].append(
self.intrinsic_rewards.get(agent_id, 0))
self.intrinsic_rewards[agent_id] = 0

if self.use_curiosity:
inverse_total.append(run_out['inverse_loss'])
forward_total.append(run_out['forward_loss'])
self.stats['value_loss'].append(np.mean(value_total))
self.stats['policy_loss'].append(np.mean(policy_total))
self.stats['Losses/Value Loss'].append(np.mean(value_total))
self.stats['Losses/Policy Loss'].append(np.mean(policy_total))
self.stats['forward_loss'].append(np.mean(forward_total))
self.stats['inverse_loss'].append(np.mean(inverse_total))
self.stats['Losses/Forward Loss'].append(np.mean(forward_total))
self.stats['Losses/Inverse Loss'].append(np.mean(inverse_total))
self.training_buffer.reset_update_buffer()

10
ml-agents/mlagents/trainers/trainer.py


"""
if global_step % self.trainer_parameters['summary_freq'] == 0 and global_step != 0:
is_training = "Training." if self.is_training and self.get_step <= self.get_max_steps else "Not Training."
if len(self.stats['cumulative_reward']) > 0:
mean_reward = np.mean(self.stats['cumulative_reward'])
if len(self.stats['Environment/Cumulative Reward']) > 0:
mean_reward = np.mean(self.stats['Environment/Cumulative Reward'])
mean_reward, np.std(self.stats['cumulative_reward']),
mean_reward, np.std(self.stats['Environment/Cumulative Reward']),
is_training))
else:
logger.info(" {}: {}: Step: {}. No episode was completed since last summary. {}"

if len(self.stats[key]) > 0:
stat_mean = float(np.mean(self.stats[key]))
summary.value.add(tag='Info/{}'.format(key), simple_value=stat_mean)
summary.value.add(tag='{}'.format(key), simple_value=stat_mean)
summary.value.add(tag='Info/Lesson', simple_value=lesson_num)
summary.value.add(tag='Environment/Lesson', simple_value=lesson_num)
self.summary_writer.add_summary(summary, self.get_step)
self.summary_writer.flush()

正在加载...
取消
保存