浏览代码

Refactor Trainers to use Policy (#1098)

/develop-generalizationTraining-TrainerController
GitHub 6 年前
当前提交
fbf92810
共有 21 个文件被更改,包括 864 次插入546 次删除
  1. 1
      MLAgentsSDK/ProjectSettings/EditorBuildSettings.asset
  2. 2
      MLAgentsSDK/ProjectSettings/ProjectVersion.txt
  3. 4
      MLAgentsSDK/ProjectSettings/UnityConnectSettings.asset
  4. 3
      ml-agents/mlagents/trainers/__init__.py
  5. 1
      ml-agents/mlagents/trainers/bc/__init__.py
  6. 94
      ml-agents/mlagents/trainers/bc/models.py
  7. 143
      ml-agents/mlagents/trainers/bc/trainer.py
  8. 13
      ml-agents/mlagents/trainers/buffer.py
  9. 147
      ml-agents/mlagents/trainers/models.py
  10. 1
      ml-agents/mlagents/trainers/ppo/__init__.py
  11. 57
      ml-agents/mlagents/trainers/ppo/models.py
  12. 303
      ml-agents/mlagents/trainers/ppo/trainer.py
  13. 28
      ml-agents/mlagents/trainers/trainer.py
  14. 56
      ml-agents/mlagents/trainers/trainer_controller.py
  15. 50
      ml-agents/tests/trainers/test_bc.py
  16. 23
      ml-agents/tests/trainers/test_buffer.py
  17. 52
      ml-agents/tests/trainers/test_ppo.py
  18. 1
      ml-agents/tests/trainers/test_trainer_controller.py
  19. 87
      ml-agents/mlagents/trainers/bc/policy.py
  20. 146
      ml-agents/mlagents/trainers/policy.py
  21. 198
      ml-agents/mlagents/trainers/ppo/policy.py

1
MLAgentsSDK/ProjectSettings/EditorBuildSettings.asset


m_ObjectHideFlags: 0
serializedVersion: 2
m_Scenes: []
m_configObjects: {}

2
MLAgentsSDK/ProjectSettings/ProjectVersion.txt


m_EditorVersion: 2017.1.0f3
m_EditorVersion: 2017.1.5f1

4
MLAgentsSDK/ProjectSettings/UnityConnectSettings.asset


--- !u!310 &1
UnityConnectSettings:
m_ObjectHideFlags: 0
m_Enabled: 0
m_Enabled: 1
m_TestMode: 0
m_TestEventUrl:
m_TestConfigUrl:

m_NativeEventUrl: https://perf-events.cloud.unity3d.com/symbolicate
m_Enabled: 0
m_CaptureEditorExceptions: 1
UnityPurchasingSettings:

m_Enabled: 0
m_InitializeOnStartup: 1
m_TestMode: 0
m_EnabledPlatforms: 4294967295
m_IosGameId:
m_AndroidGameId:
m_GameIds: {}

3
ml-agents/mlagents/trainers/__init__.py


from .trainer_controller import *
from .bc.models import *
from .bc.trainer import *
from .bc.policy import *
from .ppo.policy import *
from .policy import *

1
ml-agents/mlagents/trainers/bc/__init__.py


from .models import *
from .trainer import *
from .policy import *

94
ml-agents/mlagents/trainers/bc/models.py


class BehavioralCloningModel(LearningModel):
def __init__(self, brain, h_size=128, lr=1e-4, n_layers=2, m_size=128,
normalize=False, use_recurrent=False):
LearningModel.__init__(self, m_size, normalize, use_recurrent, brain)
normalize=False, use_recurrent=False, scope='PPO', seed=0):
num_streams = 1
hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers)
hidden = hidden_streams[0]
self.dropout_rate = tf.placeholder(dtype=tf.float32, shape=[], name="dropout_rate")
hidden_reg = tf.layers.dropout(hidden, self.dropout_rate)
if self.use_recurrent:
tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32)
self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in')
hidden_reg, self.memory_out = self.create_recurrent_encoder(hidden_reg, self.memory_in,
self.sequence_length)
self.memory_out = tf.identity(self.memory_out, name='recurrent_out')
LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, scope, seed)
if brain.vector_action_space_type == "discrete":
policy_branches = []
for size in self.a_size:
policy_branches.append(
tf.layers.dense(
hidden,
size,
activation=None,
use_bias=False,
kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01)))
self.action_probs = tf.concat(
[tf.nn.softmax(branch) for branch in policy_branches], axis=1, name="action_probs")
self.action_masks = tf.placeholder(shape=[None, sum(self.a_size)], dtype=tf.float32, name="action_masks")
self.sample_action_float, _ = self.create_discrete_action_masking_layer(
tf.concat(policy_branches, axis=1), self.action_masks, self.a_size)
self.sample_action_float = tf.identity(self.sample_action_float, name="action")
self.sample_action = tf.cast(self.sample_action_float, tf.int32)
self.true_action = tf.placeholder(shape=[None, len(policy_branches)], dtype=tf.int32, name="teacher_action")
self.action_oh = tf.concat([
tf.one_hot(self.true_action[:, i], self.a_size[i]) for i in range(len(self.a_size))], axis=1)
self.loss = tf.reduce_sum(-tf.log(self.action_probs + 1e-10) * self.action_oh)
self.action_percent = tf.reduce_mean(tf.cast(
tf.equal(tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32), self.sample_action), tf.float32))
else:
self.policy = tf.layers.dense(hidden_reg, self.a_size[0], activation=None, use_bias=False, name='pre_action',
kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01))
self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1)
self.sample_action = tf.identity(self.clipped_sample_action, name="action")
self.true_action = tf.placeholder(shape=[None, self.a_size[0]], dtype=tf.float32, name="teacher_action")
self.clipped_true_action = tf.clip_by_value(self.true_action, -1, 1)
self.loss = tf.reduce_sum(tf.squared_difference(self.clipped_true_action, self.sample_action))
with tf.variable_scope(scope):
num_streams = 1
hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers)
hidden = hidden_streams[0]
self.dropout_rate = tf.placeholder(dtype=tf.float32, shape=[], name="dropout_rate")
hidden_reg = tf.layers.dropout(hidden, self.dropout_rate)
if self.use_recurrent:
tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32)
self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in')
hidden_reg, self.memory_out = self.create_recurrent_encoder(hidden_reg, self.memory_in,
self.sequence_length)
self.memory_out = tf.identity(self.memory_out, name='recurrent_out')
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
self.update = optimizer.minimize(self.loss)
if brain.vector_action_space_type == "discrete":
policy_branches = []
for size in self.act_size:
policy_branches.append(
tf.layers.dense(
hidden,
size,
activation=None,
use_bias=False,
kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01)))
self.action_probs = tf.concat(
[tf.nn.softmax(branch) for branch in policy_branches], axis=1, name="action_probs")
self.action_masks = tf.placeholder(shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks")
self.sample_action_float, _ = self.create_discrete_action_masking_layer(
tf.concat(policy_branches, axis = 1), self.action_masks, self.act_size)
self.sample_action_float = tf.identity(self.sample_action_float, name="action")
self.sample_action = tf.cast(self.sample_action_float, tf.int32)
self.true_action = tf.placeholder(shape=[None, len(policy_branches)], dtype=tf.int32, name="teacher_action")
self.action_oh = tf.concat([
tf.one_hot(self.true_action[:, i], self.act_size[i]) for i in range(len(self.act_size))], axis=1)
self.loss = tf.reduce_sum(-tf.log(self.action_probs + 1e-10) * self.action_oh)
self.action_percent = tf.reduce_mean(tf.cast(
tf.equal(tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32), self.sample_action), tf.float32))
else:
self.policy = tf.layers.dense(hidden_reg, self.act_size[0], activation=None, use_bias=False, name='pre_action',
kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01))
self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1)
self.sample_action = tf.identity(self.clipped_sample_action, name="action")
self.true_action = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="teacher_action")
self.clipped_true_action = tf.clip_by_value(self.true_action, -1, 1)
self.loss = tf.reduce_sum(tf.squared_difference(self.clipped_true_action, self.sample_action))
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
self.update = optimizer.minimize(self.loss)

143
ml-agents/mlagents/trainers/bc/trainer.py


import tensorflow as tf
from mlagents.envs import AllBrainInfo
from mlagents.trainers.bc.models import BehavioralCloningModel
from mlagents.trainers.bc.policy import BCPolicy
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.trainer import UnityTrainerException, Trainer

class BehavioralCloningTrainer(Trainer):
"""The ImitationTrainer is an implementation of the imitation learning."""
def __init__(self, sess, env, brain_name, trainer_parameters, training, seed, run_id):
def __init__(self, sess, brain, trainer_parameters, training, seed, run_id):
:param env: The UnityEnvironment.
super(BehavioralCloningTrainer, self).__init__(sess, brain, trainer_parameters, training, run_id)
self.param_keys = ['brain_to_imitate', 'batch_size', 'time_horizon', 'graph_scope',
'summary_freq', 'max_steps', 'batches_per_epoch', 'use_recurrent', 'hidden_units',
'num_layers', 'sequence_length', 'memory_size']

raise UnityTrainerException("The hyperparameter {0} could not be found for the Imitation trainer of "
"brain {1}.".format(k, brain_name))
super(BehavioralCloningTrainer, self).__init__(sess, env, brain_name, trainer_parameters, training, run_id)
"brain {1}.".format(k, brain.brain_name))
self.variable_scope = trainer_parameters['graph_scope']
self.policy = BCPolicy(seed, brain, trainer_parameters, sess)
self.brain_name = brain.brain_name
self.use_recurrent = trainer_parameters['use_recurrent']
self.sequence_length = 1
self.m_size = None
if self.use_recurrent:
self.m_size = trainer_parameters["memory_size"]
self.sequence_length = trainer_parameters["sequence_length"]
self.n_sequences = max(int(trainer_parameters['batch_size'] / self.sequence_length), 1)
self.n_sequences = max(int(trainer_parameters['batch_size'] / self.policy.sequence_length), 1)
self.is_continuous_action = (env.brains[brain_name].vector_action_space_type == "continuous")
self.use_visual_observations = (env.brains[brain_name].number_visual_observations > 0)
self.use_vector_observations = (env.brains[brain_name].vector_observation_space_size > 0)
with tf.variable_scope(self.variable_scope):
tf.set_random_seed(seed)
self.model = BehavioralCloningModel(
h_size=int(trainer_parameters['hidden_units']),
lr=float(trainer_parameters['learning_rate']),
n_layers=int(trainer_parameters['num_layers']),
m_size=self.m_size,
normalize=False,
use_recurrent=trainer_parameters['use_recurrent'],
brain=self.brain)
self.inference_run_list = [self.model.sample_action]
if self.use_recurrent:
self.inference_run_list += [self.model.memory_out]
return '''Hyperparameters for the Imitation Trainer of brain {0}: \n{1}'''.format(
self.brain_name, '\n'.join(['\t{0}:\t{1}'.format(x, self.trainer_parameters[x]) for x in self.param_keys]))

return self.trainer_parameters
@property
def graph_scope(self):
"""
Returns the graph scope of the trainer.
"""
return self.variable_scope
@property
def get_max_steps(self):
"""
Returns the maximum number of steps. Is used to know when the trainer should be stopped.

Returns the number of steps the trainer has performed
:return: the step count of the trainer
"""
return self.sess.run(self.model.global_step)
return self.policy.get_current_step()
@property
def get_last_reward(self):

"""
Increment the step count of the trainer and Updates the last reward
"""
self.sess.run(self.model.increment_step)
self.policy.increment_step()
Decides actions given state/observation information, and takes them in environment.
Decides actions using policy given current brain info.
:param all_brain_info: AllBrainInfo from environment.
:return: a tuple containing action, memories, values and an object
to be passed to add experiences

agent_brain = all_brain_info[self.brain_name]
feed_dict = {self.model.dropout_rate: 1.0, self.model.sequence_length: 1}
if self.use_visual_observations:
for i, _ in enumerate(agent_brain.visual_observations):
feed_dict[self.model.visual_in[i]] = agent_brain.visual_observations[i]
if self.use_vector_observations:
feed_dict[self.model.vector_in] = agent_brain.vector_observations
if not self.is_continuous_action:
feed_dict[self.model.action_masks] = agent_brain.action_masks
if self.use_recurrent:
if agent_brain.memories.shape[1] == 0:
agent_brain.memories = np.zeros((len(agent_brain.agents), self.m_size))
feed_dict[self.model.memory_in] = agent_brain.memories
agent_action, memories = self.sess.run(self.inference_run_list, feed_dict)
return agent_action, memories, None, None, None
run_out = self.policy.evaluate(agent_brain)
if self.policy.use_recurrent:
return run_out['action'], run_out['memory_out'], None, None, None
agent_action = self.sess.run(self.inference_run_list, feed_dict)
return agent_action, None, None, None, None
return run_out['action'], None, None, None, None
def add_experiences(self, curr_info: AllBrainInfo, next_info: AllBrainInfo, take_action_outputs):
"""

:param take_action_outputs: The outputs of the take action method.
"""
# Used to collect teacher experience into training buffer
info_teacher = curr_info[self.brain_to_imitate]
next_info_teacher = next_info[self.brain_to_imitate]

info_teacher_record, next_info_teacher_record = "true", "true"
if info_teacher_record == "true" and next_info_teacher_record == "true":
if not stored_info_teacher.local_done[idx]:
if self.use_visual_observations:
for i, _ in enumerate(stored_info_teacher.visual_observations):
self.training_buffer[agent_id]['visual_observations%d' % i]\
.append(stored_info_teacher.visual_observations[i][idx])
if self.use_vector_observations:
self.training_buffer[agent_id]['vector_observations']\
for i in range(self.policy.vis_obs_size):
self.training_buffer[agent_id]['visual_obs%d' % i]\
.append(stored_info_teacher.visual_observations[i][idx])
if self.policy.use_vec_obs:
self.training_buffer[agent_id]['vector_obs']\
if self.use_recurrent:
if self.policy.use_recurrent:
stored_info_teacher.memories = np.zeros((len(stored_info_teacher.agents), self.m_size))
stored_info_teacher.memories = np.zeros((len(stored_info_teacher.agents),
self.policy.m_size))
self.training_buffer[agent_id]['memory'].append(stored_info_teacher.memories[idx])
self.training_buffer[agent_id]['actions'].append(next_info_teacher.
previous_vector_actions[next_idx])

"""
info_teacher = next_info[self.brain_to_imitate]
for l in range(len(info_teacher.agents)):
if ((info_teacher.local_done[l] or
len(self.training_buffer[info_teacher.agents[l]]['actions']) > self.trainer_parameters[
'time_horizon'])
and len(self.training_buffer[info_teacher.agents[l]]['actions']) > 0):
teacher_action_list = len(self.training_buffer[info_teacher.agents[l]]['actions'])
horizon_reached = teacher_action_list > self.trainer_parameters['time_horizon']
teacher_filled = len(self.training_buffer[info_teacher.agents[l]]['actions']) > 0
if ((info_teacher.local_done[l] or horizon_reached) and teacher_filled):
self.training_buffer.append_update_buffer(agent_id, batch_size=None,
training_length=self.sequence_length)
self.training_buffer.append_update_buffer(
agent_id, batch_size=None, training_length=self.policy.sequence_length)
self.training_buffer[agent_id].reset_agent()
info_student = next_info[self.brain_name]

"""
return len(self.training_buffer.update_buffer['actions']) > self.n_sequences
def update_model(self):
def update_policy(self):
Uses training_buffer to update model.
Updates the policy.
for j in range(
min(len(self.training_buffer.update_buffer['actions']) // self.n_sequences, self.batches_per_epoch)):
_buffer = self.training_buffer.update_buffer
start = j * self.n_sequences
end = (j + 1) * self.n_sequences
feed_dict = {self.model.dropout_rate: 0.5,
self.model.batch_size: self.n_sequences,
self.model.sequence_length: self.sequence_length}
if self.is_continuous_action:
feed_dict[self.model.true_action] = np.array(_buffer['actions'][start:end]).\
reshape([-1, self.brain.vector_action_space_size[0]])
else:
feed_dict[self.model.true_action] = np.array(_buffer['actions'][start:end]).reshape(
[-1, len(self.brain.vector_action_space_size)])
if self.use_vector_observations:
feed_dict[self.model.vector_in] = np.array(_buffer['vector_observations'][start:end])\
.reshape([-1, self.brain.vector_observation_space_size * self.brain.num_stacked_vector_observations])
if self.use_visual_observations:
for i, _ in enumerate(self.model.visual_in):
_obs = np.array(_buffer['visual_observations%d' % i][start:end])
feed_dict[self.model.visual_in[i]] = _obs
if not self.is_continuous_action:
feed_dict[self.model.action_masks] = np.ones(
(self.n_sequences, sum(self.brain.vector_action_space_size)))
if self.use_recurrent:
feed_dict[self.model.memory_in] = np.zeros([self.n_sequences, self.m_size])
loss, _ = self.sess.run([self.model.loss, self.model.update], feed_dict=feed_dict)
num_batches = min(len(self.training_buffer.update_buffer['actions']) //
self.n_sequences, self.batches_per_epoch)
for i in range(num_batches):
buffer = self.training_buffer.update_buffer
start = i * self.n_sequences
end = (i + 1) * self.n_sequences
mini_batch = buffer.make_mini_batch(start, end)
run_out = self.policy.update(mini_batch, self.n_sequences)
loss = run_out['policy_loss']
batch_losses.append(loss)
if len(batch_losses) > 0:
self.stats['losses'].append(np.mean(batch_losses))

13
ml-agents/mlagents/trainers/buffer.py


def shuffle(self, key_list=None):
"""
Shuffles the fields in key_list in a consistent way: The reordering will
Shuffles the fields in key_list in a consistent way: The reordering will
be the same across fields.
:param key_list: The fields that must be shuffled.
"""

np.random.shuffle(s)
for key in key_list:
self[key][:] = [self[key][i] for i in s]
def make_mini_batch(self, start, end):
"""
Creates a mini-batch from buffer.
:param start: Starting index of buffer.
:param end: Ending index of buffer.
:return: Dict of mini batch.
"""
mini_batch = {}
for key in self:
mini_batch[key] = np.array(self[key][start:end])
return mini_batch
def __init__(self):
self.update_buffer = self.AgentBuffer()

147
ml-agents/mlagents/trainers/models.py


class LearningModel(object):
def __init__(self, m_size, normalize, use_recurrent, brain):
self.brain = brain
self.vector_in = None
self.normalize = False
self.use_recurrent = False
self.global_step, self.increment_step = self.create_global_steps()
self.visual_in = []
self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name='batch_size')
self.sequence_length = tf.placeholder(shape=None, dtype=tf.int32, name='sequence_length')
self.mask_input = tf.placeholder(shape=[None], dtype=tf.float32, name='masks')
self.mask = tf.cast(self.mask_input, tf.int32)
self.m_size = m_size
self.normalize = normalize
self.use_recurrent = use_recurrent
self.a_size = brain.vector_action_space_size
self.o_size = brain.vector_observation_space_size * brain.num_stacked_vector_observations
self.v_size = brain.number_visual_observations
def __init__(self, m_size, normalize, use_recurrent, brain, scope, seed):
tf.set_random_seed(seed)
with tf.variable_scope(scope):
self.brain = brain
self.vector_in = None
self.global_step, self.increment_step = self.create_global_steps()
self.visual_in = []
self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name='batch_size')
self.sequence_length = tf.placeholder(shape=None, dtype=tf.int32, name='sequence_length')
self.mask_input = tf.placeholder(shape=[None], dtype=tf.float32, name='masks')
self.mask = tf.cast(self.mask_input, tf.int32)
self.m_size = m_size
self.normalize = normalize
self.use_recurrent = use_recurrent
self.act_size = brain.vector_action_space_size
self.vec_obs_size = brain.vector_observation_space_size * \
brain.num_stacked_vector_observations
self.vis_obs_size = brain.number_visual_observations
@staticmethod
def create_global_steps():

else:
c_channels = 3
visual_in = tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32, name=name)
visual_in = tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32,
name=name)
return visual_in
def create_vector_input(self, name='vector_observation'):

:param o_size: Size of stacked vector observation.
:param vec_obs_size: Size of stacked vector observation.
self.vector_in = tf.placeholder(shape=[None, self.o_size], dtype=tf.float32, name=name)
self.vector_in = tf.placeholder(shape=[None, self.vec_obs_size], dtype=tf.float32,
name=name)
self.running_mean = tf.get_variable("running_mean", [self.o_size], trainable=False, dtype=tf.float32,
self.running_mean = tf.get_variable("running_mean", [self.vec_obs_size],
trainable=False, dtype=tf.float32,
self.running_variance = tf.get_variable("running_variance", [self.o_size], trainable=False,
dtype=tf.float32, initializer=tf.ones_initializer())
self.running_variance = tf.get_variable("running_variance", [self.vec_obs_size],
trainable=False,
dtype=tf.float32,
initializer=tf.ones_initializer())
self.update_mean, self.update_variance = self.create_normalizer_update(self.vector_in)
self.normalized_state = tf.clip_by_value((self.vector_in - self.running_mean) / tf.sqrt(

def create_normalizer_update(self, vector_input):
mean_current_observation = tf.reduce_mean(vector_input, axis=0)
new_mean = self.running_mean + (mean_current_observation - self.running_mean) / \
tf.cast(self.global_step + 1, tf.float32)
tf.cast(tf.add(self.global_step, 1), tf.float32)
new_variance = self.running_variance + (mean_current_observation - new_mean) * \
(mean_current_observation - self.running_mean)
update_mean = tf.assign(self.running_mean, new_mean)

@staticmethod
def create_vector_observation_encoder(observation_input, h_size, activation, num_layers, scope, reuse):
def create_vector_observation_encoder(observation_input, h_size, activation, num_layers, scope,
reuse):
"""
Builds a set of hidden state encoders.
:param reuse: Whether to re-use the weights within the same scope.

with tf.variable_scope(scope):
hidden = observation_input
for i in range(num_layers):
hidden = tf.layers.dense(hidden, h_size, activation=activation, reuse=reuse, name="hidden_{}".format(i),
kernel_initializer=c_layers.variance_scaling_initializer(1.0))
hidden = tf.layers.dense(hidden, h_size, activation=activation, reuse=reuse,
name="hidden_{}".format(i),
kernel_initializer=c_layers.variance_scaling_initializer(
1.0))
def create_visual_observation_encoder(self, image_input, h_size, activation, num_layers, scope, reuse):
def create_visual_observation_encoder(self, image_input, h_size, activation, num_layers, scope,
reuse):
"""
Builds a set of visual (CNN) encoders.
:param reuse: Whether to re-use the weights within the same scope.

activation=tf.nn.elu, reuse=reuse, name="conv_2")
hidden = c_layers.flatten(conv2)
with tf.variable_scope(scope+'/'+'flat_encoding'):
with tf.variable_scope(scope + '/' + 'flat_encoding'):
hidden_flat = self.create_vector_observation_encoder(hidden, h_size, activation,
num_layers, scope, reuse)
return hidden_flat

"""
action_idx = [0] + list(np.cumsum(action_size))
branches_logits = [all_logits[:, action_idx[i]:action_idx[i + 1]] for i in range(len(action_size))]
branch_masks = [action_masks[:, action_idx[i]:action_idx[i + 1]] for i in range(len(action_size))]
raw_probs = [tf.multiply(tf.nn.softmax(branches_logits[k]), branch_masks[k]) + (1-branch_masks[k])*1.0e-10
for k in range(len(action_size))]

self.visual_in = []
for i in range(brain.number_visual_observations):
visual_input = self.create_visual_input(brain.camera_resolutions[i], name="visual_observation_" + str(i))
visual_input = self.create_visual_input(brain.camera_resolutions[i],
name="visual_observation_" + str(i))
self.visual_in.append(visual_input)
vector_observation_input = self.create_vector_input()

hidden_state, hidden_visual = None, None
if self.v_size > 0:
if self.vis_obs_size > 0:
encoded_visual = self.create_visual_observation_encoder(self.visual_in[j], h_size,
activation_fn, num_layers,
encoded_visual = self.create_visual_observation_encoder(self.visual_in[j],
h_size,
activation_fn,
num_layers,
"main_graph_{}_encoder{}"
.format(i, j), False)
visual_encoders.append(encoded_visual)

h_size, activation_fn, num_layers,
"main_graph_{}".format(i), False)
h_size, activation_fn,
num_layers,
"main_graph_{}".format(i),
False)
if hidden_state is not None and hidden_visual is not None:
final_hidden = tf.concat([hidden_visual, hidden_state], axis=1)
elif hidden_state is None and hidden_visual is not None:

_half_point = int(m_size / 2)
with tf.variable_scope(name):
rnn_cell = tf.contrib.rnn.BasicLSTMCell(_half_point)
lstm_vector_in = tf.contrib.rnn.LSTMStateTuple(memory_in[:, :_half_point], memory_in[:, _half_point:])
lstm_vector_in = tf.contrib.rnn.LSTMStateTuple(memory_in[:, :_half_point],
memory_in[:, _half_point:])
recurrent_output, lstm_state_out = tf.nn.dynamic_rnn(rnn_cell, lstm_input_state,
initial_state=lstm_vector_in)

if self.use_recurrent:
tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32)
self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in')
self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32,
name='recurrent_in')
hidden_streams[0], self.memory_in[:, :_half_point], self.sequence_length, name='lstm_policy')
hidden_streams[0], self.memory_in[:, :_half_point], self.sequence_length,
name='lstm_policy')
hidden_streams[1], self.memory_in[:, _half_point:], self.sequence_length, name='lstm_value')
self.memory_out = tf.concat([memory_policy_out, memory_value_out], axis=1, name='recurrent_out')
hidden_streams[1], self.memory_in[:, _half_point:], self.sequence_length,
name='lstm_value')
self.memory_out = tf.concat([memory_policy_out, memory_value_out], axis=1,
name='recurrent_out')
mu = tf.layers.dense(hidden_policy, self.a_size[0], activation=None,
mu = tf.layers.dense(hidden_policy, self.act_size[0], activation=None,
log_sigma_sq = tf.get_variable("log_sigma_squared", [self.a_size[0]], dtype=tf.float32,
log_sigma_sq = tf.get_variable("log_sigma_squared", [self.act_size[0]], dtype=tf.float32,
initializer=tf.zeros_initializer())
sigma_sq = tf.exp(log_sigma_sq)

value = tf.layers.dense(hidden_value, 1, activation=None)
self.value = tf.identity(value, name="value_estimate")
self.all_old_log_probs = tf.placeholder(shape=[None, self.a_size[0]], dtype=tf.float32,
self.all_old_log_probs = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32,
self.old_log_probs = tf.reduce_sum((tf.identity(self.all_old_log_probs)), axis=1, keepdims=True)
self.old_log_probs = tf.reduce_sum((tf.identity(self.all_old_log_probs)), axis=1,
keepdims=True)
def create_dc_actor_critic(self, h_size, num_layers):
"""

if self.use_recurrent:
tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32)
self.prev_action = tf.placeholder(shape=[None, len(self.a_size)], dtype=tf.int32, name='prev_action')
self.prev_action = tf.placeholder(shape=[None, len(self.act_size)], dtype=tf.int32,
name='prev_action')
tf.one_hot(self.prev_action[:, i], self.a_size[i]) for i in range(len(self.a_size))], axis=1)
tf.one_hot(self.prev_action[:, i], self.act_size[i]) for i in
range(len(self.act_size))], axis=1)
self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in')
hidden, memory_out = self.create_recurrent_encoder(hidden, self.memory_in, self.sequence_length)
self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32,
name='recurrent_in')
hidden, memory_out = self.create_recurrent_encoder(hidden, self.memory_in,
self.sequence_length)
for size in self.a_size:
for size in self.act_size:
self.action_masks = tf.placeholder(shape=[None, sum(self.a_size)], dtype=tf.float32, name="action_masks")
self.action_masks = tf.placeholder(shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks")
self.all_log_probs, self.action_masks, self.a_size)
self.all_log_probs, self.action_masks, self.act_size)
self.output = tf.identity(output, name="action")

self.action_holder = tf.placeholder(
shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder")
self.selected_actions = tf.concat([
tf.one_hot(self.action_holder[:, i], self.a_size[i]) for i in range(len(self.a_size))], axis=1)
tf.one_hot(self.action_holder[:, i], self.act_size[i]) for i in range(len(self.act_size))], axis=1)
shape=[None, sum(self.a_size)], dtype=tf.float32, name='old_probabilities')
shape=[None, sum(self.act_size)], dtype=tf.float32, name='old_probabilities')
self.all_old_log_probs, self.action_masks, self.a_size)
self.all_old_log_probs, self.action_masks, self.act_size)
action_idx = [0] + list(np.cumsum(self.a_size))
action_idx = [0] + list(np.cumsum(self.act_size))
labels=tf.nn.softmax(self.all_log_probs[:, action_idx[i]:action_idx[i + 1]]),
logits=self.all_log_probs[:, action_idx[i]:action_idx[i + 1]])
for i in range(len(self.a_size))], axis=1)), axis=1)
labels=tf.nn.softmax(self.all_log_probs[:, action_idx[i]:action_idx[i + 1]]),
logits=self.all_log_probs[:, action_idx[i]:action_idx[i + 1]])
for i in range(len(self.act_size))], axis=1)), axis=1)
self.log_probs = tf.reduce_sum((tf.stack([
-tf.nn.softmax_cross_entropy_with_logits_v2(

for i in range(len(self.a_size))], axis=1)), axis=1, keepdims=True)
for i in range(len(self.act_size))], axis=1)), axis=1, keepdims=True)
for i in range(len(self.a_size))], axis=1)), axis=1, keepdims=True)
for i in range(len(self.act_size))], axis=1)), axis=1, keepdims=True)

1
ml-agents/mlagents/trainers/ppo/__init__.py


from .models import *
from .trainer import *
from .policy import *

57
ml-agents/mlagents/trainers/ppo/models.py


class PPOModel(LearningModel):
def __init__(self, brain, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6,
normalize=False, use_recurrent=False, num_layers=2, m_size=None, use_curiosity=False,
curiosity_strength=0.01, curiosity_enc_size=128):
curiosity_strength=0.01, curiosity_enc_size=128, scope='Model', seed=0):
"""
Takes a Unity environment and model-specific hyper-parameters and returns the
appropriate PPO agent model for the environment.

:param num_layers Number of hidden layers between encoded input and policy & value layers
:param m_size: Size of brain memory.
"""
LearningModel.__init__(self, m_size, normalize, use_recurrent, brain)
self.use_curiosity = use_curiosity
if num_layers < 1:
num_layers = 1
self.last_reward, self.new_reward, self.update_reward = self.create_reward_encoder()
if brain.vector_action_space_type == "continuous":
self.create_cc_actor_critic(h_size, num_layers)
self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy
else:
self.create_dc_actor_critic(h_size, num_layers)
if self.use_curiosity:
self.curiosity_enc_size = curiosity_enc_size
self.curiosity_strength = curiosity_strength
encoded_state, encoded_next_state = self.create_curiosity_encoders()
self.create_inverse_model(encoded_state, encoded_next_state)
self.create_forward_model(encoded_state, encoded_next_state)
self.create_ppo_optimizer(self.log_probs, self.old_log_probs, self.value,
self.entropy, beta, epsilon, lr, max_step)
LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, scope, seed)
with tf.variable_scope(scope):
self.use_curiosity = use_curiosity
if num_layers < 1:
num_layers = 1
self.last_reward, self.new_reward, self.update_reward = self.create_reward_encoder()
if brain.vector_action_space_type == "continuous":
self.create_cc_actor_critic(h_size, num_layers)
self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy
else:
self.create_dc_actor_critic(h_size, num_layers)
if self.use_curiosity:
self.curiosity_enc_size = curiosity_enc_size
self.curiosity_strength = curiosity_strength
encoded_state, encoded_next_state = self.create_curiosity_encoders()
self.create_inverse_model(encoded_state, encoded_next_state)
self.create_forward_model(encoded_state, encoded_next_state)
self.create_ppo_optimizer(self.log_probs, self.old_log_probs, self.value,
self.entropy, beta, epsilon, lr, max_step)
@staticmethod
def create_reward_encoder():

encoded_state_list = []
encoded_next_state_list = []
if self.v_size > 0:
if self.vis_obs_size > 0:
for i in range(self.v_size):
for i in range(self.vis_obs_size):
# Create input ops for next (t+1) visual observations.
next_visual_input = self.create_visual_input(self.brain.camera_resolutions[i],
name="next_visual_observation_" + str(i))

encoded_state_list.append(hidden_visual)
encoded_next_state_list.append(hidden_next_visual)
if self.o_size > 0:
if self.vec_obs_size > 0:
self.next_vector_in = tf.placeholder(shape=[None, self.o_size], dtype=tf.float32,
self.next_vector_in = tf.placeholder(shape=[None, self.vec_obs_size], dtype=tf.float32,
name='next_vector_observation')
encoded_vector_obs = self.create_vector_observation_encoder(self.vector_in,

combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
hidden = tf.layers.dense(combined_input, 256, activation=self.swish)
if self.brain.vector_action_space_type == "continuous":
pred_action = tf.layers.dense(hidden, self.a_size[0], activation=None)
pred_action = tf.layers.dense(hidden, self.act_size[0], activation=None)
[tf.layers.dense(hidden, self.a_size[i], activation=tf.nn.softmax)
for i in range(len(self.a_size))], axis=1)
[tf.layers.dense(hidden, self.act_size[i], activation=tf.nn.softmax)
for i in range(len(self.act_size))], axis=1)
cross_entropy = tf.reduce_sum(-tf.log(pred_action + 1e-10) * self.selected_actions, axis=1)
self.inverse_loss = tf.reduce_mean(tf.dynamic_partition(cross_entropy, self.mask, 2)[1])

"""
combined_input = tf.concat([encoded_state, self.selected_actions], axis=1)
hidden = tf.layers.dense(combined_input, 256, activation=self.swish)
# We compare against the concatenation of all observation streams, hence `self.v_size + int(self.o_size > 0)`.
pred_next_state = tf.layers.dense(hidden, self.curiosity_enc_size * (self.v_size + int(self.o_size > 0)),
# We compare against the concatenation of all observation streams, hence `self.vis_obs_size + int(self.vec_obs_size > 0)`.
pred_next_state = tf.layers.dense(hidden, self.curiosity_enc_size * (self.vis_obs_size + int(self.vec_obs_size > 0)),
activation=None)
squared_difference = 0.5 * tf.reduce_sum(tf.squared_difference(pred_next_state, encoded_next_state), axis=1)

303
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.envs import AllBrainInfo, BrainInfo
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.ppo.models import PPOModel
from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.trainer import UnityTrainerException, Trainer
logger = logging.getLogger("mlagents.envs")

"""The PPOTrainer is an implementation of the PPO algorithm."""
action_masking_name = 'action_masks'
def __init__(self, sess, env, brain_name, trainer_parameters, training, seed, run_id):
def __init__(self, sess, brain, trainer_parameters, training, seed, run_id):
:param env: The UnityEnvironment.
super(PPOTrainer, self).__init__(sess, brain.brain_name, trainer_parameters, training, run_id)
self.param_keys = ['batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd',
'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers',
'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent',

for k in self.param_keys:
if k not in trainer_parameters:
raise UnityTrainerException("The hyperparameter {0} could not be found for the PPO trainer of "
"brain {1}.".format(k, brain_name))
"brain {1}.".format(k, brain.brain_name))
super(PPOTrainer, self).__init__(sess, env, brain_name, trainer_parameters, training, run_id)
self.use_curiosity = bool(trainer_parameters['use_curiosity'])
self.use_recurrent = trainer_parameters["use_recurrent"]
self.use_curiosity = bool(trainer_parameters['use_curiosity'])
self.sequence_length = 1
self.has_updated = False
self.m_size = None
if self.use_recurrent:
self.m_size = trainer_parameters["memory_size"]
self.sequence_length = trainer_parameters["sequence_length"]
if self.m_size == 0:
raise UnityTrainerException("The memory size for brain {0} is 0 even though the trainer uses recurrent."
.format(brain_name))
elif self.m_size % 4 != 0:
raise UnityTrainerException("The memory size for brain {0} is {1} but it must be divisible by 4."
.format(brain_name, self.m_size))
self.variable_scope = trainer_parameters['graph_scope']
with tf.variable_scope(self.variable_scope):
tf.set_random_seed(seed)
self.model = PPOModel(env.brains[brain_name],
lr=float(trainer_parameters['learning_rate']),
h_size=int(trainer_parameters['hidden_units']),
epsilon=float(trainer_parameters['epsilon']),
beta=float(trainer_parameters['beta']),
max_step=float(trainer_parameters['max_steps']),
normalize=trainer_parameters['normalize'],
use_recurrent=trainer_parameters['use_recurrent'],
num_layers=int(trainer_parameters['num_layers']),
m_size=self.m_size,
use_curiosity=bool(trainer_parameters['use_curiosity']),
curiosity_strength=float(trainer_parameters['curiosity_strength']),
curiosity_enc_size=float(trainer_parameters['curiosity_enc_size']))
self.policy = PPOPolicy(seed, brain, trainer_parameters,
sess, self.is_training)
stats = {'cumulative_reward': [], 'episode_length': [], 'value_estimate': [],
'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': []}

self.training_buffer = Buffer()
self.cumulative_rewards = {}
self.episode_steps = {}
self.is_continuous_action = (env.brains[brain_name].vector_action_space_type == "continuous")
self.use_visual_obs = (env.brains[brain_name].number_visual_observations > 0)
self.use_vector_obs = (env.brains[brain_name].vector_observation_space_size > 0)
self.summary_path = trainer_parameters['summary_path']
if not os.path.exists(self.summary_path):
os.makedirs(self.summary_path)

self.inference_run_list = [self.model.output, self.model.all_log_probs, self.model.value,
self.model.entropy, self.model.learning_rate]
if self.is_continuous_action:
self.inference_run_list.append(self.model.output_pre)
if self.use_recurrent:
self.inference_run_list.extend([self.model.memory_out])
if self.is_training and self.use_vector_obs and self.trainer_parameters['normalize']:
self.inference_run_list.extend([self.model.update_mean, self.model.update_variance])
def __str__(self):
return '''Hyperparameters for the PPO Trainer of brain {0}: \n{1}'''.format(
self.brain_name, '\n'.join(['\t{0}:\t{1}'.format(x, self.trainer_parameters[x]) for x in self.param_keys]))

return self.trainer_parameters
@property
def graph_scope(self):
"""
Returns the graph scope of the trainer.
"""
return self.variable_scope
@property
def get_max_steps(self):
"""
Returns the maximum number of steps. Is used to know when the trainer should be stopped.

"""
return self.step
@property
def get_last_reward(self):
"""
Returns the last reward the trainer has had
:return: the new last reward
"""
return self.sess.run(self.model.last_reward)
def increment_step_and_update_last_reward(self):
"""
Increment the step count of the trainer and Updates the last reward

self.sess.run([self.model.update_reward,
self.model.increment_step],
feed_dict={self.model.new_reward: mean_reward})
else:
self.sess.run(self.model.increment_step)
self.step = self.sess.run(self.model.global_step)
self.policy.update_reward(mean_reward)
self.policy.increment_step()
self.step = self.policy.get_current_step()
def take_action(self, all_brain_info: AllBrainInfo):
"""

if len(curr_brain_info.agents) == 0:
return [], [], [], None, None
feed_dict = {self.model.batch_size: len(curr_brain_info.vector_observations),
self.model.sequence_length: 1}
if self.use_recurrent:
if not self.is_continuous_action:
feed_dict[self.model.prev_action] = curr_brain_info.previous_vector_actions.reshape(
[-1, len(self.brain.vector_action_space_size)])
if curr_brain_info.memories.shape[1] == 0:
curr_brain_info.memories = np.zeros((len(curr_brain_info.agents), self.m_size))
feed_dict[self.model.memory_in] = curr_brain_info.memories
if self.use_visual_obs:
for i, _ in enumerate(curr_brain_info.visual_observations):
feed_dict[self.model.visual_in[i]] = curr_brain_info.visual_observations[i]
if self.use_vector_obs:
feed_dict[self.model.vector_in] = curr_brain_info.vector_observations
if not self.is_continuous_action:
feed_dict[self.model.action_masks] = curr_brain_info.action_masks
values = self.sess.run(self.inference_run_list, feed_dict=feed_dict)
run_out = dict(zip(self.inference_run_list, values))
self.stats['value_estimate'].append(run_out[self.model.value].mean())
self.stats['entropy'].append(run_out[self.model.entropy].mean())
self.stats['learning_rate'].append(run_out[self.model.learning_rate])
if self.use_recurrent:
return run_out[self.model.output], run_out[self.model.memory_out], None, run_out[self.model.value], run_out
run_out = self.policy.evaluate(curr_brain_info)
self.stats['value_estimate'].append(run_out['value'].mean())
self.stats['entropy'].append(run_out['entropy'].mean())
self.stats['learning_rate'].append(run_out['learning_rate'])
if self.policy.use_recurrent:
return run_out['action'], run_out['memory_out'], None, \
run_out['value'], run_out
return run_out[self.model.output], None, None, run_out[self.model.value], run_out
return run_out['action'], None, None, run_out['value'], run_out
def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo:
"""

visual_observations[i].append(agent_brain_info.visual_observations[i][agent_index])
vector_observations.append(agent_brain_info.vector_observations[agent_index])
text_observations.append(agent_brain_info.text_observations[agent_index])
if self.use_recurrent:
memories.append(agent_brain_info.memories[agent_index])
if self.policy.use_recurrent:
if len(agent_brain_info.memories > 0):
memories.append(agent_brain_info.memories[agent_index])
else:
memories.append(self.policy.make_empty_memory(1))
rewards.append(agent_brain_info.rewards[agent_index])
local_dones.append(agent_brain_info.local_done[agent_index])
max_reacheds.append(agent_brain_info.max_reached[agent_index])

curr_info = BrainInfo(visual_observations, vector_observations, text_observations, memories, rewards,
agents, local_dones, prev_vector_actions, prev_text_actions, max_reacheds)
if self.policy.use_recurrent:
memories = np.vstack(memories)
curr_info = BrainInfo(visual_observations, vector_observations, text_observations,
memories, rewards, agents, local_dones, prev_vector_actions,
prev_text_actions, max_reacheds)
def generate_intrinsic_rewards(self, curr_info, next_info):
"""
Generates intrinsic reward used for Curiosity-based training.
:BrainInfo curr_info: Current BrainInfo.
:BrainInfo next_info: Next BrainInfo.
:return: Intrinsic rewards for all agents.
"""
if self.use_curiosity:
feed_dict = {self.model.batch_size: len(next_info.vector_observations), self.model.sequence_length: 1}
if self.is_continuous_action:
feed_dict[self.model.output] = next_info.previous_vector_actions
else:
feed_dict[self.model.action_holder] = next_info.previous_vector_actions
if curr_info.agents != next_info.agents:
curr_info = self.construct_curr_info(next_info)
if len(curr_info.agents) == 0:
return []
if self.use_visual_obs:
for i in range(len(curr_info.visual_observations)):
feed_dict[self.model.visual_in[i]] = curr_info.visual_observations[i]
feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i]
if self.use_vector_obs:
feed_dict[self.model.vector_in] = curr_info.vector_observations
feed_dict[self.model.next_vector_in] = next_info.vector_observations
if self.use_recurrent:
if curr_info.memories.shape[1] == 0:
curr_info.memories = np.zeros((len(curr_info.agents), self.m_size))
feed_dict[self.model.memory_in] = curr_info.memories
intrinsic_rewards = self.sess.run(self.model.intrinsic_reward,
feed_dict=feed_dict) * float(self.has_updated)
return intrinsic_rewards
else:
return None
def generate_value_estimate(self, brain_info, idx):
"""
Generates value estimates for bootstrapping.
:param brain_info: BrainInfo to be used for bootstrapping.
:param idx: Index in BrainInfo of agent.
:return: Value estimate.
"""
feed_dict = {self.model.batch_size: 1, self.model.sequence_length: 1}
if self.use_visual_obs:
for i in range(len(brain_info.visual_observations)):
feed_dict[self.model.visual_in[i]] = [brain_info.visual_observations[i][idx]]
if self.use_vector_obs:
feed_dict[self.model.vector_in] = [brain_info.vector_observations[idx]]
if self.use_recurrent:
if brain_info.memories.shape[1] == 0:
brain_info.memories = np.zeros(
(len(brain_info.vector_observations), self.m_size))
feed_dict[self.model.memory_in] = [brain_info.memories[idx]]
if not self.is_continuous_action and self.use_recurrent:
feed_dict[self.model.prev_action] = brain_info.previous_vector_actions[idx].reshape(
[-1, len(self.brain.vector_action_space_size)])
value_estimate = self.sess.run(self.model.value, feed_dict)
return value_estimate
def add_experiences(self, curr_all_info: AllBrainInfo, next_all_info: AllBrainInfo, take_action_outputs):
"""
Adds experiences to each agent's experience history.

self.training_buffer[agent_id].last_brain_info = curr_info
self.training_buffer[agent_id].last_take_action_outputs = take_action_outputs
intrinsic_rewards = self.generate_intrinsic_rewards(curr_info, next_info)
if curr_info.agents != next_info.agents:
curr_to_use = self.construct_curr_info(next_info)
else:
curr_to_use = curr_info
intrinsic_rewards = self.policy.get_intrinsic_rewards(curr_to_use, next_info)
for agent_id in next_info.agents:
stored_info = self.training_buffer[agent_id].last_brain_info

next_idx = next_info.agents.index(agent_id)
if not stored_info.local_done[idx]:
if self.use_visual_obs:
for i, _ in enumerate(stored_info.visual_observations):
self.training_buffer[agent_id]['visual_obs%d' % i].append(
stored_info.visual_observations[i][idx])
self.training_buffer[agent_id]['next_visual_obs%d' % i].append(
next_info.visual_observations[i][next_idx])
if self.use_vector_obs:
for i, _ in enumerate(stored_info.visual_observations):
self.training_buffer[agent_id]['visual_obs%d' % i].append(
stored_info.visual_observations[i][idx])
self.training_buffer[agent_id]['next_visual_obs%d' % i].append(
next_info.visual_observations[i][next_idx])
if self.policy.use_vec_obs:
if self.use_recurrent:
if self.policy.use_recurrent:
stored_info.memories = np.zeros((len(stored_info.agents), self.m_size))
stored_info.memories = np.zeros((len(stored_info.agents), self.policy.m_size))
actions = stored_take_action_outputs[self.model.output]
if self.is_continuous_action:
actions_pre = stored_take_action_outputs[self.model.output_pre]
actions = stored_take_action_outputs['action']
if self.policy.use_continuous_act:
actions_pre = stored_take_action_outputs['pre_action']
self.training_buffer[agent_id][self.action_masking_name].append(stored_info.action_masks[idx])
a_dist = stored_take_action_outputs[self.model.all_log_probs]
value = stored_take_action_outputs[self.model.value]
self.training_buffer[agent_id]['action_mask'].append(
stored_info.action_masks[idx])
a_dist = stored_take_action_outputs['log_probs']
value = stored_take_action_outputs['value']
if self.use_curiosity:
self.training_buffer[agent_id]['rewards'].append(next_info.rewards[next_idx] +
intrinsic_rewards[next_idx])

else:
bootstrapping_info = info
idx = l
value_next = self.generate_value_estimate(bootstrapping_info, idx)
value_next = self.policy.get_value_estimate(bootstrapping_info, idx)
self.training_buffer[agent_id]['advantages'].set(
get_gae(

+ self.training_buffer[agent_id]['value_estimates'].get_batch())
self.training_buffer.append_update_buffer(agent_id, batch_size=None,
training_length=self.sequence_length)
training_length=self.policy.sequence_length)
self.training_buffer[agent_id].reset_agent()
if info.local_done[l]:

:return: A boolean corresponding to whether or not update_model() can be run
"""
size_of_buffer = len(self.training_buffer.update_buffer['actions'])
return size_of_buffer > max(int(self.trainer_parameters['buffer_size'] / self.sequence_length), 1)
return size_of_buffer > max(int(self.trainer_parameters['buffer_size'] / self.policy.sequence_length), 1)
def update_model(self):
def update_policy(self):
Uses training_buffer to update model.
Uses training_buffer to update the policy.
n_sequences = max(int(self.trainer_parameters['batch_size'] / self.sequence_length), 1)
n_sequences = max(int(self.trainer_parameters['batch_size'] / self.policy.sequence_length), 1)
value_total, policy_total, forward_total, inverse_total = [], [], [], []
advantages = self.training_buffer.update_buffer['advantages'].get_batch()
self.training_buffer.update_buffer['advantages'].set(

for l in range(len(self.training_buffer.update_buffer['actions']) // n_sequences):
start = l * n_sequences
end = (l + 1) * n_sequences
feed_dict = {self.model.batch_size: n_sequences,
self.model.sequence_length: self.sequence_length,
self.model.mask_input: np.array(buffer['masks'][start:end]).flatten(),
self.model.returns_holder: np.array(buffer['discounted_returns'][start:end]).flatten(),
self.model.old_value: np.array(buffer['value_estimates'][start:end]).flatten(),
self.model.advantage: np.array(buffer['advantages'][start:end]).reshape([-1, 1]),
self.model.all_old_log_probs: np.array(buffer['action_probs'][start:end]).reshape(
[-1, sum(self.brain.vector_action_space_size)])}
if self.is_continuous_action:
feed_dict[self.model.output_pre] = np.array(buffer['actions_pre'][start:end]).reshape(
[-1, self.brain.vector_action_space_size[0]])
else:
feed_dict[self.model.action_holder] = np.array(buffer['actions'][start:end]).reshape(
[-1, len(self.brain.vector_action_space_size)])
if self.use_recurrent:
feed_dict[self.model.prev_action] = np.array(buffer['prev_action'][start:end]).reshape(
[-1, len(self.brain.vector_action_space_size)])
feed_dict[self.model.action_masks] = np.array(buffer[self.action_masking_name][start:end]).reshape(
[-1, sum(self.brain.vector_action_space_size)]
)
if self.use_vector_obs:
total_observation_length = self.brain.vector_observation_space_size * \
self.brain.num_stacked_vector_observations
feed_dict[self.model.vector_in] = np.array(buffer['vector_obs'][start:end]).reshape(
[-1, total_observation_length])
if self.use_curiosity:
feed_dict[self.model.next_vector_in] = np.array(buffer['next_vector_in'][start:end]) \
.reshape([-1, total_observation_length])
if self.use_visual_obs:
for i, _ in enumerate(self.model.visual_in):
_obs = np.array(buffer['visual_obs%d' % i][start:end])
if self.sequence_length > 1 and self.use_recurrent:
(_batch, _seq, _w, _h, _c) = _obs.shape
feed_dict[self.model.visual_in[i]] = _obs.reshape([-1, _w, _h, _c])
else:
feed_dict[self.model.visual_in[i]] = _obs
if self.use_curiosity:
for i, _ in enumerate(self.model.visual_in):
_obs = np.array(buffer['next_visual_obs%d' % i][start:end])
if self.sequence_length > 1 and self.use_recurrent:
(_batch, _seq, _w, _h, _c) = _obs.shape
feed_dict[self.model.next_visual_in[i]] = _obs.reshape([-1, _w, _h, _c])
else:
feed_dict[self.model.next_visual_in[i]] = _obs
if self.use_recurrent:
mem_in = np.array(buffer['memory'][start:end])[:, 0, :]
feed_dict[self.model.memory_in] = mem_in
run_list = [self.model.value_loss, self.model.policy_loss, self.model.update_batch]
run_out = self.policy.update(buffer.make_mini_batch(start, end), n_sequences)
value_total.append(run_out['value_loss'])
policy_total.append(np.abs(run_out['policy_loss']))
run_list.extend([self.model.forward_loss, self.model.inverse_loss])
values = self.sess.run(run_list, feed_dict=feed_dict)
self.has_updated = True
run_out = dict(zip(run_list, values))
value_total.append(run_out[self.model.value_loss])
policy_total.append(np.abs(run_out[self.model.policy_loss]))
if self.use_curiosity:
inverse_total.append(run_out[self.model.inverse_loss])
forward_total.append(run_out[self.model.forward_loss])
inverse_total.append(run_out['inverse_loss'])
forward_total.append(run_out['forward_loss'])
self.stats['value_loss'].append(np.mean(value_total))
self.stats['policy_loss'].append(np.mean(policy_total))
if self.use_curiosity:

28
ml-agents/mlagents/trainers/trainer.py


class Trainer(object):
"""This class is the abstract class for the mlagents.trainers"""
def __init__(self, sess, env, brain_name, trainer_parameters, training, run_id):
def __init__(self, sess, brain_name, trainer_parameters, training, run_id):
:param env: The UnityEnvironment.
self.sess = sess
self.brain = env.brains[self.brain_name]
self.sess = sess
self.stats = {}
self.summary_writer = None

"""
Increment the step count of the trainer and updates the last reward
"""
raise UnityTrainerException("The increment_step_and_update_last_reward method was not implemented.")
raise UnityTrainerException(
"The increment_step_and_update_last_reward method was not implemented.")
def take_action(self, all_brain_info: AllBrainInfo):
"""

"""
raise UnityTrainerException("The take_action method was not implemented.")
def add_experiences(self, curr_info: AllBrainInfo, next_info: AllBrainInfo, take_action_outputs):
def add_experiences(self, curr_info: AllBrainInfo, next_info: AllBrainInfo,
take_action_outputs):
"""
Adds experiences to each agent's experience history.
:param curr_info: Current AllBrainInfo.