|
|
|
|
|
|
|
|
|
|
from mlagents.envs import AllBrainInfo, BrainInfo |
|
|
|
from mlagents.trainers.buffer import Buffer |
|
|
|
from mlagents.trainers.ppo.models import PPOModel |
|
|
|
from mlagents.trainers.ppo.policy import PPOPolicy |
|
|
|
from mlagents.trainers.trainer import UnityTrainerException, Trainer |
|
|
|
|
|
|
|
logger = logging.getLogger("mlagents.envs") |
|
|
|
|
|
|
"""The PPOTrainer is an implementation of the PPO algorithm.""" |
|
|
|
action_masking_name = 'action_masks' |
|
|
|
|
|
|
|
def __init__(self, sess, env, brain_name, trainer_parameters, training, seed, run_id): |
|
|
|
|
|
|
|
def __init__(self, sess, brain, trainer_parameters, training, seed, run_id): |
|
|
|
:param env: The UnityEnvironment. |
|
|
|
super(PPOTrainer, self).__init__(sess, brain.brain_name, trainer_parameters, training, run_id) |
|
|
|
|
|
|
|
self.param_keys = ['batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd', |
|
|
|
'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers', |
|
|
|
'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent', |
|
|
|
|
|
|
for k in self.param_keys: |
|
|
|
if k not in trainer_parameters: |
|
|
|
raise UnityTrainerException("The hyperparameter {0} could not be found for the PPO trainer of " |
|
|
|
"brain {1}.".format(k, brain_name)) |
|
|
|
"brain {1}.".format(k, brain.brain_name)) |
|
|
|
super(PPOTrainer, self).__init__(sess, env, brain_name, trainer_parameters, training, run_id) |
|
|
|
self.use_curiosity = bool(trainer_parameters['use_curiosity']) |
|
|
|
self.use_recurrent = trainer_parameters["use_recurrent"] |
|
|
|
self.use_curiosity = bool(trainer_parameters['use_curiosity']) |
|
|
|
self.sequence_length = 1 |
|
|
|
self.has_updated = False |
|
|
|
self.m_size = None |
|
|
|
if self.use_recurrent: |
|
|
|
self.m_size = trainer_parameters["memory_size"] |
|
|
|
self.sequence_length = trainer_parameters["sequence_length"] |
|
|
|
if self.m_size == 0: |
|
|
|
raise UnityTrainerException("The memory size for brain {0} is 0 even though the trainer uses recurrent." |
|
|
|
.format(brain_name)) |
|
|
|
elif self.m_size % 4 != 0: |
|
|
|
raise UnityTrainerException("The memory size for brain {0} is {1} but it must be divisible by 4." |
|
|
|
.format(brain_name, self.m_size)) |
|
|
|
self.variable_scope = trainer_parameters['graph_scope'] |
|
|
|
with tf.variable_scope(self.variable_scope): |
|
|
|
tf.set_random_seed(seed) |
|
|
|
self.model = PPOModel(env.brains[brain_name], |
|
|
|
lr=float(trainer_parameters['learning_rate']), |
|
|
|
h_size=int(trainer_parameters['hidden_units']), |
|
|
|
epsilon=float(trainer_parameters['epsilon']), |
|
|
|
beta=float(trainer_parameters['beta']), |
|
|
|
max_step=float(trainer_parameters['max_steps']), |
|
|
|
normalize=trainer_parameters['normalize'], |
|
|
|
use_recurrent=trainer_parameters['use_recurrent'], |
|
|
|
num_layers=int(trainer_parameters['num_layers']), |
|
|
|
m_size=self.m_size, |
|
|
|
use_curiosity=bool(trainer_parameters['use_curiosity']), |
|
|
|
curiosity_strength=float(trainer_parameters['curiosity_strength']), |
|
|
|
curiosity_enc_size=float(trainer_parameters['curiosity_enc_size'])) |
|
|
|
self.policy = PPOPolicy(seed, brain, trainer_parameters, |
|
|
|
sess, self.is_training) |
|
|
|
|
|
|
|
stats = {'cumulative_reward': [], 'episode_length': [], 'value_estimate': [], |
|
|
|
'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': []} |
|
|
|
|
|
|
self.training_buffer = Buffer() |
|
|
|
self.cumulative_rewards = {} |
|
|
|
self.episode_steps = {} |
|
|
|
self.is_continuous_action = (env.brains[brain_name].vector_action_space_type == "continuous") |
|
|
|
self.use_visual_obs = (env.brains[brain_name].number_visual_observations > 0) |
|
|
|
self.use_vector_obs = (env.brains[brain_name].vector_observation_space_size > 0) |
|
|
|
self.summary_path = trainer_parameters['summary_path'] |
|
|
|
if not os.path.exists(self.summary_path): |
|
|
|
os.makedirs(self.summary_path) |
|
|
|
|
|
|
self.inference_run_list = [self.model.output, self.model.all_log_probs, self.model.value, |
|
|
|
self.model.entropy, self.model.learning_rate] |
|
|
|
if self.is_continuous_action: |
|
|
|
self.inference_run_list.append(self.model.output_pre) |
|
|
|
if self.use_recurrent: |
|
|
|
self.inference_run_list.extend([self.model.memory_out]) |
|
|
|
if self.is_training and self.use_vector_obs and self.trainer_parameters['normalize']: |
|
|
|
self.inference_run_list.extend([self.model.update_mean, self.model.update_variance]) |
|
|
|
|
|
|
|
def __str__(self): |
|
|
|
return '''Hyperparameters for the PPO Trainer of brain {0}: \n{1}'''.format( |
|
|
|
self.brain_name, '\n'.join(['\t{0}:\t{1}'.format(x, self.trainer_parameters[x]) for x in self.param_keys])) |
|
|
|
|
|
|
return self.trainer_parameters |
|
|
|
|
|
|
|
@property |
|
|
|
def graph_scope(self): |
|
|
|
""" |
|
|
|
Returns the graph scope of the trainer. |
|
|
|
""" |
|
|
|
return self.variable_scope |
|
|
|
|
|
|
|
@property |
|
|
|
def get_max_steps(self): |
|
|
|
""" |
|
|
|
Returns the maximum number of steps. Is used to know when the trainer should be stopped. |
|
|
|
|
|
|
""" |
|
|
|
return self.step |
|
|
|
|
|
|
|
@property |
|
|
|
def get_last_reward(self): |
|
|
|
""" |
|
|
|
Returns the last reward the trainer has had |
|
|
|
:return: the new last reward |
|
|
|
""" |
|
|
|
return self.sess.run(self.model.last_reward) |
|
|
|
|
|
|
|
def increment_step_and_update_last_reward(self): |
|
|
|
""" |
|
|
|
Increment the step count of the trainer and Updates the last reward |
|
|
|
|
|
|
self.sess.run([self.model.update_reward, |
|
|
|
self.model.increment_step], |
|
|
|
feed_dict={self.model.new_reward: mean_reward}) |
|
|
|
else: |
|
|
|
self.sess.run(self.model.increment_step) |
|
|
|
self.step = self.sess.run(self.model.global_step) |
|
|
|
self.policy.update_reward(mean_reward) |
|
|
|
self.policy.increment_step() |
|
|
|
self.step = self.policy.get_current_step() |
|
|
|
|
|
|
|
def take_action(self, all_brain_info: AllBrainInfo): |
|
|
|
""" |
|
|
|
|
|
|
if len(curr_brain_info.agents) == 0: |
|
|
|
return [], [], [], None, None |
|
|
|
|
|
|
|
feed_dict = {self.model.batch_size: len(curr_brain_info.vector_observations), |
|
|
|
self.model.sequence_length: 1} |
|
|
|
if self.use_recurrent: |
|
|
|
if not self.is_continuous_action: |
|
|
|
feed_dict[self.model.prev_action] = curr_brain_info.previous_vector_actions.reshape( |
|
|
|
[-1, len(self.brain.vector_action_space_size)]) |
|
|
|
if curr_brain_info.memories.shape[1] == 0: |
|
|
|
curr_brain_info.memories = np.zeros((len(curr_brain_info.agents), self.m_size)) |
|
|
|
feed_dict[self.model.memory_in] = curr_brain_info.memories |
|
|
|
if self.use_visual_obs: |
|
|
|
for i, _ in enumerate(curr_brain_info.visual_observations): |
|
|
|
feed_dict[self.model.visual_in[i]] = curr_brain_info.visual_observations[i] |
|
|
|
if self.use_vector_obs: |
|
|
|
feed_dict[self.model.vector_in] = curr_brain_info.vector_observations |
|
|
|
if not self.is_continuous_action: |
|
|
|
feed_dict[self.model.action_masks] = curr_brain_info.action_masks |
|
|
|
|
|
|
|
values = self.sess.run(self.inference_run_list, feed_dict=feed_dict) |
|
|
|
run_out = dict(zip(self.inference_run_list, values)) |
|
|
|
self.stats['value_estimate'].append(run_out[self.model.value].mean()) |
|
|
|
self.stats['entropy'].append(run_out[self.model.entropy].mean()) |
|
|
|
self.stats['learning_rate'].append(run_out[self.model.learning_rate]) |
|
|
|
if self.use_recurrent: |
|
|
|
return run_out[self.model.output], run_out[self.model.memory_out], None, run_out[self.model.value], run_out |
|
|
|
run_out = self.policy.evaluate(curr_brain_info) |
|
|
|
self.stats['value_estimate'].append(run_out['value'].mean()) |
|
|
|
self.stats['entropy'].append(run_out['entropy'].mean()) |
|
|
|
self.stats['learning_rate'].append(run_out['learning_rate']) |
|
|
|
if self.policy.use_recurrent: |
|
|
|
return run_out['action'], run_out['memory_out'], None, \ |
|
|
|
run_out['value'], run_out |
|
|
|
return run_out[self.model.output], None, None, run_out[self.model.value], run_out |
|
|
|
return run_out['action'], None, None, run_out['value'], run_out |
|
|
|
|
|
|
|
def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo: |
|
|
|
""" |
|
|
|
|
|
|
visual_observations[i].append(agent_brain_info.visual_observations[i][agent_index]) |
|
|
|
vector_observations.append(agent_brain_info.vector_observations[agent_index]) |
|
|
|
text_observations.append(agent_brain_info.text_observations[agent_index]) |
|
|
|
if self.use_recurrent: |
|
|
|
memories.append(agent_brain_info.memories[agent_index]) |
|
|
|
if self.policy.use_recurrent: |
|
|
|
if len(agent_brain_info.memories > 0): |
|
|
|
memories.append(agent_brain_info.memories[agent_index]) |
|
|
|
else: |
|
|
|
memories.append(self.policy.make_empty_memory(1)) |
|
|
|
rewards.append(agent_brain_info.rewards[agent_index]) |
|
|
|
local_dones.append(agent_brain_info.local_done[agent_index]) |
|
|
|
max_reacheds.append(agent_brain_info.max_reached[agent_index]) |
|
|
|
|
|
|
curr_info = BrainInfo(visual_observations, vector_observations, text_observations, memories, rewards, |
|
|
|
agents, local_dones, prev_vector_actions, prev_text_actions, max_reacheds) |
|
|
|
if self.policy.use_recurrent: |
|
|
|
memories = np.vstack(memories) |
|
|
|
curr_info = BrainInfo(visual_observations, vector_observations, text_observations, |
|
|
|
memories, rewards, agents, local_dones, prev_vector_actions, |
|
|
|
prev_text_actions, max_reacheds) |
|
|
|
def generate_intrinsic_rewards(self, curr_info, next_info): |
|
|
|
""" |
|
|
|
Generates intrinsic reward used for Curiosity-based training. |
|
|
|
:BrainInfo curr_info: Current BrainInfo. |
|
|
|
:BrainInfo next_info: Next BrainInfo. |
|
|
|
:return: Intrinsic rewards for all agents. |
|
|
|
""" |
|
|
|
if self.use_curiosity: |
|
|
|
feed_dict = {self.model.batch_size: len(next_info.vector_observations), self.model.sequence_length: 1} |
|
|
|
if self.is_continuous_action: |
|
|
|
feed_dict[self.model.output] = next_info.previous_vector_actions |
|
|
|
else: |
|
|
|
feed_dict[self.model.action_holder] = next_info.previous_vector_actions |
|
|
|
|
|
|
|
if curr_info.agents != next_info.agents: |
|
|
|
curr_info = self.construct_curr_info(next_info) |
|
|
|
|
|
|
|
if len(curr_info.agents) == 0: |
|
|
|
return [] |
|
|
|
|
|
|
|
if self.use_visual_obs: |
|
|
|
for i in range(len(curr_info.visual_observations)): |
|
|
|
feed_dict[self.model.visual_in[i]] = curr_info.visual_observations[i] |
|
|
|
feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i] |
|
|
|
if self.use_vector_obs: |
|
|
|
feed_dict[self.model.vector_in] = curr_info.vector_observations |
|
|
|
feed_dict[self.model.next_vector_in] = next_info.vector_observations |
|
|
|
if self.use_recurrent: |
|
|
|
if curr_info.memories.shape[1] == 0: |
|
|
|
curr_info.memories = np.zeros((len(curr_info.agents), self.m_size)) |
|
|
|
feed_dict[self.model.memory_in] = curr_info.memories |
|
|
|
intrinsic_rewards = self.sess.run(self.model.intrinsic_reward, |
|
|
|
feed_dict=feed_dict) * float(self.has_updated) |
|
|
|
return intrinsic_rewards |
|
|
|
else: |
|
|
|
return None |
|
|
|
|
|
|
|
def generate_value_estimate(self, brain_info, idx): |
|
|
|
""" |
|
|
|
Generates value estimates for bootstrapping. |
|
|
|
:param brain_info: BrainInfo to be used for bootstrapping. |
|
|
|
:param idx: Index in BrainInfo of agent. |
|
|
|
:return: Value estimate. |
|
|
|
""" |
|
|
|
feed_dict = {self.model.batch_size: 1, self.model.sequence_length: 1} |
|
|
|
if self.use_visual_obs: |
|
|
|
for i in range(len(brain_info.visual_observations)): |
|
|
|
feed_dict[self.model.visual_in[i]] = [brain_info.visual_observations[i][idx]] |
|
|
|
if self.use_vector_obs: |
|
|
|
feed_dict[self.model.vector_in] = [brain_info.vector_observations[idx]] |
|
|
|
if self.use_recurrent: |
|
|
|
if brain_info.memories.shape[1] == 0: |
|
|
|
brain_info.memories = np.zeros( |
|
|
|
(len(brain_info.vector_observations), self.m_size)) |
|
|
|
feed_dict[self.model.memory_in] = [brain_info.memories[idx]] |
|
|
|
if not self.is_continuous_action and self.use_recurrent: |
|
|
|
feed_dict[self.model.prev_action] = brain_info.previous_vector_actions[idx].reshape( |
|
|
|
[-1, len(self.brain.vector_action_space_size)]) |
|
|
|
value_estimate = self.sess.run(self.model.value, feed_dict) |
|
|
|
return value_estimate |
|
|
|
|
|
|
|
def add_experiences(self, curr_all_info: AllBrainInfo, next_all_info: AllBrainInfo, take_action_outputs): |
|
|
|
""" |
|
|
|
Adds experiences to each agent's experience history. |
|
|
|
|
|
|
self.training_buffer[agent_id].last_brain_info = curr_info |
|
|
|
self.training_buffer[agent_id].last_take_action_outputs = take_action_outputs |
|
|
|
|
|
|
|
intrinsic_rewards = self.generate_intrinsic_rewards(curr_info, next_info) |
|
|
|
if curr_info.agents != next_info.agents: |
|
|
|
curr_to_use = self.construct_curr_info(next_info) |
|
|
|
else: |
|
|
|
curr_to_use = curr_info |
|
|
|
|
|
|
|
intrinsic_rewards = self.policy.get_intrinsic_rewards(curr_to_use, next_info) |
|
|
|
|
|
|
|
for agent_id in next_info.agents: |
|
|
|
stored_info = self.training_buffer[agent_id].last_brain_info |
|
|
|
|
|
|
next_idx = next_info.agents.index(agent_id) |
|
|
|
if not stored_info.local_done[idx]: |
|
|
|
if self.use_visual_obs: |
|
|
|
for i, _ in enumerate(stored_info.visual_observations): |
|
|
|
self.training_buffer[agent_id]['visual_obs%d' % i].append( |
|
|
|
stored_info.visual_observations[i][idx]) |
|
|
|
self.training_buffer[agent_id]['next_visual_obs%d' % i].append( |
|
|
|
next_info.visual_observations[i][next_idx]) |
|
|
|
if self.use_vector_obs: |
|
|
|
for i, _ in enumerate(stored_info.visual_observations): |
|
|
|
self.training_buffer[agent_id]['visual_obs%d' % i].append( |
|
|
|
stored_info.visual_observations[i][idx]) |
|
|
|
self.training_buffer[agent_id]['next_visual_obs%d' % i].append( |
|
|
|
next_info.visual_observations[i][next_idx]) |
|
|
|
if self.policy.use_vec_obs: |
|
|
|
if self.use_recurrent: |
|
|
|
if self.policy.use_recurrent: |
|
|
|
stored_info.memories = np.zeros((len(stored_info.agents), self.m_size)) |
|
|
|
stored_info.memories = np.zeros((len(stored_info.agents), self.policy.m_size)) |
|
|
|
actions = stored_take_action_outputs[self.model.output] |
|
|
|
if self.is_continuous_action: |
|
|
|
actions_pre = stored_take_action_outputs[self.model.output_pre] |
|
|
|
actions = stored_take_action_outputs['action'] |
|
|
|
if self.policy.use_continuous_act: |
|
|
|
actions_pre = stored_take_action_outputs['pre_action'] |
|
|
|
self.training_buffer[agent_id][self.action_masking_name].append(stored_info.action_masks[idx]) |
|
|
|
a_dist = stored_take_action_outputs[self.model.all_log_probs] |
|
|
|
value = stored_take_action_outputs[self.model.value] |
|
|
|
self.training_buffer[agent_id]['action_mask'].append( |
|
|
|
stored_info.action_masks[idx]) |
|
|
|
a_dist = stored_take_action_outputs['log_probs'] |
|
|
|
value = stored_take_action_outputs['value'] |
|
|
|
|
|
|
|
if self.use_curiosity: |
|
|
|
self.training_buffer[agent_id]['rewards'].append(next_info.rewards[next_idx] + |
|
|
|
intrinsic_rewards[next_idx]) |
|
|
|
|
|
|
else: |
|
|
|
bootstrapping_info = info |
|
|
|
idx = l |
|
|
|
value_next = self.generate_value_estimate(bootstrapping_info, idx) |
|
|
|
value_next = self.policy.get_value_estimate(bootstrapping_info, idx) |
|
|
|
|
|
|
|
self.training_buffer[agent_id]['advantages'].set( |
|
|
|
get_gae( |
|
|
|
|
|
|
+ self.training_buffer[agent_id]['value_estimates'].get_batch()) |
|
|
|
|
|
|
|
self.training_buffer.append_update_buffer(agent_id, batch_size=None, |
|
|
|
training_length=self.sequence_length) |
|
|
|
training_length=self.policy.sequence_length) |
|
|
|
|
|
|
|
self.training_buffer[agent_id].reset_agent() |
|
|
|
if info.local_done[l]: |
|
|
|
|
|
|
:return: A boolean corresponding to whether or not update_model() can be run |
|
|
|
""" |
|
|
|
size_of_buffer = len(self.training_buffer.update_buffer['actions']) |
|
|
|
return size_of_buffer > max(int(self.trainer_parameters['buffer_size'] / self.sequence_length), 1) |
|
|
|
return size_of_buffer > max(int(self.trainer_parameters['buffer_size'] / self.policy.sequence_length), 1) |
|
|
|
def update_model(self): |
|
|
|
def update_policy(self): |
|
|
|
Uses training_buffer to update model. |
|
|
|
Uses training_buffer to update the policy. |
|
|
|
n_sequences = max(int(self.trainer_parameters['batch_size'] / self.sequence_length), 1) |
|
|
|
n_sequences = max(int(self.trainer_parameters['batch_size'] / self.policy.sequence_length), 1) |
|
|
|
value_total, policy_total, forward_total, inverse_total = [], [], [], [] |
|
|
|
advantages = self.training_buffer.update_buffer['advantages'].get_batch() |
|
|
|
self.training_buffer.update_buffer['advantages'].set( |
|
|
|
|
|
|
for l in range(len(self.training_buffer.update_buffer['actions']) // n_sequences): |
|
|
|
start = l * n_sequences |
|
|
|
end = (l + 1) * n_sequences |
|
|
|
feed_dict = {self.model.batch_size: n_sequences, |
|
|
|
self.model.sequence_length: self.sequence_length, |
|
|
|
self.model.mask_input: np.array(buffer['masks'][start:end]).flatten(), |
|
|
|
self.model.returns_holder: np.array(buffer['discounted_returns'][start:end]).flatten(), |
|
|
|
self.model.old_value: np.array(buffer['value_estimates'][start:end]).flatten(), |
|
|
|
self.model.advantage: np.array(buffer['advantages'][start:end]).reshape([-1, 1]), |
|
|
|
self.model.all_old_log_probs: np.array(buffer['action_probs'][start:end]).reshape( |
|
|
|
[-1, sum(self.brain.vector_action_space_size)])} |
|
|
|
if self.is_continuous_action: |
|
|
|
feed_dict[self.model.output_pre] = np.array(buffer['actions_pre'][start:end]).reshape( |
|
|
|
[-1, self.brain.vector_action_space_size[0]]) |
|
|
|
else: |
|
|
|
feed_dict[self.model.action_holder] = np.array(buffer['actions'][start:end]).reshape( |
|
|
|
[-1, len(self.brain.vector_action_space_size)]) |
|
|
|
if self.use_recurrent: |
|
|
|
feed_dict[self.model.prev_action] = np.array(buffer['prev_action'][start:end]).reshape( |
|
|
|
[-1, len(self.brain.vector_action_space_size)]) |
|
|
|
feed_dict[self.model.action_masks] = np.array(buffer[self.action_masking_name][start:end]).reshape( |
|
|
|
[-1, sum(self.brain.vector_action_space_size)] |
|
|
|
) |
|
|
|
if self.use_vector_obs: |
|
|
|
total_observation_length = self.brain.vector_observation_space_size * \ |
|
|
|
self.brain.num_stacked_vector_observations |
|
|
|
feed_dict[self.model.vector_in] = np.array(buffer['vector_obs'][start:end]).reshape( |
|
|
|
[-1, total_observation_length]) |
|
|
|
if self.use_curiosity: |
|
|
|
feed_dict[self.model.next_vector_in] = np.array(buffer['next_vector_in'][start:end]) \ |
|
|
|
.reshape([-1, total_observation_length]) |
|
|
|
if self.use_visual_obs: |
|
|
|
for i, _ in enumerate(self.model.visual_in): |
|
|
|
_obs = np.array(buffer['visual_obs%d' % i][start:end]) |
|
|
|
if self.sequence_length > 1 and self.use_recurrent: |
|
|
|
(_batch, _seq, _w, _h, _c) = _obs.shape |
|
|
|
feed_dict[self.model.visual_in[i]] = _obs.reshape([-1, _w, _h, _c]) |
|
|
|
else: |
|
|
|
feed_dict[self.model.visual_in[i]] = _obs |
|
|
|
if self.use_curiosity: |
|
|
|
for i, _ in enumerate(self.model.visual_in): |
|
|
|
_obs = np.array(buffer['next_visual_obs%d' % i][start:end]) |
|
|
|
if self.sequence_length > 1 and self.use_recurrent: |
|
|
|
(_batch, _seq, _w, _h, _c) = _obs.shape |
|
|
|
feed_dict[self.model.next_visual_in[i]] = _obs.reshape([-1, _w, _h, _c]) |
|
|
|
else: |
|
|
|
feed_dict[self.model.next_visual_in[i]] = _obs |
|
|
|
if self.use_recurrent: |
|
|
|
mem_in = np.array(buffer['memory'][start:end])[:, 0, :] |
|
|
|
feed_dict[self.model.memory_in] = mem_in |
|
|
|
|
|
|
|
run_list = [self.model.value_loss, self.model.policy_loss, self.model.update_batch] |
|
|
|
run_out = self.policy.update(buffer.make_mini_batch(start, end), n_sequences) |
|
|
|
value_total.append(run_out['value_loss']) |
|
|
|
policy_total.append(np.abs(run_out['policy_loss'])) |
|
|
|
run_list.extend([self.model.forward_loss, self.model.inverse_loss]) |
|
|
|
values = self.sess.run(run_list, feed_dict=feed_dict) |
|
|
|
self.has_updated = True |
|
|
|
run_out = dict(zip(run_list, values)) |
|
|
|
value_total.append(run_out[self.model.value_loss]) |
|
|
|
policy_total.append(np.abs(run_out[self.model.policy_loss])) |
|
|
|
if self.use_curiosity: |
|
|
|
inverse_total.append(run_out[self.model.inverse_loss]) |
|
|
|
forward_total.append(run_out[self.model.forward_loss]) |
|
|
|
inverse_total.append(run_out['inverse_loss']) |
|
|
|
forward_total.append(run_out['forward_loss']) |
|
|
|
self.stats['value_loss'].append(np.mean(value_total)) |
|
|
|
self.stats['policy_loss'].append(np.mean(policy_total)) |
|
|
|
if self.use_curiosity: |
|
|
|