self . use_recurrent = trainer_parameters [ " use_recurrent " ]
self . use_curiosity = bool ( trainer_parameters [ ' use_curiosity ' ] )
self . sequence_length = 1
self . step = 0
if self . use_recurrent :
if self . m_size == 0 :
raise UnityTrainerException ( " The memory size for brain {0} is 0 even though the trainer uses recurrent. "
. format ( brain_name ) )
self . summary_writer = tf . summary . FileWriter ( self . summary_path )
self . inference_run_list = [ self . model . output , self . model . all_probs , self . model . value ,
self . model . entropy , self . model . learning_rate ]
if self . is_continuous_action :
self . inference_run_list . append ( self . model . output_pre )
if self . use_recurrent :
self . inference_run_list . extend ( [ self . model . memory_out ] )
if ( self . is_training and self . is_continuous_observation and
self . use_vector_obs and self . trainer_parameters [ ' normalize ' ] ) :
self . inference_run_list . extend ( [ self . model . update_mean , self . model . update_variance ] )
return ''' Hypermarameters for the PPO Trainer of brain {0}: \n {1} ''' . format (
return ''' Hyperparameters for the PPO Trainer of brain {0}: \n {1} ''' . format (
self . brain_name , ' \n ' . join ( [ ' \t {0}: \t {1} ' . format ( x , self . trainer_parameters [ x ] ) for x in self . param_keys ] ) )
@property
Returns the number of steps the trainer has performed
: return : the step count of the trainer
"""
return self . sess . run ( self . model . global_s tep )
return self . step
@property
def get_last_reward ( self ) :
"""
return self . sess . run ( self . model . last_reward )
def increment_step ( self ) :
def increment_step_and_update_last_reward ( self ) :
Increment the step count of the trainer
"""
self . sess . run ( self . model . increment_step )
def update_last_reward ( self ) :
"""
Updates the last reward
Increment the step count of the trainer and Updates the last reward
self . sess . run ( self . model . update_reward , feed_dict = { self . model . new_reward : mean_reward } )
def running_average ( self , data , steps , running_mean , running_variance ) :
"""
Computes new running mean and variances .
: param data : New piece of data .
: param steps : Total number of data so far .
: param running_mean : TF op corresponding to stored running mean .
: param running_variance : TF op corresponding to stored running variance .
: return : New mean and variance values .
"""
mean , var = self . sess . run ( [ running_mean , running_variance ] )
current_x = np . mean ( data , axis = 0 )
new_mean = mean + ( current_x - mean ) / ( steps + 1 )
new_variance = var + ( current_x - new_mean ) * ( current_x - mean )
return new_mean , new_variance
self . sess . run ( [ self . model . update_reward ,
self . model . increment_step ] ,
feed_dict = { self . model . new_reward : mean_reward } )
else :
self . sess . run ( self . model . increment_step )
self . step = self . sess . run ( self . model . global_step )
def take_action ( self , all_brain_info : AllBrainInfo ) :
"""
to be passed to add experiences
"""
steps = self . get_step
feed_dict = { self . model . batch_size : len ( curr_brain_info . vector_observations ) , self . model . sequence_length : 1 }
run_list = [ self . model . output , self . model . all_probs , self . model . value , self . model . entropy ,
self . model . learning_rate ]
if self . is_continuous_action :
run_list . append ( self . model . output_pre )
feed_dict = { self . model . batch_size : len ( curr_brain_info . vector_observations ) ,
self . model . sequence_length : 1 }
feed_dict [ self . model . prev_action ] = np . reshape ( curr_brain_info . previous_vector_actions , [ - 1 ] )
feed_dict [ self . model . prev_action ] = curr_brain_info . previous_vector_actions . flatten ( )
run_list + = [ self . model . memory_out ]
if ( self . is_training and self . is_continuous_observation and
self . use_vector_obs and self . trainer_parameters [ ' normalize ' ] ) :
new_mean , new_variance = self . running_average (
curr_brain_info . vector_observations , steps , self . model . running_mean , self . model . running_variance )
feed_dict [ self . model . new_mean ] = new_mean
feed_dict [ self . model . new_variance ] = new_variance
run_list = run_list + [ self . model . update_mean , self . model . update_variance ]
values = self . sess . run ( self . inference_run_list , feed_dict = feed_dict )
run_out = dict ( zip ( self . inference_run_list , values ) )
values = self . sess . run ( run_list , feed_dict = feed_dict )
run_out = dict ( zip ( run_list , values ) )
return ( run_out [ self . model . output ] ,
run_out [ self . model . memory_out ] ,
[ str ( v ) for v in run_out [ self . model . value ] ] ,
run_out )
return run_out [ self . model . output ] , run_out [ self . model . memory_out ] , None , run_out
return ( run_out [ self . model . output ] ,
None ,
[ str ( v ) for v in run_out [ self . model . value ] ] ,
run_out )
return run_out [ self . model . output ] , None , None , run_out
def add_experiences ( self , curr_all_info : AllBrainInfo , next_all_info : AllBrainInfo , take_action_outputs ) :
def generate_intrinsic_rewards ( self , curr_info , next_info ) :
Adds experiences to each agent ' s experience history.
: param curr_all_info : Dictionary of all current brains and corresponding BrainInfo .
: param next_all_info : Dictionary of all current brains and corresponding BrainInfo .
: param take_action_outputs : The outputs of the take action method .
Generates intrinsic reward used for Curiosity - based training .
: param curr_info : Current BrainInfo .
: param next_info : Next BrainInfo .
: return : Intrinsic rewards for all agents .
curr_info = curr_all_info [ self . brain_name ]
next_info = next_all_info [ self . brain_name ]
intrinsic_rewards = np . array ( [ ] )
run_list = [ self . model . intrinsic_reward ]
run_list . append ( self . model . output )
feed_dict [ self . model . output ] = next_info . previous_vector_actions . flatten ( )
feed_dict [ self . model . action_holder ] = np . reshape ( take_action_outputs [ self . model . output ] , [ - 1 ] )
feed_dict [ self . model . action_holder ] = next_info . previous_vector_actions . flatten ( )
for i , _ in enumerate ( curr_info . visual_observations ) :
for i in range ( len ( curr_info . visual_observations ) ) :
if self . use_recurrent :
feed_dict [ self . model . prev_action ] = np . reshape ( curr_info . previous_vector_actions , [ - 1 ] )
if curr_info . memories . shape [ 1 ] == 0 :
curr_info . memories = np . zeros ( ( len ( curr_info . agents ) , self . m_size ) )
feed_dict [ self . model . memory_in ] = curr_info . memories
run_list + = [ self . model . memory_out ]
intrinsic_rewards = self . sess . run ( self . model . intrinsic_reward , feed_dict = feed_dict ) * \
float ( self . has_updated )
intrinsic_rewards = self . sess . run ( self . model . intrinsic_reward ,
feed_dict = feed_dict ) * float ( self . has_updated )
return intrinsic_rewards
else :
return None
def generate_value_estimate ( self , brain_info , idx ) :
"""
Generates value estimates for bootstrapping .
: param brain_info : BrainInfo to be used for bootstrapping .
: param idx : Index in BrainInfo of agent .
: return : Value estimate .
"""
feed_dict = { self . model . batch_size : 1 , self . model . sequence_length : 1 }
if self . use_visual_obs :
for i in range ( len ( brain_info . visual_observations ) ) :
feed_dict [ self . model . visual_in [ i ] ] = brain_info . visual_observations [ i ] [ idx ]
if self . use_vector_obs :
feed_dict [ self . model . vector_in ] = [ brain_info . vector_observations [ idx ] ]
if self . use_recurrent :
if brain_info . memories . shape [ 1 ] == 0 :
brain_info . memories = np . zeros (
( len ( brain_info . vector_observations ) , self . m_size ) )
feed_dict [ self . model . memory_in ] = [ brain_info . memories [ idx ] ]
if not self . is_continuous_action and self . use_recurrent :
feed_dict [ self . model . prev_action ] = brain_info . previous_vector_actions [ idx ] . flatten ( )
value_estimate = self . sess . run ( self . model . value , feed_dict )
return value_estimate
def add_experiences ( self , curr_all_info : AllBrainInfo , next_all_info : AllBrainInfo , take_action_outputs ) :
"""
Adds experiences to each agent ' s experience history.
: param curr_all_info : Dictionary of all current brains and corresponding BrainInfo .
: param next_all_info : Dictionary of all current brains and corresponding BrainInfo .
: param take_action_outputs : The outputs of the take action method .
"""
curr_info = curr_all_info [ self . brain_name ]
next_info = next_all_info [ self . brain_name ]
intrinsic_rewards = self . generate_intrinsic_rewards ( curr_info , next_info )
for agent_id in curr_info . agents :
self . training_buffer [ agent_id ] . last_brain_info = curr_info
stored_info = self . training_buffer [ agent_id ] . last_brain_info
stored_take_action_outputs = self . training_buffer [ agent_id ] . last_take_action_outputs
if stored_info is None :
continue
else :
if stored_info is not None :
idx = stored_info . agents . index ( agent_id )
next_idx = next_info . agents . index ( agent_id )
if not stored_info . local_done [ idx ] :
else :
bootstrapping_info = info
idx = l
feed_dict = { self . model . batch_size : len ( bootstrapping_info . vector_observations ) ,
self . model . sequence_length : 1 }
if self . use_visual_obs :
for i in range ( len ( bootstrapping_info . visual_observations ) ) :
feed_dict [ self . model . visual_in [ i ] ] = bootstrapping_info . visual_observations [ i ]
if self . use_vector_obs :
feed_dict [ self . model . vector_in ] = bootstrapping_info . vector_observations
if self . use_recurrent :
if bootstrapping_info . memories . shape [ 1 ] == 0 :
bootstrapping_info . memories = np . zeros (
( len ( bootstrapping_info . vector_observations ) , self . m_size ) )
feed_dict [ self . model . memory_in ] = bootstrapping_info . memories
if not self . is_continuous_action and self . use_recurrent :
feed_dict [ self . model . prev_action ] = np . reshape ( bootstrapping_info . previous_vector_actions , [ - 1 ] )
value_next = self . sess . run ( self . model . value , feed_dict ) [ idx ]
value_next = self . generate_value_estimate ( bootstrapping_info , idx )
self . training_buffer [ agent_id ] [ ' advantages ' ] . set (
get_gae (
gamma = self . trainer_parameters [ ' gamma ' ] ,
lambd = self . trainer_parameters [ ' lambd ' ] )
)
lambd = self . trainer_parameters [ ' lambd ' ] ) )
self . training_buffer . append_update_buffer ( agent_id ,
batch_size = None , training_length = self . sequence_length )
self . training_buffer . append_update_buffer ( agent_id , batch_size = None ,
training_length = self . sequence_length )
self . training_buffer [ agent_id ] . reset_agent ( )
if info . local_done [ l ] :
Returns whether or not the trainer has enough elements to run update model
: return : A boolean corresponding to whether or not update_model ( ) can be run
"""
return len ( self . training_buffer . update_buffer [ ' actions ' ] ) > \
max ( int ( self . trainer_parameters [ ' buffer_size ' ] / self . sequence_length ) , 1 )
size_of_buffer = len ( self . training_buffer . update_buffer [ ' actions ' ] )
return size_of_buffer > max ( int ( self . trainer_parameters [ ' buffer_size ' ] / self . sequence_length ) , 1 )
num_epoch = self . trainer_parameters [ ' num_epoch ' ]
num_epoch = self . trainer_parameters [ ' num_epoch ' ]
buffer = self . training_buffer . update_buffer
_buffer = self . training_buffer . update_buffer
self . model . mask_input : np . array ( _buffer [ ' masks ' ] [ start : end ] ) . reshape (
[ - 1 ] ) ,
self . model . returns_holder : np . array ( _buffer [ ' discounted_returns ' ] [ start : end ] ) . reshape (
[ - 1 ] ) ,
self . model . old_value : np . array ( _buffer [ ' value_estimates ' ] [ start : end ] ) . reshape ( [ - 1 ] ) ,
self . model . advantage : np . array ( _buffer [ ' advantages ' ] [ start : end ] ) . reshape ( [ - 1 , 1 ] ) ,
self . model . all_old_probs : np . array (
_buffer [ ' action_probs ' ] [ start : end ] ) . reshape ( [ - 1 , self . brain . vector_action_space_size ] ) }
self . model . mask_input : np . array ( buffer [ ' masks ' ] [ start : end ] ) . flatten ( ) ,
self . model . returns_holder : np . array ( buffer [ ' discounted_returns ' ] [ start : end ] ) . flatten ( ) ,
self . model . old_value : np . array ( buffer [ ' value_estimates ' ] [ start : end ] ) . flatten ( ) ,
self . model . advantage : np . array ( buffer [ ' advantages ' ] [ start : end ] ) . reshape ( [ - 1 , 1 ] ) ,
self . model . all_old_probs : np . array ( buffer [ ' action_probs ' ] [ start : end ] ) . reshape (
[ - 1 , self . brain . vector_action_space_size ] ) }
feed_dict [ self . model . output_pre ] = np . array (
_buffer [ ' actions_pre ' ] [ start : end ] ) . reshape ( [ - 1 , self . brain . vector_action_space_size ] )
feed_dict [ self . model . output_pre ] = np . array ( buffer [ ' actions_pre ' ] [ start : end ] ) . reshape (
[ - 1 , self . brain . vector_action_space_size ] )
feed_dict [ self . model . action_holder ] = np . array (
_buffer [ ' actions ' ] [ start : end ] ) . reshape ( [ - 1 ] )
feed_dict [ self . model . action_holder ] = np . array ( buffer [ ' actions ' ] [ start : end ] ) . flatten ( )
feed_dict [ self . model . prev_action ] = np . array (
_buffer [ ' prev_action ' ] [ start : end ] ) . reshape ( [ - 1 ] )
feed_dict [ self . model . prev_action ] = np . array ( buffer [ ' prev_action ' ] [ start : end ] ) . flatten ( )
feed_dict [ self . model . vector_in ] = np . array (
_buffer [ ' vector_obs ' ] [ start : end ] ) . reshape (
[ - 1 , self . brain . vector_observation_space_size * self . brain . num_stacked_vector_observations ] )
total_observation_length = self . brain . vector_observation_space_size * \
self . brain . num_stacked_vector_observations
feed_dict [ self . model . vector_in ] = np . array ( buffer [ ' vector_obs ' ] [ start : end ] ) . reshape (
[ - 1 , total_observation_length ] )
feed_dict [ self . model . next_vector_obs ] = np . array (
_buffer [ ' next_vector_obs ' ] [ start : end ] ) . reshape (
[ - 1 ,
self . brain . vector_observation_space_size * self . brain . num_stacked_vector_observations ] )
feed_dict [ self . model . next_vector_obs ] = np . array ( buffer [ ' next_vector_obs ' ] [ start : end ] ) \
. reshape ( [ - 1 , total_observation_length ] )
feed_dict [ self . model . vector_in ] = np . array (
_buffer [ ' vector_obs ' ] [ start : end ] ) . reshape ( [ - 1 , self . brain . num_stacked_vector_observations ] )
feed_dict [ self . model . vector_in ] = np . array ( buffer [ ' vector_obs ' ] [ start : end ] ) . reshape (
[ - 1 , self . brain . num_stacked_vector_observations ] )
_obs = np . array ( _buffer [ ' visual_obs %d ' % i ] [ start : end ] )
_obs = np . array ( buffer [ ' visual_obs %d ' % i ] [ start : end ] )
_obs = np . array ( _buffer [ ' next_visual_obs %d ' % i ] [ start : end ] )
_obs = np . array ( buffer [ ' next_visual_obs %d ' % i ] [ start : end ] )
mem_in = np . array ( _buffer [ ' memory ' ] [ start : end ] ) [ : , 0 , : ]
mem_in = np . array ( buffer [ ' memory ' ] [ start : end ] ) [ : , 0 , : ]
feed_dict [ self . model . memory_in ] = mem_in
run_list = [ self . model . value_loss , self . model . policy_loss , self . model . update_batch ]
self . stats [ ' forward_loss ' ] . append ( np . mean ( forward_total ) )
self . stats [ ' inverse_loss ' ] . append ( np . mean ( inverse_total ) )
self . training_buffer . reset_update_buffer ( )
def discount_rewards ( r , gamma = 0.99 , value_next = 0.0 ) :
"""