from mlagents.trainers.brain import BrainInfo
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.policy import Policy
from mlagents.trainers.action_info import ActionInfoOutputs
from mlagents.trainers.action_info import ActionInfo , ActionInfo Outputs
from mlagents.trainers.stats import StatsReporter
T = TypeVar ( " T " )
"""
self . experience_buffers : Dict [ str , List [ AgentExperience ] ] = defaultdict ( list )
self . last_brain_info : Dict [ str , BrainInfo ] = { }
# last_take_action_outputs stores the action a_t taken before the current observation s_(t+1), while
# grabbing previous_action from the policy grabs the action PRIOR to that, a_(t-1).
# Note: this is needed until we switch to AgentExperiences as the data input type.
# We still need some info from the policy (memories, previous actions)
# that really should be gathered by the env-manager.
# Note: In the future this policy reference will be the policy of the env_manager and not the trainer.
# We can in that case just grab the action from the policy rather than having it passed in.
self . policy = policy
self . episode_steps : Counter = Counter ( )
self . episode_rewards : Dict [ str , float ] = defaultdict ( float )
self . behavior_id = behavior_id
def add_experiences (
self ,
curr_info : BrainInfo ,
next_info : BrainInfo ,
take_action_outputs : ActionInfoOutputs ,
self , curr_info : BrainInfo , previous_action : ActionInfo
: param next_info : next BrainInfo .
: param take_action_outputs : The outputs of the Policy ' s get_action method.
: param previous_action : The return value of the Policy ' s get_action method.
take_action_outputs = previous_action . outputs
self . stats_reporter . add_stat (
" Policy/Entropy " , take_action_outputs [ " entropy " ] . mean ( )
)
for _entropy in take_action_outputs [ " entropy " ] :
self . stats_reporter . add_stat ( " Policy/Entropy " , _entropy )
for agent_id in curr_info . agents :
self . last_brain_info [ agent_id ] = curr_info
for agent_id in previous_action . agents :
tmp_environment_reward = next_info . rewards
tmp_environment_reward = curr_info . rewards
for next_idx , agent_id in enumerate ( next_info . agents ) :
for agent_idx , agent_id in enumerate ( curr_info . agents ) :
if stored_info is not None :
stored_take_action_outputs = self . last_take_action_outputs [ agent_id ]
idx = stored_info . agents . index ( agent_id )
stored_take_action_outputs = self . last_take_action_outputs . get (
agent_id , None
)
if stored_info is not None and stored_take_action_outputs is not None :
prev_idx = stored_info . agents . index ( agent_id )
if not stored_info . local_done [ idx ] :
if not stored_info . local_done [ prev_idx ] :
obs . append ( stored_info . visual_observations [ i ] [ idx ] )
obs . append ( stored_info . visual_observations [ i ] [ prev_idx ] )
obs . append ( stored_info . vector_observations [ idx ] )
obs . append ( stored_info . vector_observations [ prev_idx ] )
done = next_info . local_done [ next_idx ]
max_step = next_info . max_reached [ next_idx ]
done = curr_info . local_done [ agent_idx ]
max_step = curr_info . max_reached [ agent_idx ]
action = stored_take_action_outputs [ " action " ] [ idx ]
action = stored_take_action_outputs [ " action " ] [ prev_idx ]
action_pre = stored_take_action_outputs [ " pre_action " ] [ idx ]
action_pre = stored_take_action_outputs [ " pre_action " ] [ prev_idx ]
action_probs = stored_take_action_outputs [ " log_probs " ] [ idx ]
action_masks = stored_info . action_masks [ idx ]
action_probs = stored_take_action_outputs [ " log_probs " ] [ prev_idx ]
action_masks = stored_info . action_masks [ prev_idx ]
reward = tmp_environment_reward [ next_idx ] ,
reward = tmp_environment_reward [ agent_idx ] ,
done = done ,
action = action ,
action_probs = action_probs ,
)
# Add the value outputs if needed
self . experience_buffers [ agent_id ] . append ( experience )
self . episode_rewards [ agent_id ] + = tmp_environment_reward [ nex t_idx ]
self . episode_rewards [ agent_id ] + = tmp_environment_reward [ age nt_idx]
next _info. local_done [ nex t_idx ]
curr _info. local_done [ age nt_idx]
or (
len ( self . experience_buffers [ agent_id ] )
> = self . max_trajectory_length
next_obs = [ ]
for i , _ in enumerate ( next _info. visual_observations ) :
next_obs . append ( next _info. visual_observations [ i ] [ nex t_idx ] )
for i , _ in enumerate ( curr _info. visual_observations ) :
next_obs . append ( curr _info. visual_observations [ i ] [ age nt_idx] )
next_obs . append ( next _info. vector_observations [ nex t_idx ] )
next_obs . append ( curr _info. vector_observations [ age nt_idx] )
trajectory = Trajectory (
steps = self . experience_buffers [ agent_id ] ,
agent_id = agent_id ,
for traj_queue in self . trajectory_queues :
traj_queue . put ( trajectory )
self . experience_buffers [ agent_id ] = [ ]
if next _info. local_done [ nex t_idx ] :
if curr _info. local_done [ age nt_idx] :
self . stats_reporter . add_stat (
" Environment/Cumulative Reward " ,
self . episode_rewards . get ( agent_id , 0 ) ,
)
del self . episode_steps [ agent_id ]
del self . episode_rewards [ agent_id ]
elif not next_info . local_done [ next_idx ] :
elif not curr_info . local_done [ agent_idx ] :
self . last_brain_info [ agent_id ] = curr_info
curr_info . agents , take_action_outputs [ " action " ]
previous_action . agents , take_action_outputs [ " action " ]
)
def publish_trajectory_queue (