|
|
|
|
|
|
baseline_estimates, |
|
|
|
value_estimates, |
|
|
|
value_next, |
|
|
|
baseline_next, |
|
|
|
died, |
|
|
|
) = self.optimizer.get_trajectory_value_estimates( |
|
|
|
agent_buffer_trajectory, |
|
|
|
trajectory.next_obs, |
|
|
|
|
|
|
# f"{name}_marginalized_value_estimates_next" |
|
|
|
#].get_batch() |
|
|
|
|
|
|
|
#print(local_rewards[-1]) |
|
|
|
#print(died) |
|
|
|
#print(value_next[name]) |
|
|
|
returns_q, returns_b, returns_v = get_team_returns( |
|
|
|
rewards=local_rewards, |
|
|
|
q_estimates=q_estimates, |
|
|
|
|
|
|
baseline_next=baseline_next[name], |
|
|
|
died=died, |
|
|
|
#gae_advantage = get_team_gae( |
|
|
|
|
|
|
|
#local_advantage = get_team_gae( |
|
|
|
# value_next=bootstrap_value, |
|
|
|
# value_next=value_next[name], |
|
|
|
# gamma=self.optimizer.reward_signals[name].gamma, |
|
|
|
# lambd=self.hyperparameters.lambd, |
|
|
|
#) |
|
|
|
|
|
|
return discounted_r |
|
|
|
|
|
|
|
|
|
|
|
def lambd_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0, baseline=False): |
|
|
|
def lambd_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0, baseline=False, died=False): |
|
|
|
if value_next == 0.0: |
|
|
|
if died: |
|
|
|
returns[-1] = r[-1] |
|
|
|
else: |
|
|
|
returns[-1] = value_estimates[-1] |
|
|
|
|
|
|
baseline_estimates, |
|
|
|
v_estimates, |
|
|
|
value_next=0.0, |
|
|
|
baseline_next=0.0, |
|
|
|
died=False, |
|
|
|
gamma=0.99, |
|
|
|
lambd=0.8, |
|
|
|
): |
|
|
|
|
|
|
rewards = np.array(rewards) |
|
|
|
returns_q = lambd_return(rewards, q_estimates, gamma=gamma, lambd=lambd, value_next=value_next) |
|
|
|
returns_b = lambd_return( |
|
|
|
rewards, baseline_estimates, gamma=gamma, lambd=lambd, value_next=baseline_next, baseline=True |
|
|
|
rewards, baseline_estimates, gamma=gamma, lambd=lambd, baseline=True, died=died |
|
|
|
#if rewards[-1] > 0: |
|
|
|
# print(returns_v) |
|
|
|
# print(rewards) |
|
|
|
|
|
|
|
return returns_q, returns_b, returns_v |