|
|
|
|
|
|
# f"{name}_marginalized_value_estimates_next" |
|
|
|
#].get_batch() |
|
|
|
|
|
|
|
#print(local_rewards[-1]) |
|
|
|
#print(died) |
|
|
|
#print(value_next[name]) |
|
|
|
returns_v, returns_b = get_team_returns( |
|
|
|
rewards=local_rewards, |
|
|
|
baseline_estimates=baseline_estimates, |
|
|
|
|
|
|
lambd=self.hyperparameters.lambd, |
|
|
|
) |
|
|
|
#print("loc", local_rewards[-1]) |
|
|
|
#print("tdlam", returns_v) |
|
|
|
|
|
|
|
#local_advantage = get_team_gae( |
|
|
|
# rewards=local_rewards, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
local_advantage = np.array(returns_v) - np.array( |
|
|
|
baseline_estimates |
|
|
|
) |
|
|
|
#local_advantage = np.array(returns_v) - baseline_estimates |
|
|
|
local_advantage = np.array(returns_v) - np.array(baseline_estimates) |
|
|
|
#self._stats_reporter.add_stat( |
|
|
|
# f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} GAE Advantage Estimate", |
|
|
|
# np.mean(gae_advantage), |
|
|
|
|
|
|
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} TD Advantage Estimate", |
|
|
|
np.mean(local_advantage), |
|
|
|
) |
|
|
|
#local_return = local_advantage + q_estimates |
|
|
|
|
|
|
|
|
|
|
|
#local_return = local_advantage + q_estimates |
|
|
|
# This is later use as target for the different value estimates |
|
|
|
# agent_buffer_trajectory[f"{name}_returns"].set(local_return) |
|
|
|
agent_buffer_trajectory[f"{name}_returns_b"].set(returns_v) |
|
|
|