|
|
|
|
|
|
gamma=self.optimizer.reward_signals[name].gamma, |
|
|
|
lambd=self.hyperparameters.lambd, |
|
|
|
) |
|
|
|
local_advantage = np.array(local_value_estimates) - np.array(m_value_estimates) |
|
|
|
local_advantage = np.array(local_value_estimates) - np.array( |
|
|
|
m_value_estimates |
|
|
|
) |
|
|
|
#agent_buffer_trajectory[f"{name}_returns"].set(local_return) |
|
|
|
# agent_buffer_trajectory[f"{name}_returns"].set(local_return) |
|
|
|
agent_buffer_trajectory[f"{name}_returns_q"].set(returns_q) |
|
|
|
agent_buffer_trajectory[f"{name}_returns_b"].set(returns_b) |
|
|
|
agent_buffer_trajectory[f"{name}_advantage"].set(local_advantage) |
|
|
|
|
|
|
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1 |
|
|
|
) |
|
|
|
# Normalize advantages |
|
|
|
advantages = np.array(self.update_buffer["advantages"].get_batch()) |
|
|
|
self.update_buffer["advantages"].set( |
|
|
|
list((advantages - advantages.mean()) / (advantages.std() + 1e-10)) |
|
|
|
) |
|
|
|
# advantages = np.array(self.update_buffer["advantages"].get_batch()) |
|
|
|
# self.update_buffer["advantages"].set( |
|
|
|
# list((advantages - advantages.mean()) / (advantages.std() + 1e-10)) |
|
|
|
# ) |
|
|
|
num_epoch = self.hyperparameters.num_epoch |
|
|
|
batch_update_stats = defaultdict(list) |
|
|
|
for _ in range(num_epoch): |
|
|
|
|
|
|
discounted_r[t] = running_add |
|
|
|
return discounted_r |
|
|
|
|
|
|
|
|
|
|
|
returns[t] = gamma * lambd * returns[t+1] + (1 - lambd) * (r[t] + gamma * next_value_estimates[t]) |
|
|
|
returns[t] = gamma * lambd * returns[t + 1] + (1 - lambd) * ( |
|
|
|
r[t] + gamma * next_value_estimates[t] |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95): |
|
|
|
""" |
|
|
|
|
|
|
""" |
|
|
|
rewards = np.array(rewards) |
|
|
|
returns_q = lambd_return(rewards, next_value_estimates, gamma=gamma, lambd=lambd) |
|
|
|
returns_b = lambd_return(rewards, next_marginalized_value_estimates, gamma=gamma, lambd=lambd) |
|
|
|
returns_b = lambd_return( |
|
|
|
rewards, next_marginalized_value_estimates, gamma=gamma, lambd=lambd |
|
|
|
) |
|
|
|
return returns_q, returns_b |