浏览代码

remove normalize advantages

/develop/coma-noact
Andrew Cohen 4 年前
当前提交
79c658d2
共有 1 个文件被更改,包括 16 次插入8 次删除
  1. 24
      ml-agents/mlagents/trainers/ppo/trainer.py

24
ml-agents/mlagents/trainers/ppo/trainer.py


gamma=self.optimizer.reward_signals[name].gamma,
lambd=self.hyperparameters.lambd,
)
local_advantage = np.array(local_value_estimates) - np.array(m_value_estimates)
local_advantage = np.array(local_value_estimates) - np.array(
m_value_estimates
)
#agent_buffer_trajectory[f"{name}_returns"].set(local_return)
# agent_buffer_trajectory[f"{name}_returns"].set(local_return)
agent_buffer_trajectory[f"{name}_returns_q"].set(returns_q)
agent_buffer_trajectory[f"{name}_returns_b"].set(returns_b)
agent_buffer_trajectory[f"{name}_advantage"].set(local_advantage)

int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
# Normalize advantages
advantages = np.array(self.update_buffer["advantages"].get_batch())
self.update_buffer["advantages"].set(
list((advantages - advantages.mean()) / (advantages.std() + 1e-10))
)
# advantages = np.array(self.update_buffer["advantages"].get_batch())
# self.update_buffer["advantages"].set(
# list((advantages - advantages.mean()) / (advantages.std() + 1e-10))
# )
num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)
for _ in range(num_epoch):

discounted_r[t] = running_add
return discounted_r
returns[t] = gamma * lambd * returns[t+1] + (1 - lambd) * (r[t] + gamma * next_value_estimates[t])
returns[t] = gamma * lambd * returns[t + 1] + (1 - lambd) * (
r[t] + gamma * next_value_estimates[t]
)
def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95):
"""

"""
rewards = np.array(rewards)
returns_q = lambd_return(rewards, next_value_estimates, gamma=gamma, lambd=lambd)
returns_b = lambd_return(rewards, next_marginalized_value_estimates, gamma=gamma, lambd=lambd)
returns_b = lambd_return(
rewards, next_marginalized_value_estimates, gamma=gamma, lambd=lambd
)
return returns_q, returns_b
正在加载...
取消
保存