浏览代码

Use GAE again

/develop/coma2/singlenetwork
Ervin Teng 4 年前
当前提交
3aefac39
共有 1 个文件被更改,包括 45 次插入41 次删除
  1. 86
      ml-agents/mlagents/trainers/ppo/trainer.py

86
ml-agents/mlagents/trainers/ppo/trainer.py


trajectory.next_obs,
trajectory.next_collab_obs,
trajectory.done_reached and not trajectory.interrupted,
trajectory.teammate_dones_reached and trajectory.done_reached and not trajectory.interrupted,
trajectory.teammate_dones_reached
and trajectory.done_reached
and not trajectory.interrupted,
)
for name, v in value_estimates.items():

baseline_estimates = agent_buffer_trajectory[
f"{name}_baseline_estimates"
].get_batch()
v_estimates = agent_buffer_trajectory[
f"{name}_value_estimates"
].get_batch()
v_estimates = agent_buffer_trajectory[f"{name}_value_estimates"].get_batch()
#next_value_estimates = agent_buffer_trajectory[
# next_value_estimates = agent_buffer_trajectory[
#].get_batch()
#next_m_value_estimates = agent_buffer_trajectory[
# ].get_batch()
# next_m_value_estimates = agent_buffer_trajectory[
#].get_batch()
# ].get_batch()
# returns_v, returns_b = get_team_returns(
# rewards=local_rewards,
# baseline_estimates=baseline_estimates,
# v_estimates=v_estimates,
# value_next=value_next[name],
# gamma=self.optimizer.reward_signals[name].gamma,
# lambd=self.hyperparameters.lambd,
# )
# print("loc", local_rewards[-1])
# print("tdlam", returns_v)
returns_v, returns_b = get_team_returns(
local_advantage = get_team_gae(
baseline_estimates=baseline_estimates,
v_estimates=v_estimates,
value_estimates=v_estimates,
baseline=baseline_estimates,
#print("loc", local_rewards[-1])
#print("tdlam", returns_v)
#local_advantage = get_team_gae(
# rewards=local_rewards,
# value_estimates=v_estimates,
# baseline=baseline_estimates,
# value_next=value_next[name],
# gamma=self.optimizer.reward_signals[name].gamma,
# lambd=self.hyperparameters.lambd,
#)
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} TD Lam",
np.mean(returns_v),
)
# self._stats_reporter.add_stat(
# f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} TD Lam",
# np.mean(returns_v),
# )
#local_advantage = np.array(returns_v) - baseline_estimates
local_advantage = np.array(returns_v) - np.array(baseline_estimates)
#self._stats_reporter.add_stat(
# local_advantage = np.array(returns_v) - baseline_estimates
# local_advantage = np.array(returns_v) - np.array(baseline_estimates)
# self._stats_reporter.add_stat(
#)
# )
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} TD Advantage Estimate",

local_return = local_advantage + baseline_estimates
#local_return = local_advantage + q_estimates
# local_return = local_advantage + q_estimates
agent_buffer_trajectory[f"{name}_returns_b"].set(returns_v)
agent_buffer_trajectory[f"{name}_returns_v"].set(returns_v)
agent_buffer_trajectory[f"{name}_returns_b"].set(local_return)
agent_buffer_trajectory[f"{name}_returns_v"].set(local_return)
agent_buffer_trajectory[f"{name}_advantage"].set(local_advantage)
tmp_advantages.append(local_advantage)
tmp_returns.append(local_return)

)
global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
agent_buffer_trajectory["advantages"].set(global_advantages)
agent_buffer_trajectory["discounted_returns"].set(global_returns)
# Append to update buffer
agent_buffer_trajectory.resequence_and_append(

n_sequences = max(
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
#Normalize advantages
# Normalize advantages
list((advantages - advantages.mean()) / (advantages.std() + 1e-10))
list((advantages - advantages.mean()) / (advantages.std() + 1e-10))
)
num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)

returns = np.zeros_like(r)
returns[-1] = r[-1] + gamma * value_next
for t in reversed(range(0, r.size - 1)):
returns[t] = gamma * lambd * returns[t + 1] + r[t] + (1 - lambd) * gamma * value_estimates[t + 1]
returns[t] = (
gamma * lambd * returns[t + 1]
+ r[t]
+ (1 - lambd) * gamma * value_estimates[t + 1]
)
def get_team_gae(rewards, value_estimates, baseline, value_next=0.0, gamma=0.99, lambd=0.95):
def get_team_gae(
rewards, value_estimates, baseline, value_next=0.0, gamma=0.99, lambd=0.95
):
"""
Computes generalized advantage estimate for use in updating policy.
:param rewards: list of rewards for time-steps t to T.

正在加载...
取消
保存