浏览代码

ntegrate teammate dones

/develop/coma-withq
Andrew Cohen 4 年前
当前提交
95253b47
共有 2 个文件被更改,包括 13 次插入49 次删除
  1. 14
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  2. 48
      ml-agents/mlagents/trainers/ppo/trainer.py

14
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


next_obs: List[np.ndarray],
next_critic_obs: List[List[np.ndarray]],
done: bool,
all_dones: bool,
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
n_obs = len(self.policy.behavior_spec.sensor_specs)

# )
# These aren't used in COMAttention
for name, estimate in q_estimates.items():
q_estimates[name] = ModelUtils.to_numpy(estimate)
for name, estimate in baseline_estimates.items():
baseline_estimates[name] = ModelUtils.to_numpy(estimate)

for name, estimate in boot_value_estimates.items():
boot_value_estimates[name] = ModelUtils.to_numpy(estimate)
died = False
if done:
if all_dones:
died = True
if len(next_critic_obs) == 0:
boot_value_estimates[k][-1] = 0.0
boot_value_estimates[k][-1] = 0.0
# else:
# print(len(next_critic_obs))
# print(baseline_estimates)

return (
q_estimates,
baseline_estimates,
baseline_estimates,
died
)

48
ml-agents/mlagents/trainers/ppo/trainer.py


# Get all value estimates
(
q_estimates,
value_estimates,
value_estimates,
died,
trajectory.teammate_dones_reached and trajectory.done_reached and not trajectory.interrupted,
for name, v in q_estimates.items():
agent_buffer_trajectory[f"{name}_q_estimates"].extend(v)
for name, v in value_estimates.items():
agent_buffer_trajectory[f"{name}_value_estimates"].extend(v)
agent_buffer_trajectory[f"{name}_value_estimates"].extend(value_estimates[name])
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Q Estimate",
np.mean(v),
)
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Baseline Estimate",
np.mean(baseline_estimates[name]),

np.mean(value_estimates[name]),
)
#for name, v in value_next.items():
# agent_buffer_trajectory[f"{name}_value_estimates_next"].extend(v)
# agent_buffer_trajectory[f"{name}_marginalized_value_estimates_next"].extend(
# marg_value_next[name]
# )
# Evaluate all reward functions
self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory["environment_rewards"]
)

for name in self.optimizer.reward_signals:
local_rewards = agent_buffer_trajectory[f"{name}_rewards"].get_batch()
q_estimates = agent_buffer_trajectory[
f"{name}_q_estimates"
].get_batch()
baseline_estimates = agent_buffer_trajectory[
f"{name}_baseline_estimates"
].get_batch()

#print(local_rewards[-1])
#print(died)
#print(value_next[name])
returns_q, returns_b, returns_v = get_team_returns(
returns_v, returns_b = get_team_returns(
q_estimates=q_estimates,
died=died,
gamma=self.optimizer.reward_signals[name].gamma,
lambd=self.hyperparameters.lambd,
)

#local_advantage = np.array(q_estimates) - np.array(
local_advantage = np.array(returns_v) - np.array(
baseline_estimates
)

local_return = local_advantage + baseline_estimates
# This is later use as target for the different value estimates
# agent_buffer_trajectory[f"{name}_returns"].set(local_return)
agent_buffer_trajectory[f"{name}_returns_q"].set(returns_v)
agent_buffer_trajectory[f"{name}_returns_b"].set(returns_v)
agent_buffer_trajectory[f"{name}_returns_v"].set(returns_v)
agent_buffer_trajectory[f"{name}_advantage"].set(local_advantage)

return discounted_r
def lambd_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0, baseline=False, died=False):
def lambd_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0):
if baseline:
# this is incorrect
if died:
returns[-1] = r[-1]
else:
returns[-1] = value_estimates[-1]
else:
returns[-1] = r[-1] + gamma * value_next
returns[-1] = r[-1] + gamma * value_next
for t in reversed(range(0, r.size - 1)):
returns[t] = gamma * lambd * returns[t + 1] + r[t] + (1 - lambd) * gamma * value_estimates[t + 1]

def get_team_returns(
rewards,
q_estimates,
baseline_estimates,
v_estimates,
value_next=0.0,

:return: list of advantage estimates for time-steps t to T.
"""
rewards = np.array(rewards)
returns_q = lambd_return(rewards, q_estimates, gamma=gamma, lambd=lambd, value_next=value_next)
rewards, baseline_estimates, gamma=gamma, lambd=lambd, baseline=True, died=died
rewards, baseline_estimates, gamma=gamma, lambd=lambd, value_next=value_next
return returns_q, returns_b, returns_v
return returns_v, returns_b
正在加载...
取消
保存