浏览代码

try again on cloud

/develop/coma-withq
Andrew Cohen 4 年前
当前提交
687f411b
共有 4 个文件被更改,包括 24 次插入15 次删除
  1. 9
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  2. 12
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  3. 14
      ml-agents/mlagents/trainers/ppo/trainer.py
  4. 4
      ml-agents/mlagents/trainers/torch/networks.py

9
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


# print(baseline_estimates)
# print(value_estimates)
# print(boot_value_baseline[k][-1])
#if done and not all_dones:
# print("agent finished but team going")
#elif all_dones:
# print("alldone")
#else:
# print("neither")
#print("final", boot_value_estimates)
#print("value", value_estimates)
#print("base", baseline_estimates)
return (
value_estimates,

12
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


for name, head in values.items():
old_val_tensor = old_values[name]
returns_tensor = returns[name]
clipped_value_estimate = old_val_tensor + torch.clamp(
head - old_val_tensor, -1 * epsilon, epsilon
)
#clipped_value_estimate = old_val_tensor + torch.clamp(
# head - old_val_tensor, -1 * epsilon, epsilon
#)
v_opt_b = (returns_tensor - clipped_value_estimate) ** 2
value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks)
#value_loss = ModelUtils.masked_mean(v_opt_a, loss_masks)
#v_opt_b = (returns_tensor - clipped_value_estimate) ** 2
#value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks)
value_loss = ModelUtils.masked_mean(v_opt_a, loss_masks)
value_losses.append(value_loss)
value_loss = torch.mean(torch.stack(value_losses))
return value_loss

14
ml-agents/mlagents/trainers/ppo/trainer.py


# f"{name}_marginalized_value_estimates_next"
#].get_batch()
#print(local_rewards[-1])
#print(died)
#print(value_next[name])
returns_v, returns_b = get_team_returns(
rewards=local_rewards,
baseline_estimates=baseline_estimates,

lambd=self.hyperparameters.lambd,
)
#print("loc", local_rewards[-1])
#print("tdlam", returns_v)
#local_advantage = get_team_gae(
# rewards=local_rewards,

local_advantage = np.array(returns_v) - np.array(
baseline_estimates
)
#local_advantage = np.array(returns_v) - baseline_estimates
local_advantage = np.array(returns_v) - np.array(baseline_estimates)
#self._stats_reporter.add_stat(
# f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} GAE Advantage Estimate",
# np.mean(gae_advantage),

f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} TD Advantage Estimate",
np.mean(local_advantage),
)
#local_return = local_advantage + q_estimates
#local_return = local_advantage + q_estimates
# This is later use as target for the different value estimates
# agent_buffer_trajectory[f"{name}_returns"].set(local_return)
agent_buffer_trajectory[f"{name}_returns_b"].set(returns_v)

4
ml-agents/mlagents/trainers/torch/networks.py


sequence_length=sequence_length,
)
value_outputs, critic_mem_out = self.critic.q_net(
q_out, critic_mem_out = self.critic.q_net(
all_obs, all_acts, memories=critic_mem, sequence_length=sequence_length
)

memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1)
else:
memories_out = None
return value_outputs, baseline_outputs, memories_out
return q_out, baseline_outputs, memories_out
def get_stats_and_value(
self,

正在加载...
取消
保存