浏览代码

cloud run with coma2 of held out zombie test env

/develop/coma-noact
Andrew Cohen 4 年前
当前提交
7f491ae7
共有 4 个文件被更改,包括 25 次插入20 次删除
  1. 14
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  2. 6
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  3. 23
      ml-agents/mlagents/trainers/ppo/trainer.py
  4. 2
      ml-agents/mlagents/trainers/torch/networks.py

14
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


memory = torch.zeros([1, 1, self.policy.m_size])
q_estimates, baseline_estimates, mem = self.policy.actor_critic.target_critic_pass(
q_estimates, baseline_estimates, mem = self.policy.actor_critic.critic_pass(
current_obs,
actions,
memory,

value_estimates[name] = ModelUtils.to_numpy(estimate)
# the base line and V shpuld not be on the same done flag
boot_value_baseline = {}
boot_value_baseline[name] = ModelUtils.to_numpy(estimate)
died = False
boot_value_baseline[k][-1] = 0.0
died = True
# else:
# print(len(next_critic_obs))
# print(baseline_estimates)
# print(value_estimates)
# print(boot_value_baseline[k][-1])
return (
q_estimates,

boot_value_baseline,
died
)

6
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


self.optimizer.step()
ModelUtils.soft_update(
self.policy.actor_critic.critic, self.policy.actor_critic.target, 1.0
)
#ModelUtils.soft_update(
# self.policy.actor_critic.critic, self.policy.actor_critic.target, 1.0
#)
update_stats = {
# NOTE: abs() is not technically correct, but matches the behavior in TensorFlow.
# TODO: After PyTorch is default, change to something more correct.

23
ml-agents/mlagents/trainers/ppo/trainer.py


baseline_estimates,
value_estimates,
value_next,
baseline_next,
died,
) = self.optimizer.get_trajectory_value_estimates(
agent_buffer_trajectory,
trajectory.next_obs,

# f"{name}_marginalized_value_estimates_next"
#].get_batch()
#print(local_rewards[-1])
#print(died)
#print(value_next[name])
returns_q, returns_b, returns_v = get_team_returns(
rewards=local_rewards,
q_estimates=q_estimates,

baseline_next=baseline_next[name],
died=died,
#gae_advantage = get_team_gae(
#local_advantage = get_team_gae(
# value_next=bootstrap_value,
# value_next=value_next[name],
# gamma=self.optimizer.reward_signals[name].gamma,
# lambd=self.hyperparameters.lambd,
#)

return discounted_r
def lambd_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0, baseline=False):
def lambd_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0, baseline=False, died=False):
if value_next == 0.0:
if died:
returns[-1] = r[-1]
else:
returns[-1] = value_estimates[-1]

baseline_estimates,
v_estimates,
value_next=0.0,
baseline_next=0.0,
died=False,
gamma=0.99,
lambd=0.8,
):

rewards = np.array(rewards)
returns_q = lambd_return(rewards, q_estimates, gamma=gamma, lambd=lambd, value_next=value_next)
returns_b = lambd_return(
rewards, baseline_estimates, gamma=gamma, lambd=lambd, value_next=baseline_next, baseline=True
rewards, baseline_estimates, gamma=gamma, lambd=lambd, baseline=True, died=died
#if rewards[-1] > 0:
# print(returns_v)
# print(rewards)
return returns_q, returns_b, returns_v

2
ml-agents/mlagents/trainers/torch/networks.py


team_obs=team_obs,
team_act=team_act,
)
value_outputs, _ = self.critic_value(inputs, memories=critic_mem, sequence_length=sequence_length, team_obs=team_obs)
value_outputs, _ = self.target_critic_value(inputs, memories=critic_mem, sequence_length=sequence_length, team_obs=team_obs)
return log_probs, entropies, q_outputs, baseline_outputs, value_outputs

正在加载...
取消
保存