浏览代码

might be right

/develop/coma-noact
Andrew Cohen 4 年前
当前提交
6e1826f8
共有 6 个文件被更改,包括 99 次插入51 次删除
  1. 71
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  2. 5
      ml-agents/mlagents/trainers/policy/torch_policy.py
  3. 30
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  4. 5
      ml-agents/mlagents/trainers/ppo/trainer.py
  5. 19
      ml-agents/mlagents/trainers/torch/agent_action.py
  6. 20
      ml-agents/mlagents/trainers/torch/networks.py

71
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


next_critic_obs: List[List[np.ndarray]],
done: bool,
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
next_obs = ObsUtil.from_buffer_next(batch, n_obs)
team_obs = TeamObsUtil.from_buffer(batch, n_obs)
next_team_obs = TeamObsUtil.from_buffer_next(batch, n_obs)
memory = torch.zeros([1, 1, self.policy.m_size])
team_obs = [
[ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
for _teammate_obs in team_obs
]
next_team_obs = [
[ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
for _teammate_obs in next_team_obs
]
next_obs = [obs.unsqueeze(0) for obs in next_obs]
next_actions = AgentAction.from_dict_next(batch)
team_actions = AgentAction.from_team_dict(batch)
next_team_actions = AgentAction.from_team_dict_next(batch)
critic_obs = TeamObsUtil.from_buffer(batch, n_obs)
critic_obs = [
[ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
for _teammate_obs in critic_obs
]
next_critic_obs = [
ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_critic_obs
]
# next_obs = [obs.unsqueeze(0) for obs in next_obs]
# critic_obs = TeamObsUtil.from_buffer(batch, n_obs)
# critic_obs = [
# [ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
# for _teammate_obs in critic_obs
# ]
# next_critic_obs = [
# ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_critic_obs
# ]
next_critic_obs = [
[_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_critic_obs
]
# next_critic_obs = [
# [_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_critic_obs
# ]
value_estimates, marg_val_estimates, next_memory = self.policy.actor_critic.critic_pass(
value_estimates, marg_val_estimates, mem = self.policy.actor_critic.critic_pass(
critic_obs=critic_obs,
team_obs=team_obs,
team_act=team_actions,
)
next_value_estimates, next_marg_val_estimates, next_mem = self.policy.actor_critic.critic_pass(
next_obs,
next_actions,
memory,
sequence_length=batch.num_experiences,
team_obs=next_team_obs,
team_act=next_team_actions,
)
# # Actions is a hack here, we need the next actions

# These aren't used in COMAttention
next_value_estimate, next_marg_val_estimate = {}, {}
next_value_estimate[name] = 0.0
for name, estimate in next_value_estimates.items():
next_value_estimates[name] = ModelUtils.to_numpy(estimate)
next_marg_val_estimate[name] = 0.0
for name, estimate in next_marg_val_estimates.items():
next_marg_val_estimates[name] = ModelUtils.to_numpy(estimate)
for k in next_value_estimate:
for k in next_value_estimates:
next_value_estimate[k] = 0.0
next_value_estimates[k] = 0.0
next_value_estimate,
next_marg_val_estimate,
next_value_estimates,
next_marg_val_estimates,
)

5
ml-agents/mlagents/trainers/policy/torch_policy.py


masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
seq_len: int = 1,
critic_obs: Optional[List[List[torch.Tensor]]] = None,
team_obs: Optional[List[List[torch.Tensor]]] = None,
team_act: Optional[List[AgentAction]] = None,
obs, actions, masks, memories, seq_len, critic_obs
obs, actions, masks, memories, seq_len, team_obs, team_act
)
return log_probs, entropies, value_heads, marg_vals

30
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


value_losses = []
for name, head in values.items():
old_val_tensor = old_values[name]
returns_tensor = returns[name]
clipped_value_estimate = old_val_tensor + torch.clamp(
head - old_val_tensor, -1 * epsilon, epsilon
)
v_opt_a = (returns_tensor - head) ** 2
v_opt_b = (returns_tensor - clipped_value_estimate) ** 2
value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks)
returns_tensor = returns[name] + 0.99 * old_val_tensor
# clipped_value_estimate = old_val_tensor + torch.clamp(
# head - old_val_tensor, -1 * epsilon, epsilon
# )
value_loss = (returns_tensor - head) ** 2
# v_opt_a = (returns_tensor - head) ** 2
# v_opt_b = (returns_tensor - clipped_value_estimate) ** 2
# value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks)
value_losses.append(value_loss)
value_loss = torch.mean(torch.stack(value_losses))
return value_loss

old_marg_values = {}
for name in self.reward_signals:
old_values[name] = ModelUtils.list_to_tensor(
batch[f"{name}_value_estimates"]
batch[f"{name}_value_estimates_next"]
batch[f"{name}_marginalized_value_estimates"]
batch[f"{name}_marginalized_value_estimates_next"]
)
returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])

# Convert to tensors
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
critic_obs = TeamObsUtil.from_buffer(batch, n_obs)
critic_obs = [
team_obs = TeamObsUtil.from_buffer(batch, n_obs)
team_obs = [
for _teammate_obs in critic_obs
for _teammate_obs in team_obs
next_team_actions = AgentAction.from_team_dict_next(batch)
# next_team_actions = AgentAction.from_team_dict_next(batch)
memories = [
ModelUtils.list_to_tensor(batch["memory"][i])

masks=act_masks,
actions=actions,
memories=memories,
critic_obs=critic_obs,
team_obs=team_obs,
team_act=team_actions,
seq_len=self.policy.sequence_length,
)
old_log_probs = ActionLogProbs.from_dict(batch).flatten()

5
ml-agents/mlagents/trainers/ppo/trainer.py


f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
np.mean(v),
)
for name, v in value_next.items():
agent_buffer_trajectory[f"{name}_value_estimates_next"].extend(v)
agent_buffer_trajectory[f"{name}_marginalized_value_estimates_next"].extend(
marg_value_next[name]
)
# Evaluate all reward functions
self.collected_rewards["environment"][agent_id] += np.sum(

19
ml-agents/mlagents/trainers/torch/agent_action.py


return AgentAction(continuous, discrete)
@staticmethod
def from_dict_next(buff: Dict[str, np.ndarray]) -> "AgentAction":
"""
A static method that accesses continuous and discrete action fields in an AgentBuffer
and constructs the corresponding AgentAction from the retrieved np arrays.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
if "next_continuous_action" in buff:
continuous = ModelUtils.list_to_tensor(buff["continuous_action"])
if "next_discrete_action" in buff:
discrete_tensor = ModelUtils.list_to_tensor(
buff["discrete_action"], dtype=torch.long
)
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
return AgentAction(continuous, discrete)
@staticmethod
def _from_team_dict(
buff: Dict[str, np.ndarray], cont_action_key: str, disc_action_key: str
):

20
ml-agents/mlagents/trainers/torch/networks.py


masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
critic_obs: Optional[List[List[torch.Tensor]]] = None,
team_obs: Optional[List[List[torch.Tensor]]] = None,
team_act: Optional[List[List[torch.Tensor]]] = None,
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
encoding, memories = self.network_body(
inputs, memories=memories, sequence_length=sequence_length

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
critic_obs: Optional[List[List[torch.Tensor]]] = None,
team_obs: Optional[List[List[torch.Tensor]]] = None,
team_act: Optional[List[List[torch.Tensor]]] = None,
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)
encoding, actor_mem_outs = self.network_body(

all_net_inputs = [inputs]
if critic_obs is not None and critic_obs:
all_net_inputs.extend(critic_obs)
critic_obs = []
mar_value_outputs, _ = self.critic(
all_net_inputs, [], [], memories=critic_mem, sequence_length=sequence_length
)
value_outputs, critic_mem_outs = self.critic(
[inputs],
critic_obs,
value_outputs, mar_value_outputs, _ = self.critic_pass(
inputs,
team_obs=team_obs,
team_act=team_act,
)
return log_probs, entropies, value_outputs, mar_value_outputs

正在加载...
取消
保存