浏览代码

Add clipping, use same network for value

/develop/coma2/singlenetwork
Ervin Teng 4 年前
当前提交
bca6c92c
共有 2 个文件被更改,包括 18 次插入29 次删除
  1. 15
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  2. 32
      ml-agents/mlagents/trainers/torch/networks.py

15
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from typing import Dict, cast
import itertools
import numpy as np
from mlagents.torch_utils import torch
from mlagents.trainers.buffer import AgentBuffer

from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.trajectory import ObsUtil, TeamObsUtil
from mlagents.trainers.torch.networks import CentralizedValueNetwork
class TorchPPOOptimizer(TorchOptimizer):

for name, head in values.items():
old_val_tensor = old_values[name]
returns_tensor = returns[name]
#clipped_value_estimate = old_val_tensor + torch.clamp(
# head - old_val_tensor, -1 * epsilon, epsilon
#)
clipped_value_estimate = old_val_tensor + torch.clamp(
head - old_val_tensor, -1 * epsilon, epsilon
)
#v_opt_b = (returns_tensor - clipped_value_estimate) ** 2
#value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks)
value_loss = ModelUtils.masked_mean(v_opt_a, loss_masks)
v_opt_b = (returns_tensor - clipped_value_estimate) ** 2
value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks)
value_losses.append(value_loss)
value_loss = torch.mean(torch.stack(value_losses))
return value_loss

32
ml-agents/mlagents/trainers/torch/networks.py


)
return encoding, memories
def forward(
self,
f_enc: torch.Tensor,

memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
encoding, memories = self.network_body.value(
obs, memories, sequence_length
)
encoding, memories = self.network_body.value(obs, memories, sequence_length)
output = self.value_heads(encoding)
return output, memories

self.critic = CentralizedValueNetwork(
stream_names, sensor_specs, network_settings, action_spec=action_spec
)
self.target = CentralizedValueNetwork(
stream_names, sensor_specs, network_settings, action_spec=action_spec
)
@property
def memory_size(self) -> int:

if team_obs is not None and team_obs:
all_obs.extend(team_obs)
value_outputs, _ = self.target.value(
all_obs,
memories=critic_mem,
sequence_length=sequence_length,
value_outputs, critic_mem_out = self.critic.value(
all_obs, memories=critic_mem, sequence_length=sequence_length
)
# if mar_value_outputs is None:

if team_obs is not None and team_obs:
all_obs.extend(team_obs)
value_outputs, _ = self.critic.value(
all_obs,
memories=critic_mem,
sequence_length=sequence_length,
value_outputs, critic_mem_out = self.critic.value(
all_obs, memories=critic_mem, sequence_length=sequence_length
)
# if mar_value_outputs is None:

if team_act is not None and team_act:
all_acts.extend(team_act)
baseline_outputs, _ = self.target.baseline(
baseline_outputs, _ = self.critic.baseline(
inputs,
team_obs,
team_act,

value_outputs, critic_mem_out = self.target.q_net(
value_outputs, critic_mem_out = self.critic.q_net(
all_obs, all_acts, memories=critic_mem, sequence_length=sequence_length
)

team_obs=team_obs,
team_act=team_act,
)
value_outputs, _ = self.target_critic_value(inputs, memories=critic_mem, sequence_length=sequence_length, team_obs=team_obs)
value_outputs, _ = self.target_critic_value(
inputs,
memories=critic_mem,
sequence_length=sequence_length,
team_obs=team_obs,
)
return log_probs, entropies, q_outputs, baseline_outputs, value_outputs

正在加载...
取消
保存