浏览代码

Running COMA (not sure if learning)

/develop/action-slice
Ervin Teng 4 年前
当前提交
9bc88c41
共有 9 个文件被更改,包括 466 次插入178 次删除
  1. 5
      ml-agents/mlagents/trainers/buffer.py
  2. 202
      ml-agents/mlagents/trainers/coma/optimizer_torch.py
  3. 103
      ml-agents/mlagents/trainers/coma/trainer.py
  4. 26
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  5. 46
      ml-agents/mlagents/trainers/ppo/trainer.py
  6. 7
      ml-agents/mlagents/trainers/settings.py
  7. 82
      ml-agents/mlagents/trainers/torch/agent_action.py
  8. 162
      ml-agents/mlagents/trainers/torch/networks.py
  9. 11
      ml-agents/mlagents/trainers/trainer/trainer_factory.py

5
ml-agents/mlagents/trainers/buffer.py


VALUE_ESTIMATES = "value_estimates"
RETURNS = "returns"
ADVANTAGE = "advantage"
BASELINES = "baselines"
AgentBufferKey = Union[

@staticmethod
def advantage_key(name: str) -> AgentBufferKey:
return RewardSignalKeyPrefix.ADVANTAGE, name
@staticmethod
def baseline_estimates_key(name: str) -> AgentBufferKey:
return RewardSignalKeyPrefix.BASELINES, name
class AgentBufferField(list):

202
ml-agents/mlagents/trainers/coma/optimizer_torch.py


from typing import Dict, cast
from typing import Dict, cast, List, Tuple, Optional
import numpy as np
from mlagents_envs.base_env import ObservationSpec, ActionSpec
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
from mlagents.trainers.settings import TrainerSettings, PPOSettings

from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.trajectory import ObsUtil
from mlagents.trainers.trajectory import ObsUtil, GroupObsUtil
from mlagents.trainers.settings import NetworkSettings
class TorchCOMAOptimizer(TorchOptimizer):

else:
encoding_size = network_settings.hidden_units
self.value_heads = ValueHeads(stream_names, encoding_size, outputs_per_stream)
self.value_heads = ValueHeads(stream_names, encoding_size, 1)
@property
def memory_size(self) -> int:
return self.network_body.memory_size
@property
def memory_size(self) -> int:
return self.network_body.memory_size
def update_normalization(self, buffer: AgentBuffer) -> None:
self.network_body.update_normalization(buffer)
def baseline(
self,
self_obs: List[List[torch.Tensor]],
obs: List[List[torch.Tensor]],
actions: List[AgentAction],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
def update_normalization(self, buffer: AgentBuffer) -> None:
self.network_body.update_normalization(buffer)
def baseline(
self,
self_obs: List[List[torch.Tensor]],
obs: List[List[torch.Tensor]],
actions: List[AgentAction],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
encoding, memories = self.network_body(obs_only=self_obs, obs=obs, actions=actions, memories, sequence_length)
value_outputs, critic_mem_out = self.forward(encoding, memories, sequence_length)
return value_outputs, critic_mem_out
encoding, memories = self.network_body(
obs_only=self_obs,
obs=obs,
actions=actions,
memories=memories,
sequence_length=sequence_length,
)
value_outputs, critic_mem_out = self.forward(
encoding, memories, sequence_length
)
return value_outputs, critic_mem_out
def critic_pass(
self,
obs: List[List[torch.Tensor]],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
def critic_pass(
self,
obs: List[List[torch.Tensor]],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
encoding, memories = self.network_body(obs_only=obs, obs=None, actions=None, memories, sequence_length)
value_outputs, critic_mem_out = self.forward(encoding, memories, sequence_length)
return value_outputs, critic_mem_out
encoding, memories = self.network_body(
obs_only=obs,
obs=None,
actions=None,
memories=memories,
sequence_length=sequence_length,
)
value_outputs, critic_mem_out = self.forward(
encoding, memories, sequence_length
)
return value_outputs, critic_mem_out
def forward(
self,
encoding: torch.Tensor,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
def forward(
self,
encoding: torch.Tensor,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
output = self.value_heads(encoding)
return output, memories
output = self.value_heads(encoding)
return output, memories
def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
"""

reward_signal_configs = trainer_settings.reward_signals
reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
self._critic = COMAValueNetwork(
self._critic = TorchCOMAOptimizer.COMAValueNetwork(
policy.behavior_spec.action_spec,
action_spec=policy.behavior_spec.action_spec,
params = list(self.policy.actor.parameters()) + list(
self.value_net.parameters()
)
params = list(self.policy.actor.parameters()) + list(self.critic.parameters())
self.hyperparameters: PPOSettings = cast(
PPOSettings, trainer_settings.hyperparameters
)

value_loss = torch.mean(torch.stack(value_losses))
return value_loss
def policy_policy_loss(
def ppo_policy_loss(
self,
advantages: torch.Tensor,
log_probs: torch.Tensor,

decay_bet = self.decay_beta.get_value(self.policy.get_current_step())
returns = {}
old_values = {}
old_baseline_values = {}
for name in self.reward_signals:
old_values[name] = ModelUtils.list_to_tensor(
batch[RewardSignalUtil.value_estimates_key(name)]

)
old_baseline_values[name] = ModelUtils.list_to_tensor(
batch[RewardSignalUtil.baseline_estimates_key(name)]
)
n_obs = len(self.policy.behavior_spec.observation_specs)

group_obs = GroupObsUtil.from_buffer(batch, n_obs)
group_obs = [
[ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs]
for _groupmate_obs in group_obs
]
group_actions = AgentAction.group_from_buffer(batch)
memories = [
ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i])

memories=memories,
seq_len=self.policy.sequence_length,
)
all_obs = [current_obs] + group_obs
current_obs, memories=memories, sequence_length=self.policy.sequence_length
all_obs, memories=memories, sequence_length=self.policy.sequence_length
)
baselines, _ = self.critic.baseline(
[current_obs],
group_obs,
group_actions,
memories=memories,
sequence_length=self.policy.sequence_length,
value_loss = self.ppo_value_loss(
baseline_loss = self.coma_value_loss(
baselines, old_baseline_values, returns, decay_eps, loss_masks
)
value_loss = self.coma_value_loss(
values, old_values, returns, decay_eps, loss_masks
)
policy_loss = self.ppo_policy_loss(

)
loss = (
policy_loss
+ 0.5 * value_loss
+ 0.5 * (value_loss + baseline_loss)
- decay_bet * ModelUtils.masked_mean(entropy, loss_masks)
)

for reward_provider in self.reward_signals.values():
modules.update(reward_provider.get_modules())
return modules
def get_trajectory_value_estimates(
self,
batch: AgentBuffer,
next_obs: List[np.ndarray],
next_group_obs: List[List[np.ndarray]],
done: bool,
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
n_obs = len(self.policy.behavior_spec.observation_specs)
current_obs = ObsUtil.from_buffer(batch, n_obs)
team_obs = GroupObsUtil.from_buffer(batch, n_obs)
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
team_obs = [
[ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
for _teammate_obs in team_obs
]
team_actions = AgentAction.group_from_buffer(batch)
next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
next_obs = [obs.unsqueeze(0) for obs in next_obs]
next_group_obs = [
ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_group_obs
]
# Expand dimensions of next critic obs
next_group_obs = [
[_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_group_obs
]
memory = torch.zeros([1, 1, self.policy.m_size])
all_obs = [current_obs] + team_obs if team_obs is not None else [current_obs]
value_estimates, mem = self.critic.critic_pass(
all_obs, memory, sequence_length=batch.num_experiences
)
baseline_estimates, mem = self.critic.baseline(
[current_obs],
team_obs,
team_actions,
memory,
sequence_length=batch.num_experiences,
)
all_next_obs = (
[next_obs] + next_group_obs if next_group_obs is not None else [next_obs]
)
next_value_estimates, mem = self.critic.critic_pass(
all_next_obs, mem, sequence_length=batch.num_experiences
)
for name, estimate in baseline_estimates.items():
baseline_estimates[name] = ModelUtils.to_numpy(estimate)
for name, estimate in value_estimates.items():
value_estimates[name] = ModelUtils.to_numpy(estimate)
# the base line and V shpuld not be on the same done flag
for name, estimate in next_value_estimates.items():
next_value_estimates[name] = ModelUtils.to_numpy(estimate)
if done:
for k in next_value_estimates:
if not self.reward_signals[k].ignore_done:
next_value_estimates[k][-1] = 0.0
return (value_estimates, baseline_estimates, next_value_estimates)

103
ml-agents/mlagents/trainers/coma/trainer.py


self.policy.update_normalization(agent_buffer_trajectory)
# Get all value estimates
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
(
value_estimates,
baseline_estimates,
value_next,
) = self.optimizer.get_trajectory_value_estimates(
trajectory.done_reached and not trajectory.interrupted,
trajectory.next_group_obs,
trajectory.teammate_dones_reached
and trajectory.done_reached
and not trajectory.interrupted,
)
for name, v in value_estimates.items():

agent_buffer_trajectory[
RewardSignalUtil.baseline_estimates_key(name)
].extend(baseline_estimates[name])
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Baseline Estimate",
np.mean(baseline_estimates[name]),
)
np.mean(v),
np.mean(value_estimates[name]),
# Evaluate all reward functions
self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]
)

tmp_advantages = []
tmp_returns = []
for name in self.optimizer.reward_signals:
bootstrap_value = value_next[name]
local_value_estimates = agent_buffer_trajectory[
baseline_estimates = agent_buffer_trajectory[
RewardSignalUtil.baseline_estimates_key(name)
].get_batch()
v_estimates = agent_buffer_trajectory[
local_advantage = get_gae(
returns_v, returns_b = get_team_returns(
value_estimates=local_value_estimates,
value_next=bootstrap_value,
baseline_estimates=baseline_estimates,
v_estimates=v_estimates,
value_next=value_next[name],
local_return = local_advantage + local_value_estimates
# This is later use as target for the different value estimates
agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set(
local_return
test_v, _ = get_team_returns(
rewards=local_rewards,
baseline_estimates=baseline_estimates,
v_estimates=v_estimates,
value_next=value_next[name],
gamma=self.optimizer.reward_signals[name].gamma,
lambd=1,
)
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Sum Rewards",
np.mean(test_v),
)
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} TD Lam",
np.mean(returns_v),
)
local_advantage = np.array(returns_v) - np.array(baseline_estimates)
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} TD Advantage Estimate",
np.mean(local_advantage),
local_return = local_advantage + baseline_estimates
# local_return = local_advantage + q_estimates
# This is later use as target for the different value estimates
# agent_buffer_trajectory[f"{name}_returns"].set(local_return)
agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set(returns_v)
agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set(
local_advantage
)

)
global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages)
agent_buffer_trajectory[BufferKey.DISCOUNTED_RETURNS].set(global_returns)
# Append to update buffer
agent_buffer_trajectory.resequence_and_append(

return policy
def create_coma_optimizer(self) -> TorchCOMAOptimizer:
return TorchCOMAptimizer( # type: ignore
return TorchCOMAOptimizer( # type: ignore
cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore
) # type: ignore

"""
self.policy = policy
self.policies[parsed_behavior_id.behavior_id] = policy
self.optimizer = self.create_ppo_optimizer()
self.optimizer = self.create_coma_optimizer()
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)

+ (1 - lambd) * gamma * value_estimates[t + 1]
)
return returns
def get_team_returns(
rewards,
baseline_estimates,
v_estimates,
value_next=0.0,
died=False,
gamma=0.99,
lambd=0.8,
):
"""
Computes generalized advantage estimate for use in updating policy.
:param rewards: list of rewards for time-steps t to T.
:param value_next: Value estimate for time-step T+1.
:param value_estimates: list of value estimates for time-steps t to T.
:param gamma: Discount factor.
:param lambd: GAE weighing factor.
:return: list of advantage estimates for time-steps t to T.
"""
rewards = np.array(rewards)
returns_b = lambda_return(
rewards, baseline_estimates, gamma=gamma, lambd=lambd, value_next=value_next
)
returns_v = lambda_return(
rewards, v_estimates, gamma=gamma, lambd=lambd, value_next=value_next
)
return returns_v, returns_b

26
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
from mlagents.trainers.settings import TrainerSettings, PPOSettings
from mlagents.trainers.torch.networks import ValueNetwork
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils

# Create the graph here to give more granular control of the TF graph to the Optimizer.
super().__init__(policy, trainer_settings)
reward_signal_configs = trainer_settings.reward_signals
reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
if policy.shared_critic:
self.value_net = policy.actor
else:
self.value_net = ValueNetwork(
reward_signal_names,
policy.behavior_spec.observation_specs,
network_settings=trainer_settings.network_settings,
)
params = list(self.policy.actor.parameters()) + list(
self.value_net.parameters()
)
params = list(self.policy.actor_critic.parameters())
self.hyperparameters: PPOSettings = cast(
PPOSettings, trainer_settings.hyperparameters
)

}
self.stream_names = list(self.reward_signals.keys())
@property
def critic(self):
return self.value_net
def ppo_value_loss(
self,

if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
log_probs, entropy = self.policy.evaluate_actions(
log_probs, entropy, values = self.policy.evaluate_actions(
)
values, _ = self.critic.critic_pass(
current_obs, memories=memories, sequence_length=self.policy.sequence_length
)
old_log_probs = ActionLogProbs.from_buffer(batch).flatten()
log_probs = log_probs.flatten()

46
ml-agents/mlagents/trainers/ppo/trainer.py


return self.policy
def lambd_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0):
def discount_rewards(r, gamma=0.99, value_next=0.0):
Computes lambda return.
Computes discounted sum of future rewards for use in updating value estimate.
:param value_estimates: List of value estimates.
:param lambd: n_step return weighting factor.
:return: lambda return as a list
:return: discounted sum of future rewards as list.
discounted_r = np.zeros_like(r)
running_add = value_next
for t in reversed(range(0, r.size)):
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
def lambd_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0):
returns = np.zeros_like(r)
returns[-1] = r[-1] + gamma * value_next
for t in reversed(range(0, r.size - 1)):

)
return returns
def get_team_returns(
rewards,
baseline_estimates,
v_estimates,
value_next=0.0,
died=False,
gamma=0.99,
lambd=0.8,
):
"""
Computes generalized advantage estimate for use in updating policy.
:param rewards: list of rewards for time-steps t to T.
:param value_next: Value estimate for time-step T+1.
:param value_estimates: list of value estimates for time-steps t to T.
:param gamma: Discount factor.
:param lambd: GAE weighing factor.
:return: list of advantage estimates for time-steps t to T.
"""
rewards = np.array(rewards)
returns_b = lambd_return(
rewards, baseline_estimates, gamma=gamma, lambd=lambd, value_next=value_next
)
returns_v = lambd_return(
rewards, v_estimates, gamma=gamma, lambd=lambd, value_next=value_next
)
return returns_v, returns_b

7
ml-agents/mlagents/trainers/settings.py


class TrainerType(Enum):
PPO: str = "ppo"
SAC: str = "sac"
COMA: str = "coma"
_mapping = {TrainerType.PPO: PPOSettings, TrainerType.SAC: SACSettings}
_mapping = {
TrainerType.PPO: PPOSettings,
TrainerType.SAC: SACSettings,
TrainerType.COMA: PPOSettings,
}
return _mapping[self]

82
ml-agents/mlagents/trainers/torch/agent_action.py


from typing import List, Optional, NamedTuple
import itertools
import numpy as np
from mlagents.trainers.buffer import AgentBuffer, BufferKey
from mlagents.trainers.buffer import AgentBuffer, BufferKey, AgentBufferField
from mlagents.trainers.torch.utils import ModelUtils
from mlagents_envs.base_env import ActionTuple

discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
return AgentAction(continuous, discrete)
@staticmethod
def _padded_time_to_batch(
agent_buffer_field: AgentBufferField, dtype: torch.dtype = torch.float32
) -> List[torch.Tensor]:
"""
Pad actions and convert to tensor. Note that data is padded by 0's, not NaNs
as the observations are.
"""
action_shape = None
for _action in agent_buffer_field:
if _action:
action_shape = _action[0].shape
break
# If there were no critic obs at all
if action_shape is None:
return []
new_list = list(
map(
lambda x: ModelUtils.list_to_tensor(x, dtype=dtype),
itertools.zip_longest(
*agent_buffer_field, fillvalue=np.full(action_shape, 0)
),
)
)
return new_list
@staticmethod
def _group_from_buffer(
buff: AgentBuffer, cont_action_key: BufferKey, disc_action_key: BufferKey
) -> List["AgentAction"]:
continuous_tensors: List[torch.Tensor] = []
discrete_tensors: List[torch.Tensor] = [] # type: ignore
if cont_action_key in buff:
continuous_tensors = AgentAction._padded_time_to_batch(
buff[cont_action_key]
)
if disc_action_key in buff:
discrete_tensors = AgentAction._padded_time_to_batch(
buff[disc_action_key], dtype=torch.long
)
actions_list = []
for _cont, _disc in itertools.zip_longest(
continuous_tensors, discrete_tensors, fillvalue=None
):
if _disc is not None:
_disc = [_disc[..., i] for i in range(_disc.shape[-1])]
actions_list.append(AgentAction(_cont, _disc))
return actions_list
@staticmethod
def group_from_buffer(buff: AgentBuffer) -> List["AgentAction"]:
"""
A static method that accesses continuous and discrete action fields in an AgentBuffer
and constructs the corresponding AgentAction from the retrieved np arrays.
"""
return AgentAction._group_from_buffer(
buff, BufferKey.GROUP_CONTINUOUS_ACTION, BufferKey.GROUP_DISCRETE_ACTION
)
@staticmethod
def group_from_buffer_next(buff: AgentBuffer) -> List["AgentAction"]:
"""
A static method that accesses next continuous and discrete action fields in an AgentBuffer
and constructs the corresponding AgentAction from the retrieved np arrays.
"""
return AgentAction._group_from_buffer(
buff, BufferKey.GROUP_NEXT_CONT_ACTION, BufferKey.GROUP_NEXT_DISC_ACTION
)
def to_flat(self, discrete_branches: List[int]) -> torch.Tensor:
discrete_oh = ModelUtils.actions_to_onehot(
self.discrete_tensor, discrete_branches
)
discrete_oh = torch.cat(discrete_oh, dim=1)
return torch.cat([self.continuous_tensor, discrete_oh], dim=-1)

162
ml-agents/mlagents/trainers/torch/networks.py


return encoding, memories
class MultiInputNetworkBody(torch.nn.Module, Critic):
class MultiInputNetworkBody(torch.nn.Module):
def __init__(
self,
observation_specs: List[ObservationSpec],

else 0
)
self.processors, _input_size = ModelUtils.create_input_processors(
sensor_specs,
observation_specs,
self.h_size,
network_settings.vis_encode_type,
normalize=self.normalize,

+ sum(self.action_spec.discrete_branches)
+ self.action_spec.continuous_size
)
self.obs_encoder = EntityEmbedding(
0, obs_only_ent_size, None, self.h_size, concat_self=False
)
self.obs_action_encoder = EntityEmbedding(
0, q_ent_size, None, self.h_size, concat_self=False
)
self.obs_encoder = EntityEmbedding(obs_only_ent_size, None, self.h_size)
self.obs_action_encoder = EntityEmbedding(q_ent_size, None, self.h_size)
self.self_attn = ResidualSelfAttention(self.h_size)

if self.use_lstm:
self.lstm = LSTM(self.h_size, self.m_size)
else:
self.lstm = None # type: ignorek
self.lstm = None # type: ignore
@property
def memory_size(self) -> int:
return self.lstm.memory_size if self.use_lstm else 0
@property
def memory_size(self) -> int:
return self.lstm.memory_size if self.use_lstm else 0
def update_normalization(self, buffer: AgentBuffer) -> None:
obs = ObsUtil.from_buffer(buffer, len(self.processors))
for vec_input, enc in zip(obs, self.processors):
if isinstance(enc, VectorInput):
enc.update_normalization(torch.as_tensor(vec_input))
def copy_normalization(self, other_network: "MultiInputNetworkBody") -> None:
if self.normalize:
for n1, n2 in zip(self.processors, other_network.processors):
if isinstance(n1, VectorInput) and isinstance(n2, VectorInput):
n1.copy_normalization(n2)
def update_normalization(self, buffer: AgentBuffer) -> None:
obs = ObsUtil.from_buffer(buffer, len(self.processors))
for vec_input, enc in zip(obs, self.processors):
if isinstance(enc, VectorInput):
enc.update_normalization(torch.as_tensor(vec_input))
def _get_masks_from_nans(self, obs_tensors: List[torch.Tensor]) -> torch.Tensor:
"""
Get attention masks by grabbing an arbitrary obs across all the agents
Since these are raw obs, the padded values are still NaN
"""
only_first_obs = [_all_obs[0] for _all_obs in obs_tensors]
obs_for_mask = torch.stack(only_first_obs, dim=1)
# Get the mask from nans
attn_mask = torch.any(obs_for_mask.isnan(), dim=2).type(torch.FloatTensor)
return attn_mask
def copy_normalization(self, other_network: "MultiInputNetworkBody") -> None:
if self.normalize:
for n1, n2 in zip(self.processors, other_network.processors):
if isinstance(n1, VectorInput) and isinstance(n2, VectorInput):
n1.copy_normalization(n2)
def _get_masks_from_nans(self, obs_tensors: List[torch.Tensor]) -> torch.Tensor:
"""
Get attention masks by grabbing an arbitrary obs across all the agents
Since these are raw obs, the padded values are still NaN
"""
only_first_obs = [_all_obs[0] for _all_obs in obs_tensors]
obs_for_mask = torch.stack(only_first_obs, dim=1)
# Get the mask from nans
attn_mask = torch.any(obs_for_mask.isnan(), dim=2).type(torch.FloatTensor)
return attn_mask
def forward(
self,
obs_only: List[List[torch.Tensor]],
obs: List[List[torch.Tensor]],
actions: List[AgentAction],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
def forward(
self,
obs_only: List[List[torch.Tensor]],
obs: List[List[torch.Tensor]],
actions: Optional[List[AgentAction]],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
self_attn_masks = []
self_attn_inputs = []
concat_f_inp = []
for inputs, action in zip(obs, actions):
encodes = []
for idx, processor in enumerate(self.processors):
obs_input = inputs[idx]
obs_input[obs_input.isnan()] = 0.0 # Remove NaNs
processed_obs = processor(obs_input)
encodes.append(processed_obs)
cat_encodes = [
torch.cat(encodes, dim=-1),
action.to_flat(self.action_spec.discrete_branches),
]
concat_f_inp.append(torch.cat(cat_encodes, dim=1))
self_attn_masks = []
self_attn_inputs = []
concat_f_inp = []
if actions is not None:
for inputs, action in zip(obs, actions):
encodes = []
for idx, processor in enumerate(self.processors):
obs_input = inputs[idx]
obs_input[obs_input.isnan()] = 0.0 # Remove NaNs
processed_obs = processor(obs_input)
encodes.append(processed_obs)
cat_encodes = [
torch.cat(encodes, dim=-1),
action.to_flat(self.action_spec.discrete_branches),
]
concat_f_inp.append(torch.cat(cat_encodes, dim=1))
if concat_f_inp:
f_inp = torch.stack(concat_f_inp, dim=1)
self_attn_masks.append(self._get_masks_from_nans(obs))
self_attn_inputs.append(self.obs_action_encoder(None, f_inp))
if concat_f_inp:
f_inp = torch.stack(concat_f_inp, dim=1)
self_attn_masks.append(self._get_masks_from_nans(obs))
self_attn_inputs.append(self.obs_action_encoder(None, f_inp))
concat_encoded_obs = []
for inputs in obs_only:
encodes = []
for idx, processor in enumerate(self.processors):
obs_input = inputs[idx]
obs_input[obs_input.isnan()] = 0.0 # Remove NaNs
processed_obs = processor(obs_input)
encodes.append(processed_obs)
concat_encoded_obs.append(torch.cat(encodes, dim=-1))
g_inp = torch.stack(concat_encoded_obs, dim=1)
self_attn_masks.append(self._get_masks_from_nans())
self_attn_inputs.append(self.obs_encoder(None, g_inp))
concat_encoded_obs = []
for inputs in obs_only:
encodes = []
for idx, processor in enumerate(self.processors):
obs_input = inputs[idx]
obs_input[obs_input.isnan()] = 0.0 # Remove NaNs
processed_obs = processor(obs_input)
encodes.append(processed_obs)
concat_encoded_obs.append(torch.cat(encodes, dim=-1))
g_inp = torch.stack(concat_encoded_obs, dim=1)
self_attn_masks.append(self._get_masks_from_nans(obs_only))
self_attn_inputs.append(self.obs_encoder(None, g_inp))
encoded_entity = torch.cat(self_attn_inputs, dim=1)
encoded_state = self.self_attn(encoded_entity, self_attn_masks)
encoded_entity = torch.cat(self_attn_inputs, dim=1)
encoded_state = self.self_attn(encoded_entity, self_attn_masks)
encoding = self.linear_encoder(encoded_state)
if self.use_lstm:
# Resize to (batch, sequence length, encoding size)
encoding = encoding.reshape([-1, sequence_length, self.h_size])
encoding, memories = self.lstm(encoding, memories)
encoding = encoding.reshape([-1, self.m_size // 2])
return encoding, memories
encoding = self.linear_encoder(encoded_state)
if self.use_lstm:
# Resize to (batch, sequence length, encoding size)
encoding = encoding.reshape([-1, sequence_length, self.h_size])
encoding, memories = self.lstm(encoding, memories)
encoding = encoding.reshape([-1, self.m_size // 2])
return encoding, memories
class Critic(abc.ABC):

11
ml-agents/mlagents/trainers/trainer/trainer_factory.py


import os
from typing import Dict
from mlagents.trainers.coma.trainer import COMATrainer
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager

if trainer_type == TrainerType.PPO:
trainer = PPOTrainer(
brain_name,
min_lesson_length,
trainer_settings,
train_model,
load_model,
seed,
trainer_artifact_path,
)
elif trainer_type == TrainerType.COMA:
trainer = COMATrainer(
brain_name,
min_lesson_length,
trainer_settings,

正在加载...
取消
保存