比较提交

...
此合并请求有变更与目标分支冲突。
/ml-agents/mlagents/trainers/settings.py
/ml-agents/mlagents/trainers/stats.py
/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
/ml-agents/mlagents/trainers/trainer/trainer_factory.py
/ml-agents/mlagents/trainers/tests/mock_brain.py
/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_extrinsic.py
/ml-agents/mlagents/trainers/buffer.py
/ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py
/ml-agents/mlagents/trainers/torch/utils.py
/ml-agents/mlagents/trainers/torch/networks.py
/ml-agents/mlagents/trainers/trajectory.py

8 次代码提交

作者 SHA1 备注 提交日期
Ervin Teng 0207f95e Don't delete when agents don't die 4 年前
Ervin Teng a9fb37aa Fix reporting of group rewards, CLI print of group 4 年前
Ervin Teng 58122103 Fix warning message formatting again 4 年前
Ervin Teng 252c1f36 Fix warning message format 4 年前
Andrew Cohen 853b44d5 torch coma tests: lstm, cur, gail 4 年前
Andrew Cohen 21d7ab85 add torch no_grad to coma LSTM value computation 4 年前
Ervin Teng 8902c058 Merge branch 'main' into develop-coma2-trainer 4 年前
GitHub d2635e58 Action slice (#5047) 4 年前
共有 21 个文件被更改,包括 1691 次插入94 次删除
  1. 17
      ml-agents/mlagents/trainers/settings.py
  2. 11
      ml-agents/mlagents/trainers/trainer/trainer_factory.py
  3. 6
      ml-agents/mlagents/trainers/stats.py
  4. 2
      ml-agents/mlagents/trainers/tests/mock_brain.py
  5. 27
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_extrinsic.py
  6. 11
      ml-agents/mlagents/trainers/tests/torch/test_agent_action.py
  7. 2
      ml-agents/mlagents/trainers/trajectory.py
  8. 97
      ml-agents/mlagents/trainers/torch/utils.py
  9. 33
      ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py
  10. 221
      ml-agents/mlagents/trainers/torch/networks.py
  11. 16
      ml-agents/mlagents/trainers/torch/agent_action.py
  12. 6
      ml-agents/mlagents/trainers/buffer.py
  13. 28
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  14. 26
      config/ppo/PushBlockCollab.yaml
  15. 11
      com.unity.ml-agents/Runtime/MultiAgentGroupIdCounter.cs.meta
  16. 290
      ml-agents/mlagents/trainers/tests/torch/test_coma.py
  17. 0
      ml-agents/mlagents/trainers/coma/__init__.py
  18. 660
      ml-agents/mlagents/trainers/coma/optimizer_torch.py
  19. 310
      ml-agents/mlagents/trainers/coma/trainer.py
  20. 11
      com.unity.ml-agents/Runtime/MultiAgentGroupIdCounter.cs.meta

17
ml-agents/mlagents/trainers/settings.py


def to_settings(self) -> type:
_mapping = {
RewardSignalType.EXTRINSIC: RewardSignalSettings,
RewardSignalType.EXTRINSIC: ExtrinsicSettings,
RewardSignalType.GAIL: GAILSettings,
RewardSignalType.CURIOSITY: CuriositySettings,
RewardSignalType.RND: RNDSettings,

"encoding_size"
]
return d_final
@attr.s(auto_attribs=True)
class ExtrinsicSettings(RewardSignalSettings):
# For use with COMA2. Add groupmate rewards to the final extrinsic reward.
add_groupmate_rewards = False
@attr.s(auto_attribs=True)

class TrainerType(Enum):
PPO: str = "ppo"
SAC: str = "sac"
COMA: str = "coma"
_mapping = {TrainerType.PPO: PPOSettings, TrainerType.SAC: SACSettings}
_mapping = {
TrainerType.PPO: PPOSettings,
TrainerType.SAC: SACSettings,
TrainerType.COMA: PPOSettings,
}
return _mapping[self]

network_settings: NetworkSettings = attr.ib(factory=NetworkSettings)
reward_signals: Dict[RewardSignalType, RewardSignalSettings] = attr.ib(
factory=lambda: {RewardSignalType.EXTRINSIC: RewardSignalSettings()}
factory=lambda: {RewardSignalType.EXTRINSIC: ExtrinsicSettings()}
)
init_path: Optional[str] = None
keep_checkpoints: int = 5

11
ml-agents/mlagents/trainers/trainer/trainer_factory.py


from mlagents.trainers.trainer import Trainer
from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.sac.trainer import SACTrainer
from mlagents.trainers.coma.trainer import COMATrainer
from mlagents.trainers.ghost.trainer import GhostTrainer
from mlagents.trainers.ghost.controller import GhostController
from mlagents.trainers.settings import TrainerSettings, TrainerType

if trainer_type == TrainerType.PPO:
trainer = PPOTrainer(
brain_name,
min_lesson_length,
trainer_settings,
train_model,
load_model,
seed,
trainer_artifact_path,
)
elif trainer_type == TrainerType.COMA:
trainer = COMATrainer(
brain_name,
min_lesson_length,
trainer_settings,

6
ml-agents/mlagents/trainers/stats.py


log_info.append(f"Rank: {self.rank}")
log_info.append(f"Mean Reward: {stats_summary.mean:0.3f}")
log_info.append(f"Std of Reward: {stats_summary.std:0.3f}")
if "Environment/Group Cumulative Reward" in values:
group_stats_summary = values["Environment/Group Cumulative Reward"]
log_info.append(f"Mean Group Reward: {group_stats_summary.mean:0.3f}")
else:
log_info.append(f"Std of Reward: {stats_summary.std:0.3f}")
log_info.append(is_training)
if self.self_play and "Self-play/ELO" in values:

2
ml-agents/mlagents/trainers/tests/mock_brain.py


behavior_spec: BehaviorSpec,
memory_size: int = 10,
exclude_key_list: List[str] = None,
num_other_agents_in_group: int = 0,
) -> AgentBuffer:
trajectory = make_fake_trajectory(
length,

num_other_agents_in_group=num_other_agents_in_group,
)
buffer = trajectory.to_agentbuffer()
# If a key_list was given, remove those keys

27
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_extrinsic.py


from mlagents.trainers.buffer import BufferKey
import numpy as np
from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
from mlagents.trainers.settings import ExtrinsicSettings, RewardSignalType
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
create_agent_buffer,
)

],
)
def test_construction(behavior_spec: BehaviorSpec) -> None:
settings = RewardSignalSettings()
settings = ExtrinsicSettings()
settings.gamma = 0.2
extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
assert extrinsic_rp.gamma == 0.2

],
)
def test_factory(behavior_spec: BehaviorSpec) -> None:
settings = RewardSignalSettings()
settings = ExtrinsicSettings()
extrinsic_rp = create_reward_provider(
RewardSignalType.EXTRINSIC, behavior_spec, settings
)

)
def test_reward(behavior_spec: BehaviorSpec, reward: float) -> None:
buffer = create_agent_buffer(behavior_spec, 1000, reward)
settings = RewardSignalSettings()
settings = ExtrinsicSettings()
# Test group rewards. Rewards should be double of the environment rewards, but shouldn't count
# the groupmate rewards.
buffer[BufferKey.GROUP_REWARD] = buffer[BufferKey.ENVIRONMENT_REWARDS]
# 2 agents with identical rewards
buffer[BufferKey.GROUPMATE_REWARDS].set(
[np.ones(1, dtype=np.float32) * reward] * 2
for _ in range(buffer.num_experiences)
)
generated_rewards = extrinsic_rp.evaluate(buffer)
assert (generated_rewards == 2 * reward).all()
# Test groupmate rewards. Total reward should be indiv_reward + 2 * teammate_reward + group_reward
settings.add_groupmate_rewards = True
extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
generated_rewards = extrinsic_rp.evaluate(buffer)
assert (generated_rewards == 4 * reward).all()

11
ml-agents/mlagents/trainers/tests/torch/test_agent_action.py


assert (agent_1_act.discrete_tensor[3:] == 0).all()
def test_slice():
# Both continuous and discrete
aa = AgentAction(
torch.tensor([[1.0], [1.0], [1.0]]),
[torch.tensor([2, 1, 0]), torch.tensor([1, 2, 0])],
)
saa = aa.slice(0, 2)
assert saa.continuous_tensor.shape == (2, 1)
assert saa.discrete_tensor.shape == (2, 2)
def test_to_flat():
# Both continuous and discrete
aa = AgentAction(

2
ml-agents/mlagents/trainers/trajectory.py


return self.steps[-1].done
@property
def teammate_dones_reached(self) -> bool:
def all_group_dones_reached(self) -> bool:
"""
Returns true if all teammates are done at the end of the trajectory.
Combine with done_reached to check if the whole team is done.

97
ml-agents/mlagents/trainers/torch/utils.py


from typing import List, Optional, Tuple
from mlagents.torch_utils import torch, nn
from mlagents.trainers.torch.layers import LinearEncoder, Initialization
import numpy as np
from mlagents.trainers.torch.encoders import (

VectorInput,
)
from mlagents.trainers.settings import EncoderType, ScheduleType
from mlagents.trainers.torch.attention import EntityEmbedding
from mlagents.trainers.torch.attention import (
EntityEmbedding,
ResidualSelfAttention,
get_zero_entities_mask,
)
from mlagents.trainers.exception import UnityTrainerException
from mlagents_envs.base_env import ObservationSpec, DimensionProperty

alpha=tau,
out=target_param.data,
)
@staticmethod
def create_residual_self_attention(
input_processors: nn.ModuleList, embedding_sizes: List[int], hidden_size: int
) -> Tuple[Optional[ResidualSelfAttention], Optional[LinearEncoder]]:
"""
Creates an RSA if there are variable length observations found in the input processors.
:param input_processors: A ModuleList of input processors as returned by the function
create_input_processors().
:param embedding sizes: A List of embedding sizes as returned by create_input_processors().
:param hidden_size: The hidden size to use for the RSA.
:returns: A Tuple of the RSA itself, a self encoder, and the embedding size after the RSA.
Returns None for the RSA and encoder if no var len inputs are detected.
"""
rsa, x_self_encoder = None, None
entity_num_max: int = 0
var_processors = [p for p in input_processors if isinstance(p, EntityEmbedding)]
for processor in var_processors:
entity_max: int = processor.entity_num_max_elements
# Only adds entity max if it was known at construction
if entity_max > 0:
entity_num_max += entity_max
if len(var_processors) > 0:
if sum(embedding_sizes):
x_self_encoder = LinearEncoder(
sum(embedding_sizes),
1,
hidden_size,
kernel_init=Initialization.Normal,
kernel_gain=(0.125 / hidden_size) ** 0.5,
)
rsa = ResidualSelfAttention(hidden_size, entity_num_max)
return rsa, x_self_encoder
@staticmethod
def encode_observations(
inputs: List[torch.Tensor],
processors: nn.ModuleList,
rsa: Optional[ResidualSelfAttention],
x_self_encoder: Optional[LinearEncoder],
) -> torch.Tensor:
"""
Helper method to encode observations using a listt of processors and an RSA.
:param inputs: List of Tensors corresponding to a set of obs.
:param processors: a ModuleList of the input processors to be applied to these obs.
:param rsa: Optionally, an RSA to use for variable length obs.
:param x_self_encoder: Optionally, an encoder to use for x_self (in this case, the non-variable inputs.).
"""
encodes = []
var_len_processor_inputs: List[Tuple[nn.Module, torch.Tensor]] = []
for idx, processor in enumerate(processors):
if not isinstance(processor, EntityEmbedding):
# The input can be encoded without having to process other inputs
obs_input = inputs[idx]
processed_obs = processor(obs_input)
encodes.append(processed_obs)
else:
var_len_processor_inputs.append((processor, inputs[idx]))
if len(encodes) != 0:
encoded_self = torch.cat(encodes, dim=1)
input_exist = True
else:
input_exist = False
if len(var_len_processor_inputs) > 0 and rsa is not None:
# Some inputs need to be processed with a variable length encoder
masks = get_zero_entities_mask([p_i[1] for p_i in var_len_processor_inputs])
embeddings: List[torch.Tensor] = []
processed_self = (
x_self_encoder(encoded_self)
if input_exist and x_self_encoder is not None
else None
)
for processor, var_len_input in var_len_processor_inputs:
embeddings.append(processor(processed_self, var_len_input))
qkv = torch.cat(embeddings, dim=1)
attention_embedding = rsa(qkv, masks)
if not input_exist:
encoded_self = torch.cat([attention_embedding], dim=1)
input_exist = True
else:
encoded_self = torch.cat([encoded_self, attention_embedding], dim=1)
if not input_exist:
raise UnityTrainerException(
"The trainer was unable to process any of the provided inputs. "
"Make sure the trained agents has at least one sensor attached to them."
)
return encoded_self

33
ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py


from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
BaseRewardProvider,
)
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.settings import ExtrinsicSettings
"""
Evaluates extrinsic reward. For single-agent, this equals the individual reward
given to the agent. For the COMA2 algorithm, we want not only the individual reward
but also the team and the individual rewards of the other agents.
"""
def __init__(self, specs: BehaviorSpec, settings: ExtrinsicSettings) -> None:
super().__init__(specs, settings)
self._add_groupmate_rewards = settings.add_groupmate_rewards
return np.array(mini_batch[BufferKey.ENVIRONMENT_REWARDS], dtype=np.float32)
indiv_rewards = np.array(
mini_batch[BufferKey.ENVIRONMENT_REWARDS], dtype=np.float32
)
total_rewards = indiv_rewards
if (
BufferKey.GROUPMATE_REWARDS in mini_batch
and BufferKey.GROUP_REWARD in mini_batch
):
if self._add_groupmate_rewards:
groupmate_rewards_list = mini_batch[BufferKey.GROUPMATE_REWARDS]
groupmate_rewards_sum = np.array(
[sum(_rew) for _rew in groupmate_rewards_list], dtype=np.float32
)
total_rewards += groupmate_rewards_sum
group_rewards = np.array(
mini_batch[BufferKey.GROUP_REWARD], dtype=np.float32
)
# Add all the group rewards to the individual rewards
total_rewards += group_rewards
return total_rewards
def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
return {}

221
ml-agents/mlagents/trainers/torch/networks.py


from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.decoders import ValueHeads
from mlagents.trainers.torch.layers import LSTM, LinearEncoder, Initialization
from mlagents.trainers.torch.layers import LSTM, LinearEncoder
from mlagents.trainers.torch.attention import (
EntityEmbedding,
ResidualSelfAttention,
get_zero_entities_mask,
)
from mlagents.trainers.torch.attention import EntityEmbedding, ResidualSelfAttention
ActivationFunction = Callable[[torch.Tensor], torch.Tensor]

normalize=self.normalize,
)
entity_num_max: int = 0
var_processors = [p for p in self.processors if isinstance(p, EntityEmbedding)]
for processor in var_processors:
entity_max: int = processor.entity_num_max_elements
# Only adds entity max if it was known at construction
if entity_max > 0:
entity_num_max += entity_max
if len(var_processors) > 0:
if sum(self.embedding_sizes):
self.x_self_encoder = LinearEncoder(
sum(self.embedding_sizes),
1,
self.h_size,
kernel_init=Initialization.Normal,
kernel_gain=(0.125 / self.h_size) ** 0.5,
)
self.rsa = ResidualSelfAttention(self.h_size, entity_num_max)
self.rsa, self.x_self_encoder = ModelUtils.create_residual_self_attention(
self.processors, self.embedding_sizes, self.h_size
)
if self.rsa is not None:
total_enc_size = sum(self.embedding_sizes) + self.h_size
else:
total_enc_size = sum(self.embedding_sizes)

memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
encodes = []
var_len_processor_inputs: List[Tuple[nn.Module, torch.Tensor]] = []
encoded_self = ModelUtils.encode_observations(
inputs, self.processors, self.rsa, self.x_self_encoder
)
if actions is not None:
encoded_self = torch.cat([encoded_self, actions], dim=1)
encoding = self.linear_encoder(encoded_self)
if self.use_lstm:
# Resize to (batch, sequence length, encoding size)
encoding = encoding.reshape([-1, sequence_length, self.h_size])
encoding, memories = self.lstm(encoding, memories)
encoding = encoding.reshape([-1, self.m_size // 2])
return encoding, memories
class MultiInputNetworkBody(torch.nn.Module):
def __init__(
self,
observation_specs: List[ObservationSpec],
network_settings: NetworkSettings,
action_spec: ActionSpec,
):
super().__init__()
self.normalize = network_settings.normalize
self.use_lstm = network_settings.memory is not None
# Scale network depending on num agents
self.h_size = network_settings.hidden_units
self.m_size = (
network_settings.memory.memory_size
if network_settings.memory is not None
else 0
)
self.processors, _input_size = ModelUtils.create_input_processors(
observation_specs,
self.h_size,
network_settings.vis_encode_type,
normalize=self.normalize,
)
self.action_spec = action_spec
# This RSA and input are for variable length obs, not for multi-agentt.
(
self.input_rsa,
self.input_x_self_encoder,
) = ModelUtils.create_residual_self_attention(
self.processors, _input_size, self.h_size
)
if self.input_rsa is not None:
_input_size.append(self.h_size)
# Modules for multi-agent self-attention
obs_only_ent_size = sum(_input_size)
q_ent_size = (
sum(_input_size)
+ sum(self.action_spec.discrete_branches)
+ self.action_spec.continuous_size
)
self.obs_encoder = EntityEmbedding(obs_only_ent_size, None, self.h_size)
self.obs_action_encoder = EntityEmbedding(q_ent_size, None, self.h_size)
self.self_attn = ResidualSelfAttention(self.h_size)
self.linear_encoder = LinearEncoder(
self.h_size,
network_settings.num_layers,
self.h_size,
kernel_gain=(0.125 / self.h_size) ** 0.5,
)
for idx, processor in enumerate(self.processors):
if not isinstance(processor, EntityEmbedding):
# The input can be encoded without having to process other inputs
obs_input = inputs[idx]
processed_obs = processor(obs_input)
encodes.append(processed_obs)
else:
var_len_processor_inputs.append((processor, inputs[idx]))
if len(encodes) != 0:
encoded_self = torch.cat(encodes, dim=1)
input_exist = True
if self.use_lstm:
self.lstm = LSTM(self.h_size, self.m_size)
input_exist = False
if len(var_len_processor_inputs) > 0:
# Some inputs need to be processed with a variable length encoder
masks = get_zero_entities_mask([p_i[1] for p_i in var_len_processor_inputs])
embeddings: List[torch.Tensor] = []
processed_self = self.x_self_encoder(encoded_self) if input_exist else None
for processor, var_len_input in var_len_processor_inputs:
embeddings.append(processor(processed_self, var_len_input))
qkv = torch.cat(embeddings, dim=1)
attention_embedding = self.rsa(qkv, masks)
if not input_exist:
encoded_self = torch.cat([attention_embedding], dim=1)
input_exist = True
else:
encoded_self = torch.cat([encoded_self, attention_embedding], dim=1)
self.lstm = None # type: ignore
@property
def memory_size(self) -> int:
return self.lstm.memory_size if self.use_lstm else 0
if not input_exist:
raise Exception(
"The trainer was unable to process any of the provided inputs. "
"Make sure the trained agents has at least one sensor attached to them."
)
def update_normalization(self, buffer: AgentBuffer) -> None:
obs = ObsUtil.from_buffer(buffer, len(self.processors))
for vec_input, enc in zip(obs, self.processors):
if isinstance(enc, VectorInput):
enc.update_normalization(torch.as_tensor(vec_input))
def copy_normalization(self, other_network: "MultiInputNetworkBody") -> None:
if self.normalize:
for n1, n2 in zip(self.processors, other_network.processors):
if isinstance(n1, VectorInput) and isinstance(n2, VectorInput):
n1.copy_normalization(n2)
if actions is not None:
encoded_self = torch.cat([encoded_self, actions], dim=1)
encoding = self.linear_encoder(encoded_self)
def _get_masks_from_nans(self, obs_tensors: List[torch.Tensor]) -> torch.Tensor:
"""
Get attention masks by grabbing an arbitrary obs across all the agents
Since these are raw obs, the padded values are still NaN
"""
only_first_obs = [_all_obs[0] for _all_obs in obs_tensors]
# Just get the first element in each obs regardless of its dimension. This will speed up
# searching for NaNs.
only_first_obs_flat = torch.stack(
[_obs.flatten(start_dim=1)[:, 0] for _obs in only_first_obs], dim=1
)
# Get the mask from NaNs
attn_mask = only_first_obs_flat.isnan().type(torch.FloatTensor)
return attn_mask
def _remove_nans_from_obs(
self, all_obs: List[List[torch.Tensor]], attention_mask: torch.Tensor
) -> None:
"""
Helper function to remove NaNs from observations using an attention mask.
"""
for i_agent, single_agent_obs in enumerate(all_obs):
for obs in single_agent_obs:
obs[
attention_mask.type(torch.BoolTensor)[:, i_agent], ::
] = 0.0 # Remoove NaNs fast
def forward(
self,
obs_only: List[List[torch.Tensor]],
obs: List[List[torch.Tensor]],
actions: List[AgentAction],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
self_attn_masks = []
self_attn_inputs = []
concat_f_inp = []
if obs:
obs_attn_mask = self._get_masks_from_nans(obs)
self._remove_nans_from_obs(obs, obs_attn_mask)
for inputs, action in zip(obs, actions):
encoded = ModelUtils.encode_observations(
inputs, self.processors, self.input_rsa, self.input_x_self_encoder
)
cat_encodes = [
encoded,
action.to_flat(self.action_spec.discrete_branches),
]
concat_f_inp.append(torch.cat(cat_encodes, dim=1))
f_inp = torch.stack(concat_f_inp, dim=1)
self_attn_masks.append(obs_attn_mask)
self_attn_inputs.append(self.obs_action_encoder(None, f_inp))
concat_encoded_obs = []
if obs_only:
obs_only_attn_mask = self._get_masks_from_nans(obs_only)
self._remove_nans_from_obs(obs_only, obs_only_attn_mask)
for inputs in obs_only:
encoded = ModelUtils.encode_observations(
inputs, self.processors, self.input_rsa, self.input_x_self_encoder
)
concat_encoded_obs.append(encoded)
g_inp = torch.stack(concat_encoded_obs, dim=1)
self_attn_masks.append(obs_only_attn_mask)
self_attn_inputs.append(self.obs_encoder(None, g_inp))
encoded_entity = torch.cat(self_attn_inputs, dim=1)
encoded_state = self.self_attn(encoded_entity, self_attn_masks)
encoding = self.linear_encoder(encoded_state)
if self.use_lstm:
# Resize to (batch, sequence length, encoding size)
encoding = encoding.reshape([-1, sequence_length, self.h_size])

16
ml-agents/mlagents/trainers/torch/agent_action.py


else:
return torch.empty(0)
def slice(self, start: int, end: int) -> "AgentAction":
"""
Returns an AgentAction with the continuous and discrete tensors slices
from index start to index end.
"""
_cont = None
_disc_list = []
if self.continuous_tensor is not None:
_cont = self.continuous_tensor[start:end]
if self.discrete_list is not None and len(self.discrete_list) > 0:
for _disc in self.discrete_list:
_disc_list.append(_disc[start:end])
return AgentAction(_cont, _disc_list)
def to_action_tuple(self, clip: bool = False) -> ActionTuple:
"""
Returns an ActionTuple

:return: Tensor of flattened actions.
"""
# if there are any discrete actions, create one-hot
if self.discrete_list is not None and self.discrete_list:
if self.discrete_list is not None and len(self.discrete_list) > 0:
discrete_oh = ModelUtils.actions_to_onehot(
self.discrete_tensor, discrete_branches
)

6
ml-agents/mlagents/trainers/buffer.py


MASKS = "masks"
MEMORY = "memory"
CRITIC_MEMORY = "critic_memory"
BASELINE_MEMORY = "coma_baseline_memory"
PREV_ACTION = "prev_action"
ADVANTAGES = "advantages"

VALUE_ESTIMATES = "value_estimates"
RETURNS = "returns"
ADVANTAGE = "advantage"
BASELINES = "baselines"
AgentBufferKey = Union[

@staticmethod
def advantage_key(name: str) -> AgentBufferKey:
return RewardSignalKeyPrefix.ADVANTAGE, name
@staticmethod
def baseline_estimates_key(name: str) -> AgentBufferKey:
return RewardSignalKeyPrefix.BASELINES, name
class AgentBufferField(list):

28
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer import Optimizer
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.settings import (
TrainerSettings,
RewardSignalSettings,
RewardSignalType,
)
from mlagents.trainers.torch.utils import ModelUtils

def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
pass
def create_reward_signals(self, reward_signal_configs):
def create_reward_signals(
self, reward_signal_configs: Dict[RewardSignalType, RewardSignalSettings]
) -> None:
"""
Create reward signals
:param reward_signal_configs: Reward signal config.

)
def _evaluate_by_sequence(
self, tensor_obs: List[torch.Tensor], initial_memory: np.ndarray
self, tensor_obs: List[torch.Tensor], initial_memory: torch.Tensor
) -> Tuple[Dict[str, torch.Tensor], AgentBufferField, torch.Tensor]:
"""
Evaluate a trajectory sequence-by-sequence, assembling the result. This enables us to get the

# Compute values for the potentially truncated initial sequence
seq_obs = []
first_seq_len = self.policy.sequence_length
first_seq_len = leftover if leftover > 0 else self.policy.sequence_length
if leftover > 0:
first_seq_len = leftover
first_seq_obs = _obs[0:first_seq_len]
seq_obs.append(first_seq_obs)

seq_obs = []
for _ in range(self.policy.sequence_length):
all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze()))
start = seq_num * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
end = (seq_num + 1) * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
start = seq_num * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
end = (seq_num + 1) * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
seq_obs.append(_obs[start:end])
values, _mem = self.critic.critic_pass(
seq_obs, _mem, sequence_length=self.policy.sequence_length

26
config/ppo/PushBlockCollab.yaml


behaviors:
PushBlock:
trainer_type: coma
hyperparameters:
batch_size: 1024
buffer_size: 10240
learning_rate: 0.0003
beta: 0.01
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: constant
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
keep_checkpoints: 5
max_steps: 20000000
time_horizon: 64
summary_freq: 60000
threaded: true

11
com.unity.ml-agents/Runtime/MultiAgentGroupIdCounter.cs.meta


fileFormatVersion: 2
guid: 5661ffdb6c7704e84bc785572dcd5bd1
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

290
ml-agents/mlagents/trainers/tests/torch/test_coma.py


import pytest
import numpy as np
import attr
from mlagents.trainers.coma.optimizer_torch import TorchCOMAOptimizer
from mlagents.trainers.settings import ExtrinsicSettings, RewardSignalType
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.mock_brain import copy_buffer_fields
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.tests.dummy_config import ( # noqa: F401
ppo_dummy_config,
curiosity_dummy_config,
gail_dummy_config,
)
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
@pytest.fixture
def dummy_config():
# coma has the same hyperparameters as ppo for now
return ppo_dummy_config()
VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8
DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 64
NUM_AGENTS = 4
CONTINUOUS_ACTION_SPEC = ActionSpec.create_continuous(VECTOR_ACTION_SPACE)
DISCRETE_ACTION_SPEC = ActionSpec.create_discrete(tuple(DISCRETE_ACTION_SPACE))
def create_test_coma_optimizer(dummy_config, use_rnn, use_discrete, use_visual):
mock_specs = mb.setup_test_behavior_specs(
use_discrete,
use_visual,
vector_action_space=DISCRETE_ACTION_SPACE
if use_discrete
else VECTOR_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE,
)
trainer_settings = attr.evolve(dummy_config)
trainer_settings.reward_signals = {
RewardSignalType.EXTRINSIC: ExtrinsicSettings(strength=1.0, gamma=0.99)
}
trainer_settings.network_settings.memory = (
NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
if use_rnn
else None
)
policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False)
optimizer = TorchCOMAOptimizer(policy, trainer_settings)
return optimizer
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
def test_coma_optimizer_update(dummy_config, rnn, visual, discrete):
# Test evaluate
optimizer = create_test_coma_optimizer(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES,
optimizer.policy.behavior_spec,
memory_size=optimizer.policy.m_size,
num_other_agents_in_group=NUM_AGENTS,
)
# Mock out reward signal eval
copy_buffer_fields(
update_buffer,
BufferKey.ENVIRONMENT_REWARDS,
[
BufferKey.ADVANTAGES,
RewardSignalUtil.returns_key("extrinsic"),
RewardSignalUtil.value_estimates_key("extrinsic"),
RewardSignalUtil.baseline_estimates_key("extrinsic"),
],
)
# Copy memories to critic memories
copy_buffer_fields(
update_buffer,
BufferKey.MEMORY,
[BufferKey.CRITIC_MEMORY, BufferKey.BASELINE_MEMORY],
)
return_stats = optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
)
# Make sure we have the right stats
required_stats = [
"Losses/Policy Loss",
"Losses/Value Loss",
"Policy/Learning Rate",
"Policy/Epsilon",
"Policy/Beta",
]
for stat in required_stats:
assert stat in return_stats.keys()
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
def test_coma_get_value_estimates(dummy_config, rnn, visual, discrete):
optimizer = create_test_coma_optimizer(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
time_horizon = 15
trajectory = make_fake_trajectory(
length=time_horizon,
observation_specs=optimizer.policy.behavior_spec.observation_specs,
action_spec=DISCRETE_ACTION_SPEC if discrete else CONTINUOUS_ACTION_SPEC,
max_step_complete=True,
num_other_agents_in_group=NUM_AGENTS,
)
(
value_estimates,
baseline_estimates,
value_next,
value_memories,
baseline_memories,
) = optimizer.get_trajectory_and_baseline_value_estimates(
trajectory.to_agentbuffer(),
trajectory.next_obs,
trajectory.next_group_obs,
done=False,
)
for key, val in value_estimates.items():
assert type(key) is str
assert len(val) == 15
for key, val in baseline_estimates.items():
assert type(key) is str
assert len(val) == 15
if value_memories is not None:
assert len(value_memories) == 15
assert len(baseline_memories) == 15
(
value_estimates,
baseline_estimates,
value_next,
value_memories,
baseline_memories,
) = optimizer.get_trajectory_and_baseline_value_estimates(
trajectory.to_agentbuffer(),
trajectory.next_obs,
trajectory.next_group_obs,
done=True,
)
for key, val in value_next.items():
assert type(key) is str
assert val == 0.0
# Check if we ignore terminal states properly
optimizer.reward_signals["extrinsic"].use_terminal_states = False
(
value_estimates,
baseline_estimates,
value_next,
value_memories,
baseline_memories,
) = optimizer.get_trajectory_and_baseline_value_estimates(
trajectory.to_agentbuffer(),
trajectory.next_obs,
trajectory.next_group_obs,
done=False,
)
for key, val in value_next.items():
assert type(key) is str
assert val != 0.0
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
# We need to test this separately from test_reward_signals.py to ensure no interactions
def test_ppo_optimizer_update_curiosity(
dummy_config, curiosity_dummy_config, rnn, visual, discrete # noqa: F811
):
# Test evaluate
dummy_config.reward_signals = curiosity_dummy_config
optimizer = create_test_coma_optimizer(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES,
optimizer.policy.behavior_spec,
memory_size=optimizer.policy.m_size,
)
# Mock out reward signal eval
copy_buffer_fields(
update_buffer,
src_key=BufferKey.ENVIRONMENT_REWARDS,
dst_keys=[
BufferKey.ADVANTAGES,
RewardSignalUtil.returns_key("extrinsic"),
RewardSignalUtil.value_estimates_key("extrinsic"),
RewardSignalUtil.baseline_estimates_key("extrinsic"),
RewardSignalUtil.returns_key("curiosity"),
RewardSignalUtil.value_estimates_key("curiosity"),
RewardSignalUtil.baseline_estimates_key("curiosity"),
],
)
# Copy memories to critic memories
copy_buffer_fields(
update_buffer,
BufferKey.MEMORY,
[BufferKey.CRITIC_MEMORY, BufferKey.BASELINE_MEMORY],
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
)
# We need to test this separately from test_reward_signals.py to ensure no interactions
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811
# Test evaluate
dummy_config.reward_signals = gail_dummy_config
config = ppo_dummy_config()
optimizer = create_test_coma_optimizer(
config, use_rnn=False, use_discrete=False, use_visual=False
)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
# Mock out reward signal eval
copy_buffer_fields(
update_buffer,
src_key=BufferKey.ENVIRONMENT_REWARDS,
dst_keys=[
BufferKey.ADVANTAGES,
RewardSignalUtil.returns_key("extrinsic"),
RewardSignalUtil.value_estimates_key("extrinsic"),
RewardSignalUtil.baseline_estimates_key("extrinsic"),
RewardSignalUtil.returns_key("gail"),
RewardSignalUtil.value_estimates_key("gail"),
RewardSignalUtil.baseline_estimates_key("gail"),
],
)
update_buffer[BufferKey.CONTINUOUS_LOG_PROBS] = np.ones_like(
update_buffer[BufferKey.CONTINUOUS_ACTION]
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
)
# Check if buffer size is too big
update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
# Mock out reward signal eval
copy_buffer_fields(
update_buffer,
src_key=BufferKey.ENVIRONMENT_REWARDS,
dst_keys=[
BufferKey.ADVANTAGES,
RewardSignalUtil.returns_key("extrinsic"),
RewardSignalUtil.value_estimates_key("extrinsic"),
RewardSignalUtil.baseline_estimates_key("extrinsic"),
RewardSignalUtil.returns_key("gail"),
RewardSignalUtil.value_estimates_key("gail"),
RewardSignalUtil.baseline_estimates_key("gail"),
],
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
)
if __name__ == "__main__":
pytest.main()

0
ml-agents/mlagents/trainers/coma/__init__.py

660
ml-agents/mlagents/trainers/coma/optimizer_torch.py


from typing import Dict, cast, List, Tuple, Optional
import numpy as np
import math
from mlagents.torch_utils import torch
from mlagents.trainers.buffer import (
AgentBuffer,
BufferKey,
RewardSignalUtil,
AgentBufferField,
)
from mlagents_envs.timers import timed
from mlagents_envs.base_env import ObservationSpec, ActionSpec
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
from mlagents.trainers.settings import (
ExtrinsicSettings,
RewardSignalSettings,
RewardSignalType,
TrainerSettings,
PPOSettings,
)
from mlagents.trainers.torch.networks import Critic, MultiInputNetworkBody
from mlagents.trainers.torch.decoders import ValueHeads
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.trajectory import ObsUtil, GroupObsUtil
from mlagents.trainers.settings import NetworkSettings
from mlagents_envs.logging_util import get_logger
logger = get_logger(__name__)
class TorchCOMAOptimizer(TorchOptimizer):
class COMAValueNetwork(torch.nn.Module, Critic):
def __init__(
self,
stream_names: List[str],
observation_specs: List[ObservationSpec],
network_settings: NetworkSettings,
action_spec: ActionSpec,
):
torch.nn.Module.__init__(self)
self.network_body = MultiInputNetworkBody(
observation_specs, network_settings, action_spec
)
if network_settings.memory is not None:
encoding_size = network_settings.memory.memory_size // 2
else:
encoding_size = network_settings.hidden_units
self.value_heads = ValueHeads(stream_names, encoding_size, 1)
@property
def memory_size(self) -> int:
return self.network_body.memory_size
def update_normalization(self, buffer: AgentBuffer) -> None:
self.network_body.update_normalization(buffer)
def baseline(
self,
self_obs: List[List[torch.Tensor]],
obs: List[List[torch.Tensor]],
actions: List[AgentAction],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
encoding, memories = self.network_body(
obs_only=self_obs,
obs=obs,
actions=actions,
memories=memories,
sequence_length=sequence_length,
)
value_outputs, critic_mem_out = self.forward(
encoding, memories, sequence_length
)
return value_outputs, critic_mem_out
def critic_pass(
self,
obs: List[List[torch.Tensor]],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
encoding, memories = self.network_body(
obs_only=obs,
obs=[],
actions=[],
memories=memories,
sequence_length=sequence_length,
)
value_outputs, critic_mem_out = self.forward(
encoding, memories, sequence_length
)
return value_outputs, critic_mem_out
def forward(
self,
encoding: torch.Tensor,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
output = self.value_heads(encoding)
return output, memories
def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
"""
Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
The PPO optimizer has a value estimator and a loss function.
:param policy: A TorchPolicy object that will be updated by this PPO Optimizer.
:param trainer_params: Trainer parameters dictionary that specifies the
properties of the trainer.
"""
# Create the graph here to give more granular control of the TF graph to the Optimizer.
super().__init__(policy, trainer_settings)
reward_signal_configs = trainer_settings.reward_signals
reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
self._critic = TorchCOMAOptimizer.COMAValueNetwork(
reward_signal_names,
policy.behavior_spec.observation_specs,
network_settings=trainer_settings.network_settings,
action_spec=policy.behavior_spec.action_spec,
)
params = list(self.policy.actor.parameters()) + list(self.critic.parameters())
self.hyperparameters: PPOSettings = cast(
PPOSettings, trainer_settings.hyperparameters
)
self.decay_learning_rate = ModelUtils.DecayedValue(
self.hyperparameters.learning_rate_schedule,
self.hyperparameters.learning_rate,
1e-10,
self.trainer_settings.max_steps,
)
self.decay_epsilon = ModelUtils.DecayedValue(
self.hyperparameters.learning_rate_schedule,
self.hyperparameters.epsilon,
0.1,
self.trainer_settings.max_steps,
)
self.decay_beta = ModelUtils.DecayedValue(
self.hyperparameters.learning_rate_schedule,
self.hyperparameters.beta,
1e-5,
self.trainer_settings.max_steps,
)
self.optimizer = torch.optim.Adam(
params, lr=self.trainer_settings.hyperparameters.learning_rate
)
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
self.stream_names = list(self.reward_signals.keys())
self.value_memory_dict: Dict[str, torch.Tensor] = {}
self.baseline_memory_dict: Dict[str, torch.Tensor] = {}
def create_reward_signals(
self, reward_signal_configs: Dict[RewardSignalType, RewardSignalSettings]
) -> None:
"""
Create reward signals. Override default to provide warnings for Curiosity and
GAIL, and make sure Extrinsic adds team rewards.
:param reward_signal_configs: Reward signal config.
"""
for reward_signal, settings in reward_signal_configs.items():
if reward_signal != RewardSignalType.EXTRINSIC:
logger.warning(
f"Reward signal {reward_signal.value.capitalize()} is not supported with the COMA2 trainer; "
"results may be unexpected."
)
elif isinstance(settings, ExtrinsicSettings):
settings.add_groupmate_rewards = True
super().create_reward_signals(reward_signal_configs)
@property
def critic(self):
return self._critic
def coma_value_loss(
self,
values: Dict[str, torch.Tensor],
old_values: Dict[str, torch.Tensor],
returns: Dict[str, torch.Tensor],
epsilon: float,
loss_masks: torch.Tensor,
) -> torch.Tensor:
"""
Evaluates value loss for PPO.
:param values: Value output of the current network.
:param old_values: Value stored with experiences in buffer.
:param returns: Computed returns.
:param epsilon: Clipping value for value estimate.
:param loss_mask: Mask for losses. Used with LSTM to ignore 0'ed out experiences.
"""
value_losses = []
for name, head in values.items():
old_val_tensor = old_values[name]
returns_tensor = returns[name]
clipped_value_estimate = old_val_tensor + torch.clamp(
head - old_val_tensor, -1 * epsilon, epsilon
)
v_opt_a = (returns_tensor - head) ** 2
v_opt_b = (returns_tensor - clipped_value_estimate) ** 2
value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks)
value_losses.append(value_loss)
value_loss = torch.mean(torch.stack(value_losses))
return value_loss
def ppo_policy_loss(
self,
advantages: torch.Tensor,
log_probs: torch.Tensor,
old_log_probs: torch.Tensor,
loss_masks: torch.Tensor,
) -> torch.Tensor:
"""
Evaluate PPO policy loss.
:param advantages: Computed advantages.
:param log_probs: Current policy probabilities
:param old_log_probs: Past policy probabilities
:param loss_masks: Mask for losses. Used with LSTM to ignore 0'ed out experiences.
"""
advantage = advantages.unsqueeze(-1)
decay_epsilon = self.hyperparameters.epsilon
r_theta = torch.exp(log_probs - old_log_probs)
p_opt_a = r_theta * advantage
p_opt_b = (
torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage
)
policy_loss = -1 * ModelUtils.masked_mean(
torch.min(p_opt_a, p_opt_b), loss_masks
)
return policy_loss
@timed
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
"""
Performs update on model.
:param batch: Batch of experiences.
:param num_sequences: Number of sequences to process.
:return: Results of update.
"""
# Get decayed parameters
decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step())
decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step())
decay_bet = self.decay_beta.get_value(self.policy.get_current_step())
returns = {}
old_values = {}
old_baseline_values = {}
for name in self.reward_signals:
old_values[name] = ModelUtils.list_to_tensor(
batch[RewardSignalUtil.value_estimates_key(name)]
)
returns[name] = ModelUtils.list_to_tensor(
batch[RewardSignalUtil.returns_key(name)]
)
old_baseline_values[name] = ModelUtils.list_to_tensor(
batch[RewardSignalUtil.baseline_estimates_key(name)]
)
n_obs = len(self.policy.behavior_spec.observation_specs)
current_obs = ObsUtil.from_buffer(batch, n_obs)
# Convert to tensors
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
group_obs = GroupObsUtil.from_buffer(batch, n_obs)
group_obs = [
[ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs]
for _groupmate_obs in group_obs
]
act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK])
actions = AgentAction.from_buffer(batch)
group_actions = AgentAction.group_from_buffer(batch)
memories = [
ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i])
for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length)
]
if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
value_memories = [
ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i])
for i in range(
0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length
)
]
baseline_memories = [
ModelUtils.list_to_tensor(batch[BufferKey.BASELINE_MEMORY][i])
for i in range(
0, len(batch[BufferKey.BASELINE_MEMORY]), self.policy.sequence_length
)
]
if len(value_memories) > 0:
value_memories = torch.stack(value_memories).unsqueeze(0)
baseline_memories = torch.stack(baseline_memories).unsqueeze(0)
log_probs, entropy = self.policy.evaluate_actions(
current_obs,
masks=act_masks,
actions=actions,
memories=memories,
seq_len=self.policy.sequence_length,
)
all_obs = [current_obs] + group_obs
values, _ = self.critic.critic_pass(
all_obs,
memories=value_memories,
sequence_length=self.policy.sequence_length,
)
baselines, _ = self.critic.baseline(
[current_obs],
group_obs,
group_actions,
memories=baseline_memories,
sequence_length=self.policy.sequence_length,
)
old_log_probs = ActionLogProbs.from_buffer(batch).flatten()
log_probs = log_probs.flatten()
loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool)
baseline_loss = self.coma_value_loss(
baselines, old_baseline_values, returns, decay_eps, loss_masks
)
value_loss = self.coma_value_loss(
values, old_values, returns, decay_eps, loss_masks
)
policy_loss = self.ppo_policy_loss(
ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]),
log_probs,
old_log_probs,
loss_masks,
)
loss = (
policy_loss
+ 0.5 * (value_loss + 0.5 * baseline_loss)
- decay_bet * ModelUtils.masked_mean(entropy, loss_masks)
)
# Set optimizer learning rate
ModelUtils.update_learning_rate(self.optimizer, decay_lr)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
update_stats = {
# NOTE: abs() is not technically correct, but matches the behavior in TensorFlow.
# TODO: After PyTorch is default, change to something more correct.
"Losses/Policy Loss": torch.abs(policy_loss).item(),
"Losses/Value Loss": value_loss.item(),
"Losses/Baseline Loss": baseline_loss.item(),
"Policy/Learning Rate": decay_lr,
"Policy/Epsilon": decay_eps,
"Policy/Beta": decay_bet,
}
for reward_provider in self.reward_signals.values():
update_stats.update(reward_provider.update(batch))
return update_stats
def get_modules(self):
modules = {"Optimizer": self.optimizer}
for reward_provider in self.reward_signals.values():
modules.update(reward_provider.get_modules())
return modules
def _evaluate_by_sequence_team(
self,
self_obs: List[torch.Tensor],
obs: List[List[torch.Tensor]],
actions: List[AgentAction],
init_value_mem: torch.Tensor,
init_baseline_mem: torch.Tensor,
) -> Tuple[
Dict[str, torch.Tensor],
Dict[str, torch.Tensor],
AgentBufferField,
AgentBufferField,
torch.Tensor,
torch.Tensor,
]:
"""
Evaluate a trajectory sequence-by-sequence, assembling the result. This enables us to get the
intermediate memories for the critic.
:param tensor_obs: A List of tensors of shape (trajectory_len, <obs_dim>) that are the agent's
observations for this trajectory.
:param initial_memory: The memory that preceeds this trajectory. Of shape (1,1,<mem_size>), i.e.
what is returned as the output of a MemoryModules.
:return: A Tuple of the value estimates as a Dict of [name, tensor], an AgentBufferField of the initial
memories to be used during value function update, and the final memory at the end of the trajectory.
"""
num_experiences = self_obs[0].shape[0]
all_next_value_mem = AgentBufferField()
all_next_baseline_mem = AgentBufferField()
# In the buffer, the 1st sequence are the ones that are padded. So if seq_len = 3 and
# trajectory is of length 10, the 1st sequence is [pad,pad,obs].
# Compute the number of elements in this padded seq.
leftover = num_experiences % self.policy.sequence_length
# Compute values for the potentially truncated initial sequence
first_seq_len = leftover if leftover > 0 else self.policy.sequence_length
self_seq_obs = []
team_seq_obs = []
team_seq_act = []
seq_obs = []
for _self_obs in self_obs:
first_seq_obs = _self_obs[0:first_seq_len]
seq_obs.append(first_seq_obs)
self_seq_obs.append(seq_obs)
for team_obs, team_action in zip(obs, actions):
seq_obs = []
for _obs in team_obs:
first_seq_obs = _obs[0:first_seq_len]
seq_obs.append(first_seq_obs)
team_seq_obs.append(seq_obs)
_act = team_action.slice(0, first_seq_len)
team_seq_act.append(_act)
# For the first sequence, the initial memory should be the one at the
# beginning of this trajectory.
for _ in range(first_seq_len):
all_next_value_mem.append(ModelUtils.to_numpy(init_value_mem.squeeze()))
all_next_baseline_mem.append(
ModelUtils.to_numpy(init_baseline_mem.squeeze())
)
all_seq_obs = self_seq_obs + team_seq_obs
init_values, _value_mem = self.critic.critic_pass(
all_seq_obs, init_value_mem, sequence_length=first_seq_len
)
all_values = {
signal_name: [init_values[signal_name]]
for signal_name in init_values.keys()
}
init_baseline, _baseline_mem = self.critic.baseline(
self_seq_obs,
team_seq_obs,
team_seq_act,
init_baseline_mem,
sequence_length=first_seq_len,
)
all_baseline = {
signal_name: [init_baseline[signal_name]]
for signal_name in init_baseline.keys()
}
# Evaluate other trajectories, carrying over _mem after each
# trajectory
for seq_num in range(
1, math.ceil((num_experiences) / (self.policy.sequence_length))
):
for _ in range(self.policy.sequence_length):
all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze()))
all_next_baseline_mem.append(
ModelUtils.to_numpy(_baseline_mem.squeeze())
)
start = seq_num * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
end = (seq_num + 1) * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
self_seq_obs = []
team_seq_obs = []
team_seq_act = []
seq_obs = []
for _self_obs in self_obs:
seq_obs.append(_obs[start:end])
self_seq_obs.append(seq_obs)
for team_obs, team_action in zip(obs, actions):
seq_obs = []
for (_obs,) in team_obs:
first_seq_obs = _obs[start:end]
seq_obs.append(first_seq_obs)
team_seq_obs.append(seq_obs)
_act = team_action.slice(start, end)
team_seq_act.append(_act)
all_seq_obs = self_seq_obs + team_seq_obs
values, _value_mem = self.critic.critic_pass(
all_seq_obs, _value_mem, sequence_length=self.policy.sequence_length
)
all_values = {
signal_name: [init_values[signal_name]] for signal_name in values.keys()
}
baselines, _baseline_mem = self.critic.baseline(
self_seq_obs,
team_seq_obs,
team_seq_act,
_baseline_mem,
sequence_length=first_seq_len,
)
all_baseline = {
signal_name: [baselines[signal_name]]
for signal_name in baselines.keys()
}
# Create one tensor per reward signal
all_value_tensors = {
signal_name: torch.cat(value_list, dim=0)
for signal_name, value_list in all_values.items()
}
all_baseline_tensors = {
signal_name: torch.cat(baseline_list, dim=0)
for signal_name, baseline_list in all_baseline.items()
}
next_value_mem = _value_mem
next_baseline_mem = _baseline_mem
return (
all_value_tensors,
all_baseline_tensors,
all_next_value_mem,
all_next_baseline_mem,
next_value_mem,
next_baseline_mem,
)
def get_trajectory_and_baseline_value_estimates(
self,
batch: AgentBuffer,
next_obs: List[np.ndarray],
next_group_obs: List[List[np.ndarray]],
done: bool,
agent_id: str = "",
) -> Tuple[
Dict[str, np.ndarray],
Dict[str, np.ndarray],
Dict[str, float],
Optional[AgentBufferField],
Optional[AgentBufferField],
]:
n_obs = len(self.policy.behavior_spec.observation_specs)
current_obs = ObsUtil.from_buffer(batch, n_obs)
team_obs = GroupObsUtil.from_buffer(batch, n_obs)
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
team_obs = [
[ModelUtils.list_to_tensor(obs) for obs in _teammate_obs]
for _teammate_obs in team_obs
]
team_actions = AgentAction.group_from_buffer(batch)
next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
next_obs = [obs.unsqueeze(0) for obs in next_obs]
next_group_obs = [
ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_group_obs
]
# Expand dimensions of next critic obs
next_group_obs = [
[_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_group_obs
]
if agent_id in self.value_memory_dict:
# The agent_id should always be in both since they are added together
_init_value_mem = self.value_memory_dict[agent_id]
_init_baseline_mem = self.baseline_memory_dict[agent_id]
else:
_init_value_mem = (
torch.zeros((1, 1, self.critic.memory_size))
if self.policy.use_recurrent
else None
)
_init_baseline_mem = (
torch.zeros((1, 1, self.critic.memory_size))
if self.policy.use_recurrent
else None
)
all_obs = [current_obs] + team_obs if team_obs is not None else [current_obs]
all_next_value_mem: Optional[AgentBufferField] = None
all_next_baseline_mem: Optional[AgentBufferField] = None
with torch.no_grad():
if self.policy.use_recurrent:
(
value_estimates,
baseline_estimates,
all_next_value_mem,
all_next_baseline_mem,
next_value_mem,
next_baseline_mem,
) = self._evaluate_by_sequence_team(
current_obs,
team_obs,
team_actions,
_init_value_mem,
_init_baseline_mem,
)
else:
value_estimates, next_value_mem = self.critic.critic_pass(
all_obs, _init_value_mem, sequence_length=batch.num_experiences
)
baseline_estimates, next_baseline_mem = self.critic.baseline(
[current_obs],
team_obs,
team_actions,
_init_baseline_mem,
sequence_length=batch.num_experiences,
)
# Store the memory for the next trajectory
self.value_memory_dict[agent_id] = next_value_mem
self.baseline_memory_dict[agent_id] = next_baseline_mem
all_next_obs = (
[next_obs] + next_group_obs if next_group_obs is not None else [next_obs]
)
next_value_estimates, _ = self.critic.critic_pass(
all_next_obs, next_value_mem, sequence_length=1
)
for name, estimate in baseline_estimates.items():
baseline_estimates[name] = ModelUtils.to_numpy(estimate)
for name, estimate in value_estimates.items():
value_estimates[name] = ModelUtils.to_numpy(estimate)
# the base line and V shpuld not be on the same done flag
for name, estimate in next_value_estimates.items():
next_value_estimates[name] = ModelUtils.to_numpy(estimate)
if done:
for k in next_value_estimates:
if not self.reward_signals[k].ignore_done:
next_value_estimates[k][-1] = 0.0
return (
value_estimates,
baseline_estimates,
next_value_estimates,
all_next_value_mem,
all_next_baseline_mem,
)

310
ml-agents/mlagents/trainers/coma/trainer.py


# # Unity ML-Agents Toolkit
# ## ML-Agent Learning (PPO)
# Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347
from collections import defaultdict
from typing import cast, Dict
import numpy as np
from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
from mlagents_envs.logging_util import get_logger
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.policy import Policy
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.coma.optimizer_torch import TorchCOMAOptimizer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, PPOSettings
logger = get_logger(__name__)
class COMATrainer(RLTrainer):
"""The COMATrainer is an implementation of the COMA2 algorithm."""
def __init__(
self,
behavior_name: str,
reward_buff_cap: int,
trainer_settings: TrainerSettings,
training: bool,
load: bool,
seed: int,
artifact_path: str,
):
"""
Responsible for collecting experiences and training PPO model.
:param behavior_name: The name of the behavior associated with trainer config
:param reward_buff_cap: Max reward history to track in the reward buffer
:param trainer_settings: The parameters for the trainer.
:param training: Whether the trainer is set for training.
:param load: Whether the model should be loaded.
:param seed: The seed the model will be initialized with
:param artifact_path: The directory within which to store artifacts from this trainer.
"""
super().__init__(
behavior_name,
trainer_settings,
training,
load,
artifact_path,
reward_buff_cap,
)
self.hyperparameters: PPOSettings = cast(
PPOSettings, self.trainer_settings.hyperparameters
)
self.seed = seed
self.policy: Policy = None # type: ignore
self.collected_group_rewards: Dict[str, int] = defaultdict(lambda: 0)
def _process_trajectory(self, trajectory: Trajectory) -> None:
"""
Takes a trajectory and processes it, putting it into the update buffer.
Processing involves calculating value and advantage targets for model updating step.
:param trajectory: The Trajectory tuple containing the steps to be processed.
"""
super()._process_trajectory(trajectory)
agent_id = trajectory.agent_id # All the agents should have the same ID
agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory)
# Get all value estimates
(
value_estimates,
baseline_estimates,
value_next,
value_memories,
baseline_memories,
) = self.optimizer.get_trajectory_and_baseline_value_estimates(
agent_buffer_trajectory,
trajectory.next_obs,
trajectory.next_group_obs,
trajectory.all_group_dones_reached
and trajectory.done_reached
and not trajectory.interrupted,
)
if value_memories is not None and baseline_memories is not None:
agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set(value_memories)
agent_buffer_trajectory[BufferKey.BASELINE_MEMORY].set(baseline_memories)
for name, v in value_estimates.items():
agent_buffer_trajectory[RewardSignalUtil.value_estimates_key(name)].extend(
v
)
agent_buffer_trajectory[
RewardSignalUtil.baseline_estimates_key(name)
].extend(baseline_estimates[name])
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Baseline Estimate",
np.mean(baseline_estimates[name]),
)
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
np.mean(value_estimates[name]),
)
self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]
)
self.collected_group_rewards[agent_id] += np.sum(
agent_buffer_trajectory[BufferKey.GROUP_REWARD]
)
for name, reward_signal in self.optimizer.reward_signals.items():
evaluate_result = (
reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength
)
agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend(
evaluate_result
)
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
# Compute lambda returns and advantage
tmp_advantages = []
for name in self.optimizer.reward_signals:
local_rewards = np.array(
agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].get_batch(),
dtype=np.float32,
)
baseline_estimate = agent_buffer_trajectory[
RewardSignalUtil.baseline_estimates_key(name)
].get_batch()
v_estimates = agent_buffer_trajectory[
RewardSignalUtil.value_estimates_key(name)
].get_batch()
lambd_returns = lambda_return(
r=local_rewards,
value_estimates=v_estimates,
gamma=self.optimizer.reward_signals[name].gamma,
lambd=self.hyperparameters.lambd,
value_next=value_next[name],
)
local_advantage = np.array(lambd_returns) - np.array(baseline_estimate)
agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set(
lambd_returns
)
agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set(
local_advantage
)
tmp_advantages.append(local_advantage)
# Get global advantages
global_advantages = list(
np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)
)
agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages)
# Append to update buffer
agent_buffer_trajectory.resequence_and_append(
self.update_buffer, training_length=self.policy.sequence_length
)
# If this was a terminal trajectory, append stats and reset reward collection
if trajectory.done_reached:
self._update_end_episode_stats(agent_id, self.optimizer)
# Remove dead agents from group reward recording
if not trajectory.all_group_dones_reached:
self.collected_group_rewards.pop(agent_id)
# If the whole team is done, average the remaining group rewards.
if trajectory.all_group_dones_reached:
self.stats_reporter.add_stat(
"Environment/Group Cumulative Reward",
self.collected_group_rewards.get(agent_id, 0),
aggregation=StatsAggregationMethod.HISTOGRAM,
)
self.collected_group_rewards.pop(agent_id)
def _is_ready_update(self):
"""
Returns whether or not the trainer has enough elements to run update model
:return: A boolean corresponding to whether or not update_model() can be run
"""
size_of_buffer = self.update_buffer.num_experiences
return size_of_buffer > self.hyperparameters.buffer_size
def _update_policy(self):
"""
Uses demonstration_buffer to update the policy.
The reward signal generators must be updated in this method at their own pace.
"""
buffer_length = self.update_buffer.num_experiences
self.cumulative_returns_since_policy_update.clear()
# Make sure batch_size is a multiple of sequence length. During training, we
# will need to reshape the data into a batch_size x sequence_length tensor.
batch_size = (
self.hyperparameters.batch_size
- self.hyperparameters.batch_size % self.policy.sequence_length
)
# Make sure there is at least one sequence
batch_size = max(batch_size, self.policy.sequence_length)
n_sequences = max(
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
advantages = np.array(
self.update_buffer[BufferKey.ADVANTAGES].get_batch(), dtype=np.float32
)
self.update_buffer[BufferKey.ADVANTAGES].set(
(advantages - advantages.mean()) / (advantages.std() + 1e-10)
)
num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)
for _ in range(num_epoch):
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update(
buffer.make_mini_batch(i, i + batch_size), n_sequences
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
if self.optimizer.bc_module:
update_stats = self.optimizer.bc_module.update()
for stat, val in update_stats.items():
self._stats_reporter.add_stat(stat, val)
self._clear_update_buffer()
return True
def create_torch_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TorchPolicy:
"""
Creates a policy with a PyTorch backend and PPO hyperparameters
:param parsed_behavior_id:
:param behavior_spec: specifications for policy construction
:return policy
"""
policy = TorchPolicy(
self.seed,
behavior_spec,
self.trainer_settings,
condition_sigma_on_obs=False, # Faster training for PPO
separate_critic=True, # Match network architecture with TF
)
return policy
def create_coma_optimizer(self) -> TorchCOMAOptimizer:
return TorchCOMAOptimizer( # type: ignore
cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore
) # type: ignore
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to trainer.
:param parsed_behavior_id: Behavior identifiers that the policy should belong to.
:param policy: Policy to associate with name_behavior_id.
"""
self.policy = policy
self.policies[parsed_behavior_id.behavior_id] = policy
self.optimizer = self.create_coma_optimizer()
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
self.model_saver.register(self.policy)
self.model_saver.register(self.optimizer)
self.model_saver.initialize_or_load()
# Needed to resume loads properly
self.step = policy.get_current_step()
def get_policy(self, name_behavior_id: str) -> Policy:
"""
Gets policy from trainer associated with name_behavior_id
:param name_behavior_id: full identifier of policy
"""
return self.policy
def lambda_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0):
returns = np.zeros_like(r)
returns[-1] = r[-1] + gamma * value_next
for t in reversed(range(0, r.size - 1)):
returns[t] = (
gamma * lambd * returns[t + 1]
+ r[t]
+ (1 - lambd) * gamma * value_estimates[t + 1]
)
return returns

11
com.unity.ml-agents/Runtime/MultiAgentGroupIdCounter.cs.meta


fileFormatVersion: 2
guid: 5661ffdb6c7704e84bc785572dcd5bd1
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:
正在加载...
取消
保存