浏览代码

action models

/develop/actionmodel-csharp
Andrew Cohen 4 年前
当前提交
44c9879e
共有 8 个文件被更改,包括 230 次插入422 次删除
  1. 75
      ml-agents/mlagents/trainers/policy/torch_policy.py
  2. 11
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  3. 21
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  4. 6
      ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
  5. 37
      ml-agents/mlagents/trainers/torch/distributions.py
  6. 2
      ml-agents/mlagents/trainers/torch/model_serialization.py
  7. 381
      ml-agents/mlagents/trainers/torch/networks.py
  8. 119
      ml-agents/mlagents/trainers/torch/action_models.py

75
ml-agents/mlagents/trainers/policy/torch_policy.py


from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.torch.distributions import DistInstance
from mlagents.trainers.torch.networks import (
SharedActorCritic,
SeparateActorCritic,

self, decision_requests: DecisionSteps
) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
mask = None
if not self.use_continuous_act:
mask = torch.ones([len(decision_requests), np.sum(self.act_size)])
if decision_requests.action_mask is not None:
mask = torch.as_tensor(
1 - np.concatenate(decision_requests.action_mask, axis=1)
)
#mask = None
print(self.discrete_act_size)
mask = torch.ones([len(decision_requests), np.sum(self.discrete_act_size)])
if decision_requests.action_mask is not None:
mask = torch.as_tensor(
1 - np.concatenate(decision_requests.action_mask, axis=1)
)
return vec_vis_obs, mask
def update_normalization(self, vector_obs: np.ndarray) -> None:

if self.use_vec_obs and self.normalize:
self.actor_critic.update_normalization(vector_obs)
def get_actions_and_stats(dists : List[DistInstance]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
action_list = self.actor_critic.sample_action(dists)
log_probs, entropies, all_logs = ModelUtils.get_probs_and_entropy(
action_list, dists
)
actions = torch.stack(action_list, dim=-1)
return (
actions,
all_logs if all_log_probs else log_probs,
entropies,
)
@timed
def sample_actions(
self,

"""
:param all_log_probs: Returns (for discrete actions) a tensor of log probs, one for each action.
"""
continuous_dists, discrete_dists, value_heads, memories = self.actor_critic.get_dist_and_value(
actions, log_probs, entropies, value_heads, memories = self.actor_critic.get_action_stats_and_value(
continuous_actions, continuous_entropies, continuous_log_probs = self.get_action_and_stats(continuous_dists)
discrete_actions, discrete_entropies, discrete_log_probs = self.get_action_and_stats(discrete_dists)
continuous_actions = continuous_actions[:, :, 0]
discrete_actions = discrete_actions[:, 0, :]
continuous_actions,
continuous_log_probs,
continuous_entropies,
discrete_actions,
discrete_log_probs,
discrete_entropies,
actions,
log_probs,
entropies,
value_heads,
memories,
)

vec_obs: torch.Tensor,
vis_obs: torch.Tensor,
continuous_actions: torch.Tensor,
discrete_actions: torch.Tensor,
actions: torch.Tensor,
continuous_dists, discrete_dists, value_heads, memories = self.actor_critic.get_dist_and_value(
vec_obs, vis_obs, masks, memories, seq_len
log_probs, entropies, value_heads = self.actor_critic.get_stats_and_value(
vec_obs, vis_obs, actions, masks, memories, seq_len
continuous_action_list = [actions[..., i] for i in range(actions.shape[-1])]
discrete_action_list = [actions[..., i] for i in range(actions.shape[-1])]
continuous_log_probs, continuous_entropies, _ = ModelUtils.get_probs_and_entropy(continuous_action_list, dists)
discrete_log_probs, discrete_entropies, _ = ModelUtils.get_probs_and_entropy(discrete_action_list, dists)
return continuous_log_probs, continuous_entropies, discrete_log_probs, discrete_entropies, value_heads
return log_probs, entropies, value_heads
@timed
def evaluate(

run_out = {}
with torch.no_grad():
continuous_action, continuous_log_probs, continuous_entropy, discrete_action, discrete_log_probs, discrete_entropy, value_heads, memories = self.sample_actions(
action, log_probs, entropy, value_heads, memories = self.sample_actions(
run_out["continuous_action"] = ModelUtils.to_numpy(continuous_action)
run_out["continuous_log_probs"] = ModelUtils.to_numpy(log_probs)
run_out["continuous_entropy"] = ModelUtils.to_numpy(entropy)
run_out["discrete_action"] = ModelUtils.to_numpy(discrete_action)
run_out["discrete_log_probs"] = ModelUtils.to_numpy(log_probs)
run_out["discrete_entropy"] = ModelUtils.to_numpy(entropy)
run_out["action"] = ModelUtils.to_numpy(action)
run_out["log_probs"] = ModelUtils.to_numpy(log_probs)
run_out["entropy"] = ModelUtils.to_numpy(entropy)
run_out["value_heads"] = {
name: ModelUtils.to_numpy(t) for name, t in value_heads.items()
}

decision_requests, global_agent_ids
) # pylint: disable=assignment-from-no-return
self.save_memories(global_agent_ids, run_out.get("memory_out"))
action = np.concat([run_out.get("continuous_action"), run_out.get("discrete_action")], axis=1)
action=action,
action=run_out.get("action"),
value=run_out.get("value"),
outputs=run_out,
agent_ids=list(decision_requests.agent_id),

11
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
continuous_actions = ModelUtils.list_to_tensor(batch["actions"][:self.policy.continuous_act_size]).unsqueeze(-1)
discrete_actions = ModelUtils.list_to_tensor(batch["actions"][self.policy.continuous_act_size:], dtype=torch.long)
actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
#discrete_actions = ModelUtils.list_to_tensor(batch["actions"][self.policy.continuous_act_size:], dtype=torch.long)
memories = [
ModelUtils.list_to_tensor(batch["memory"][i])

else:
vis_obs = []
continuous_log_probs, continuous_entropy, discrete_log_probs, discrete_entropy, values = self.policy.evaluate_actions(
log_probs, entropy, values = self.policy.evaluate_actions(
continuous_actions=continuous_actions,
discrete_actions=discrete_actions,
actions=actions,
log_probs = continuous_log_probs + discrete_log_probs
entropy = continuous_entropy + discrete_entropy
loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
value_loss = self.ppo_value_loss(
values, old_values, returns, decay_eps, loss_masks

21
ml-agents/mlagents/trainers/tests/simple_test_envs.py


self.behavior_spec = HybridBehaviorSpec(
self._make_obs_spec(), action_size, tuple(2 for _ in range(action_size))
)
self.action_size = action_size
self.continuous_action = {}
self.discrete_action = {}

for name in self.names:
cont_done = self.continuous_env._take_action(name)
cont_reward = self.continuous_env._compute_reward(name, cont_done)
self.rewards[name] += cont_reward / 2
self.rewards[name] += disc_reward / 2
if all_done:
reward = (cont_reward + disc_reward) / 2
else:
reward = -TIME_PENALTY
self.rewards[name] += reward
name, all_done, cont_reward + disc_reward
name, all_done, reward
self.reset()
super().reset()
def set_actions(self, behavior_name: BehaviorName, action: HybridAction) -> None:
self.continuous_env.set_actions(action.continuous)
self.discrete_env.set_actions(action.discrete)
def set_actions(self, behavior_name: BehaviorName, action) -> None:
continuous_action = action[:, :self.action_size]
discrete_action = action[:, self.action_size:]
self.continuous_env.set_actions(behavior_name, continuous_action)
self.discrete_env.set_actions(behavior_name, discrete_action)
class MemoryEnvironment(SimpleEnvironment):

6
ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py


# env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
# config = attr.evolve(PPO_CONFIG)
# _check_environment_trains(env, {BRAIN_NAME: config})
#
env = HybridEnvironment([BRAIN_NAME])
env = HybridEnvironment([BRAIN_NAME], action_size=3)
_check_environment_trains(env, {BRAIN_NAME: config})
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=2.0)
#

37
ml-agents/mlagents/trainers/torch/distributions.py


distribution = CategoricalDistInstance(norm_logits)
branch_distributions.append(distribution)
return branch_distributions
class OutputDistributions(nn.Module):
def __init__(
self,
hidden_size: int,
continuous_act_size: int,
discrete_act_size: List[int],
conditional_sigma: bool = False,
tanh_squash: bool = False,
):
self.encoding_size = hidden_size
self.continuous_distributions: List[GaussianDistribution] = []
self.discrete_distributions: List[MultiCategoricalDistribution] = []
if continuous_act_size > 0:
self.continuous_distributions.append(
GaussianDistribution(
self.encoding_size,
continuous_act_size,
conditional_sigma=conditional_sigma,
tanh_squash=tanh_squash,
)
)
if len(discrete_act_size) > 0:
self.discrete_distributions.append(
MultiCategoricalDistribution(self.encoding_size, discrete_act_size)
)
def forward(self, inputs: torch.Tensor, masks: torch.Tensor) -> Tuple[List[DistInstance], List[DiscreteDistInstance]]:
continuous_distributions: List[DistInstance] = []
discrete_distributions: List[DiscreteDistInstance] = []
for continuous_dist in self.continuous_distributions:
continuous_distributions += continuous_dist(inputs)
for discrete_dist in self.discrete_distributions:
discrete_distributions += discrete_dist(inputs, masks)
return continuous_distributions, discrete_distributions

2
ml-agents/mlagents/trainers/torch/model_serialization.py


for shape in self.policy.behavior_spec.observation_shapes
if len(shape) == 3
]
dummy_masks = torch.ones(batch_dim + [sum(self.policy.actor_critic.act_size)])
dummy_masks = torch.ones(batch_dim + [sum(self.policy.actor_critic.discrete_act_size)])
dummy_memories = torch.zeros(
batch_dim + seq_len_dim + [self.policy.export_memory_size]
)

381
ml-agents/mlagents/trainers/torch/networks.py


from mlagents.torch_utils import torch, nn
from mlagents_envs.base_env import ActionType
from mlagents.trainers.torch.distributions import OutputDistributions, DistInstance
from mlagents.trainers.torch.distributions import DistInstance
from mlagents.trainers.torch.action_models import HybridActionModel
from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.decoders import ValueHeads

pass
@abc.abstractmethod
def sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
"""
Takes a List of Distribution iinstances and samples an action from each.
"""
pass
@abc.abstractmethod
def get_dists(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[List[DistInstance], Optional[torch.Tensor]]:
"""
Returns distributions from this Actor, from which actions can be sampled.
If memory is enabled, return the memories as well.
:param vec_inputs: A List of vector inputs as tensors.
:param vis_inputs: A List of visual inputs as tensors.
:param masks: If using discrete actions, a Tensor of action masks.
:param memories: If using memory, a Tensor of initial memories.
:param sequence_length: If using memory, the sequence length.
:return: A Tuple of a List of action distribution instances, and memories.
Memories will be None if not using memory.
"""
pass
@abc.abstractmethod
def forward(
self,
vec_inputs: List[torch.Tensor],

pass
@abc.abstractmethod
def get_dist_and_value(
def get_action_stats_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],

self.discrete_act_size = discrete_act_size
self.continuous_act_size = continuous_act_size
self.version_number = torch.nn.Parameter(torch.Tensor([2.0]))
self.continuous_act_size_vector = torch.nn.Parameter(
torch.Tensor(continuous_act_size)
self.act_size_vector = torch.nn.Parameter(
torch.Tensor(continuous_act_size + len(discrete_act_size))
self.discrete_act_size_vector = torch.nn.Parameter(
torch.Tensor(discrete_act_size)
self.is_continuous_int = torch.nn.Parameter(
torch.Tensor([int(self.continuous_act_size > 0)])
)
self.network_body = NetworkBody(observation_shapes, network_settings)
if network_settings.memory is not None:

self.output_distributions = OutputDistributions(
self.action_model = HybridActionModel(
self.encoding_size,
continuous_act_size,
discrete_act_size,

def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
self.network_body.update_normalization(vector_obs)
def sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
actions = []
for action_dist in dists:
action = action_dist.sample()
actions.append(action)
return actions
def get_dists(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[List[DistInstance], Optional[torch.Tensor]]:
encoding, memories = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
)
continuous_dists, discrete_dists = self.output_distribution(encoding, masks)
return continuous_dists, discrete_dists, memories
def forward(
self,
vec_inputs: List[torch.Tensor],

"""
Note: This forward() method is required for exporting to ONNX. Don't modify the inputs and outputs.
"""
encoding, memories_out = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=1
)
continuous_dists, discrete_dists, _ = self.get_dists(vec_inputs, vis_inputs, masks, memories, 1)
action_out = torch.cat([dist.exported_model_output() for dist in continuous_dists + discrete_dists], dim=1)
action_out = self.action_model.get_action_out(encoding, masks)
return (
action_out,
self.version_number,

conditional_sigma: bool = False,
tanh_squash: bool = False,
):
self.use_lstm = network_settings.memory is not None
super().__init__(
observation_shapes,
network_settings,

)
return self.value_heads(encoding), memories_out
def get_dist_and_value(
def get_stats_and_value(
actions: torch.Tensor,
) -> Tuple[List[DistInstance], List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]:
encoding, memories = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
)
log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
value_outputs = self.value_heads(encoding)
return log_probs, entropies, value_outputs
# TODO: this is just a rehashing of get_dists code
def get_action_stats_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
continuous_dists, discrete_dists = self.output_distribution(encoding, masks)
action, log_probs, entropies = self.action_model(encoding, masks)
return continuous_dists, discrete_dists, value_outputs, memories
return action, log_probs, entropies, value_outputs, memories
class SeparateActorCritic(SimpleActor, ActorCritic):

conditional_sigma: bool = False,
tanh_squash: bool = False,
):
self.use_lstm = network_settings.memory is not None
super().__init__(
observation_shapes,
network_settings,

tanh_squash,
)
self.stream_names = stream_names
self.value_heads = ValueHeads(stream_names, self.encoding_size)
print("CREATED", self.memory_size)
self.critic = ValueNetwork(stream_names, observation_shapes, network_settings)
@property
def memory_size(self) -> int:

memories_out = None
return value_outputs, memories_out
def get_dist_and_value(
def get_stats_and_value(
actions: torch.Tensor,
) -> Tuple[List[DistInstance], List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]:
if self.use_lstm:
# Use only the back half of memories for critic and actor
actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1)

continuous_dists, discrete_dists, actor_mem_outs = self.get_dists(
vec_inputs,
vis_inputs,
memories=actor_mem,
sequence_length=sequence_length,
masks=masks,
encoding, memories = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
)
return log_probs, entropies, value_outputs
def get_action_stats_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
if self.use_lstm:
# Use only the back half of memories for critic and actor
actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1)
else:
critic_mem = None
actor_mem = None
encoding, memories = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
)
action, log_probs, entropies = self.action_model(encoding, masks)
value_outputs, critic_mem_outs = self.critic(
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
return continuous_dists, discrete_dists, value_outputs, mem_out
return action, log_probs, entropies, value_outputs, mem_out
################################################################################
######### Continuous xor Discrete cases ##########
################################################################################
# class SimpleActor(nn.Module, Actor):
# def __init__(
# self,
# observation_shapes: List[Tuple[int, ...]],
# network_settings: NetworkSettings,
# act_type: ActionType,
# act_size: List[int],
# conditional_sigma: bool = False,
# tanh_squash: bool = False,
# ):
# super().__init__()
# self.act_type = act_type
# self.act_size = act_size
# self.version_number = torch.nn.Parameter(torch.Tensor([2.0]))
# self.is_continuous_int = torch.nn.Parameter(
# torch.Tensor([int(act_type == ActionType.CONTINUOUS)])
# )
# self.act_size_vector = torch.nn.Parameter(torch.Tensor(act_size))
# self.network_body = NetworkBody(observation_shapes, network_settings)
# if network_settings.memory is not None:
# self.encoding_size = network_settings.memory.memory_size // 2
# else:
# self.encoding_size = network_settings.hidden_units
#
# if self.act_type == ActionType.CONTINUOUS:
# self.distribution = GaussianDistribution(
# self.encoding_size,
# act_size[0],
# conditional_sigma=conditional_sigma,
# tanh_squash=tanh_squash,
# )
# else:
# self.distribution = MultiCategoricalDistribution(
# self.encoding_size, act_size
# )
#
# @property
# def memory_size(self) -> int:
# return self.network_body.memory_size
#
# def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
# self.network_body.update_normalization(vector_obs)
#
# def sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
# actions = []
# for action_dist in dists:
# action = action_dist.sample()
# actions.append(action)
# return actions
#
# def get_dists(
# self,
# vec_inputs: List[torch.Tensor],
# vis_inputs: List[torch.Tensor],
# masks: Optional[torch.Tensor] = None,
# memories: Optional[torch.Tensor] = None,
# sequence_length: int = 1,
# ) -> Tuple[List[DistInstance], Optional[torch.Tensor]]:
# encoding, memories = self.network_body(
# vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
# )
# if self.act_type == ActionType.CONTINUOUS:
# dists = self.distribution(encoding)
# else:
# dists = self.distribution(encoding, masks)
#
# return dists, memories
#
# def forward(
# self,
# vec_inputs: List[torch.Tensor],
# vis_inputs: List[torch.Tensor],
# masks: Optional[torch.Tensor] = None,
# memories: Optional[torch.Tensor] = None,
# ) -> Tuple[torch.Tensor, int, int, int, int]:
# """
# Note: This forward() method is required for exporting to ONNX. Don't modify the inputs and outputs.
# """
# dists, _ = self.get_dists(vec_inputs, vis_inputs, masks, memories, 1)
# action_list = self.sample_action(dists)
# sampled_actions = torch.stack(action_list, dim=-1)
# if self.act_type == ActionType.CONTINUOUS:
# action_out = sampled_actions
# else:
# action_out = dists[0].all_log_prob()
# return (
# action_out,
# self.version_number,
# torch.Tensor([self.network_body.memory_size]),
# self.is_continuous_int,
# self.act_size_vector,
# )
#
#
# class SharedActorCritic(SimpleActor, ActorCritic):
# def __init__(
# self,
# observation_shapes: List[Tuple[int, ...]],
# network_settings: NetworkSettings,
# act_type: ActionType,
# act_size: List[int],
# stream_names: List[str],
# conditional_sigma: bool = False,
# tanh_squash: bool = False,
# ):
# super().__init__(
# observation_shapes,
# network_settings,
# act_type,
# act_size,
# conditional_sigma,
# tanh_squash,
# )
# self.stream_names = stream_names
# self.value_heads = ValueHeads(stream_names, self.encoding_size)
#
# def critic_pass(
# self,
# vec_inputs: List[torch.Tensor],
# vis_inputs: List[torch.Tensor],
# memories: Optional[torch.Tensor] = None,
# sequence_length: int = 1,
# ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
# encoding, memories_out = self.network_body(
# vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
# )
# return self.value_heads(encoding), memories_out
#
# def get_dist_and_value(
# self,
# vec_inputs: List[torch.Tensor],
# vis_inputs: List[torch.Tensor],
# masks: Optional[torch.Tensor] = None,
# memories: Optional[torch.Tensor] = None,
# sequence_length: int = 1,
# ) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
# encoding, memories = self.network_body(
# vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
# )
# if self.act_type == ActionType.CONTINUOUS:
# dists = self.distribution(encoding)
# else:
# dists = self.distribution(encoding, masks=masks)
#
# value_outputs = self.value_heads(encoding)
# return dists, value_outputs, memories
#
#
# class SeparateActorCritic(SimpleActor, ActorCritic):
# def __init__(
# self,
# observation_shapes: List[Tuple[int, ...]],
# network_settings: NetworkSettings,
# act_type: ActionType,
# act_size: List[int],
# stream_names: List[str],
# conditional_sigma: bool = False,
# tanh_squash: bool = False,
# ):
# # Give the Actor only half the memories. Note we previously validate
# # that memory_size must be a multiple of 4.
# self.use_lstm = network_settings.memory is not None
# super().__init__(
# observation_shapes,
# network_settings,
# act_type,
# act_size,
# conditional_sigma,
# tanh_squash,
# )
# self.stream_names = stream_names
# self.critic = ValueNetwork(stream_names, observation_shapes, network_settings)
#
# @property
# def memory_size(self) -> int:
# return self.network_body.memory_size + self.critic.memory_size
#
# def critic_pass(
# self,
# vec_inputs: List[torch.Tensor],
# vis_inputs: List[torch.Tensor],
# memories: Optional[torch.Tensor] = None,
# sequence_length: int = 1,
# ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
# actor_mem, critic_mem = None, None
# if self.use_lstm:
# # Use only the back half of memories for critic
# actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, -1)
# value_outputs, critic_mem_out = self.critic(
# vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
# )
# if actor_mem is not None:
# # Make memories with the actor mem unchanged
# memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1)
# else:
# memories_out = None
# return value_outputs, memories_out
#
# def get_dist_and_value(
# self,
# vec_inputs: List[torch.Tensor],
# vis_inputs: List[torch.Tensor],
# masks: Optional[torch.Tensor] = None,
# memories: Optional[torch.Tensor] = None,
# sequence_length: int = 1,
# ) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
# if self.use_lstm:
# # Use only the back half of memories for critic and actor
# actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1)
# else:
# critic_mem = None
# actor_mem = None
# dists, actor_mem_outs = self.get_dists(
# vec_inputs,
# vis_inputs,
# memories=actor_mem,
# sequence_length=sequence_length,
# masks=masks,
# )
# value_outputs, critic_mem_outs = self.critic(
# vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
# )
# if self.use_lstm:
# mem_out = torch.cat([actor_mem_outs, critic_mem_outs], dim=-1)
# else:
# mem_out = None
# return dists, value_outputs, mem_out
#
# def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
# super().update_normalization(vector_obs)
# self.critic.network_body.update_normalization(vector_obs)
#
#
class GlobalSteps(nn.Module):
def __init__(self):
super().__init__()

119
ml-agents/mlagents/trainers/torch/action_models.py


import abc
from typing import List, Tuple
from mlagents.torch_utils import torch, nn
import numpy as np
import math
from mlagents.trainers.torch.layers import linear_layer, Initialization
from mlagents.trainers.torch.distributions import DistInstance, DiscreteDistInstance, GaussianDistribution, MultiCategoricalDistribution
from mlagents.trainers.torch.utils import ModelUtils
EPSILON = 1e-7 # Small value to avoid divide by zero
class ActionModel(nn.Module, abc.ABC):
#@abc.abstractmethod
#def entropy(self, action_list: np.ndarray) -> torch.Tensor:
# pass
#@abc.abstractmethod
#def log_probs(self, action_list: np.ndarray) -> torch.Tensor:
# pass
def _sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
actions = []
for action_dist in dists:
action = action_dist.sample()
actions.append(action)
return actions
@abc.abstractmethod
def forward(self, inputs: torch.Tensor, masks: torch.Tensor):
pass
class HybridActionModel(ActionModel):
def __init__(
self,
hidden_size: int,
continuous_act_size: int,
discrete_act_size: List[int],
conditional_sigma: bool = False,
tanh_squash: bool = False,
):
super().__init__()
self.encoding_size = hidden_size
self.continuous_act_size = continuous_act_size
self.discrete_act_size = discrete_act_size
self.continuous_distribution = None #: List[GaussianDistribution] = []
self.discrete_distribution = None #: List[MultiCategoricalDistribution] = []
if continuous_act_size > 0:
self.continuous_distribution = GaussianDistribution(
self.encoding_size,
continuous_act_size,
conditional_sigma=conditional_sigma,
tanh_squash=tanh_squash,
)
if len(discrete_act_size) > 0:
self.discrete_distribution = MultiCategoricalDistribution(self.encoding_size, discrete_act_size)
def evaluate(self, inputs: torch.Tensor, masks: torch.Tensor, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
continuous_dists, discrete_dists = self._get_dists(inputs, masks)
continuous_actions, discrete_actions= torch.split(actions, self.continuous_act_size, dim=1)
continuous_action_list = [continuous_actions[..., i] for i in range(continuous_actions.shape[-1])]
continuous_log_probs, continuous_entropies, _ = ModelUtils.get_probs_and_entropy(continuous_action_list, continuous_dists)
discrete_action_list = [discrete_actions[..., i] for i in range(discrete_actions.shape[-1])]
discrete_log_probs, discrete_entropies, _ = ModelUtils.get_probs_and_entropy(discrete_action_list, discrete_dists)
log_probs = torch.add(continuous_log_probs, discrete_log_probs)
entropies = torch.add(continuous_entropies, discrete_entropies)
return log_probs, entropies
def get_action_out(self, inputs: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:
continuous_dists, discrete_dists = self._get_dists(inputs, masks)
dists = continuous_dists + discrete_dists
return torch.cat([dist.exported_model_output() for dist in dists], dim=1)
def _get_dists(self, inputs: torch.Tensor, masks: torch.Tensor) -> Tuple[List[DistInstance], List[DiscreteDistInstance]]:
#continuous_distributions: List[DistInstance] = []
#discrete_distributions: List[DiscreteDistInstance] = []
continuous_dist_instances = self.continuous_distribution(inputs)# for continuous_dist in self.continuous_distributions]
discrete_dist_instances = self.discrete_distribution(inputs, masks)# for discrete_dist in self.discrete_distributions]
#for continuous_dist in self.continuous_distributions:
# continuous_distributions += continuous_dist(inputs)
#for discrete_dist in self.discrete_distributions:
# discrete_distributions += discrete_dist(inputs, masks)
return continuous_dist_instances, discrete_dist_instances
def forward(self, inputs: torch.Tensor, masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
continuous_dists, discrete_dists = self._get_dists(inputs, masks)
continuous_action_list = self._sample_action(continuous_dists)
continuous_entropies, continuous_log_probs, continuous_all_probs = ModelUtils.get_probs_and_entropy(
continuous_action_list, continuous_dists
)
continuous_actions = torch.stack(continuous_action_list, dim=-1)
continuous_actions = continuous_actions[:, :, 0]
discrete_action_list = self._sample_action(discrete_dists)
discrete_entropies, discrete_log_probs, discrete_all_probs = ModelUtils.get_probs_and_entropy(
discrete_action_list, discrete_dists
)
discrete_actions = torch.stack(discrete_action_list, dim=-1)
discrete_actions = discrete_actions[:, 0, :]
action = torch.cat([continuous_actions, discrete_actions.type(torch.float)], axis=1)
log_probs = torch.add(continuous_log_probs, discrete_log_probs)
entropies = torch.add(continuous_entropies, discrete_entropies)
#print("ac",action)
#print("clp",continuous_log_probs)
#print("dlp",discrete_log_probs)
#print("lp",log_probs)
#print("en",entropies)
return (action, log_probs, entropies)
正在加载...
取消
保存