浏览代码

move action model to explicit distributions

/develop/actionmodel-csharp
Andrew Cohen 4 年前
当前提交
6174c428
共有 4 个文件被更改,包括 76 次插入155 次删除
  1. 35
      ml-agents/mlagents/trainers/policy/torch_policy.py
  2. 115
      ml-agents/mlagents/trainers/torch/action_model.py
  3. 6
      ml-agents/mlagents/trainers/torch/distributions.py
  4. 75
      ml-agents/mlagents/trainers/torch/utils.py

35
ml-agents/mlagents/trainers/policy/torch_policy.py


)
return (actions, log_probs, entropies, value_heads, memories)
# if memories is None:
# dists, memories = self.actor_critic.get_dists(
# vec_obs, vis_obs, masks, memories, seq_len
# )
# else:
# # If we're using LSTM. we need to execute the values to get the critic memories
# dists, _, memories = self.actor_critic.get_dist_and_value(
# vec_obs, vis_obs, masks, memories, seq_len
# )
# action_list = self.actor_critic.sample_action(dists)
# log_probs_list, entropies, all_logs_list = ModelUtils.get_probs_and_entropy(
# action_list, dists
# )
# actions = AgentAction.create(action_list, self.behavior_spec.action_spec)
# log_probs = ActionLogProbs.create(
# log_probs_list, self.behavior_spec.action_spec, all_logs_list
# )
# # Use the sum of entropy across actions, not the mean
# entropy_sum = torch.sum(entropies, dim=1)
# return (actions, log_probs, entropy_sum, memories)
def evaluate_actions(
self,
vec_obs: torch.Tensor,

vec_obs, vis_obs, actions, masks, memories, seq_len
)
return log_probs, entropies, value_heads
# dists, value_heads, _ = self.actor_critic.get_dist_and_value(
# vec_obs, vis_obs, masks, memories, seq_len
# )
# action_list = actions.to_tensor_list()
# log_probs_list, entropies, _ = ModelUtils.get_probs_and_entropy(
# action_list, dists
# )
# log_probs = ActionLogProbs.create(
# log_probs_list, self.behavior_spec.action_spec
# )
## Use the sum of entropy across actions, not the mean
# entropy_sum = torch.sum(entropies, dim=1)
# return log_probs, entropy_sum, value_heads
@timed
def evaluate(

115
ml-agents/mlagents/trainers/torch/action_model.py


from typing import List, Tuple
from typing import List, Tuple, NamedTuple, Optional
DiscreteDistInstance,
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
from mlagents.trainers.torch.utils import AgentAction, ActionLogProbs
class DistInstances(NamedTuple):
continuous: DistInstance
discrete: List[DiscreteDistInstance]
class ActionModel(nn.Module):
def __init__(
self,

super().__init__()
self.encoding_size = hidden_size
self.action_spec = action_spec
self._distributions = torch.nn.ModuleList()
self._continuous_distribution = None
self._discrete_distribution = None
self._distributions.append(
GaussianDistribution(
self.encoding_size,
self.action_spec.continuous_size,
conditional_sigma=conditional_sigma,
tanh_squash=tanh_squash,
)
self._continuous_distribution = GaussianDistribution(
self.encoding_size,
self.action_spec.continuous_size,
conditional_sigma=conditional_sigma,
tanh_squash=tanh_squash,
self._distributions.append(
MultiCategoricalDistribution(
self.encoding_size, self.action_spec.discrete_branches
)
self._discrete_distribution = MultiCategoricalDistribution(
self.encoding_size, self.action_spec.discrete_branches
def _sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
def _sample_action(self, dists: DistInstances) -> AgentAction:
actions = []
for action_dist in dists:
action = action_dist.sample()
actions.append(action)
return actions
continuous_action: Optional[torch.Tensor] = None
discrete_action: Optional[List[torch.Tensor]] = None
if self.action_spec.continuous_size > 0:
continuous_action = dists.continuous.sample()
if self.action_spec.discrete_size > 0:
discrete_action = []
for discrete_dist in dists.discrete:
discrete_action.append(discrete_dist.sample())
return AgentAction(continuous_action, discrete_action)
def _get_dists(self, inputs: torch.Tensor, masks: torch.Tensor) -> DistInstances:
continuous_dist: Optional[DistInstance] = None
discrete_dist: Optional[List[DiscreteDistInstance]] = None
if self.action_spec.continuous_size > 0:
continuous_dist = self._continuous_distribution(inputs, masks)
if self.action_spec.discrete_size > 0:
discrete_dist = self._discrete_distribution(inputs, masks)
return DistInstances(continuous_dist, discrete_dist)
def _get_probs_and_entropy(
self, actions: AgentAction, dists: DistInstances
) -> Tuple[ActionLogProbs, torch.Tensor]:
def _get_dists(
self, inputs: torch.Tensor, masks: torch.Tensor
) -> List[DistInstance]:
distribution_instances: List[DistInstance] = []
for distribution in self._distributions:
dist_instances = distribution(inputs, masks)
for dist_instance in dist_instances:
distribution_instances.append(dist_instance)
return distribution_instances
entropies_list: List[torch.Tensor] = []
continuous_log_prob: Optional[torch.Tensor] = None
discrete_log_probs: Optional[List[torch.Tensor]] = None
all_discrete_log_probs: Optional[List[torch.Tensor]] = None
if self.action_spec.continuous_size > 0:
continuous_log_prob = dists.continuous.log_prob(actions.continuous_tensor)
entropies_list.append(dists.continuous.entropy())
if self.action_spec.discrete_size > 0:
discrete_log_probs = []
all_discrete_log_probs = []
for discrete_action, discrete_dist in zip(
actions.discrete_list, dists.discrete
):
discrete_log_prob = discrete_dist.log_prob(discrete_action)
entropies_list.append(discrete_dist.entropy())
discrete_log_probs.append(discrete_log_prob)
all_discrete_log_probs.append(discrete_dist.all_log_prob())
action_log_probs = ActionLogProbs(
continuous_log_prob, discrete_log_probs, all_discrete_log_probs
)
entropies = torch.cat(entropies_list, dim=1)
return action_log_probs, entropies
action_list = actions.to_tensor_list()
log_probs_list, entropies, _ = ModelUtils.get_probs_and_entropy(
action_list, dists
)
log_probs = ActionLogProbs.create(log_probs_list, self.action_spec)
log_probs, entropies = self._get_probs_and_entropy(actions, dists)
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return log_probs, entropy_sum

return torch.cat([dist.exported_model_output() for dist in dists], dim=1)
out_list: List[torch.Tensor] = []
if self.action_spec.continuous_size > 0:
out_list.append(dists.continuous.exported_model_output())
if self.action_spec.discrete_size > 0:
for discrete_dist in dists.discrete:
out_list.append(discrete_dist.exported_model_output())
return torch.cat(out_list, dim=1)
action_list = self._sample_action(dists)
log_probs_list, entropies, all_logs_list = ModelUtils.get_probs_and_entropy(
action_list, dists
)
actions = AgentAction.create(action_list, self.action_spec)
log_probs = ActionLogProbs.create(
log_probs_list, self.action_spec, all_logs_list
)
actions = self._sample_action(dists)
log_probs, entropies = self._get_probs_and_entropy(actions, dists)
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return (actions, log_probs, entropy_sum)

6
ml-agents/mlagents/trainers/torch/distributions.py


import abc
from typing import List, Tuple
from typing import List
from mlagents.torch_utils import torch, nn
import numpy as np
import math

# verified version of Barracuda (1.0.2).
log_sigma = torch.cat([self.log_sigma] * inputs.shape[0], axis=0)
if self.tanh_squash:
return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))]
return TanhGaussianDistInstance(mu, torch.exp(log_sigma))
return [GaussianDistInstance(mu, torch.exp(log_sigma))]
return GaussianDistInstance(mu, torch.exp(log_sigma))
class MultiCategoricalDistribution(nn.Module):

75
ml-agents/mlagents/trainers/torch/utils.py


from mlagents.trainers.settings import EncoderType, ScheduleType
from mlagents.trainers.exception import UnityTrainerException
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.torch.distributions import DistInstance, DiscreteDistInstance
class AgentAction(NamedTuple):

)
return array_dict
def to_tensor_list(self) -> List[torch.Tensor]:
"""
Returns the tensors in the AgentAction as a flat List of torch Tensors. This will be removed
when the ActionModel is merged.
"""
tensor_list: List[torch.Tensor] = []
if self.continuous_tensor is not None:
tensor_list.append(self.continuous_tensor)
if self.discrete_list is not None:
tensor_list += (
self.discrete_list
) # Note this is different for ActionLogProbs
return tensor_list
@staticmethod
def create(
tensor_list: List[torch.Tensor], action_spec: ActionSpec
) -> "AgentAction":
"""
A static method that converts a list of torch Tensors into an AgentAction using the ActionSpec.
This will change (and may be removed) in the ActionModel.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
_offset = 0
if action_spec.continuous_size > 0:
continuous = tensor_list[0]
_offset = 1
if action_spec.discrete_size > 0:
discrete = tensor_list[_offset:]
return AgentAction(continuous, discrete)
@staticmethod
def from_dict(buff: Dict[str, np.ndarray]) -> "AgentAction":
"""

self.continuous_tensor
)
if self.discrete_list is not None:
array_dict["discrete_log_probs"] = ModelUtils.to_numpy(self.discrete_tensor)
return array_dict

if self.continuous_tensor is not None:
tensor_list.append(self.continuous_tensor)
if self.discrete_list is not None:
tensor_list.append(
self.discrete_tensor
) # Note this is different for AgentActions
tensor_list.append(self.discrete_tensor)
return tensor_list
def flatten(self) -> torch.Tensor:

"""
return torch.cat(self._to_tensor_list(), dim=1)
@staticmethod
def create(
log_prob_list: List[torch.Tensor],
action_spec: ActionSpec,
all_log_prob_list: List[torch.Tensor] = None,
) -> "ActionLogProbs":
"""
A static method that converts a list of torch Tensors into an ActionLogProbs using the ActionSpec.
This will change (and may be removed) in the ActionModel.
"""
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None # type: ignore
_offset = 0
if action_spec.continuous_size > 0:
continuous = log_prob_list[0]
_offset = 1
if action_spec.discrete_size > 0:
discrete = log_prob_list[_offset:]
return ActionLogProbs(continuous, discrete, all_log_prob_list)
@staticmethod
def from_dict(buff: Dict[str, np.ndarray]) -> "ActionLogProbs":

for i in range(num_partitions):
res += [data[(partitions == i).nonzero().squeeze(1)]]
return res
@staticmethod
def get_probs_and_entropy(
action_list: List[torch.Tensor], dists: List[DistInstance]
) -> Tuple[List[torch.Tensor], torch.Tensor, Optional[List[torch.Tensor]]]:
log_probs_list = []
all_probs_list = []
entropies_list = []
for action, action_dist in zip(action_list, dists):
log_prob = action_dist.log_prob(action)
log_probs_list.append(log_prob)
entropy = action_dist.entropy()
entropies_list.append(entropy)
if isinstance(action_dist, DiscreteDistInstance):
all_probs_list.append(action_dist.all_log_prob())
entropies = torch.cat(entropies_list, dim=1)
return log_probs_list, entropies, all_probs_list
@staticmethod
def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:

正在加载...
取消
保存