浏览代码

1:1 and continuous/discrete train

/develop/actionmodel-csharp
Andrew Cohen 4 年前
当前提交
06f1f254
共有 6 个文件被更改,包括 87 次插入171 次删除
  1. 126
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  2. 32
      ml-agents/mlagents/trainers/tests/torch/test_hybrid.py
  3. 80
      ml-agents/mlagents/trainers/torch/action_model.py
  4. 15
      ml-agents/mlagents/trainers/torch/distributions.py
  5. 4
      ml-agents/mlagents/trainers/torch/networks.py
  6. 1
      ml-agents/mlagents/trainers/torch/utils.py

126
ml-agents/mlagents/trainers/tests/simple_test_envs.py


def __init__(
self,
brain_names,
use_discrete,
action_size=1,
continuous_action_size=1,
discrete_action_size=1,
self.discrete = use_discrete
if use_discrete:
action_spec = ActionSpec.create_discrete(
tuple(2 for _ in range(action_size))
)
discrete_tuple = tuple(2 for _ in range(discrete_action_size))
if continuous_action_size > 0:
if discrete_action_size > 0:
action_spec = ActionSpec(continuous_action_size, discrete_tuple)
else:
action_spec = ActionSpec.create_continuous(continuous_action_size)
action_spec = ActionSpec.create_continuous(action_size)
action_spec = ActionSpec.create_discrete(discrete_tuple)
self.total_action_size = (
continuous_action_size + discrete_action_size
) # to set the goals/positions
self.action_spec = action_spec
self.action_size = action_size
self.names = brain_names
self.positions: Dict[str, List[float]] = {}
self.step_count: Dict[str, float] = {}

return done
def _generate_mask(self):
if self.discrete:
action_mask = None
if self.action_spec.discrete_size > 0:
ndmask = np.array(2 * self.action_size * [False], dtype=np.bool)
ndmask = np.array(
2 * self.action_spec.discrete_size * [False], dtype=np.bool
)
else:
action_mask = None
return action_mask
def _compute_reward(self, name: str, done: bool) -> float:

def _reset_agent(self, name):
self.goal[name] = self.random.choice([-1, 1])
self.positions[name] = [0.0 for _ in range(self.action_size)]
self.positions[name] = [0.0 for _ in range(self.total_action_size)]
self.step_count[name] = 0
self.rewards[name] = 0
self.agent_id[name] = self.agent_id[name] + 1

def close(self):
pass
class HybridEnvironment(SimpleEnvironment):
def __init__(
self,
brain_names,
step_size=STEP_SIZE,
num_visual=0,
num_vector=1,
vis_obs_size=VIS_OBS_SIZE,
vec_obs_size=OBS_SIZE,
continuous_action_size=1,
discrete_action_size=1,
):
super().__init__(brain_names, False)
self.continuous_env = SimpleEnvironment(
brain_names,
False,
step_size,
num_visual,
num_vector,
vis_obs_size,
vec_obs_size,
continuous_action_size,
)
self.discrete_env = SimpleEnvironment(
brain_names,
True,
step_size,
num_visual,
num_vector,
vis_obs_size,
vec_obs_size,
discrete_action_size,
)
super().__init__(
brain_names,
True, # This is needed for env to generate masks correctly
step_size=step_size,
num_visual=num_visual,
num_vector=num_vector,
action_size=discrete_action_size, # This is needed for env to generate masks correctly
)
# Number of steps to reveal the goal for. Lower is harder. Should be
# less than 1/step_size to force agent to use memory
self.behavior_spec = BehaviorSpec(
self._make_obs_spec(),
ActionSpec(
continuous_action_size, tuple(2 for _ in range(discrete_action_size))
),
)
self.continuous_action_size = continuous_action_size
self.discrete_action_size = discrete_action_size
self.continuous_action = {}
self.discrete_action = {}
def step(self) -> None:
assert all(action is not None for action in self.continuous_env.action.values())
assert all(action is not None for action in self.discrete_env.action.values())
for name in self.names:
cont_done = self.continuous_env._take_action(name)
disc_done = self.discrete_env._take_action(name)
all_done = cont_done and disc_done
if all_done:
reward = 0
for _pos in (
self.continuous_env.positions[name]
+ self.discrete_env.positions[name]
):
reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
self.continuous_env.positions[name]
+ self.discrete_env.positions[name]
)
else:
reward = -TIME_PENALTY
self.rewards[name] += reward
self.step_result[name] = self._make_batched_step(name, all_done, reward)
def reset(self) -> None: # type: ignore
super().reset()
self.continuous_env.reset()
self.discrete_env.reset()
self.continuous_env.goal = self.goal
self.discrete_env.goal = self.goal
def set_actions(self, behavior_name: BehaviorName, action) -> None:
# print(action, self.goal[behavior_name])
continuous_action = action[:, : self.continuous_action_size]
discrete_action = action[:, self.continuous_action_size :]
self.continuous_env.set_actions(behavior_name, continuous_action)
self.discrete_env.set_actions(behavior_name, discrete_action)
class MemoryEnvironment(SimpleEnvironment):

32
ml-agents/mlagents/trainers/tests/torch/test_hybrid.py


from mlagents.trainers.tests.simple_test_envs import (
SimpleEnvironment,
HybridEnvironment,
MemoryEnvironment,
RecordEnvironment,
)

PPO_TORCH_CONFIG = attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH)
SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_simple_ppo(use_discrete):
# env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
# config = attr.evolve(PPO_TORCH_CONFIG)
# _check_environment_trains(env, {BRAIN_NAME: config})
env = HybridEnvironment(
env = SimpleEnvironment(
config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000)
config = attr.evolve(
PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
)
env = HybridEnvironment(
env = SimpleEnvironment(
[BRAIN_NAME], continuous_action_size=1, discrete_action_size=0, step_size=0.8
)
config = attr.evolve(PPO_TORCH_CONFIG)

def test_dischybrid_ppo():
env = HybridEnvironment(
env = SimpleEnvironment(
[BRAIN_NAME], continuous_action_size=0, discrete_action_size=1, step_size=0.8
)
config = attr.evolve(PPO_TORCH_CONFIG)

def test_3cdhybrid_ppo():
env = HybridEnvironment([BRAIN_NAME], continuous_action_size=2, discrete_action_size=1, step_size=0.8)
env = SimpleEnvironment(
[BRAIN_NAME], continuous_action_size=2, discrete_action_size=1, step_size=0.8
)
config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000)
config = attr.evolve(
PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
)
env = HybridEnvironment(
env = SimpleEnvironment(
config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000)
config = attr.evolve(
PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
)

80
ml-agents/mlagents/trainers/torch/action_model.py


import numpy as np
import math
from mlagents.trainers.torch.layers import linear_layer, Initialization
from mlagents.trainers.torch.distributions import DistInstance, DiscreteDistInstance, GaussianDistribution, MultiCategoricalDistribution
from mlagents.trainers.torch.distributions import (
DistInstance,
DiscreteDistInstance,
GaussianDistribution,
MultiCategoricalDistribution,
)
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
class ActionModel(nn.Module):
def __init__(

):
super().__init__()
self.encoding_size = hidden_size
self.continuous_act_size = action_spec.continuous_action_size
self.discrete_act_branches = action_spec.discrete_action_branches
self.discrete_act_size = action_spec.discrete_action_size
self._split_list : List[int] = []
if self.continuous_act_size > 0:
self._distributions.append(GaussianDistribution(
if self.action_spec.continuous_size > 0:
self._distributions.append(
GaussianDistribution(
self.continuous_act_size,
self.action_spec.continuous_size,
self._split_list.append(self.continuous_act_size)
if self.discrete_act_size > 0:
self._distributions.append(MultiCategoricalDistribution(self.encoding_size, self.discrete_act_branches))
self._split_list += [1 for _ in range(self.discrete_act_size)]
if self.action_spec.discrete_size > 0:
self._distributions.append(
MultiCategoricalDistribution(
self.encoding_size, self.action_spec.discrete_branches
)
)
def _sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
"""

actions.append(action)
return actions
def _get_dists(self, inputs: torch.Tensor, masks: torch.Tensor) -> Tuple[List[DistInstance], List[DiscreteDistInstance]]:
def _get_dists(
self, inputs: torch.Tensor, masks: torch.Tensor
) -> Tuple[List[DistInstance], List[DiscreteDistInstance]]:
distribution_instances: List[DistInstance] = []
for distribution in self._distributions:
dist_instances = distribution(inputs, masks)

def evaluate(self, inputs: torch.Tensor, masks: torch.Tensor, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
def evaluate(
self, inputs: torch.Tensor, masks: torch.Tensor, actions: AgentAction
) -> Tuple[ActionLogProbs, torch.Tensor]:
split_actions = torch.split(actions, self._split_list, dim=1)
action_lists : List[torch.Tensor] = []
for split_action in split_actions:
action_list = [split_action[..., i] for i in range(split_action.shape[-1])]
action_lists += action_list
log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_lists, dists)
return log_probs, entropies
action_list = actions.to_tensor_list()
log_probs_list, entropies, _ = ModelUtils.get_probs_and_entropy(
action_list, dists
)
log_probs = ActionLogProbs.create(log_probs_list, self.action_spec)
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return log_probs, entropy_sum
def forward(self, inputs: torch.Tensor, masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
def forward(
self, inputs: torch.Tensor, masks: torch.Tensor
) -> Tuple[AgentAction, ActionLogProbs, torch.Tensor]:
action_outs : List[torch.Tensor] = []
action_lists = self._sample_action(dists)
for action_list, dist in zip(action_lists, dists):
action_out = action_list.unsqueeze(-1)
action_outs.append(dist.structure_action(action_out))
log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_lists, dists)
action = torch.cat(action_outs, dim=1)
return (action, log_probs, entropies)
action_list = self._sample_action(dists)
log_probs_list, entropies, all_logs_list = ModelUtils.get_probs_and_entropy(
action_list, dists
)
actions = AgentAction.create(action_list, self.action_spec)
log_probs = ActionLogProbs.create(
log_probs_list, self.action_spec, all_logs_list
)
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return (actions, log_probs, entropies)

15
ml-agents/mlagents/trainers/torch/distributions.py


"""
pass
@abc.abstractmethod
def structure_action(self, action: torch.Tensor) -> torch.Tensor:
"""
Return the structured action to be passed to the trainer
"""
pass
class DiscreteDistInstance(DistInstance):
@abc.abstractmethod

def exported_model_output(self):
return self.sample()
def structure_action(self, action):
return action[:, :, 0]
class TanhGaussianDistInstance(GaussianDistInstance):
def __init__(self, mean, std):

).squeeze(-1)
def log_prob(self, value):
return torch.log(self.pdf(value)).unsqueeze(-1)
return torch.log(self.pdf(value))
def all_log_prob(self):
return torch.log(self.probs)

def exported_model_output(self):
return self.all_log_prob()
def structure_action(self, action):
return action[:, 0, :].type(torch.float)
class GaussianDistribution(nn.Module):

4
ml-agents/mlagents/trainers/torch/networks.py


memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[
List[DistInstance], List[DistInstance], Dict[str, torch.Tensor], torch.Tensor
AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
]:
"""
Returns distributions, from which actions can be sampled, and value estimates.

:param masks: If using discrete actions, a Tensor of action masks.
:param memories: If using memory, a Tensor of initial memories.
:param sequence_length: If using memory, the sequence length.
:return: A Tuple of a List of action distribution instances, a Dict of reward signal
:return: A Tuple of AgentAction, ActionLogProbs, entropies, Dict of reward signal
name to value estimate, and memories. Memories will be None if not using memory.
"""
pass

1
ml-agents/mlagents/trainers/torch/utils.py


entropies_list.append(entropy)
if isinstance(action_dist, DiscreteDistInstance):
all_probs_list.append(action_dist.all_log_prob())
print(entropies_list)
entropies = torch.stack(entropies_list, dim=-1)
if not all_probs_list:
entropies = entropies.squeeze(-1)

正在加载...
取消
保存