浏览代码

ppo extended

/develop/actionmodel-csharp
Andrew Cohen 4 年前
当前提交
643c8e58
共有 7 个文件被更改,包括 68 次插入46 次删除
  1. 2
      ml-agents-envs/mlagents_envs/base_env.py
  2. 3
      ml-agents/mlagents/trainers/policy/policy.py
  3. 71
      ml-agents/mlagents/trainers/policy/torch_policy.py
  4. 14
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  5. 1
      ml-agents/mlagents/trainers/ppo/trainer.py
  6. 2
      ml-agents/mlagents/trainers/torch/distributions.py
  7. 21
      ml-agents/mlagents/trainers/torch/networks.py

2
ml-agents-envs/mlagents_envs/base_env.py


CONTINUOUS = 1
class HybridBehaviorSpec(NamedTuple):
class BehaviorSpec(NamedTuple):
observation_shapes: List[Tuple]
continuous_action_shape: int
discrete_action_shape: Tuple[int]

3
ml-agents/mlagents/trainers/policy/policy.py


self.trainer_settings = trainer_settings
self.network_settings: NetworkSettings = trainer_settings.network_settings
self.seed = seed
# For hybrid
self.continuous_act_size = behavior_spec.continuous_action_size()
self.discrete_act_size = behavior_spec.continuous_action_size()
self.act_size = (
list(behavior_spec.discrete_action_branches)
if behavior_spec.is_action_discrete()

71
ml-agents/mlagents/trainers/policy/torch_policy.py


observation_shapes=self.behavior_spec.observation_shapes,
network_settings=trainer_settings.network_settings,
act_type=behavior_spec.action_type,
act_size=self.act_size,
continuous_act_size=self.continuous_act_size,
discrete_act_size=self.discrete_act_size,
stream_names=reward_signal_names,
conditional_sigma=self.condition_sigma_on_obs,
tanh_squash=tanh_squash,

if self.use_vec_obs and self.normalize:
self.actor_critic.update_normalization(vector_obs)
def get_actions_and_stats(dists : List[DistInstance]):
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
action_list = self.actor_critic.sample_action(dists)
log_probs, entropies, all_logs = ModelUtils.get_probs_and_entropy(
action_list, dists
)
actions = torch.stack(action_list, dim=-1)
return (
actions,
all_logs if all_log_probs else log_probs,
entropies,
)
@timed
def sample_actions(
self,

"""
:param all_log_probs: Returns (for discrete actions) a tensor of log probs, one for each action.
"""
dists, value_heads, memories = self.actor_critic.get_dist_and_value(
continuous_dists, discrete_dists, value_heads, memories = self.actor_critic.get_dist_and_value(
action_list = self.actor_critic.sample_action(dists)
log_probs, entropies, all_logs = ModelUtils.get_probs_and_entropy(
action_list, dists
)
actions = torch.stack(action_list, dim=-1)
if self.use_continuous_act:
actions = actions[:, :, 0]
else:
actions = actions[:, 0, :]
continuous_actions, continuous_entropies, continuous_log_probs = self.get_action_and_stats(continuous_dists)
discrete_actions, discrete_entropies, discrete_log_probs = self.get_action_and_stats(discrete_dists)
continuous_actions = continuous_actions[:, :, 0]
discrete_actions = discrete_actions[:, 0, :]
actions,
all_logs if all_log_probs else log_probs,
entropies,
continuous_actions,
continuous_log_probs,
continuous_entropies,
discrete_actions,
discrete_log_probs,
discrete_entropies,
value_heads,
memories,
)

vec_obs: torch.Tensor,
vis_obs: torch.Tensor,
actions: torch.Tensor,
continuous_actions: torch.Tensor,
discrete_actions: torch.Tensor,
dists, value_heads, _ = self.actor_critic.get_dist_and_value(
continuous_dists, discrete_dists, value_heads, memories = self.actor_critic.get_dist_and_value(
action_list = [actions[..., i] for i in range(actions.shape[-1])]
log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_list, dists)
continuous_action_list = [actions[..., i] for i in range(actions.shape[-1])]
discrete_action_list = [actions[..., i] for i in range(actions.shape[-1])]
continuous_log_probs, continuous_entropies, _ = ModelUtils.get_probs_and_entropy(continuous_action_list, dists)
discrete_log_probs, discrete_entropies, _ = ModelUtils.get_probs_and_entropy(discrete_action_list, dists)
return log_probs, entropies, value_heads
return continuous_log_probs, continuous_entropies, discrete_log_probs, discrete_entropies, value_heads
@timed
def evaluate(

run_out = {}
with torch.no_grad():
action, log_probs, entropy, value_heads, memories = self.sample_actions(
continuous_action, continuous_log_probs, continuous_entropy, discrete_action, discrete_log_probs, discrete_entropy, value_heads, memories = self.sample_actions(
run_out["action"] = ModelUtils.to_numpy(action)
# Todo - make pre_action difference
# Todo - make pre_action difference
run_out["log_probs"] = ModelUtils.to_numpy(log_probs)
run_out["entropy"] = ModelUtils.to_numpy(entropy)
run_out["continuous_action"] = ModelUtils.to_numpy(continuous_action)
run_out["continuous_log_probs"] = ModelUtils.to_numpy(log_probs)
run_out["continuous_entropy"] = ModelUtils.to_numpy(entropy)
run_out["discrete_action"] = ModelUtils.to_numpy(discrete_action)
run_out["discrete_log_probs"] = ModelUtils.to_numpy(log_probs)
run_out["discrete_entropy"] = ModelUtils.to_numpy(entropy)
run_out["value_heads"] = {
name: ModelUtils.to_numpy(t) for name, t in value_heads.items()
}

decision_requests, global_agent_ids
) # pylint: disable=assignment-from-no-return
self.save_memories(global_agent_ids, run_out.get("memory_out"))
action = np.concat([run_out.get("continuous_action"), run_out.get("discrete_action")], axis=1)
action=run_out.get("action"),
action=action,
value=run_out.get("value"),
outputs=run_out,
agent_ids=list(decision_requests.agent_id),

14
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
else:
actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long)
continuous_actions = ModelUtils.list_to_tensor(batch["actions"][:self.policy.continuous_act_size]).unsqueeze(-1)
discrete_actions = ModelUtils.list_to_tensor(batch["actions"][self.policy.continuous_act_size:], dtype=torch.long)
memories = [
ModelUtils.list_to_tensor(batch["memory"][i])

vis_obs.append(vis_ob)
else:
vis_obs = []
log_probs, entropy, values = self.policy.evaluate_actions(
continuous_log_probs, continuous_entropy, discrete_log_probs, discrete_entropy, values = self.policy.evaluate_actions(
actions=actions,
continuous_actions=continuous_actions,
discrete_actions=discrete_actions,
log_probs = continuous_log_probs + discrete_log_probs
entropy = continuous_entropy + discrete_entropy
loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
value_loss = self.ppo_value_loss(
values, old_values, returns, decay_eps, loss_masks

1
ml-agents/mlagents/trainers/ppo/trainer.py


behavior_spec,
self.trainer_settings,
condition_sigma_on_obs=False, # Faster training for PPO
separate_critic=behavior_spec.is_action_continuous(),
)
return policy

2
ml-agents/mlagents/trainers/torch/distributions.py


return branch_distributions
class HybridDistribution(nn.Module):
class OutputDistributions(nn.Module):
def __init__(
self,
hidden_size: int,

21
ml-agents/mlagents/trainers/torch/networks.py


from mlagents.torch_utils import torch, nn
from mlagents_envs.base_env import ActionType
from mlagents.trainers.torch.distributions import HybridDistribution, DistInstance
from mlagents.trainers.torch.distributions import OutputDistributions, DistInstance
from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.decoders import ValueHeads

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
) -> Tuple[List[DistInstance], List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
"""
Returns distributions, from which actions can be sampled, and value estimates.
If memory is enabled, return the memories as well.

self.discrete_act_size = discrete_act_size
self.continuous_act_size = continuous_act_size
self.version_number = torch.nn.Parameter(torch.Tensor([2.0]))
# self.is_continuous_int = torch.nn.Parameter(
# torch.Tensor([int(act_type == ActionType.CONTINUOUS)])
# )
self.continuous_act_size_vector = torch.nn.Parameter(
torch.Tensor(continuous_act_size)
)

else:
self.encoding_size = network_settings.hidden_units
self.distribution = HybridDistribution(
self.output_distributions = OutputDistributions(
self.encoding_size,
continuous_act_size[0],
discrete_act_size,

encoding, memories = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
)
continuous_dists, discrete_dists = self.distribution(encoding, masks)
continuous_dists, discrete_dists = self.output_distribution(encoding, masks)
return continuous_dists, discrete_dists, memories
def forward(

)
class SharedActorCritic(HybridSimpleActor, ActorCritic):
class SharedActorCritic(SimpleActor, ActorCritic):
def __init__(
self,
observation_shapes: List[Tuple[int, ...]],

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
) -> Tuple[List[DistInstance], List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
continuous_dists, discrete_dists = self.distribution(encoding, masks)
continuous_dists, discrete_dists = self.output_distribution(encoding, masks)
class SeparateActorCritic(HybridSimpleActor, ActorCritic):
class SeparateActorCritic(SimpleActor, ActorCritic):
def __init__(
self,
observation_shapes: List[Tuple[int, ...]],

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
) -> Tuple[List[DistInstance], List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
if self.use_lstm:
# Use only the back half of memories for critic and actor
actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1)

正在加载...
取消
保存