浏览代码

Continuous and discrete now train

/develop/add-fire
Arthur Juliani 4 年前
当前提交
a11a79e4
共有 4 个文件被更改,包括 54 次插入20 次删除
  1. 21
      ml-agents/mlagents/trainers/distributions_torch.py
  2. 15
      ml-agents/mlagents/trainers/models_torch.py
  3. 33
      ml-agents/mlagents/trainers/policy/torch_policy.py
  4. 5
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py

21
ml-agents/mlagents/trainers/distributions_torch.py


import torch
from torch import nn
from torch import distributions
import numpy as np
EPSILON = 1e-6 # Small value to avoid divide by zero

class MultiCategoricalDistribution(nn.Module):
def __init__(self, hidden_size, act_sizes):
super(MultiCategoricalDistribution, self).__init__()
self.branches = self.create_policy_branches(hidden_size, act_sizes)
self.act_sizes = act_sizes
self.branches = self.create_policy_branches(hidden_size)
def create_policy_branches(self, hidden_size, act_sizes):
def create_policy_branches(self, hidden_size):
for size in act_sizes:
for size in self.act_sizes:
branch_output_layer = nn.Linear(hidden_size, size)
nn.init.xavier_uniform_(branch_output_layer.weight, gain=0.01)
branches.append(branch_output_layer)

normalized_logits = torch.log(normalized_probs)
return normalized_logits
def split_masks(self, masks):
split_masks = []
for idx, _ in enumerate(self.act_sizes):
start = int(np.sum(self.act_sizes[:idx]))
end = int(np.sum(self.act_sizes[: idx + 1]))
split_masks.append(masks[:, start:end])
return split_masks
for branch in self.branches:
masks = self.split_masks(masks)
for idx, branch in enumerate(self.branches):
norm_logits = self.mask_branch(logits, masks)
norm_logits = self.mask_branch(logits, masks[idx])
distribution = distributions.categorical.Categorical(logits=norm_logits)
branch_distributions.append(distribution)
return branch_distributions

15
ml-agents/mlagents/trainers/models_torch.py


DISCRETE = "discrete"
CONTINUOUS = "continuous"
@staticmethod
def from_str(label):
if label in "continuous":
return ActionType.CONTINUOUS
elif label in "discrete":
return ActionType.DISCRETE
else:
raise NotImplementedError
class LearningRateSchedule(Enum):
CONSTANT = "constant"

use_lstm,
):
super(Actor, self).__init__()
self.act_type = act_type
self.act_type = ActionType.from_str(act_type)
self.act_size = act_size
self.network_body = NetworkBody(
vector_sizes,

use_lstm,
)
if self.act_type == ActionType.CONTINUOUS:
self.distribution = GaussianDistribution(h_size, act_size)
self.distribution = GaussianDistribution(h_size, act_size[0])
else:
self.distribution = MultiCategoricalDistribution(h_size, act_size)

self.layers = [nn.Linear(input_size, hidden_size)]
for _ in range(num_layers - 1):
self.layers.append(nn.Linear(hidden_size, hidden_size))
self.layers.append(nn.Tanh())
self.layers.append(nn.ReLU())
self.layers = nn.ModuleList(self.layers)
def forward(self, inputs):

33
ml-agents/mlagents/trainers/policy/torch_policy.py


self.critic.network_body.update_normalization(vector_obs)
self.actor.network_body.update_normalization(vector_obs)
def execute_model(self, vec_obs, vis_obs, masks=None):
def execute_model(self, vec_obs, vis_obs, masks=None, actions=None):
actions = []
if actions is None:
generate_actions = True
actions = []
else:
generate_actions = False
for action_dist in action_dists:
action = action_dist.sample()
actions.append(action)
for idx, action_dist in enumerate(action_dists):
if generate_actions:
action = action_dist.sample()
actions.append(action)
else:
action = actions[idx]
actions = torch.stack(actions)
log_probs = torch.stack(log_probs).squeeze(0)
entropies = torch.stack(entropies).squeeze(0)
if generate_actions:
actions = torch.stack(actions, dim=-1)
log_probs = torch.stack(log_probs, dim=-1)
entropies = torch.stack(entropies, dim=-1)
if self.act_type == "continuous":
if generate_actions:
actions = actions.squeeze(-1)
log_probs = log_probs.squeeze(-1)
entropies = entropies.squeeze(-1)
value_heads, mean_value = self.critic(vec_obs, vis_obs)
return actions, log_probs, entropies, value_heads

vec_obs, vis_obs, masks = self.split_decision_step(decision_requests)
vec_obs = [torch.Tensor(vec_obs)]
vis_obs = [torch.Tensor(vis_ob) for vis_ob in vis_obs]
masks = torch.Tensor(masks)
if masks is not None:
masks = torch.Tensor(masks)
run_out = {}
action, log_probs, entropy, value_heads = self.execute_model(
vec_obs, vis_obs, masks

5
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


vec_obs = np.array(batch["vector_obs"])
vec_obs = [torch.Tensor(vec_obs)]
act_masks = torch.Tensor(np.array(batch["action_mask"]))
actions = [torch.Tensor(np.array(batch["actions"]))]
actions, log_probs, entropy, values = self.policy.execute_model(
vec_obs, vis_obs, act_masks
_, log_probs, entropy, values = self.policy.execute_model(
vec_obs, vis_obs, act_masks, actions
)
value_loss = self.ppo_value_loss(values, old_values, returns)
policy_loss = self.ppo_policy_loss(

正在加载...
取消
保存