比较提交

...
此合并请求有变更与目标分支冲突。
/config/ppo/3DBall.yaml
/Project/ProjectSettings/ProjectVersion.txt
/ml-agents/mlagents/trainers/optimizer/optimizer.py
/ml-agents/mlagents/trainers/policy/policy.py
/ml-agents/mlagents/trainers/ppo/trainer.py
/ml-agents/mlagents/trainers/trainer/trainer.py
/ml-agents/mlagents/trainers/trainer/rl_trainer.py
/ml-agents/mlagents/trainers/distributions_torch.py
/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
/ml-agents/mlagents/trainers/policy/torch_policy.py
/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
/ml-agents/mlagents/trainers/models.py
/ml-agents/mlagents/trainers/tests/test_ppo.py
/ml-agents/mlagents/trainers/tests/test_reward_signals.py
/ml-agents/mlagents/trainers/policy/tf_policy.py
/ml-agents/mlagents/trainers/ppo/optimizer_tf.py
/ml-agents/mlagents/trainers/ppo/optimizer_tf.py

2 次代码提交

作者 SHA1 备注 提交日期
Ervin Teng 7830319f Move test_env_perf file 4 年前
Ervin Teng f214836a Changes for speed test 5 年前
共有 21 个文件被更改,包括 1378 次插入133 次删除
  1. 1
      config/ppo/3DBall.yaml
  2. 5
      Project/ProjectSettings/EditorBuildSettings.asset
  3. 2
      Project/ProjectSettings/ProjectVersion.txt
  4. 2
      Project/ProjectSettings/UnityConnectSettings.asset
  5. 4
      ml-agents/mlagents/trainers/models.py
  6. 4
      ml-agents/mlagents/trainers/trainer/rl_trainer.py
  7. 14
      ml-agents/mlagents/trainers/trainer/trainer.py
  8. 4
      ml-agents/mlagents/trainers/tests/test_ppo.py
  9. 4
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  10. 7
      ml-agents/mlagents/trainers/optimizer/optimizer.py
  11. 111
      ml-agents/mlagents/trainers/policy/tf_policy.py
  12. 128
      ml-agents/mlagents/trainers/policy/policy.py
  13. 4
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  14. 67
      ml-agents/mlagents/trainers/ppo/trainer.py
  15. 69
      ml-agents/mlagents/trainers/distributions_torch.py
  16. 496
      ml-agents/mlagents/trainers/models_torch.py
  17. 115
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  18. 300
      ml-agents/mlagents/trainers/policy/torch_policy.py
  19. 145
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  20. 29
      test_env_perf.py
  21. 0
      /ml-agents/mlagents/trainers/ppo/optimizer_tf.py

1
config/ppo/3DBall.yaml


summary_freq: 12000
use_recurrent: false
vis_encode_type: simple
threaded: false
reward_signals:
extrinsic:
strength: 1.0

5
Project/ProjectSettings/EditorBuildSettings.asset


EditorBuildSettings:
m_ObjectHideFlags: 0
serializedVersion: 2
m_Scenes: []
m_Scenes:
- enabled: 1
path: Assets/ML-Agents/Examples/3DBall/Scenes/3DBall.unity
guid: b9ac0cbf961bf4dacbfa0aa9c0d60aaa
m_configObjects: {}

2
Project/ProjectSettings/ProjectVersion.txt


m_EditorVersion: 2018.4.17f1
m_EditorVersion: 2018.4.20f1

2
Project/ProjectSettings/UnityConnectSettings.asset


UnityConnectSettings:
m_ObjectHideFlags: 0
serializedVersion: 1
m_Enabled: 1
m_Enabled: 0
m_TestMode: 0
m_EventOldUrl: https://api.uca.cloud.unity3d.com/v1/events
m_EventUrl: https://cdp.cloud.unity3d.com/v1/events

4
ml-agents/mlagents/trainers/models.py


:param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action]
:param action_size: A list containing the number of possible actions for each branch
:return: The action output dimension [batch_size, num_branches], the concatenated
normalized probs (after softmax)
and the concatenated normalized log probs
normalized log_probs (after softmax)
and the concatenated normalized log log_probs
"""
branch_masks = ModelUtils.break_into_branches(action_masks, action_size)
raw_probs = [

4
ml-agents/mlagents/trainers/trainer/rl_trainer.py


import abc
import time
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.optimizer.optimizer import Optimizer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.exception import UnityTrainerException

for agent_id in rewards:
rewards[agent_id] = 0
def _update_end_episode_stats(self, agent_id: str, optimizer: TFOptimizer) -> None:
def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None:
for name, rewards in self.collected_rewards.items():
if name == "environment":
self.stats_reporter.add_stat(

14
ml-agents/mlagents/trainers/trainer/trainer.py


from collections import deque
from mlagents_envs.logging_util import get_logger
from mlagents.model_serialization import export_policy_model, SerializationSettings
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy import Policy
from mlagents.trainers.policy.policy import Policy
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers

Exports the model
"""
policy = self.get_policy(name_behavior_id)
settings = SerializationSettings(policy.model_path, policy.brain.brain_name)
export_policy_model(settings, policy.graph, policy.sess)
policy.export_model()
@abc.abstractmethod
def end_episode(self):

@abc.abstractmethod
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> TFPolicy:
) -> Policy:
"""
Creates policy
"""

def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to trainer.

@abc.abstractmethod
def get_policy(self, name_behavior_id: str) -> TFPolicy:
def get_policy(self, name_behavior_id: str) -> Policy:
"""
Gets policy from trainer.
"""

4
ml-agents/mlagents/trainers/tests/test_ppo.py


import yaml
from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.ppo.optimizer_tf import TFPPOOptimizer
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.agent_processor import AgentManagerQueue

policy = NNPolicy(
0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
)
optimizer = PPOOptimizer(policy, trainer_parameters)
optimizer = TFPPOOptimizer(policy, trainer_parameters)
return optimizer

4
ml-agents/mlagents/trainers/tests/test_reward_signals.py


import mlagents.trainers.tests.mock_brain as mb
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.sac.optimizer import SACOptimizer
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.ppo.optimizer_tf import TFPPOOptimizer
CONTINUOUS_PATH = os.path.dirname(os.path.abspath(__file__)) + "/test.demo"
DISCRETE_PATH = os.path.dirname(os.path.abspath(__file__)) + "/testdcvis.demo"

if trainer_parameters["trainer"] == "sac":
optimizer = SACOptimizer(policy, trainer_parameters)
else:
optimizer = PPOOptimizer(policy, trainer_parameters)
optimizer = TFPPOOptimizer(policy, trainer_parameters)
return optimizer

7
ml-agents/mlagents/trainers/optimizer/optimizer.py


import abc
class Optimizer(abc.ABC):
class Optimizer(object):
@abc.abstractmethod
def __init__(self):
self.reward_signals = {}
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
"""
Update the Policy based on the batch that was passed in.

111
ml-agents/mlagents/trainers/policy/tf_policy.py


import abc
import os
import numpy as np
from mlagents.model_serialization import SerializationSettings, export_policy_model
from mlagents_envs.exception import UnityException
from mlagents.trainers.policy.policy import UnityPolicyException
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.brain_conversion_utils import get_global_agent_id
from mlagents_envs.base_env import DecisionSteps

logger = get_logger(__name__)
class UnityPolicyException(UnityException):
"""
Related to errors with the Trainer.
"""
pass
class TFPolicy(Policy):
"""
Contains a learning model, and the necessary

:param brain: The corresponding Brain for this policy.
:param trainer_parameters: The trainer parameters.
"""
super(TFPolicy, self).__init__(
brain=brain, seed=seed, trainer_params=trainer_parameters
)
self._version_number_ = 2
self.m_size = 0

self.inference_dict = {}
self.update_dict = {}
self.sequence_length = 1
self.seed = seed
self.brain = brain
self.use_recurrent = trainer_parameters["use_recurrent"]
self.memory_dict: Dict[str, np.ndarray] = {}
self.num_branches = len(self.brain.vector_action_space_size)
self.previous_action_dict: Dict[str, np.array] = {}
self.normalize = trainer_parameters.get("normalize", False)
self.use_continuous_act = brain.vector_action_space_type == "continuous"
if self.use_continuous_act:
self.num_branches = self.brain.vector_action_space_size[0]
self.model_path = trainer_parameters["output_path"]
self.initialize_path = trainer_parameters.get("init_path", None)
self.keep_checkpoints = trainer_parameters.get("keep_checkpoints", 5)
self.graph = tf.Graph()

self.saver = None
self.seed = seed
if self.use_recurrent:
self.m_size = trainer_parameters["memory_size"]
self.sequence_length = trainer_parameters["sequence_length"]
if self.m_size == 0:
raise UnityPolicyException(
"The memory size for brain {0} is 0 even "
"though the trainer uses recurrent.".format(brain.brain_name)
)
elif self.m_size % 2 != 0:
raise UnityPolicyException(
"The memory size for brain {0} is {1} "
"but it must be divisible by 2.".format(
brain.brain_name, self.m_size
)
)
self._initialize_tensorflow_references()
self.load = load

"""
pass
def load_model(self, step=0):
reset_steps = not self.load
self._load_graph(self.model_path, reset_global_steps=reset_steps)
def _initialize_graph(self):
with self.graph.as_default():
self.saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)

"""
raise UnityPolicyException("The evaluate function was not implemented.")
def export_model(self, step=0):
settings = SerializationSettings(self.model_path, self.brain.brain_name)
export_policy_model(settings, self.graph, self.sess)
def get_action(
self, decision_requests: DecisionSteps, worker_id: int = 0
) -> ActionInfo:

feed_dict[self.action_masks] = mask
return feed_dict
def make_empty_memory(self, num_agents):
"""
Creates empty memory for use with RNNs
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.m_size), dtype=np.float32)
def save_memories(
self, agent_ids: List[str], memory_matrix: Optional[np.ndarray]
) -> None:
if memory_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.memory_dict[agent_id] = memory_matrix[index, :]
def retrieve_memories(self, agent_ids: List[str]) -> np.ndarray:
memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.memory_dict:
memory_matrix[index, :] = self.memory_dict[agent_id]
return memory_matrix
def remove_memories(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.memory_dict:
self.memory_dict.pop(agent_id)
def make_empty_previous_action(self, num_agents):
"""
Creates empty previous action for use with RNNs and discrete control
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.num_branches), dtype=np.int)
def save_previous_action(
self, agent_ids: List[str], action_matrix: Optional[np.ndarray]
) -> None:
if action_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.previous_action_dict[agent_id] = action_matrix[index, :]
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.previous_action_dict:
action_matrix[index, :] = self.previous_action_dict[agent_id]
return action_matrix
def remove_previous_action(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.previous_action_dict:
self.previous_action_dict.pop(agent_id)
def get_current_step(self):
"""
Gets current model step.

"""
return list(self.update_dict.keys())
def save_model(self, steps):
def save_model(self, step=0):
:param steps: The number of steps the model was trained for
:param step: The number of steps the model was trained for
last_checkpoint = os.path.join(self.model_path, f"model-{steps}.ckpt")
last_checkpoint = os.path.join(self.model_path, f"model-{step}.ckpt")
self.saver.save(self.sess, last_checkpoint)
tf.train.write_graph(
self.graph, self.model_path, "raw_graph_def.pb", as_text=False

128
ml-agents/mlagents/trainers/policy/policy.py


from abc import ABC, abstractmethod
from abc import abstractmethod
from typing import Dict, List, Optional
import numpy as np
from mlagents_envs.exception import UnityException
class Policy(ABC):
@abstractmethod
class UnityPolicyException(UnityException):
"""
Related to errors with the Trainer.
"""
pass
class Policy(object):
def __init__(self, brain, seed, trainer_params):
self.brain = brain
self.seed = seed
self.model_path = None
self.use_continuous_act = brain.vector_action_space_type == "continuous"
if self.use_continuous_act:
self.num_branches = self.brain.vector_action_space_size[0]
else:
self.num_branches = len(self.brain.vector_action_space_size)
self.previous_action_dict: Dict[str, np.array] = {}
self.memory_dict: Dict[str, np.ndarray] = {}
self.normalize = trainer_params["normalize"]
self.use_recurrent = trainer_params["use_recurrent"]
self.model_path = trainer_params["output_path"]
if self.use_recurrent:
self.m_size = trainer_params["memory_size"]
self.sequence_length = trainer_params["sequence_length"]
if self.m_size == 0:
raise UnityPolicyException(
"The memory size for brain {0} is 0 even "
"though the trainer uses recurrent.".format(brain.brain_name)
)
elif self.m_size % 2 != 0:
raise UnityPolicyException(
"The memory size for brain {0} is {1} "
"but it must be divisible by 2.".format(
brain.brain_name, self.m_size
)
)
def make_empty_memory(self, num_agents):
"""
Creates empty memory for use with RNNs
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.m_size), dtype=np.float32)
def save_memories(
self, agent_ids: List[str], memory_matrix: Optional[np.ndarray]
) -> None:
if memory_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.memory_dict[agent_id] = memory_matrix[index, :]
def retrieve_memories(self, agent_ids: List[str]) -> np.ndarray:
memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.memory_dict:
memory_matrix[index, :] = self.memory_dict[agent_id]
return memory_matrix
def remove_memories(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.memory_dict:
self.memory_dict.pop(agent_id)
def make_empty_previous_action(self, num_agents):
"""
Creates empty previous action for use with RNNs and discrete control
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.num_branches), dtype=np.int)
def save_previous_action(
self, agent_ids: List[str], action_matrix: Optional[np.ndarray]
) -> None:
if action_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.previous_action_dict[agent_id] = action_matrix[index, :]
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.previous_action_dict:
action_matrix[index, :] = self.previous_action_dict[agent_id]
return action_matrix
def remove_previous_action(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.previous_action_dict:
self.previous_action_dict.pop(agent_id)
raise NotImplementedError
@abstractmethod
def update_normalization(self, vector_obs: np.ndarray) -> None:
pass
@abstractmethod
def export_model(self, step=0):
pass
@abstractmethod
def save_model(self, step=0):
pass
@abstractmethod
def load_model(self, step=0):
pass
@abstractmethod
def increment_step(self, n_steps):
pass
@abstractmethod
def get_current_step(self):
pass

4
ml-agents/mlagents/trainers/ppo/optimizer_tf.py


from mlagents.trainers.buffer import AgentBuffer
class PPOOptimizer(TFOptimizer):
class TFPPOOptimizer(TFOptimizer):
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
"""
Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.

name="old_probabilities",
)
# Break old log probs into separate branches
# Break old log log_probs into separate branches
old_log_prob_branches = ModelUtils.break_into_branches(
self.all_old_log_probs, self.policy.act_size
)

67
ml-agents/mlagents/trainers/ppo/trainer.py


# Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347
from collections import defaultdict
import time
from mlagents.trainers.policy import Policy
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer
from mlagents.trainers.ppo.optimizer_tf import TFPPOOptimizer
logger = get_logger(__name__)
logger = get_logger(__name__)
TIMINGS = []
class PPOTrainer(RLTrainer):

self._check_param_keys()
self.load = load
self.seed = seed
self.policy: NNPolicy = None # type: ignore
self.framework = "torch"
self.policy: Policy = None # type: ignore
self.update_times = []
def _check_param_keys(self):
super()._check_param_keys()

trajectory.next_obs,
trajectory.done_reached and not trajectory.interrupted,
)
for name, v in value_estimates.items():
agent_buffer_trajectory["{}_value_estimates".format(name)].extend(v)
self._stats_reporter.add_stat(

local_value_estimates = agent_buffer_trajectory[
"{}_value_estimates".format(name)
].get_batch()
local_advantage = get_gae(
rewards=local_rewards,
value_estimates=local_value_estimates,

buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
t1 = time.perf_counter()
t2 = time.perf_counter()
TIMINGS.append(t2 - t1)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)

def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> TFPolicy:
) -> Policy:
if self.framework == "torch":
return self.create_torch_policy(parsed_behavior_id, brain_parameters)
else:
return self.create_tf_policy(parsed_behavior_id, brain_parameters)
def create_tf_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> NNPolicy:
:param parsed_behavior_id:
:param brain_parameters: specifications for policy construction
:return policy
"""

self.is_training,
self.load,
condition_sigma_on_obs=False, # Faster training for PPO
create_tf_graph=False, # We will create the TF graph in the Optimizer
return policy
def create_torch_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> TorchPolicy:
"""
Creates a PPO policy to trainers list of policies.
:param parsed_behavior_id:
:param brain_parameters: specifications for policy construction
:return policy
"""
policy = TorchPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,
self.is_training,
self.load,
condition_sigma_on_obs=False, # Faster training for PPO
)
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to trainer.

self.__class__.__name__
)
)
if not isinstance(policy, NNPolicy):
if not isinstance(policy, Policy):
self.optimizer = PPOOptimizer(self.policy, self.trainer_parameters)
if self.framework == "torch":
self.optimizer = TorchPPOOptimizer( # type: ignore
self.policy, self.trainer_parameters # type: ignore
) # type: ignore
else:
self.optimizer = TFPPOOptimizer( # type: ignore
self.policy, self.trainer_parameters # type: ignore
) # type: ignore
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly

def get_policy(self, name_behavior_id: str) -> TFPolicy:
def get_policy(self, name_behavior_id: str) -> Policy:
"""
Gets policy from trainer associated with name_behavior_id
:param name_behavior_id: full identifier of policy

69
ml-agents/mlagents/trainers/distributions_torch.py


import torch
from torch import nn
from torch import distributions
import numpy as np
EPSILON = 1e-7 # Small value to avoid divide by zero
class GaussianDistribution(nn.Module):
def __init__(self, hidden_size, num_outputs, conditional_sigma=False, **kwargs):
super(GaussianDistribution, self).__init__(**kwargs)
self.conditional_sigma = conditional_sigma
self.mu = nn.Linear(hidden_size, num_outputs)
nn.init.xavier_uniform_(self.mu.weight, gain=0.01)
if conditional_sigma:
self.log_sigma = nn.Linear(hidden_size, num_outputs)
nn.init.xavier_uniform(self.log_sigma.weight, gain=0.01)
else:
self.log_sigma = nn.Parameter(
torch.zeros(1, num_outputs, requires_grad=True)
)
def forward(self, inputs):
mu = self.mu(inputs)
if self.conditional_sigma:
log_sigma = self.log_sigma(inputs)
else:
log_sigma = self.log_sigma
return [distributions.normal.Normal(loc=mu, scale=torch.exp(log_sigma))]
class MultiCategoricalDistribution(nn.Module):
def __init__(self, hidden_size, act_sizes):
super(MultiCategoricalDistribution, self).__init__()
self.act_sizes = act_sizes
self.branches = self.create_policy_branches(hidden_size)
def create_policy_branches(self, hidden_size):
branches = []
for size in self.act_sizes:
branch_output_layer = nn.Linear(hidden_size, size)
nn.init.xavier_uniform_(branch_output_layer.weight, gain=0.01)
branches.append(branch_output_layer)
return nn.ModuleList(branches)
def mask_branch(self, logits, mask):
raw_probs = torch.nn.functional.softmax(logits, dim=-1) * mask
normalized_probs = raw_probs / torch.sum(raw_probs, dim=-1).unsqueeze(-1)
normalized_logits = torch.log(normalized_probs + EPSILON)
return normalized_logits
def split_masks(self, masks):
split_masks = []
for idx, _ in enumerate(self.act_sizes):
start = int(np.sum(self.act_sizes[:idx]))
end = int(np.sum(self.act_sizes[: idx + 1]))
split_masks.append(masks[:, start:end])
return split_masks
def forward(self, inputs, masks):
# Todo - Support multiple branches in mask code
branch_distributions = []
masks = self.split_masks(masks)
for idx, branch in enumerate(self.branches):
logits = branch(inputs)
norm_logits = self.mask_branch(logits, masks[idx])
distribution = distributions.categorical.Categorical(logits=norm_logits)
branch_distributions.append(distribution)
return branch_distributions

496
ml-agents/mlagents/trainers/models_torch.py


from enum import Enum
from typing import Callable, NamedTuple
import torch
from torch import nn
from mlagents.trainers.distributions_torch import (
GaussianDistribution,
MultiCategoricalDistribution,
)
from mlagents.trainers.exception import UnityTrainerException
ActivationFunction = Callable[[torch.Tensor], torch.Tensor]
EncoderFunction = Callable[
[torch.Tensor, int, ActivationFunction, int, str, bool], torch.Tensor
]
EPSILON = 1e-7
class EncoderType(Enum):
SIMPLE = "simple"
NATURE_CNN = "nature_cnn"
RESNET = "resnet"
class ActionType(Enum):
DISCRETE = "discrete"
CONTINUOUS = "continuous"
@staticmethod
def from_str(label):
if label in "continuous":
return ActionType.CONTINUOUS
elif label in "discrete":
return ActionType.DISCRETE
else:
raise NotImplementedError
class LearningRateSchedule(Enum):
CONSTANT = "constant"
LINEAR = "linear"
class NormalizerTensors(NamedTuple):
steps: torch.Tensor
running_mean: torch.Tensor
running_variance: torch.Tensor
class NetworkBody(nn.Module):
def __init__(
self,
vector_sizes,
visual_sizes,
h_size,
normalize,
num_layers,
m_size,
vis_encode_type,
use_lstm,
):
super(NetworkBody, self).__init__()
self.normalize = normalize
self.visual_encoders = []
self.vector_encoders = []
self.vector_normalizers = []
self.use_lstm = use_lstm
self.h_size = h_size
self.m_size = m_size
visual_encoder = ModelUtils.get_encoder_for_type(vis_encode_type)
for vector_size in vector_sizes:
if vector_size != 0:
self.vector_normalizers.append(Normalizer(vector_size))
self.vector_encoders.append(
VectorEncoder(vector_size, h_size, num_layers)
)
for visual_size in visual_sizes:
self.visual_encoders.append(
visual_encoder(
visual_size.height,
visual_size.width,
visual_size.num_channels,
h_size,
)
)
self.vector_encoders = nn.ModuleList(self.vector_encoders)
self.visual_encoders = nn.ModuleList(self.visual_encoders)
if use_lstm:
self.lstm = nn.LSTM(h_size, m_size // 2, 1)
def update_normalization(self, vec_inputs):
if self.normalize:
for idx, vec_input in enumerate(vec_inputs):
self.vector_normalizers[idx].update(vec_input)
def forward(self, vec_inputs, vis_inputs, memories=None, sequence_length=1):
vec_embeds = []
for idx, encoder in enumerate(self.vector_encoders):
vec_input = vec_inputs[idx]
if self.normalize:
vec_input = self.vector_normalizers[idx](vec_input)
hidden = encoder(vec_input)
vec_embeds.append(hidden)
vis_embeds = []
for idx, encoder in enumerate(self.visual_encoders):
vis_input = vis_inputs[idx]
vis_input = vis_input.permute([0, 3, 1, 2])
hidden = encoder(vis_input)
vis_embeds.append(hidden)
if len(vec_embeds) > 0:
vec_embeds = torch.cat(vec_embeds)
if len(vis_embeds) > 0:
vis_embeds = torch.cat(vis_embeds)
if len(vec_embeds) > 0 and len(vis_embeds) > 0:
embedding = torch.cat([vec_embeds, vis_embeds])
elif len(vec_embeds) > 0:
embedding = vec_embeds
else:
embedding = vis_embeds
if self.use_lstm:
embedding = embedding.reshape([sequence_length, -1, self.h_size])
memories = torch.split(memories, self.m_size // 2, dim=-1)
embedding, memories = self.lstm(embedding, memories)
embedding = embedding.reshape([-1, self.m_size // 2])
memories = torch.cat(memories, dim=-1)
return embedding, memories
class ActorCritic(nn.Module):
def __init__(
self,
h_size,
vector_sizes,
visual_sizes,
act_size,
normalize,
num_layers,
m_size,
vis_encode_type,
act_type,
use_lstm,
stream_names,
separate_critic,
):
super(ActorCritic, self).__init__()
self.act_type = ActionType.from_str(act_type)
self.act_size = act_size
self.separate_critic = separate_critic
self.network_body = NetworkBody(
vector_sizes,
visual_sizes,
h_size,
normalize,
num_layers,
m_size,
vis_encode_type,
use_lstm,
)
if use_lstm:
embedding_size = m_size // 2
else:
embedding_size = h_size
if self.act_type == ActionType.CONTINUOUS:
self.distribution = GaussianDistribution(embedding_size, act_size[0])
else:
self.distribution = MultiCategoricalDistribution(embedding_size, act_size)
if separate_critic:
self.critic = Critic(
stream_names,
h_size,
vector_sizes,
visual_sizes,
normalize,
num_layers,
m_size,
vis_encode_type,
)
else:
self.stream_names = stream_names
self.value_heads = ValueHeads(stream_names, embedding_size)
def update_normalization(self, vector_obs):
self.network_body.update_normalization(vector_obs)
if self.separate_critic:
self.critic.network_body.update_normalization(vector_obs)
def critic_pass(self, vec_inputs, vis_inputs, memories=None):
if self.separate_critic:
return self.critic(vec_inputs, vis_inputs)
else:
embedding, _ = self.network_body(vec_inputs, vis_inputs, memories=memories)
return self.value_heads(embedding)
def sample_action(self, dists):
actions = []
for action_dist in dists:
action = action_dist.sample()
actions.append(action)
actions = torch.stack(actions, dim=-1)
return actions
def get_probs_and_entropy(self, actions, dists):
log_probs = []
entropies = []
for idx, action_dist in enumerate(dists):
action = actions[..., idx]
log_probs.append(action_dist.log_prob(action))
entropies.append(action_dist.entropy())
log_probs = torch.stack(log_probs, dim=-1)
entropies = torch.stack(entropies, dim=-1)
if self.act_type == ActionType.CONTINUOUS:
log_probs = log_probs.squeeze(-1)
entropies = entropies.squeeze(-1)
return log_probs, entropies
def get_dist_and_value(
self, vec_inputs, vis_inputs, masks=None, memories=None, sequence_length=1
):
embedding, memories = self.network_body(
vec_inputs, vis_inputs, memories, sequence_length
)
if self.act_type == ActionType.CONTINUOUS:
dists = self.distribution(embedding)
else:
dists = self.distribution(embedding, masks=masks)
if self.separate_critic:
value_outputs = self.critic(vec_inputs, vis_inputs)
else:
value_outputs = self.value_heads(embedding)
return dists, value_outputs, memories
def forward(
self, vec_inputs, vis_inputs, masks=None, memories=None, sequence_length=1
):
dists, value_outputs, memories = self.get_dist_and_value(
vec_inputs, vis_inputs, masks, memories, sequence_length
)
sampled_actions = self.sample_action(dists)
return sampled_actions, memories
class Critic(nn.Module):
def __init__(
self,
stream_names,
h_size,
vector_sizes,
visual_sizes,
normalize,
num_layers,
m_size,
vis_encode_type,
):
super(Critic, self).__init__()
self.network_body = NetworkBody(
vector_sizes,
visual_sizes,
h_size,
normalize,
num_layers,
m_size,
vis_encode_type,
False,
)
self.stream_names = stream_names
self.value_heads = ValueHeads(stream_names, h_size)
def forward(self, vec_inputs, vis_inputs):
embedding, _ = self.network_body(vec_inputs, vis_inputs)
return self.value_heads(embedding)
class Normalizer(nn.Module):
def __init__(self, vec_obs_size, **kwargs):
super(Normalizer, self).__init__(**kwargs)
self.normalization_steps = torch.tensor(1)
self.running_mean = torch.zeros(vec_obs_size)
self.running_variance = torch.ones(vec_obs_size)
def forward(self, inputs):
normalized_state = torch.clamp(
(inputs - self.running_mean)
/ torch.sqrt(self.running_variance / self.normalization_steps),
-5,
5,
)
return normalized_state
def update(self, vector_input):
steps_increment = vector_input.size()[0]
total_new_steps = self.normalization_steps + steps_increment
input_to_old_mean = vector_input - self.running_mean
new_mean = self.running_mean + (input_to_old_mean / total_new_steps).sum(0)
input_to_new_mean = vector_input - new_mean
new_variance = self.running_variance + (
input_to_new_mean * input_to_old_mean
).sum(0)
self.running_mean = new_mean
self.running_variance = new_variance
self.normalization_steps = total_new_steps
class ValueHeads(nn.Module):
def __init__(self, stream_names, input_size):
super(ValueHeads, self).__init__()
self.stream_names = stream_names
self.value_heads = {}
for name in stream_names:
value = nn.Linear(input_size, 1)
self.value_heads[name] = value
self.value_heads = nn.ModuleDict(self.value_heads)
def forward(self, hidden):
value_outputs = {}
for stream_name, _ in self.value_heads.items():
value_outputs[stream_name] = self.value_heads[stream_name](hidden).squeeze(
-1
)
return (
value_outputs,
torch.mean(torch.stack(list(value_outputs.values())), dim=0),
)
class VectorEncoder(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, **kwargs):
super(VectorEncoder, self).__init__(**kwargs)
self.layers = [nn.Linear(input_size, hidden_size)]
for _ in range(num_layers - 1):
self.layers.append(nn.Linear(hidden_size, hidden_size))
self.layers.append(nn.ReLU())
self.layers = nn.ModuleList(self.layers)
def forward(self, inputs):
x = inputs
for layer in self.layers:
x = layer(x)
return x
def conv_output_shape(h_w, kernel_size=1, stride=1, pad=0, dilation=1):
from math import floor
if type(kernel_size) is not tuple:
kernel_size = (kernel_size, kernel_size)
h = floor(
((h_w[0] + (2 * pad) - (dilation * (kernel_size[0] - 1)) - 1) / stride) + 1
)
w = floor(
((h_w[1] + (2 * pad) - (dilation * (kernel_size[1] - 1)) - 1) / stride) + 1
)
return h, w
class SimpleVisualEncoder(nn.Module):
def __init__(self, height, width, initial_channels, output_size):
super(SimpleVisualEncoder, self).__init__()
self.h_size = output_size
conv_1_hw = conv_output_shape((height, width), 8, 4)
conv_2_hw = conv_output_shape(conv_1_hw, 4, 2)
self.final_flat = conv_2_hw[0] * conv_2_hw[1] * 32
self.conv1 = nn.Conv2d(initial_channels, 16, [8, 8], [4, 4])
self.conv2 = nn.Conv2d(16, 32, [4, 4], [2, 2])
self.dense = nn.Linear(self.final_flat, self.h_size)
def forward(self, visual_obs):
conv_1 = torch.relu(self.conv1(visual_obs))
conv_2 = torch.relu(self.conv2(conv_1))
hidden = self.dense(conv_2.reshape([-1, self.final_flat]))
return hidden
class NatureVisualEncoder(nn.Module):
def __init__(self, height, width, initial_channels, output_size):
super(NatureVisualEncoder, self).__init__()
self.h_size = output_size
conv_1_hw = conv_output_shape((height, width), 8, 4)
conv_2_hw = conv_output_shape(conv_1_hw, 4, 2)
conv_3_hw = conv_output_shape(conv_2_hw, 3, 1)
self.final_flat = conv_3_hw[0] * conv_3_hw[1] * 64
self.conv1 = nn.Conv2d(initial_channels, 32, [8, 8], [4, 4])
self.conv2 = nn.Conv2d(43, 64, [4, 4], [2, 2])
self.conv3 = nn.Conv2d(64, 64, [3, 3], [1, 1])
self.dense = nn.Linear(self.final_flat, self.h_size)
def forward(self, visual_obs):
conv_1 = torch.relu(self.conv1(visual_obs))
conv_2 = torch.relu(self.conv2(conv_1))
conv_3 = torch.relu(self.conv3(conv_2))
hidden = self.dense(conv_3.reshape([-1, self.final_flat]))
return hidden
class GlobalSteps(nn.Module):
def __init__(self):
super(GlobalSteps, self).__init__()
self.global_step = torch.Tensor([0])
def increment(self, value):
self.global_step += value
class LearningRate(nn.Module):
def __init__(self, lr):
# Todo: add learning rate decay
super(LearningRate, self).__init__()
self.learning_rate = torch.Tensor([lr])
class ResNetVisualEncoder(nn.Module):
def __init__(self, initial_channels):
super(ResNetVisualEncoder, self).__init__()
n_channels = [16, 32, 32] # channel for each stack
n_blocks = 2 # number of residual blocks
self.layers = []
for _, channel in enumerate(n_channels):
self.layers.append(nn.Conv2d(initial_channels, channel, [3, 3], [1, 1]))
self.layers.append(nn.MaxPool2d([3, 3], [2, 2]))
for _ in range(n_blocks):
self.layers.append(self.make_block(channel))
self.layers.append(nn.ReLU())
@staticmethod
def make_block(channel):
block_layers = [
nn.ReLU(),
nn.Conv2d(channel, channel, [3, 3], [1, 1]),
nn.ReLU(),
nn.Conv2d(channel, channel, [3, 3], [1, 1]),
]
return block_layers
@staticmethod
def forward_block(input_hidden, block_layers):
hidden = input_hidden
for layer in block_layers:
hidden = layer(hidden)
return hidden + input_hidden
def forward(self, visual_obs):
hidden = visual_obs
for layer in self.layers:
if layer is nn.Module:
hidden = layer(hidden)
elif layer is list:
hidden = self.forward_block(hidden, layer)
return hidden.flatten()
class ModelUtils:
# Minimum supported side for each encoder type. If refactoring an encoder, please
# adjust these also.
MIN_RESOLUTION_FOR_ENCODER = {
EncoderType.SIMPLE: 20,
EncoderType.NATURE_CNN: 36,
EncoderType.RESNET: 15,
}
@staticmethod
def swish(input_activation: torch.Tensor) -> torch.Tensor:
"""Swish activation function. For more info: https://arxiv.org/abs/1710.05941"""
return torch.mul(input_activation, torch.sigmoid(input_activation))
@staticmethod
def get_encoder_for_type(encoder_type: EncoderType) -> nn.Module:
ENCODER_FUNCTION_BY_TYPE = {
EncoderType.SIMPLE: SimpleVisualEncoder,
EncoderType.NATURE_CNN: NatureVisualEncoder,
EncoderType.RESNET: ResNetVisualEncoder,
}
return ENCODER_FUNCTION_BY_TYPE.get(encoder_type)
@staticmethod
def _check_resolution_for_encoder(
vis_in: torch.Tensor, vis_encoder_type: EncoderType
) -> None:
min_res = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[vis_encoder_type]
height = vis_in.shape[1]
width = vis_in.shape[2]
if height < min_res or width < min_res:
raise UnityTrainerException(
f"Visual observation resolution ({width}x{height}) is too small for"
f"the provided EncoderType ({vis_encoder_type.value}). The min dimension is {min_res}"
)

115
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


from typing import Dict, Any, Optional, Tuple, List
import torch
import numpy as np
from mlagents_envs.base_env import DecisionSteps
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.components.bc.module import BCModule
from mlagents.trainers.components.reward_signals.extrinsic.signal import (
ExtrinsicRewardSignal,
)
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer import Optimizer
from mlagents.trainers.trajectory import SplitObservations
class TorchOptimizer(Optimizer): # pylint: disable=W0223
def __init__(self, policy: TorchPolicy, trainer_params: Dict[str, Any]):
super(TorchOptimizer, self).__init__()
self.policy = policy
self.trainer_params = trainer_params
self.update_dict: Dict[str, torch.Tensor] = {}
self.value_heads: Dict[str, torch.Tensor] = {}
self.memory_in: torch.Tensor = None
self.memory_out: torch.Tensor = None
self.m_size: int = 0
self.global_step = torch.tensor(0)
self.bc_module: Optional[BCModule] = None
self.create_reward_signals(trainer_params["reward_signals"])
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
pass
def create_reward_signals(self, reward_signal_configs):
"""
Create reward signals
:param reward_signal_configs: Reward signal config.
"""
extrinsic_signal = ExtrinsicRewardSignal(
self.policy, **reward_signal_configs["extrinsic"]
)
self.reward_signals = {"extrinsic": extrinsic_signal}
# Create reward signals
# for reward_signal, config in reward_signal_configs.items():
# self.reward_signals[reward_signal] = create_reward_signal(
# self.policy, reward_signal, config
# )
# self.update_dict.update(self.reward_signals[reward_signal].update_dict)
def get_value_estimates(
self, decision_requests: DecisionSteps, idx: int, done: bool
) -> Dict[str, float]:
"""
Generates value estimates for bootstrapping.
:param decision_requests:
:param idx: Index in BrainInfo of agent.
:param done: Whether or not this is the last element of the episode,
in which case the value estimate will be 0.
:return: The value estimate dictionary with key being the name of the reward signal
and the value the corresponding value estimate.
"""
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
value_estimates, mean_value = self.policy.actor_critic.critic_pass(
np.expand_dims(vec_vis_obs.vector_observations[idx], 0),
np.expand_dims(vec_vis_obs.visual_observations[idx], 0),
)
value_estimates = {k: float(v) for k, v in value_estimates.items()}
# If we're done, reassign all of the value estimates that need terminal states.
if done:
for k in value_estimates:
if self.reward_signals[k].use_terminal_states:
value_estimates[k] = 0.0
return value_estimates
def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
vector_obs = [torch.as_tensor(batch["vector_obs"])]
if self.policy.use_vis_obs:
visual_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_encoders
):
visual_ob = torch.as_tensor(batch["visual_obs%d" % idx])
visual_obs.append(visual_ob)
else:
visual_obs = []
memory = torch.zeros([1, len(vector_obs[0]), self.policy.m_size])
next_obs = np.concatenate(next_obs, axis=-1)
next_obs = [torch.as_tensor(next_obs).unsqueeze(0)]
next_memory = torch.zeros([1, 1, self.policy.m_size])
value_estimates, mean_value = self.policy.actor_critic.critic_pass(
vector_obs, visual_obs, memory
)
next_value_estimate, next_value = self.policy.actor_critic.critic_pass(
next_obs, next_obs, next_memory
)
for name, estimate in value_estimates.items():
value_estimates[name] = estimate.detach().numpy()
next_value_estimate[name] = next_value_estimate[name].detach().numpy()
if done:
for k in next_value_estimate:
if self.reward_signals[k].use_terminal_states:
next_value_estimate[k] = 0.0
return value_estimates, next_value_estimate

300
ml-agents/mlagents/trainers/policy/torch_policy.py


from typing import Any, Dict, List
import numpy as np
import torch
import os
from torch import onnx
from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.brain_conversion_utils import get_global_agent_id
from mlagents.trainers.policy import Policy
from mlagents_envs.base_env import DecisionSteps
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents.trainers.policy.policy import UnityPolicyException
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.models_torch import EncoderType, ActorCritic
EPSILON = 1e-7 # Small value to avoid divide by zero
torch.set_num_threads(1)
class TorchPolicy(Policy):
def __init__(
self,
seed: int,
brain: BrainParameters,
trainer_params: Dict[str, Any],
load: bool,
tanh_squash: bool = False,
reparameterize: bool = False,
condition_sigma_on_obs: bool = True,
):
"""
Policy that uses a multilayer perceptron to map the observations to actions. Could
also use a CNN to encode visual input prior to the MLP. Supports discrete and
continuous action spaces, as well as recurrent networks.
:param seed: Random seed.
:param brain: Assigned BrainParameters object.
:param trainer_params: Defined training parameters.
:param load: Whether a pre-trained model will be loaded or a new one created.
:param tanh_squash: Whether to use a tanh function on the continuous output,
or a clipped output.
:param reparameterize: Whether we are using the resampling trick to update the policy
in continuous output.
"""
super(TorchPolicy, self).__init__(brain, seed, trainer_params)
self.grads = None
num_layers = trainer_params["num_layers"]
self.h_size = trainer_params["hidden_units"]
self.seed = seed
self.brain = brain
self.global_step = 0
self.m_size = 0
self.act_size = brain.vector_action_space_size
self.act_type = brain.vector_action_space_type
self.sequence_length = 1
if self.use_recurrent:
self.m_size = trainer_params["memory_size"]
self.sequence_length = trainer_params["sequence_length"]
if self.m_size == 0:
raise UnityPolicyException(
"The memory size for brain {0} is 0 even "
"though the trainer uses recurrent.".format(brain.brain_name)
)
elif self.m_size % 2 != 0:
raise UnityPolicyException(
"The memory size for brain {0} is {1} "
"but it must be divisible by 2.".format(
brain.brain_name, self.m_size
)
)
if num_layers < 1:
num_layers = 1
self.num_layers = num_layers
self.vis_encode_type = EncoderType(
trainer_params.get("vis_encode_type", "simple")
)
self.tanh_squash = tanh_squash
self.reparameterize = reparameterize
self.condition_sigma_on_obs = condition_sigma_on_obs
# Non-exposed parameters; these aren't exposed because they don't have a
# good explanation and usually shouldn't be touched.
self.log_std_min = -20
self.log_std_max = 2
self.inference_dict: Dict[str, tf.Tensor] = {}
self.update_dict: Dict[str, tf.Tensor] = {}
# TF defaults to 32-bit, so we use the same here.
torch.set_default_tensor_type(torch.FloatTensor)
reward_signal_configs = trainer_params["reward_signals"]
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
self.actor_critic = ActorCritic(
h_size=int(trainer_params["hidden_units"]),
act_type=self.act_type,
vector_sizes=[brain.vector_observation_space_size],
act_size=brain.vector_action_space_size,
normalize=trainer_params["normalize"],
num_layers=int(trainer_params["num_layers"]),
m_size=trainer_params["memory_size"],
use_lstm=self.use_recurrent,
visual_sizes=brain.camera_resolutions,
vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")
),
stream_names=list(reward_signal_configs.keys()),
separate_critic=self.use_continuous_act,
)
def split_decision_step(self, decision_requests):
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
mask = None
if not self.use_continuous_act:
mask = torch.ones(
[len(decision_requests), np.sum(self.brain.vector_action_space_size)]
)
if decision_requests.action_mask is not None:
mask = torch.as_tensor(
1 - np.concatenate(decision_requests.action_mask, axis=1)
)
return vec_vis_obs.vector_observations, vec_vis_obs.visual_observations, mask
def update_normalization(self, vector_obs: np.ndarray) -> None:
"""
If this policy normalizes vector observations, this will update the norm values in the graph.
:param vector_obs: The vector observations to add to the running estimate of the distribution.
"""
vector_obs = [torch.as_tensor(vector_obs)]
if self.use_vec_obs and self.normalize:
self.actor_critic.update_normalization(vector_obs)
@timed
def sample_actions(self, vec_obs, vis_obs, masks=None, memories=None, seq_len=1):
dists, (
value_heads,
mean_value,
), memories = self.actor_critic.get_dist_and_value(
vec_obs, vis_obs, masks, memories, seq_len
)
actions = self.actor_critic.sample_action(dists)
log_probs, entropies = self.actor_critic.get_probs_and_entropy(actions, dists)
if self.act_type == "continuous":
actions.squeeze_(-1)
return actions, log_probs, entropies, value_heads, memories
def evaluate_actions(
self, vec_obs, vis_obs, actions, masks=None, memories=None, seq_len=1
):
dists, (value_heads, mean_value), _ = self.actor_critic.get_dist_and_value(
vec_obs, vis_obs, masks, memories, seq_len
)
log_probs, entropies = self.actor_critic.get_probs_and_entropy(actions, dists)
return log_probs, entropies, value_heads
@timed
def evaluate(
self, decision_requests: DecisionSteps, global_agent_ids: List[str]
) -> Dict[str, Any]:
"""
Evaluates policy for the agent experiences provided.
:param global_agent_ids:
:param decision_requests: DecisionStep object containing inputs.
:return: Outputs from network as defined by self.inference_dict.
"""
vec_obs, vis_obs, masks = self.split_decision_step(decision_requests)
vec_obs = [torch.as_tensor(vec_obs)]
vis_obs = [torch.as_tensor(vis_ob) for vis_ob in vis_obs]
memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze(
0
)
run_out = {}
with torch.no_grad():
action, log_probs, entropy, value_heads, memories = self.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
)
run_out["action"] = action.detach().numpy()
run_out["pre_action"] = action.detach().numpy()
# Todo - make pre_action difference
run_out["log_probs"] = log_probs.detach().numpy()
run_out["entropy"] = entropy.detach().numpy()
run_out["value_heads"] = {
name: t.detach().numpy() for name, t in value_heads.items()
}
run_out["value"] = np.mean(list(run_out["value_heads"].values()), 0)
run_out["learning_rate"] = 0.0
if self.use_recurrent:
run_out["memories"] = memories.detach().numpy()
self.actor_critic.update_normalization(vec_obs)
return run_out
def get_action(
self, decision_requests: DecisionSteps, worker_id: int = 0
) -> ActionInfo:
"""
Decides actions given observations information, and takes them in environment.
:param worker_id:
:param decision_requests: A dictionary of brain names and BrainInfo from environment.
:return: an ActionInfo containing action, memories, values and an object
to be passed to add experiences
"""
if len(decision_requests) == 0:
return ActionInfo.empty()
global_agent_ids = [
get_global_agent_id(worker_id, int(agent_id))
for agent_id in decision_requests.agent_id
] # For 1-D array, the iterator order is correct.
run_out = self.evaluate(
decision_requests, global_agent_ids
) # pylint: disable=assignment-from-no-return
self.save_memories(global_agent_ids, run_out.get("memory_out"))
return ActionInfo(
action=run_out.get("action"),
value=run_out.get("value"),
outputs=run_out,
agent_ids=list(decision_requests.agent_id),
)
def save_model(self, step=0):
"""
Saves the model
:param step: The number of steps the model was trained for
"""
if not os.path.exists(self.model_path):
os.makedirs(self.model_path)
save_path = self.model_path + "/model-" + str(step) + ".pt"
torch.save(self.actor_critic.state_dict(), save_path)
def load_model(self, step=0):
load_path = self.model_path + "/model-" + str(step) + ".pt"
self.actor_critic.load_state_dict(torch.load(load_path))
def export_model(self, step=0):
fake_vec_obs = [torch.zeros([1] + [self.vec_obs_size])]
fake_vis_obs = [
torch.zeros(
[1] + [camera_res.height, camera_res.width, camera_res.num_channels]
)
for camera_res in self.brain.camera_resolutions
]
if self.use_continuous_act:
fake_masks = None
else:
fake_masks = torch.ones([1] + [int(np.sum(self.act_size))])
fake_memories = torch.zeros([1] + [self.m_size])
export_path = self.model_path + "/model-" + str(step) + ".onnx"
output_names = ["action", "value_estimates", "memories"]
onnx.export(
self.actor_critic,
(fake_vec_obs, fake_vis_obs, fake_masks, fake_memories, 1),
export_path,
verbose=True,
output_names=output_names,
)
@property
def vis_obs_size(self):
return self.brain.number_visual_observations
@property
def vec_obs_size(self):
return self.brain.vector_observation_space_size
@property
def use_vis_obs(self):
return self.vis_obs_size > 0
@property
def use_vec_obs(self):
return self.vec_obs_size > 0
def get_current_step(self):
"""
Gets current model step.
:return: current model step.
"""
step = self.global_step
return step
def increment_step(self, n_steps):
"""
Increments model step.
"""
self.global_step += n_steps
return self.get_current_step()

145
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from typing import Any, Dict
import torch
from mlagents.trainers.buffer import AgentBuffer
from mlagents_envs.timers import timed
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
class TorchPPOOptimizer(TorchOptimizer):
def __init__(self, policy: TorchPolicy, trainer_params: Dict[str, Any]):
"""
Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
The PPO optimizer has a value estimator and a loss function.
:param policy: A TFPolicy object that will be updated by this PPO Optimizer.
:param trainer_params: Trainer parameters dictionary that specifies the
properties of the trainer.
"""
# Create the graph here to give more granular control of the TF graph to the Optimizer.
super(TorchPPOOptimizer, self).__init__(policy, trainer_params)
params = list(self.policy.actor_critic.parameters())
self.optimizer = torch.optim.Adam(
params, lr=self.trainer_params["learning_rate"]
)
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
self.stream_names = list(self.reward_signals.keys())
def ppo_value_loss(self, values, old_values, returns):
"""
Creates training-specific Tensorflow ops for PPO models.
:param returns:
:param old_values:
:param values:
"""
decay_epsilon = self.trainer_params["epsilon"]
value_losses = []
for name, head in values.items():
old_val_tensor = old_values[name]
returns_tensor = returns[name]
clipped_value_estimate = old_val_tensor + torch.clamp(
head - old_val_tensor, -decay_epsilon, decay_epsilon
)
v_opt_a = (returns_tensor - head) ** 2
v_opt_b = (returns_tensor - clipped_value_estimate) ** 2
value_loss = torch.mean(torch.max(v_opt_a, v_opt_b))
value_losses.append(value_loss)
value_loss = torch.mean(torch.stack(value_losses))
return value_loss
def ppo_policy_loss(self, advantages, log_probs, old_log_probs, masks):
"""
Creates training-specific Tensorflow ops for PPO models.
:param masks:
:param advantages:
:param log_probs: Current policy probabilities
:param old_log_probs: Past policy probabilities
"""
advantage = advantages.unsqueeze(-1)
decay_epsilon = self.trainer_params["epsilon"]
r_theta = torch.exp(log_probs - old_log_probs)
p_opt_a = r_theta * advantage
p_opt_b = (
torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage
)
policy_loss = -torch.mean(torch.min(p_opt_a, p_opt_b))
return policy_loss
@timed
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
"""
Performs update on model.
:param batch: Batch of experiences.
:param num_sequences: Number of sequences to process.
:return: Results of update.
"""
returns = {}
old_values = {}
for name in self.reward_signals:
old_values[name] = torch.as_tensor(batch["{}_value_estimates".format(name)])
returns[name] = torch.as_tensor(batch["{}_returns".format(name)])
vec_obs = [torch.as_tensor(batch["vector_obs"])]
act_masks = torch.as_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = torch.as_tensor(batch["actions"]).unsqueeze(-1)
else:
actions = torch.as_tensor(batch["actions"])
memories = [
torch.as_tensor(batch["memory"][i])
for i in range(0, len(batch["memory"]), self.policy.sequence_length)
]
if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_encoders
):
vis_ob = torch.as_tensor(batch["visual_obs%d" % idx])
vis_obs.append(vis_ob)
else:
vis_obs = []
log_probs, entropy, values = self.policy.evaluate_actions(
vec_obs,
vis_obs,
masks=act_masks,
actions=actions,
memories=memories,
seq_len=self.policy.sequence_length,
)
value_loss = self.ppo_value_loss(values, old_values, returns)
policy_loss = self.ppo_policy_loss(
torch.as_tensor(batch["advantages"]),
log_probs,
torch.as_tensor(batch["action_probs"]),
torch.as_tensor(batch["masks"], dtype=torch.int32),
)
loss = (
policy_loss
+ 0.5 * value_loss
- self.trainer_params["beta"] * torch.mean(entropy)
)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
update_stats = {
"Losses/Policy Loss": abs(policy_loss.detach().numpy()),
"Losses/Value Loss": value_loss.detach().numpy(),
}
return update_stats

29
test_env_perf.py


from mlagents.trainers.tests.test_simple_rl import (
_check_environment_trains,
PPO_CONFIG,
generate_config,
)
from mlagents.trainers.tests.simple_test_envs import SimpleEnvironment
from mlagents.trainers.ppo.trainer import TIMINGS
import matplotlib.pyplot as plt
import numpy as np
BRAIN_NAME = "1D"
if __name__ == "__main__":
env = SimpleEnvironment([BRAIN_NAME], use_discrete=False)
config = generate_config(
PPO_CONFIG,
override_vals={"batch_size": 256, "max_steps": 20000, "buffer_size": 1024},
)
try:
_check_environment_trains(env, config)
except Exception:
pass
print(f"Mean update time {np.mean(TIMINGS)}")
plt.plot(TIMINGS)
plt.ylim((0, 0.006))
plt.title("PyTorch w/ 3DBall Running, batch size 256, 32 hidden units, 1 layer")
plt.ylabel("Update Time (s)")
plt.ylabel("Update #")
plt.show()

/ml-agents/mlagents/trainers/ppo/optimizer.py → /ml-agents/mlagents/trainers/ppo/optimizer_tf.py

正在加载...
取消
保存