浏览代码

Refactoring policy and optimizer

/develop/add-fire
Arthur Juliani 5 年前
当前提交
7c3bd376
共有 11 个文件被更改,包括 410 次插入641 次删除
  1. 4
      ml-agents/mlagents/trainers/distributions_torch.py
  2. 140
      ml-agents/mlagents/trainers/models_torch.py
  3. 24
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  4. 28
      ml-agents/mlagents/trainers/policy/policy.py
  5. 19
      ml-agents/mlagents/trainers/policy/tf_policy.py
  6. 460
      ml-agents/mlagents/trainers/policy/torch_policy.py
  7. 39
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  8. 20
      ml-agents/mlagents/trainers/ppo/trainer.py
  9. 6
      ml-agents/mlagents/trainers/trainer/rl_trainer.py
  10. 23
      ml-agents/mlagents/trainers/trainer/trainer.py
  11. 288
      ml-agents/mlagents/trainers/policy/nn_torch_policy.py

4
ml-agents/mlagents/trainers/distributions_torch.py


def forward(self, inputs):
mu = self.mu(inputs)
log_sig = self.log_sigma_sq(inputs)
return distributions.normal.Normal(loc=mu, scale=torch.sqrt(torch.exp(log_sig)))
return [
distributions.normal.Normal(loc=mu, scale=torch.sqrt(torch.exp(log_sig)))
]
class MultiCategoricalDistribution(nn.Module):

140
ml-agents/mlagents/trainers/models_torch.py


import torch
from torch import nn
from mlagents.trainers.distributions_torch import (
GaussianDistribution,
MultiCategoricalDistribution,
)
from mlagents.trainers.exception import UnityTrainerException
ActivationFunction = Callable[[torch.Tensor], torch.Tensor]

steps: torch.Tensor
running_mean: torch.Tensor
running_variance: torch.Tensor
class NetworkBody(nn.Module):
def __init__(
self,
vector_sizes,
visual_sizes,
h_size,
normalize,
num_layers,
m_size,
vis_encode_type,
use_lstm,
):
super(NetworkBody, self).__init__()
self.normalize = normalize
self.visual_encoders = []
self.vector_encoders = []
self.vector_normalizers = []
self.use_lstm = use_lstm
self.h_size = h_size
self.m_size = m_size
visual_encoder = ModelUtils.get_encoder_for_type(vis_encode_type)
for vector_size in vector_sizes:
self.vector_normalizers.append(Normalizer(vector_size))
self.vector_encoders.append(VectorEncoder(vector_size, h_size, num_layers))
for visual_size in visual_sizes:
self.visual_encoders.append(visual_encoder(visual_size))
if use_lstm:
self.lstm = nn.LSTM(h_size, h_size, 1)
def clear_memory(self, batch_size):
self.memory = (
torch.zeros(1, batch_size, self.m_size),
torch.zeros(1, batch_size, self.m_size),
)
def update_normalization(self, inputs):
if self.normalize:
self.normalizer.update(inputs)
def forward(self, vec_inputs, vis_inputs):
vec_embeds = []
for idx, encoder in enumerate(self.vector_encoders):
vec_input = vec_inputs[idx]
if self.normalize:
vec_input = self.normalizers[idx](vec_inputs[idx])
hidden = encoder(vec_input)
vec_embeds.append(hidden)
vis_embeds = []
for idx, encoder in enumerate(self.visual_encoders):
hidden = encoder(vis_inputs[idx])
vis_embeds.append(hidden)
vec_embeds = torch.cat(vec_embeds)
vis_embeds = torch.cat(vis_embeds)
embedding = torch.cat([vec_embeds, vis_embeds])
if self.use_lstm:
embedding, self.memory = self.lstm(embedding, self.memory)
return embedding
class Actor(nn.Module):
def __init__(
self,
h_size,
vector_sizes,
visual_sizes,
act_size,
normalize,
num_layers,
m_size,
vis_encode_type,
act_type,
use_lstm,
):
super(Actor, self).__init__()
self.act_type = act_type
self.act_size = act_size
self.network_body = NetworkBody(
vector_sizes,
visual_sizes,
h_size,
normalize,
num_layers,
m_size,
vis_encode_type,
use_lstm,
)
if self.act_type == ActionType.CONTINUOUS:
self.distribution = GaussianDistribution(h_size, act_size)
else:
self.distribution = MultiCategoricalDistribution(h_size, act_size)
def forward(self, vec_inputs, vis_inputs, masks=None):
embedding = self.network_body(vec_inputs, vis_inputs)
if self.act_type == ActionType.CONTINUOUS:
dist = self.distribution(embedding)
else:
dist = self.distribution(embedding, masks=masks)
return dist
class Critic(nn.Module):
def __init__(
self,
stream_names,
h_size,
vector_sizes,
visual_sizes,
normalize,
num_layers,
m_size,
vis_encode_type,
use_lstm,
):
super(Critic, self).__init__()
self.stream_names = stream_names
self.network_body = NetworkBody(
vector_sizes,
visual_sizes,
h_size,
normalize,
num_layers,
m_size,
vis_encode_type,
use_lstm,
)
self.value_heads = ValueHeads(stream_names, h_size)
def forward(self, vec_inputs, vis_inputs):
embedding = self.network_body(vec_inputs, vis_inputs)
return self.value_heads(embedding)
class Normalizer(nn.Module):

24
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


from typing import Dict, Any
from typing import Dict, Any, Optional
from mlagents.trainers.policy.nn_torch_policy import NNPolicy
from mlagents.trainers.components.bc.module import BCModule
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.components.reward_signals.reward_signal_factory import (
create_reward_signal,
)
def __init__(self, policy: NNPolicy, trainer_params: Dict[str, Any]):
def __init__(self, policy: TorchPolicy, trainer_params: Dict[str, Any]):
super(TorchOptimizer, self).__init__()
self.policy = policy
self.trainer_params = trainer_params

self.memory_out: torch.Tensor = None
self.m_size: int = 0
self.global_step = torch.tensor(0)
self.bc_module: Optional[BCModule] = None
def create_reward_signals(self, reward_signal_configs):
"""
Create reward signals
:param reward_signal_configs: Reward signal config.
"""
self.reward_signals = {}
# Create reward signals
for reward_signal, config in reward_signal_configs.items():
self.reward_signals[reward_signal] = create_reward_signal(
self.policy, reward_signal, config
)
self.update_dict.update(self.reward_signals[reward_signal].update_dict)

28
ml-agents/mlagents/trainers/policy/policy.py


from abc import ABC, abstractmethod
from abc import abstractmethod
from mlagents_envs.exception import UnityException
class Policy(ABC):
@abstractmethod
class UnityPolicyException(UnityException):
"""
Related to errors with the Trainer.
"""
pass
class Policy(object):
def __init__(self, brain, seed):
self.brain = brain
self.seed = seed
self.model_path = None
raise NotImplementedError
@abstractmethod
def increment_step(self, n_steps):
pass
@abstractmethod
def save_model(self, step):
pass

19
ml-agents/mlagents/trainers/policy/tf_policy.py


import numpy as np
from mlagents.tf_utils import tf
from mlagents import tf_utils
from mlagents_envs.exception import UnityException
from mlagents.trainers.policy.policy import UnityPolicyException
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.brain_conversion_utils import get_global_agent_id
from mlagents_envs.base_env import DecisionSteps

logger = get_logger(__name__)
class UnityPolicyException(UnityException):
"""
Related to errors with the Trainer.
"""
pass
class TFPolicy(Policy):
"""
Contains a learning model, and the necessary

:param brain: The corresponding Brain for this policy.
:param trainer_parameters: The trainer parameters.
"""
super(TFPolicy, self).__init__(brain, seed)
self._version_number_ = 2
self.m_size = 0

self.inference_dict = {}
self.update_dict = {}
self.sequence_length = 1
self.seed = seed
self.brain = brain
self.act_size = brain.vector_action_space_size
self.vec_obs_size = brain.vector_observation_space_size

"""
return list(self.update_dict.keys())
def save_model(self, steps):
def save_model(self, step):
:param steps: The number of steps the model was trained for
:param step: The number of steps the model was trained for
last_checkpoint = self.model_path + "/model-" + str(steps) + ".ckpt"
last_checkpoint = self.model_path + "/model-" + str(step) + ".ckpt"
self.saver.save(self.sess, last_checkpoint)
tf.train.write_graph(
self.graph, self.model_path, "raw_graph_def.pb", as_text=False

460
ml-agents/mlagents/trainers/policy/torch_policy.py


from typing import Any, Dict, List, Optional
from typing import Any, Dict
from mlagents import tf_utils
from mlagents.tf_utils import tf
from mlagents_envs.exception import UnityException
from mlagents_envs.logging_util import get_logger
import torch
from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.brain_conversion_utils import get_global_agent_id
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
logger = get_logger(__name__)
class UnityPolicyException(UnityException):
"""
Related to errors with the Trainer.
"""
from mlagents.trainers.policy.policy import UnityPolicyException
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.models_torch import ActionType, EncoderType, Actor, Critic
pass
EPSILON = 1e-7 # Small value to avoid divide by zero
"""
Contains a learning model, and the necessary
functions to save/load models and create the input placeholders.
"""
def __init__(self, seed, brain, trainer_parameters, load=False):
def __init__(
self,
seed: int,
brain: BrainParameters,
trainer_params: Dict[str, Any],
load: bool,
tanh_squash: bool = False,
reparameterize: bool = False,
condition_sigma_on_obs: bool = True,
):
Initialized the policy.
:param seed: Random seed to use for TensorFlow.
:param brain: The corresponding Brain for this policy.
:param trainer_parameters: The trainer parameters.
Policy that uses a multilayer perceptron to map the observations to actions. Could
also use a CNN to encode visual input prior to the MLP. Supports discrete and
continuous action spaces, as well as recurrent networks.
:param seed: Random seed.
:param brain: Assigned BrainParameters object.
:param trainer_params: Defined training parameters.
:param load: Whether a pre-trained model will be loaded or a new one created.
:param tanh_squash: Whether to use a tanh function on the continuous output,
or a clipped output.
:param reparameterize: Whether we are using the resampling trick to update the policy
in continuous output.
self._version_number_ = 2
self.m_size = 0
# for ghost trainer save/load snapshots
self.assign_phs = []
self.assign_ops = []
self.inference_dict = {}
self.update_dict = {}
self.sequence_length = 1
self.global_step = 0
super(TorchPolicy, self).__init__(brain, seed)
self.grads = None
num_layers = trainer_params["num_layers"]
self.h_size = trainer_params["hidden_units"]
self.normalize = trainer_params["normalize"]
self.vec_obs_size = brain.vector_observation_space_size
self.vis_obs_size = brain.number_visual_observations
self.use_recurrent = trainer_parameters["use_recurrent"]
self.memory_dict: Dict[str, np.ndarray] = {}
self.num_branches = len(self.brain.vector_action_space_size)
self.previous_action_dict: Dict[str, np.array] = {}
self.normalize = trainer_parameters.get("normalize", False)
self.use_continuous_act = brain.vector_action_space_type == "continuous"
if self.use_continuous_act:
self.num_branches = self.brain.vector_action_space_size[0]
self.model_path = trainer_parameters["model_path"]
self.initialize_path = trainer_parameters.get("init_path", None)
self.keep_checkpoints = trainer_parameters.get("keep_checkpoints", 5)
self.graph = tf.Graph()
self.sess = tf.Session(
config=tf_utils.generate_session_config(), graph=self.graph
)
self.saver = None
self.seed = seed
self.sequence_length = 1
self.m_size = trainer_parameters["memory_size"]
self.sequence_length = trainer_parameters["sequence_length"]
self.m_size = trainer_params["memory_size"]
self.sequence_length = trainer_params["sequence_length"]
if self.m_size == 0:
raise UnityPolicyException(
"The memory size for brain {0} is 0 even "

brain.brain_name, self.m_size
)
)
self.load = load
def _initialize_graph(self):
with self.graph.as_default():
self.saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
init = tf.global_variables_initializer()
self.sess.run(init)
if num_layers < 1:
num_layers = 1
self.num_layers = num_layers
self.vis_encode_type = EncoderType(
trainer_params.get("vis_encode_type", "simple")
)
self.tanh_squash = tanh_squash
self.reparameterize = reparameterize
self.condition_sigma_on_obs = condition_sigma_on_obs
def _load_graph(self, model_path: str, reset_global_steps: bool = False) -> None:
with self.graph.as_default():
self.saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
logger.info(
"Loading model for brain {} from {}.".format(
self.brain.brain_name, model_path
)
)
ckpt = tf.train.get_checkpoint_state(model_path)
if ckpt is None:
raise UnityPolicyException(
"The model {0} could not be loaded. Make "
"sure you specified the right "
"--run-id and that the previous run you are loading from had the same "
"behavior names.".format(model_path)
)
try:
self.saver.restore(self.sess, ckpt.model_checkpoint_path)
except tf.errors.NotFoundError:
raise UnityPolicyException(
"The model {0} was found but could not be loaded. Make "
"sure the model is from the same version of ML-Agents, has the same behavior parameters, "
"and is using the same trainer configuration as the current run.".format(
model_path
)
)
if reset_global_steps:
logger.info(
"Starting training from step 0 and saving to {}.".format(
self.model_path
)
)
else:
logger.info(
"Resuming training from step {}.".format(self.get_current_step())
)
# Non-exposed parameters; these aren't exposed because they don't have a
# good explanation and usually shouldn't be touched.
self.log_std_min = -20
self.log_std_max = 2
self.inference_dict: Dict[str, tf.Tensor] = {}
self.update_dict: Dict[str, tf.Tensor] = {}
# TF defaults to 32-bit, so we use the same here.
torch.set_default_tensor_type(torch.DoubleTensor)
def initialize_or_load(self):
# If there is an initialize path, load from that. Else, load from the set model path.
# If load is set to True, don't reset steps to 0. Else, do. This allows a user to,
# e.g., resume from an initialize path.
reset_steps = not self.load
if self.initialize_path is not None:
self._load_graph(self.initialize_path, reset_global_steps=reset_steps)
elif self.load:
self._load_graph(self.model_path, reset_global_steps=reset_steps)
else:
self._initialize_graph()
reward_signal_configs = trainer_params["reward_signals"]
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
def get_weights(self):
with self.graph.as_default():
_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
values = [v.eval(session=self.sess) for v in _vars]
return values
self.actor = Actor(
h_size=int(trainer_params["hidden_units"]),
act_type=ActionType.CONTINUOUS,
vector_sizes=[brain.vector_observation_space_size],
act_size=sum(brain.vector_action_space_size),
normalize=trainer_params["normalize"],
num_layers=int(trainer_params["num_layers"]),
m_size=trainer_params["memory_size"],
use_lstm=self.use_recurrent,
visual_sizes=brain.camera_resolutions,
vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")
),
)
def init_load_weights(self):
with self.graph.as_default():
_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
values = [v.eval(session=self.sess) for v in _vars]
for var, value in zip(_vars, values):
assign_ph = tf.placeholder(var.dtype, shape=value.shape)
self.assign_phs.append(assign_ph)
self.assign_ops.append(tf.assign(var, assign_ph))
self.critic = Critic(
h_size=int(trainer_params["hidden_units"]),
vector_sizes=[brain.vector_observation_space_size],
normalize=trainer_params["normalize"],
num_layers=int(trainer_params["num_layers"]),
m_size=trainer_params["memory_size"],
use_lstm=self.use_recurrent,
visual_sizes=brain.camera_resolutions,
stream_names=list(reward_signal_configs.keys()),
vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")
),
)
def load_weights(self, values):
if len(self.assign_ops) == 0:
logger.warning(
"Calling load_weights in tf_policy but assign_ops is empty. Did you forget to call init_load_weights?"
def split_decision_step(self, decision_requests):
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
mask = None
if not self.use_continuous_act:
mask = np.ones(
(len(decision_requests), np.sum(self.brain.vector_action_space_size)),
dtype=np.float32,
with self.graph.as_default():
feed_dict = {}
for assign_ph, value in zip(self.assign_phs, values):
feed_dict[assign_ph] = value
self.sess.run(self.assign_ops, feed_dict=feed_dict)
if decision_requests.action_mask is not None:
mask = 1 - np.concatenate(decision_requests.action_mask, axis=1)
return vec_vis_obs.vector_observations, vec_vis_obs.visual_observations, mask
def update_normalization(self, vector_obs: np.ndarray) -> None:
"""
If this policy normalizes vector observations, this will update the norm values in the graph.
:param vector_obs: The vector observations to add to the running estimate of the distribution.
"""
if self.use_vec_obs and self.normalize:
self.critic.network_body.normalize(vector_obs)
self.actor.network_body.normalize(vector_obs)
def execute_model(self, vec_obs, vis_obs, masks=None):
action_dists = self.actor(vec_obs, vis_obs, masks)
actions = []
log_probs = []
entropies = []
for action_dist in action_dists:
action = action_dist.sample()
actions.append(action)
log_probs.append(action_dist.log_prob(action))
entropies.append(action_dist.entropy())
actions = torch.stack(actions)
log_probs = torch.stack(log_probs)
entropies = torch.stack(entropies)
value_heads = self.critic(vec_obs, vis_obs)
return actions, log_probs, entropies, value_heads
@timed
:param decision_requests: DecisionSteps input to network.
:return: Output from policy based on self.inference_dict.
:param decision_requests: DecisionStep object containing inputs.
:return: Outputs from network as defined by self.inference_dict.
raise UnityPolicyException("The evaluate function was not implemented.")
vec_obs, vis_obs, masks = self.split_decision_step(decision_requests)
run_out = {}
action, log_probs, entropy, value_heads = self.execute_model(
vec_obs, vis_obs, masks
)
run_out["action"] = np.array(action.detach())
run_out["log_probs"] = np.array(log_probs.detach())
run_out["entropy"] = np.array(entropy.detach())
run_out["value_heads"] = {
name: np.array(t.detach()) for name, t in value_heads.items()
}
run_out["value"] = np.mean(list(run_out["value_heads"].values()), 0)
run_out["learning_rate"] = 0.0
self.actor.network_body.update_normalization(vec_obs)
self.critic.network_body.update_normalization(vec_obs)
return run_out
def get_action(
self, decision_requests: DecisionSteps, worker_id: int = 0

:param decision_requests: A dictionary of brain names and DecisionSteps from environment.
:param worker_id: In parallel environment training, the unique id of the environment worker that
the DecisionSteps came from. Used to construct a globally unique id for each agent.
:param worker_id:
:param decision_requests: A dictionary of brain names and BrainInfo from environment.
global_agent_ids = [
get_global_agent_id(worker_id, int(agent_id))
for agent_id in decision_requests.agent_id
] # For 1-D array, the iterator order is correct.
run_out = self.evaluate( # pylint: disable=assignment-from-no-return
run_out = self.evaluate(
)
self.save_memories(global_agent_ids, run_out.get("memory_out"))
) # pylint: disable=assignment-from-no-return
agent_ids=decision_requests.agent_id,
agent_ids=list(decision_requests.agent_id),
def update(self, mini_batch, num_sequences):
"""
Performs update of the policy.
:param num_sequences: Number of experience trajectories in batch.
:param mini_batch: Batch of experiences.
:return: Results of update.
"""
raise UnityPolicyException("The update function was not implemented.")
def _execute_model(self, feed_dict, out_dict):
"""
Executes model.
:param feed_dict: Input dictionary mapping nodes to input data.
:param out_dict: Output dictionary mapping names to nodes.
:return: Dictionary mapping names to input data.
"""
network_out = self.sess.run(list(out_dict.values()), feed_dict=feed_dict)
run_out = dict(zip(list(out_dict.keys()), network_out))
return run_out
def fill_eval_dict(self, batched_step_result):
vec_vis_obs = SplitObservations.from_observations(batched_step_result.obs)
mask = None
if not self.use_continuous_act:
mask = np.ones(
(len(batched_step_result), np.sum(self.brain.vector_action_space_size)),
dtype=np.float32,
)
if batched_step_result.action_mask is not None:
mask = 1 - np.concatenate(batched_step_result.action_mask, axis=1)
return vec_vis_obs.vector_observations, vec_vis_obs.visual_observations, mask
def make_empty_memory(self, num_agents):
"""
Creates empty memory for use with RNNs
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.m_size), dtype=np.float32)
def save_memories(
self, agent_ids: List[str], memory_matrix: Optional[np.ndarray]
) -> None:
if memory_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.memory_dict[agent_id] = memory_matrix[index, :]
@property
def vis_obs_size(self):
return self.brain.number_visual_observations
def retrieve_memories(self, agent_ids: List[str]) -> np.ndarray:
memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.memory_dict:
memory_matrix[index, :] = self.memory_dict[agent_id]
return memory_matrix
@property
def vec_obs_size(self):
return self.brain.vector_observation_space_size
def remove_memories(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.memory_dict:
self.memory_dict.pop(agent_id)
def make_empty_previous_action(self, num_agents):
"""
Creates empty previous action for use with RNNs and discrete control
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.num_branches), dtype=np.int)
@property
def use_vis_obs(self):
return self.vis_obs_size > 0
def save_previous_action(
self, agent_ids: List[str], action_matrix: Optional[np.ndarray]
) -> None:
if action_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.previous_action_dict[agent_id] = action_matrix[index, :]
@property
def use_vec_obs(self):
return self.vec_obs_size > 0
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.previous_action_dict:
action_matrix[index, :] = self.previous_action_dict[agent_id]
return action_matrix
@property
def use_recurrent(self):
return False
def remove_previous_action(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.previous_action_dict:
self.previous_action_dict.pop(agent_id)
@property
def use_continuous_act(self):
return True
def get_current_step(self):
"""

return self.global_step
def _set_step(self, step: int) -> int:
"""
Sets current model step to step without creating additional ops.
:param step: Step to set the current model step to.
:return: The step the model was set to.
"""
current_step = self.get_current_step()
# Increment a positive or negative number of steps.
return self.increment_step(step - current_step)
step = self.global_step.detach().numpy()
return step
self.global_step += n_steps
return self.global_step
self.global_step = self.global_step + n_steps
return self.get_current_step()
def get_inference_vars(self):
"""
:return:list of inference var names
"""
return list(self.inference_dict.keys())
def save_model(self, step):
pass
def get_update_vars(self):
"""
:return:list of update var names
"""
return list(self.update_dict.keys())
def save_model(self, steps):
"""
Saves the model
:param steps: The number of steps the model was trained for
:return:
"""
with self.graph.as_default():
last_checkpoint = self.model_path + "/model-" + str(steps) + ".ckpt"
self.saver.save(self.sess, last_checkpoint)
tf.train.write_graph(
self.graph, self.model_path, "raw_graph_def.pb", as_text=False
)
def update_normalization(self, vector_obs: np.ndarray) -> None:
"""
If this policy normalizes vector observations, this will update the norm values in the graph.
:param vector_obs: The vector observations to add to the running estimate of the distribution.
"""
return None
@property
def use_vis_obs(self):
return self.vis_obs_size > 0
@property
def use_vec_obs(self):
return self.vec_obs_size > 0
def export_model(self):
pass

39
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from typing import Any, Dict
from typing import Any, Dict, List, Tuple
import numpy as np
import torch

from mlagents_envs.timers import timed
from mlagents.trainers.policy.nn_torch_policy import NNPolicy
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.components.reward_signals.reward_signal_factory import (
create_reward_signal,
)
def __init__(self, policy: NNPolicy, trainer_params: Dict[str, Any]):
def __init__(self, policy: TorchPolicy, trainer_params: Dict[str, Any]):
"""
Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
The PPO optimizer has a value estimator and a loss function.

self.stream_names = list(self.reward_signals.keys())
self.create_reward_signals(reward_signal_configs)
def create_reward_signals(self, reward_signal_configs):
"""
Create reward signals
:param reward_signal_configs: Reward signal config.
"""
self.reward_signals = {}
# Create reward signals
for reward_signal, config in reward_signal_configs.items():
self.reward_signals[reward_signal] = create_reward_signal(
self.policy, reward_signal, config
)
self.update_dict.update(self.reward_signals[reward_signal].update_dict)
def ppo_value_loss(self, values, old_values, returns):
"""
Creates training-specific Tensorflow ops for PPO models.

returns[name] = batch["{}_returns".format(name)]
old_values[name] = batch["{}_value_estimates".format(name)]
obs = np.array(batch["vector_obs"])
values = self.policy.critic(obs)
dist = self.policy.actor(obs)
probs = dist.log_prob(torch.from_numpy(np.array(batch["actions"])))
entropy = dist.entropy()
vec_obs = np.array(batch["vector_obs"])
vis_obs = np.array(batch["visual_obs"])
actions, log_probs, entropy, values = self.policy.execute_model(
vec_obs, vis_obs
)
probs,
log_probs,
np.array(batch["action_probs"]),
np.array(batch["masks"], dtype=np.uint32),
)

value_estimates[k] = 0.0
return value_estimates
def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
return ({"": np.zeros(0)}), {"": 0.0}

20
ml-agents/mlagents/trainers/ppo/trainer.py


from collections import defaultdict
import numpy as np
from mlagents.trainers.policy import Policy
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.ppo.optimizer_torch import PPOOptimizer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers

self._check_param_keys()
self.load = load
self.seed = seed
self.policy: NNPolicy = None # type: ignore
self.policy: TorchPolicy = None # type: ignore
def _check_param_keys(self):
super()._check_param_keys()

def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> TFPolicy:
) -> TorchPolicy:
:param parsed_behavior_id:
policy = NNPolicy(
policy = TorchPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,

create_tf_graph=False, # We will create the TF graph in the Optimizer
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: TorchPolicy
) -> None:
"""
Adds policy to trainer.

self.__class__.__name__
)
)
if not isinstance(policy, NNPolicy):
if not isinstance(policy, Policy):
raise RuntimeError("Non-NNPolicy passed to PPOTrainer.add_policy()")
self.policy = policy
self.optimizer = PPOOptimizer(self.policy, self.trainer_parameters)

self.step = policy.get_current_step()
self.next_summary_step = self._get_next_summary_step()
def get_policy(self, name_behavior_id: str) -> TFPolicy:
def get_policy(self, name_behavior_id: str) -> TorchPolicy:
"""
Gets policy from trainer associated with name_behavior_id
:param name_behavior_id: full identifier of policy

6
ml-agents/mlagents/trainers/trainer/rl_trainer.py


from collections import defaultdict
import abc
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.exception import UnityTrainerException

for agent_id in rewards:
rewards[agent_id] = 0
def _update_end_episode_stats(self, agent_id: str, optimizer: TFOptimizer) -> None:
def _update_end_episode_stats(
self, agent_id: str, optimizer: TorchOptimizer
) -> None:
for name, rewards in self.collected_rewards.items():
if name == "environment":
self.stats_reporter.add_stat(

23
ml-agents/mlagents/trainers/trainer/trainer.py


from collections import deque
from mlagents_envs.logging_util import get_logger
from mlagents.model_serialization import export_policy_model, SerializationSettings
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy import Policy
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers

self._stats_reporter = StatsReporter(self.summary_path)
self.is_training = training
self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)
self.policy_queues: List[AgentManagerQueue[Policy]] = []
self.policy_queues: List[AgentManagerQueue[TorchPolicy]] = []
self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
self.step: int = 0
self.summary_freq = self.trainer_parameters["summary_freq"]

"""
Exports the model
"""
policy = self.get_policy(name_behavior_id)
settings = SerializationSettings(policy.model_path, policy.brain.brain_name)
export_policy_model(settings, policy.graph, policy.sess)
print("Export")
# policy = self.get_policy(name_behavior_id)
# settings = SerializationSettings(policy.model_path, policy.brain.brain_name)
# export_policy_model(settings, policy.graph, policy.sess)
@abc.abstractmethod
def end_episode(self):

@abc.abstractmethod
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> TFPolicy:
) -> TorchPolicy:
"""
Creates policy
"""

def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: TorchPolicy
) -> None:
"""
Adds policy to trainer.

@abc.abstractmethod
def get_policy(self, name_behavior_id: str) -> TFPolicy:
def get_policy(self, name_behavior_id: str) -> TorchPolicy:
"""
Gets policy from trainer.
"""

"""
pass
def publish_policy_queue(self, policy_queue: AgentManagerQueue[Policy]) -> None:
def publish_policy_queue(
self, policy_queue: AgentManagerQueue[TorchPolicy]
) -> None:
"""
Adds a policy queue to the list of queues to publish to when this Trainer
makes a policy update

288
ml-agents/mlagents/trainers/policy/nn_torch_policy.py


from typing import Any, Dict
import numpy as np
import torch
from mlagents_envs.base_env import DecisionSteps
from torch import nn
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.models import EncoderType
from mlagents.trainers.models_torch import (
ActionType,
VectorEncoder,
ValueHeads,
Normalizer,
ModelUtils,
)
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.distributions_torch import (
GaussianDistribution,
MultiCategoricalDistribution,
)
EPSILON = 1e-7 # Small value to avoid divide by zero
class NetworkBody(nn.Module):
def __init__(
self,
vector_sizes,
visual_sizes,
h_size,
normalize,
num_layers,
m_size,
vis_encode_type,
use_lstm,
):
super(NetworkBody, self).__init__()
self.normalize = normalize
self.visual_encoders = []
self.vector_encoders = []
self.vector_normalizers = []
self.use_lstm = use_lstm
self.h_size = h_size
self.m_size = m_size
visual_encoder = ModelUtils.get_encoder_for_type(vis_encode_type)
for vector_size in vector_sizes:
self.vector_normalizers.append(Normalizer(vector_size))
self.vector_encoders.append(VectorEncoder(vector_size, h_size, num_layers))
for visual_size in visual_sizes:
self.visual_encoders.append(visual_encoder(visual_size))
if use_lstm:
self.lstm = nn.LSTM(h_size, h_size, 1)
def clear_memory(self, batch_size):
self.memory = (
torch.zeros(1, batch_size, self.m_size),
torch.zeros(1, batch_size, self.m_size),
)
def update_normalization(self, inputs):
if self.normalize:
self.normalizer.update(inputs)
def forward(self, vec_inputs, vis_inputs):
vec_embeds = []
for idx, encoder in enumerate(self.vector_encoders):
vec_input = vec_inputs[idx]
if self.normalize:
vec_input = self.normalizers[idx](vec_inputs[idx])
hidden = encoder(vec_input)
vec_embeds.append(hidden)
vis_embeds = []
for idx, encoder in enumerate(self.visual_encoders):
hidden = encoder(vis_inputs[idx])
vis_embeds.append(hidden)
vec_embeds = torch.cat(vec_embeds)
vis_embeds = torch.cat(vis_embeds)
embedding = torch.cat([vec_embeds, vis_embeds])
if self.use_lstm:
embedding, self.memory = self.lstm(embedding, self.memory)
return embedding
class Actor(nn.Module):
def __init__(
self,
h_size,
vector_sizes,
visual_sizes,
act_size,
normalize,
num_layers,
m_size,
vis_encode_type,
act_type,
use_lstm,
):
super(Actor, self).__init__()
self.act_type = act_type
self.act_size = act_size
self.network_body = NetworkBody(
vector_sizes,
visual_sizes,
h_size,
normalize,
num_layers,
m_size,
vis_encode_type,
use_lstm,
)
if self.act_type == ActionType.CONTINUOUS:
self.distribution = GaussianDistribution(h_size, act_size)
else:
self.distribution = MultiCategoricalDistribution(h_size, act_size)
def forward(self, vec_inputs, vis_inputs, masks=None):
embedding = self.network_body(vec_inputs, vis_inputs)
if self.act_type == ActionType.CONTINUOUS:
dist = self.distribution(embedding)
else:
dist = self.distribution(embedding, masks=masks)
return dist
class Critic(nn.Module):
def __init__(
self,
stream_names,
h_size,
vector_sizes,
visual_sizes,
normalize,
num_layers,
m_size,
vis_encode_type,
use_lstm,
):
super(Critic, self).__init__()
self.stream_names = stream_names
self.network_body = NetworkBody(
vector_sizes,
visual_sizes,
h_size,
normalize,
num_layers,
m_size,
vis_encode_type,
use_lstm,
)
self.value_heads = ValueHeads(stream_names, h_size)
def forward(self, vec_inputs, vis_inputs):
embedding = self.network_body(vec_inputs, vis_inputs)
return self.value_heads(embedding)
class NNPolicy(TorchPolicy):
def __init__(
self,
seed: int,
brain: BrainParameters,
trainer_params: Dict[str, Any],
load: bool,
tanh_squash: bool = False,
reparameterize: bool = False,
condition_sigma_on_obs: bool = True,
):
"""
Policy that uses a multilayer perceptron to map the observations to actions. Could
also use a CNN to encode visual input prior to the MLP. Supports discrete and
continuous action spaces, as well as recurrent networks.
:param seed: Random seed.
:param brain: Assigned BrainParameters object.
:param trainer_params: Defined training parameters.
:param load: Whether a pre-trained model will be loaded or a new one created.
:param tanh_squash: Whether to use a tanh function on the continuous output,
or a clipped output.
:param reparameterize: Whether we are using the resampling trick to update the policy
in continuous output.
"""
super().__init__(seed, brain, trainer_params, load)
self.grads = None
num_layers = trainer_params["num_layers"]
self.h_size = trainer_params["hidden_units"]
if num_layers < 1:
num_layers = 1
self.num_layers = num_layers
self.vis_encode_type = EncoderType(
trainer_params.get("vis_encode_type", "simple")
)
self.tanh_squash = tanh_squash
self.reparameterize = reparameterize
self.condition_sigma_on_obs = condition_sigma_on_obs
# Non-exposed parameters; these aren't exposed because they don't have a
# good explanation and usually shouldn't be touched.
self.log_std_min = -20
self.log_std_max = 2
self.inference_dict: Dict[str, tf.Tensor] = {}
self.update_dict: Dict[str, tf.Tensor] = {}
# TF defaults to 32-bit, so we use the same here.
torch.set_default_tensor_type(torch.DoubleTensor)
reward_signal_configs = trainer_params["reward_signals"]
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
self.actor = Actor(
h_size=int(trainer_params["hidden_units"]),
act_type=ActionType.CONTINUOUS,
vector_sizes=[brain.vector_observation_space_size],
act_size=sum(brain.vector_action_space_size),
normalize=trainer_params["normalize"],
num_layers=int(trainer_params["num_layers"]),
m_size=trainer_params["memory_size"],
use_lstm=self.use_recurrent,
visual_sizes=brain.camera_resolutions,
vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")
),
)
self.critic = Critic(
h_size=int(trainer_params["hidden_units"]),
vector_sizes=[brain.vector_observation_space_size],
normalize=trainer_params["normalize"],
num_layers=int(trainer_params["num_layers"]),
m_size=trainer_params["memory_size"],
use_lstm=self.use_recurrent,
visual_sizes=brain.camera_resolutions,
stream_names=list(reward_signal_configs.keys()),
vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")
),
)
def split_decision_step(self, decision_requests):
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
mask = None
if not self.use_continuous_act:
mask = np.ones(
(len(decision_requests), np.sum(self.brain.vector_action_space_size)),
dtype=np.float32,
)
if decision_requests.action_mask is not None:
mask = 1 - np.concatenate(decision_requests.action_mask, axis=1)
return vec_vis_obs.vector_observations, vec_vis_obs.visual_observations, mask
def execute_model(self, vec_obs, vis_obs, masks):
action_dist = self.actor(vec_obs, vis_obs, masks)
action = action_dist.sample()
log_probs = action_dist.log_prob(action)
entropy = action_dist.entropy()
value_heads = self.critic(vec_obs, vis_obs)
return action, log_probs, entropy, value_heads
@timed
def evaluate(self, decision_requests: DecisionSteps) -> Dict[str, Any]:
"""
Evaluates policy for the agent experiences provided.
:param decision_requests: DecisionStep object containing inputs.
:return: Outputs from network as defined by self.inference_dict.
"""
vec_obs, vis_obs, masks = self.split_decision_step(decision_requests)
run_out = {}
action, log_probs, entropy, value_heads = self.execute_model(
vec_obs, vis_obs, masks
)
run_out["action"] = np.array(action.detach())
run_out["log_probs"] = np.array(log_probs.detach())
run_out["entropy"] = np.array(entropy.detach())
run_out["value_heads"] = {
name: np.array(t.detach()) for name, t in value_heads.items()
}
run_out["value"] = np.mean(list(run_out["value_heads"].values()), 0)
run_out["learning_rate"] = 0.0
self.actor.network_body.update_normalization(vec_obs)
self.critic.network_body.update_normalization(vec_obs)
return run_out
正在加载...
取消
保存