浏览代码

[feature] Add experimental PyTorch support (#4335)

* Begin porting work

* Add ResNet and distributions

* Dynamically construct actor and critic

* Initial optimizer port

* Refactoring policy and optimizer

* Resolving a few bugs

* Share more code between tf and torch policies

* Slightly closer to running model

* Training runs, but doesn’t actually work

* Fix a couple additional bugs

* Add conditional sigma for distribution

* Fix normalization

* Support discrete actions as well

* Continuous and discrete now train

* Mulkti-discrete now working

* Visual observations now train as well

* GRU in-progress and dynamic cnns

* Fix for memories

* Remove unused arg

* Combine actor and critic classes. Initial export.

* Support tf and pytorch alongside one another

* Prepare model for onnx export

* Use LSTM and fix a few merge errors

* Fix bug in probs calculation

* Optimize np -> tensor operations

* Time action sample funct...
/MLA-1734-demo-provider
GitHub 4 年前
当前提交
1955af9e
共有 52 个文件被更改,包括 5374 次插入156 次删除
  1. 4
      com.unity.ml-agents/CHANGELOG.md
  2. 2
      ml-agents/mlagents/trainers/buffer.py
  3. 7
      ml-agents/mlagents/trainers/cli_utils.py
  4. 12
      ml-agents/mlagents/trainers/ghost/trainer.py
  5. 2
      ml-agents/mlagents/trainers/policy/tf_policy.py
  6. 76
      ml-agents/mlagents/trainers/ppo/trainer.py
  7. 120
      ml-agents/mlagents/trainers/sac/trainer.py
  8. 14
      ml-agents/mlagents/trainers/settings.py
  9. 7
      ml-agents/mlagents/trainers/tests/test_ghost.py
  10. 5
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  11. 3
      ml-agents/mlagents/trainers/tests/test_sac.py
  12. 2
      ml-agents/mlagents/trainers/tests/test_simple_rl.py
  13. 19
      ml-agents/mlagents/trainers/tests/torch/test_layers.py
  14. 17
      ml-agents/mlagents/trainers/tests/torch/test_networks.py
  15. 6
      ml-agents/mlagents/trainers/tests/torch/test_utils.py
  16. 20
      ml-agents/mlagents/trainers/tf/model_serialization.py
  17. 17
      ml-agents/mlagents/trainers/torch/encoders.py
  18. 67
      ml-agents/mlagents/trainers/torch/layers.py
  19. 115
      ml-agents/mlagents/trainers/torch/networks.py
  20. 4
      ml-agents/mlagents/trainers/torch/utils.py
  21. 99
      ml-agents/mlagents/trainers/trainer/rl_trainer.py
  22. 5
      ml-agents/mlagents/trainers/trainer/trainer.py
  23. 7
      ml-agents/mlagents/trainers/trainer_controller.py
  24. 94
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  25. 281
      ml-agents/mlagents/trainers/policy/torch_policy.py
  26. 203
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  27. 561
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  28. 118
      ml-agents/mlagents/trainers/saver/torch_saver.py
  29. 1001
      ml-agents/mlagents/trainers/tests/torch/test.demo
  30. 144
      ml-agents/mlagents/trainers/tests/torch/test_bcmodule.py
  31. 177
      ml-agents/mlagents/trainers/tests/torch/test_ghost.py
  32. 150
      ml-agents/mlagents/trainers/tests/torch/test_policy.py
  33. 505
      ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
  34. 446
      ml-agents/mlagents/trainers/tests/torch/testdcvis.demo
  35. 74
      ml-agents/mlagents/trainers/torch/model_serialization.py
  36. 111
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py
  37. 56
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_extrinsic.py
  38. 138
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_gail.py
  39. 32
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
  40. 0
      ml-agents/mlagents/trainers/torch/components/__init__.py
  41. 0
      ml-agents/mlagents/trainers/torch/components/bc/__init__.py
  42. 183
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  43. 15
      ml-agents/mlagents/trainers/torch/components/reward_providers/__init__.py
  44. 72
      ml-agents/mlagents/trainers/torch/components/reward_providers/base_reward_provider.py
  45. 15
      ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py
  46. 43
      ml-agents/mlagents/trainers/torch/components/reward_providers/reward_provider_factory.py
  47. 225
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  48. 256
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py

4
com.unity.ml-agents/CHANGELOG.md


- The interaction between EnvManager and TrainerController was changed; EnvManager.advance() was split into to stages,
and TrainerController now uses the results from the first stage to handle new behavior names. This change speeds up
Python training by approximately 5-10%. (#4259)
- Experimental PyTorch support has been added. Use `--torch` when running `mlagents-learn`, or add
`framework: pytorch` to your trainer configuration (under the behavior name) to enable it.
Note that PyTorch 1.6.0 or greater should be installed to use this feature; see
[the PyTorch website](https://pytorch.org/) for installation instructions. (#4335)
### Minor Changes
#### com.unity.ml-agents (C#)

2
ml-agents/mlagents/trainers/buffer.py


Adds a list of np.arrays to the end of the list of np.arrays.
:param data: The np.array list to append.
"""
self += list(np.array(data))
self += list(np.array(data, dtype=np.float32))
def set(self, data):
"""

7
ml-agents/mlagents/trainers/cli_utils.py


action=DetectDefaultStoreTrue,
help="Forces training using CPU only",
)
argparser.add_argument(
"--torch",
default=False,
action=DetectDefaultStoreTrue,
help="(Experimental) Use the PyTorch framework instead of TensorFlow. Install PyTorch "
"before using this option",
)
eng_conf = argparser.add_argument_group(title="Engine Configuration")
eng_conf.add_argument(

12
ml-agents/mlagents/trainers/ghost/trainer.py


self.trainer.save_model()
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
self,
parsed_behavior_id: BehaviorIdentifiers,
behavior_spec: BehaviorSpec,
create_graph: bool = False,
) -> Policy:
"""
Creates policy with the wrapped trainer's create_policy function

wrapped trainer to be trained.
"""
policy = self.trainer.create_policy(parsed_behavior_id, behavior_spec)
policy.create_tf_graph()
policy = self.trainer.create_policy(
parsed_behavior_id, behavior_spec, create_graph=True
)
policy.init_load_weights()
team_id = parsed_behavior_id.team_id
self.controller.subscribe_team_id(team_id, self)

parsed_behavior_id, behavior_spec
)
self.trainer.add_policy(parsed_behavior_id, internal_trainer_policy)
internal_trainer_policy.init_load_weights()
self.current_policy_snapshot[
parsed_behavior_id.brain_name
] = internal_trainer_policy.get_weights()

2
ml-agents/mlagents/trainers/policy/tf_policy.py


# We do an initialize to make the Policy usable out of the box. If an optimizer is needed,
# it will re-load the full graph
self.initialize()
# Create assignment ops for Ghost Trainer
self.init_load_weights()
def _create_encoder(
self,

76
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, PPOSettings
from mlagents.trainers.settings import TrainerSettings, PPOSettings, FrameworkType
from mlagents.trainers.components.reward_signals import RewardSignal
try:
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer
except ModuleNotFoundError:
TorchPolicy = None # type: ignore
TorchPPOOptimizer = None # type: ignore
logger = get_logger(__name__)

trajectory.next_obs,
trajectory.done_reached and not trajectory.interrupted,
)
self._stats_reporter.add_stat(
self.optimizer.reward_signals[name].value_name, np.mean(v)
)
if isinstance(self.optimizer.reward_signals[name], RewardSignal):
self._stats_reporter.add_stat(
self.optimizer.reward_signals[name].value_name, np.mean(v)
)
else:
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
np.mean(v),
)
# Evaluate all reward functions
self.collected_rewards["environment"][agent_id] += np.sum(

evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
if isinstance(reward_signal, RewardSignal):
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
else:
evaluate_result = (
reward_signal.evaluate(agent_buffer_trajectory)
* reward_signal.strength
)
agent_buffer_trajectory[f"{name}_rewards"].extend(evaluate_result)
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

local_value_estimates = agent_buffer_trajectory[
f"{name}_value_estimates"
].get_batch()
local_advantage = get_gae(
rewards=local_rewards,
value_estimates=local_value_estimates,

self._clear_update_buffer()
return True
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
def create_tf_policy(
self,
parsed_behavior_id: BehaviorIdentifiers,
behavior_spec: BehaviorSpec,
create_graph: bool = False,
Creates a PPO policy to trainers list of policies.
Creates a policy with a Tensorflow backend and PPO hyperparameters
:param parsed_behavior_id:
:param create_graph: whether to create the Tensorflow graph on construction
:return policy
"""
policy = TFPolicy(

condition_sigma_on_obs=False, # Faster training for PPO
create_tf_graph=False, # We will create the TF graph in the Optimizer
create_tf_graph=create_graph,
return policy
def create_torch_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TorchPolicy:
"""
Creates a policy with a PyTorch backend and PPO hyperparameters
:param parsed_behavior_id:
:param behavior_spec: specifications for policy construction
:return policy
"""
policy = TorchPolicy(
self.seed,
behavior_spec,
self.trainer_settings,
condition_sigma_on_obs=False, # Faster training for PPO
separate_critic=behavior_spec.is_action_continuous(),
)
return PPOOptimizer(cast(TFPolicy, self.policy), self.trainer_settings)
if self.framework == FrameworkType.PYTORCH:
return TorchPPOOptimizer( # type: ignore
cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore
) # type: ignore
else:
return PPOOptimizer( # type: ignore
cast(TFPolicy, self.policy), self.trainer_settings # type: ignore
) # type: ignore
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy

)
self.policy = policy
self.policies[parsed_behavior_id.behavior_id] = policy
self.optimizer = self.create_ppo_optimizer()
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)

120
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.trajectory import Trajectory, SplitObservations
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, SACSettings
from mlagents.trainers.settings import TrainerSettings, SACSettings, FrameworkType
from mlagents.trainers.components.reward_signals import RewardSignal
try:
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer
except ModuleNotFoundError:
TorchPolicy = None # type: ignore
TorchSACOptimizer = None # type: ignore
logger = get_logger(__name__)

agent_buffer_trajectory["environment_rewards"]
)
for name, reward_signal in self.optimizer.reward_signals.items():
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
if isinstance(reward_signal, RewardSignal):
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
else:
evaluate_result = (
reward_signal.evaluate(agent_buffer_trajectory)
* reward_signal.strength
)
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

)
for name, v in value_estimates.items():
self._stats_reporter.add_stat(
self.optimizer.reward_signals[name].value_name, np.mean(v)
)
if isinstance(self.optimizer.reward_signals[name], RewardSignal):
self._stats_reporter.add_stat(
self.optimizer.reward_signals[name].value_name, np.mean(v)
)
else:
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value",
np.mean(v),
)
# Bootstrap using the last step rather than the bootstrap step if max step is reached.
# Set last element to duplicate obs and remove dones.

self._update_reward_signals()
return policy_was_updated
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TFPolicy:
policy = TFPolicy(
self.seed,
behavior_spec,
self.trainer_settings,
tanh_squash=True,
reparameterize=True,
create_tf_graph=False,
)
def maybe_load_replay_buffer(self):
# Load the replay buffer if load
if self.load and self.checkpoint_replay_buffer:
try:

)
)
def create_tf_policy(
self,
parsed_behavior_id: BehaviorIdentifiers,
behavior_spec: BehaviorSpec,
create_graph: bool = False,
) -> TFPolicy:
"""
Creates a policy with a Tensorflow backend and SAC hyperparameters
:param parsed_behavior_id:
:param behavior_spec: specifications for policy construction
:param create_graph: whether to create the Tensorflow graph on construction
:return policy
"""
policy = TFPolicy(
self.seed,
behavior_spec,
self.trainer_settings,
tanh_squash=True,
reparameterize=True,
create_tf_graph=create_graph,
)
self.maybe_load_replay_buffer()
return policy
def create_torch_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TorchPolicy:
"""
Creates a policy with a PyTorch backend and SAC hyperparameters
:param parsed_behavior_id:
:param behavior_spec: specifications for policy construction
:return policy
"""
policy = TorchPolicy(
self.seed,
behavior_spec,
self.trainer_settings,
condition_sigma_on_obs=True,
tanh_squash=True,
separate_critic=True,
)
self.maybe_load_replay_buffer()
return policy
def _update_sac_policy(self) -> bool:

)
# Get rewards for each reward
for name, signal in self.optimizer.reward_signals.items():
sampled_minibatch[f"{name}_rewards"] = signal.evaluate_batch(
sampled_minibatch
).scaled_reward
if isinstance(signal, RewardSignal):
sampled_minibatch[f"{name}_rewards"] = signal.evaluate_batch(
sampled_minibatch
).scaled_reward
else:
sampled_minibatch[f"{name}_rewards"] = (
signal.evaluate(sampled_minibatch) * signal.strength
)
update_stats = self.optimizer.update(sampled_minibatch, n_sequences)
for stat_name, value in update_stats.items():

reward_signal_minibatches = {}
for name, signal in self.optimizer.reward_signals.items():
logger.debug(f"Updating {name} at step {self.step}")
# Some signals don't need a minibatch to be sampled - so we don't!
if signal.update_dict:
reward_signal_minibatches[name] = buffer.sample_mini_batch(
self.hyperparameters.batch_size,
sequence_length=self.policy.sequence_length,
)
if isinstance(signal, RewardSignal):
# Some signals don't need a minibatch to be sampled - so we don't!
if signal.update_dict:
reward_signal_minibatches[name] = buffer.sample_mini_batch(
self.hyperparameters.batch_size,
sequence_length=self.policy.sequence_length,
)
update_stats = self.optimizer.update_reward_signals(
reward_signal_minibatches, n_sequences
)

self._stats_reporter.add_stat(stat, np.mean(stat_list))
def create_sac_optimizer(self) -> SACOptimizer:
return SACOptimizer(cast(TFPolicy, self.policy), self.trainer_settings)
if self.framework == FrameworkType.PYTORCH:
return TorchSACOptimizer( # type: ignore
cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore
) # type: ignore
else:
return SACOptimizer( # type: ignore
cast(TFPolicy, self.policy), self.trainer_settings # type: ignore
) # type: ignore
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy

14
ml-agents/mlagents/trainers/settings.py


return _mapping[self]
class FrameworkType(Enum):
TENSORFLOW: str = "tensorflow"
PYTORCH: str = "pytorch"
@attr.s(auto_attribs=True)
class TrainerSettings(ExportableSettings):
trainer_type: TrainerType = TrainerType.PPO

threaded: bool = True
self_play: Optional[SelfPlaySettings] = None
behavioral_cloning: Optional[BehavioralCloningSettings] = None
framework: FrameworkType = FrameworkType.TENSORFLOW
cattr.register_structure_hook(
Dict[RewardSignalType, RewardSignalSettings], RewardSignalSettings.structure

configured_dict["engine_settings"][key] = val
else: # Base options
configured_dict[key] = val
return RunOptions.from_dict(configured_dict)
# Apply --torch retroactively
final_runoptions = RunOptions.from_dict(configured_dict)
if "torch" in DetectDefault.non_default_args:
for trainer_set in final_runoptions.behaviors.values():
trainer_set.framework = FrameworkType.PYTORCH
return final_runoptions
@staticmethod
def from_dict(options_dict: Dict[str, Any]) -> "RunOptions":

7
ml-agents/mlagents/trainers/tests/test_ghost.py


trainer_params = dummy_config
trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0")
trainer.seed = 1
policy = trainer.create_policy("test", mock_specs)
policy.create_tf_graph()
policy = trainer.create_policy("test", mock_specs, create_graph=True)
to_load_policy = trainer.create_policy("test", mock_specs)
to_load_policy.create_tf_graph()
to_load_policy.init_load_weights()
to_load_policy = trainer.create_policy("test", mock_specs, create_graph=True)
weights = policy.get_weights()
load_weights = to_load_policy.get_weights()

5
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


mock_saver.save_checkpoint.side_effect = checkpoint_path
self.saver = mock_saver
def create_policy(self):
def create_tf_policy(self, parsed_behavior_id, behavior_spec):
return mock.Mock()
def create_torch_policy(self, parsed_behavior_id, behavior_spec):
return mock.Mock()
def _process_trajectory(self, trajectory):

3
ml-agents/mlagents/trainers/tests/test_sac.py


0, mock_brain, trainer_settings, "test", False, create_tf_graph=False
)
optimizer = SACOptimizer(policy, trainer_settings)
policy.initialize()
optimizer.policy.initialize()
return optimizer

trainer.add_policy(behavior_id, policy)
trainer.saver.initialize_or_load(policy)
trainer.optimizer.update = mock.Mock()
trainer.saver.initialize_or_load(policy)
trainer.optimizer.update_reward_signals = mock.Mock()
trainer.optimizer.update_reward_signals.return_value = {}
trainer.optimizer.update.return_value = {}

2
ml-agents/mlagents/trainers/tests/test_simple_rl.py


RewardSignalType,
EncoderType,
ScheduleType,
FrameworkType,
)
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
from mlagents_envs.side_channel.environment_parameters_channel import (

summary_freq=500,
max_steps=3000,
threaded=False,
framework=FrameworkType.TENSORFLOW,
)
SAC_CONFIG = TrainerSettings(

19
ml-agents/mlagents/trainers/tests/torch/test_layers.py


linear_layer,
lstm_layer,
Initialization,
LSTM,
)

assert torch.all(
torch.eq(param.data[4:8], torch.ones_like(param.data[4:8]))
)
def test_lstm_class():
torch.manual_seed(0)
input_size = 12
memory_size = 64
batch_size = 8
seq_len = 16
lstm = LSTM(input_size, memory_size)
assert lstm.memory_size == memory_size
sample_input = torch.ones((batch_size, seq_len, input_size))
sample_memories = torch.ones((1, batch_size, memory_size))
out, mem = lstm(sample_input, sample_memories)
# Hidden size should be half of memory_size
assert out.shape == (batch_size, seq_len, memory_size // 2)
assert mem.shape == (1, batch_size, memory_size)

17
ml-agents/mlagents/trainers/tests/torch/test_networks.py


assert act.shape == (1, 1)
# Test forward
actions, probs, ver_num, mem_size, is_cont, act_size_vec = actor.forward(
actions, ver_num, mem_size, is_cont, act_size_vec = actor.forward(
# This is different from above for ONNX export
assert act.shape == (
act_size[0],
1,
) # This is different from above for ONNX export
assert act.shape == (act_size[0], 1)
assert act.shape == (1, 1)
assert act.shape == tuple(act_size)
# TODO: Once export works properly. fix the shapes here.
assert mem_size == 0
assert is_cont == int(action_type == ActionType.CONTINUOUS)
assert act_size_vec == torch.tensor(act_size)

if lstm:
sample_obs = torch.ones((1, network_settings.memory.sequence_length, obs_size))
memories = torch.ones(
(
1,
network_settings.memory.sequence_length,
network_settings.memory.memory_size,
)
(1, network_settings.memory.sequence_length, actor.memory_size)
)
else:
sample_obs = torch.ones((1, obs_size))

6
ml-agents/mlagents/trainers/tests/torch/test_utils.py


masks = torch.tensor([False, False, False, False, False])
mean = ModelUtils.masked_mean(test_input, masks=masks)
assert mean == 0.0
# Make sure it works with 2d arrays of shape (mask_length, N)
test_input = torch.tensor([1, 2, 3, 4, 5]).repeat(2, 1).T
masks = torch.tensor([False, False, True, True, True])
mean = ModelUtils.masked_mean(test_input, masks=masks)
assert mean == 4.0

20
ml-agents/mlagents/trainers/tf/model_serialization.py


def export_policy_model(
model_path: str,
output_filepath: str,
brain_name: str,
behavior_name: str,
graph: tf.Graph,
sess: tf.Session,
) -> None:

:param output_filepath: file path to output the model (without file suffix)
:param brain_name: brain name of the trained model
:param behavior_name: behavior name of the trained model
frozen_graph_def = _make_frozen_graph(brain_name, graph, sess)
frozen_graph_def = _make_frozen_graph(behavior_name, graph, sess)
if not os.path.exists(output_filepath):
os.makedirs(output_filepath)
# Save frozen graph

if ONNX_EXPORT_ENABLED:
if SerializationSettings.convert_to_onnx:
try:
onnx_graph = convert_frozen_to_onnx(brain_name, frozen_graph_def)
onnx_graph = convert_frozen_to_onnx(behavior_name, frozen_graph_def)
onnx_output_path = f"{output_filepath}.onnx"
with open(onnx_output_path, "wb") as f:
f.write(onnx_graph.SerializeToString())

def _make_frozen_graph(
brain_name: str, graph: tf.Graph, sess: tf.Session
behavior_name: str, graph: tf.Graph, sess: tf.Session
target_nodes = ",".join(_process_graph(brain_name, graph))
target_nodes = ",".join(_process_graph(behavior_name, graph))
graph_def = graph.as_graph_def()
output_graph_def = graph_util.convert_variables_to_constants(
sess, graph_def, target_nodes.replace(" ", "").split(",")

def convert_frozen_to_onnx(brain_name: str, frozen_graph_def: tf.GraphDef) -> Any:
def convert_frozen_to_onnx(behavior_name: str, frozen_graph_def: tf.GraphDef) -> Any:
# This is basically https://github.com/onnx/tensorflow-onnx/blob/master/tf2onnx/convert.py
inputs = _get_input_node_names(frozen_graph_def)

)
onnx_graph = optimizer.optimize_graph(g)
model_proto = onnx_graph.make_model(brain_name)
model_proto = onnx_graph.make_model(behavior_name)
return model_proto

return names
def _process_graph(brain_name: str, graph: tf.Graph) -> List[str]:
def _process_graph(behavior_name: str, graph: tf.Graph) -> List[str]:
"""
Gets the list of the output nodes present in the graph for inference
:return: list of node names

logger.info("List of nodes to export for brain :" + brain_name)
logger.info("List of nodes to export for behavior :" + behavior_name)
for n in nodes:
logger.info("\t" + n)
return nodes

17
ml-agents/mlagents/trainers/torch/encoders.py


super().__init__()
n_channels = [16, 32, 32] # channel for each stack
n_blocks = 2 # number of residual blocks
self.layers = []
layers = []
self.layers.append(
nn.Conv2d(last_channel, channel, [3, 3], [1, 1], padding=1)
)
self.layers.append(nn.MaxPool2d([3, 3], [2, 2]))
layers.append(nn.Conv2d(last_channel, channel, [3, 3], [1, 1], padding=1))
layers.append(nn.MaxPool2d([3, 3], [2, 2]))
self.layers.append(ResNetBlock(channel))
layers.append(ResNetBlock(channel))
self.layers.append(Swish())
layers.append(Swish())
self.dense = linear_layer(
n_channels[-1] * height * width,
final_hidden,

self.sequential = nn.Sequential(*layers)
hidden = visual_obs
for layer in self.layers:
hidden = layer(hidden)
hidden = self.sequential(visual_obs)
before_out = hidden.view(batch_size, -1)
return torch.relu(self.dense(before_out))

67
ml-agents/mlagents/trainers/torch/layers.py


import torch
import abc
from typing import Tuple
from enum import Enum

forget_bias
)
return lstm
class MemoryModule(torch.nn.Module):
@abc.abstractproperty
def memory_size(self) -> int:
"""
Size of memory that is required at the start of a sequence.
"""
pass
@abc.abstractmethod
def forward(
self, input_tensor: torch.Tensor, memories: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Pass a sequence to the memory module.
:input_tensor: Tensor of shape (batch_size, seq_length, size) that represents the input.
:memories: Tensor of initial memories.
:return: Tuple of output, final memories.
"""
pass
class LSTM(MemoryModule):
"""
Memory module that implements LSTM.
"""
def __init__(
self,
input_size: int,
memory_size: int,
num_layers: int = 1,
forget_bias: float = 1.0,
kernel_init: Initialization = Initialization.XavierGlorotUniform,
bias_init: Initialization = Initialization.Zero,
):
super().__init__()
# We set hidden size to half of memory_size since the initial memory
# will be divided between the hidden state and initial cell state.
self.hidden_size = memory_size // 2
self.lstm = lstm_layer(
input_size,
self.hidden_size,
num_layers,
True,
forget_bias,
kernel_init,
bias_init,
)
@property
def memory_size(self) -> int:
return 2 * self.hidden_size
def forward(
self, input_tensor: torch.Tensor, memories: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
# We don't use torch.split here since it is not supported by Barracuda
h0 = memories[:, :, : self.hidden_size]
c0 = memories[:, :, self.hidden_size :]
hidden = (h0, c0)
lstm_out, hidden_out = self.lstm(input_tensor, hidden)
output_mem = torch.cat(hidden_out, dim=-1)
return lstm_out, output_mem

115
ml-agents/mlagents/trainers/torch/networks.py


from typing import Callable, List, Dict, Tuple, Optional
import attr
import abc
import torch

from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.decoders import ValueHeads
from mlagents.trainers.torch.layers import lstm_layer
from mlagents.trainers.torch.layers import LSTM
ActivationFunction = Callable[[torch.Tensor], torch.Tensor]
EncoderFunction = Callable[

)
if self.use_lstm:
self.lstm = lstm_layer(self.h_size, self.m_size // 2, batch_first=True)
self.lstm = LSTM(self.h_size, self.m_size)
self.lstm = None
self.lstm = None # type: ignore
def update_normalization(self, vec_inputs: List[torch.Tensor]) -> None:
for vec_input, vec_enc in zip(vec_inputs, self.vector_encoders):

for n1, n2 in zip(self.vector_encoders, other_network.vector_encoders):
n1.copy_normalization(n2)
@property
def memory_size(self) -> int:
return self.lstm.memory_size if self.use_lstm else 0
def forward(
self,
vec_inputs: List[torch.Tensor],

sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]:
vec_encodes = []
encodes = []
for idx, encoder in enumerate(self.vector_encoders):
vec_input = vec_inputs[idx]
if actions is not None:

vec_encodes.append(hidden)
encodes.append(hidden)
vis_encodes = []
vis_input = vis_input.permute([0, 3, 1, 2])
if not torch.onnx.is_in_onnx_export():
vis_input = vis_input.permute([0, 3, 1, 2])
vis_encodes.append(hidden)
encodes.append(hidden)
if len(vec_encodes) > 0 and len(vis_encodes) > 0:
vec_encodes_tensor = torch.stack(vec_encodes, dim=-1).sum(dim=-1)
vis_encodes_tensor = torch.stack(vis_encodes, dim=-1).sum(dim=-1)
encoding = torch.stack(
[vec_encodes_tensor, vis_encodes_tensor], dim=-1
).sum(dim=-1)
elif len(vec_encodes) > 0:
encoding = torch.stack(vec_encodes, dim=-1).sum(dim=-1)
elif len(vis_encodes) > 0:
encoding = torch.stack(vis_encodes, dim=-1).sum(dim=-1)
else:
if len(encodes) == 0:
# Constants don't work in Barracuda
encoding = encodes[0]
if len(encodes) > 1:
for _enc in encodes[1:]:
encoding += _enc
memories = torch.split(memories, self.m_size // 2, dim=-1)
memories = torch.cat(memories, dim=-1)
return encoding, memories

encoding_size = network_settings.hidden_units
self.value_heads = ValueHeads(stream_names, encoding_size, outputs_per_stream)
@property
def memory_size(self) -> int:
return self.network_body.memory_size
def forward(
self,
vec_inputs: List[torch.Tensor],

vis_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor, int, int, int, int]:
) -> Tuple[torch.Tensor, int, int, int, int]:
"""
Forward pass of the Actor for inference. This is required for export to ONNX, and
the inputs and outputs of this method should not be changed without a respective change

"""
pass
@abc.abstractproperty
def memory_size(self):
"""
Returns the size of the memory (same size used as input and output in the other
methods) used by this Actor.
"""
pass
class SimpleActor(nn.Module, Actor):
def __init__(

self.act_type = act_type
self.act_size = act_size
self.version_number = torch.nn.Parameter(torch.Tensor([2.0]))
self.memory_size = torch.nn.Parameter(torch.Tensor([0]))
self.is_continuous_int = torch.nn.Parameter(
torch.Tensor([int(act_type == ActionType.CONTINUOUS)])
)

self.encoding_size = network_settings.memory.memory_size // 2
else:
self.encoding_size = network_settings.hidden_units
if self.act_type == ActionType.CONTINUOUS:
self.distribution = GaussianDistribution(
self.encoding_size,

self.encoding_size, act_size
)
@property
def memory_size(self) -> int:
return self.network_body.memory_size
def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
self.network_body.update_normalization(vector_obs)

vis_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor, int, int, int, int]:
) -> Tuple[torch.Tensor, int, int, int, int]:
dists, _ = self.get_dists(
vec_inputs, vis_inputs, masks, memories, sequence_length
)
dists, _ = self.get_dists(vec_inputs, vis_inputs, masks, memories, 1)
if self.act_type == ActionType.CONTINUOUS:
action_out = sampled_actions
else:
action_out = dists[0].all_log_prob()
sampled_actions,
dists[0].pdf(sampled_actions),
action_out,
self.memory_size,
torch.Tensor([self.network_body.memory_size]),
self.is_continuous_int,
self.act_size_vector,
)

# Give the Actor only half the memories. Note we previously validate
# that memory_size must be a multiple of 4.
self.use_lstm = network_settings.memory is not None
if network_settings.memory is not None:
self.half_mem_size = network_settings.memory.memory_size // 2
new_memory_settings = attr.evolve(
network_settings.memory, memory_size=self.half_mem_size
)
use_network_settings = attr.evolve(
network_settings, memory=new_memory_settings
)
else:
use_network_settings = network_settings
self.half_mem_size = 0
use_network_settings,
network_settings,
act_type,
act_size,
conditional_sigma,

self.critic = ValueNetwork(
stream_names, observation_shapes, use_network_settings
)
self.critic = ValueNetwork(stream_names, observation_shapes, network_settings)
@property
def memory_size(self) -> int:
return self.network_body.memory_size + self.critic.memory_size
def critic_pass(
self,

actor_mem, critic_mem = None, None
if self.use_lstm:
# Use only the back half of memories for critic
actor_mem, critic_mem = torch.split(memories, self.half_mem_size, -1)
actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, -1)
value_outputs, critic_mem_out = self.critic(
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
)

) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
if self.use_lstm:
# Use only the back half of memories for critic and actor
actor_mem, critic_mem = torch.split(memories, self.half_mem_size, dim=-1)
actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1)
else:
critic_mem = None
actor_mem = None

class GlobalSteps(nn.Module):
def __init__(self):
super().__init__()
self.global_step = torch.Tensor([0])
self.__global_step = nn.Parameter(torch.Tensor([0]), requires_grad=False)
@property
def current_step(self):
return int(self.__global_step.item())
@current_step.setter
def current_step(self, value):
self.__global_step[:] = value
self.global_step += value
self.__global_step += value
class LearningRate(nn.Module):

4
ml-agents/mlagents/trainers/torch/utils.py


:param tensor: Tensor which needs mean computation.
:param masks: Boolean tensor of masks with same dimension as tensor.
"""
return (tensor * masks).sum() / torch.clamp(masks.float().sum(), min=1.0)
return (tensor.T * masks).sum() / torch.clamp(
(torch.ones_like(tensor.T) * masks).float().sum(), min=1.0
)

99
ml-agents/mlagents/trainers/trainer/rl_trainer.py


from mlagents.trainers.optimizer import Optimizer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.components.reward_signals import RewardSignalResult
from mlagents.trainers.components.reward_signals import RewardSignalResult, RewardSignal
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.policy.policy import Policy
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.settings import TrainerSettings, FrameworkType
from mlagents.trainers.exception import UnityTrainerException
try:
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.saver.torch_saver import TorchSaver
except ModuleNotFoundError:
TorchPolicy = None # type: ignore
RewardSignalResults = Dict[str, RewardSignalResult]

self._stats_reporter.add_property(
StatsPropertyType.HYPERPARAMETERS, self.trainer_settings.as_dict()
)
self.framework = self.trainer_settings.framework
logger.debug(f"Using framework {self.framework.value}")
self.trainer_settings, self.artifact_path, self.load
self.framework, self.trainer_settings, self.artifact_path, self.load
)
def end_episode(self) -> None:

for agent_id in rewards:
rewards[agent_id] = 0
@staticmethod
def create_saver(
trainer_settings: TrainerSettings, model_path: str, load: bool
) -> BaseSaver:
saver = TFSaver(trainer_settings, model_path, load)
return saver
def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None:
for name, rewards in self.collected_rewards.items():
if name == "environment":

self.reward_buffer.appendleft(rewards.get(agent_id, 0))
rewards[agent_id] = 0
else:
self.stats_reporter.add_stat(
optimizer.reward_signals[name].stat_name, rewards.get(agent_id, 0)
)
if isinstance(optimizer.reward_signals[name], RewardSignal):
self.stats_reporter.add_stat(
optimizer.reward_signals[name].stat_name,
rewards.get(agent_id, 0),
)
else:
self.stats_reporter.add_stat(
f"Policy/{optimizer.reward_signals[name].name.capitalize()} Reward",
rewards.get(agent_id, 0),
)
rewards[agent_id] = 0
def _clear_update_buffer(self) -> None:

"""
return False
def create_policy(
self,
parsed_behavior_id: BehaviorIdentifiers,
behavior_spec: BehaviorSpec,
create_graph: bool = False,
) -> Policy:
if self.framework == FrameworkType.PYTORCH and TorchPolicy is None:
raise UnityTrainerException(
"To use the experimental PyTorch backend, install the PyTorch Python package first."
)
elif self.framework == FrameworkType.PYTORCH:
return self.create_torch_policy(parsed_behavior_id, behavior_spec)
else:
return self.create_tf_policy(
parsed_behavior_id, behavior_spec, create_graph=create_graph
)
@abc.abstractmethod
def create_torch_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TorchPolicy:
"""
Create a Policy object that uses the PyTorch backend.
"""
pass
@abc.abstractmethod
def create_tf_policy(
self,
parsed_behavior_id: BehaviorIdentifiers,
behavior_spec: BehaviorSpec,
create_graph: bool = False,
) -> TFPolicy:
"""
Create a Policy object that uses the TensorFlow backend.
"""
pass
@staticmethod
def create_saver(
framework: str, trainer_settings: TrainerSettings, model_path: str, load: bool
) -> BaseSaver:
if framework == FrameworkType.PYTORCH:
saver = TorchSaver( # type: ignore
trainer_settings, model_path, load
)
else:
saver = TFSaver( # type: ignore
trainer_settings, model_path, load
)
return saver
def _policy_mean_reward(self) -> Optional[float]:
""" Returns the mean episode reward for the current policy. """
rewards = self.cumulative_returns_since_policy_update

logger.warning(
"Trainer has multiple policies, but default behavior only saves the first."
)
elif n_policies == 0:
logger.warning("Trainer has no policies, not saving anything.")
return
# Copy the checkpointed model files to the final output location
final_checkpoint = attr.evolve(
model_checkpoint, file_path=f"{self.saver.model_path}.nn"
)

5
ml-agents/mlagents/trainers/trainer/trainer.py


@abc.abstractmethod
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
self,
parsed_behavior_id: BehaviorIdentifiers,
behavior_spec: BehaviorSpec,
create_graph: bool = False,
) -> Policy:
"""
Creates policy

7
ml-agents/mlagents/trainers/trainer_controller.py


from mlagents.trainers.agent_processor import AgentManager
from mlagents.tf_utils.globals import get_rank
try:
import torch
except ModuleNotFoundError:
torch = None # type: ignore
class TrainerController:
def __init__(

self.kill_trainers = False
np.random.seed(training_seed)
tf.set_random_seed(training_seed)
if torch is not None:
torch.manual_seed(training_seed)
self.rank = get_rank()
@timed

94
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


from typing import Dict, Optional, Tuple, List
import torch
import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.torch.components.bc.module import BCModule
from mlagents.trainers.torch.components.reward_providers import create_reward_provider
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer import Optimizer
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.torch.utils import ModelUtils
class TorchOptimizer(Optimizer): # pylint: disable=W0223
def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
super().__init__()
self.policy = policy
self.trainer_settings = trainer_settings
self.update_dict: Dict[str, torch.Tensor] = {}
self.value_heads: Dict[str, torch.Tensor] = {}
self.memory_in: torch.Tensor = None
self.memory_out: torch.Tensor = None
self.m_size: int = 0
self.global_step = torch.tensor(0)
self.bc_module: Optional[BCModule] = None
self.create_reward_signals(trainer_settings.reward_signals)
if trainer_settings.behavioral_cloning is not None:
self.bc_module = BCModule(
self.policy,
trainer_settings.behavioral_cloning,
policy_learning_rate=trainer_settings.hyperparameters.learning_rate,
default_batch_size=trainer_settings.hyperparameters.batch_size,
default_num_epoch=3,
)
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
pass
def create_reward_signals(self, reward_signal_configs):
"""
Create reward signals
:param reward_signal_configs: Reward signal config.
"""
for reward_signal, settings in reward_signal_configs.items():
# Name reward signals by string in case we have duplicates later
self.reward_signals[reward_signal.value] = create_reward_provider(
reward_signal, self.policy.behavior_spec, settings
)
def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
if self.policy.use_vis_obs:
visual_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_encoders
):
visual_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
visual_obs.append(visual_ob)
else:
visual_obs = []
memory = torch.zeros([1, 1, self.policy.m_size])
vec_vis_obs = SplitObservations.from_observations(next_obs)
next_vec_obs = [
ModelUtils.list_to_tensor(vec_vis_obs.vector_observations).unsqueeze(0)
]
next_vis_obs = [
ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0)
for _vis_ob in vec_vis_obs.visual_observations
]
value_estimates, next_memory = self.policy.actor_critic.critic_pass(
vector_obs, visual_obs, memory, sequence_length=batch.num_experiences
)
next_value_estimate, _ = self.policy.actor_critic.critic_pass(
next_vec_obs, next_vis_obs, next_memory, sequence_length=1
)
for name, estimate in value_estimates.items():
value_estimates[name] = estimate.detach().cpu().numpy()
next_value_estimate[name] = next_value_estimate[name].detach().cpu().numpy()
if done:
for k in next_value_estimate:
if not self.reward_signals[k].ignore_done:
next_value_estimate[k] = 0.0
return value_estimates, next_value_estimate

281
ml-agents/mlagents/trainers/policy/torch_policy.py


from typing import Any, Dict, List, Tuple, Optional
import numpy as np
import torch
import copy
from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.behavior_id_utils import get_global_agent_id
from mlagents.trainers.policy import Policy
from mlagents_envs.base_env import DecisionSteps, BehaviorSpec
from mlagents_envs.timers import timed
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.torch.networks import (
SharedActorCritic,
SeparateActorCritic,
GlobalSteps,
)
from mlagents.trainers.torch.utils import ModelUtils
EPSILON = 1e-7 # Small value to avoid divide by zero
class TorchPolicy(Policy):
def __init__(
self,
seed: int,
behavior_spec: BehaviorSpec,
trainer_settings: TrainerSettings,
tanh_squash: bool = False,
reparameterize: bool = False,
separate_critic: bool = True,
condition_sigma_on_obs: bool = True,
):
"""
Policy that uses a multilayer perceptron to map the observations to actions. Could
also use a CNN to encode visual input prior to the MLP. Supports discrete and
continuous action spaces, as well as recurrent networks.
:param seed: Random seed.
:param brain: Assigned BrainParameters object.
:param trainer_settings: Defined training parameters.
:param load: Whether a pre-trained model will be loaded or a new one created.
:param tanh_squash: Whether to use a tanh function on the continuous output,
or a clipped output.
:param reparameterize: Whether we are using the resampling trick to update the policy
in continuous output.
"""
super().__init__(
seed,
behavior_spec,
trainer_settings,
tanh_squash,
reparameterize,
condition_sigma_on_obs,
)
self.global_step = (
GlobalSteps()
) # could be much simpler if TorchPolicy is nn.Module
self.grads = None
torch.set_default_tensor_type(torch.FloatTensor)
reward_signal_configs = trainer_settings.reward_signals
reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
if separate_critic:
ac_class = SeparateActorCritic
else:
ac_class = SharedActorCritic
self.actor_critic = ac_class(
observation_shapes=self.behavior_spec.observation_shapes,
network_settings=trainer_settings.network_settings,
act_type=behavior_spec.action_type,
act_size=self.act_size,
stream_names=reward_signal_names,
conditional_sigma=self.condition_sigma_on_obs,
tanh_squash=tanh_squash,
)
# Save the m_size needed for export
self._export_m_size = self.m_size
# m_size needed for training is determined by network, not trainer settings
self.m_size = self.actor_critic.memory_size
self.actor_critic.to("cpu")
@property
def export_memory_size(self) -> int:
"""
Returns the memory size of the exported ONNX policy. This only includes the memory
of the Actor and not any auxillary networks.
"""
return self._export_m_size
def _split_decision_step(
self, decision_requests: DecisionSteps
) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
mask = None
if not self.use_continuous_act:
mask = torch.ones([len(decision_requests), np.sum(self.act_size)])
if decision_requests.action_mask is not None:
mask = torch.as_tensor(
1 - np.concatenate(decision_requests.action_mask, axis=1)
)
return vec_vis_obs, mask