浏览代码

Merge branch 'master' into develop-torch-omp

/develop/torch-omp-no-thread
Ruo-Ping Dong 4 年前
当前提交
fd1dc3a6
共有 52 个文件被更改,包括 419 次插入423 次删除
  1. 219
      .circleci/config.yml
  2. 1
      DevProject/.gitignore
  3. 4
      com.unity.ml-agents.extensions/.gitignore
  4. 2
      com.unity.ml-agents/.gitignore
  5. 9
      com.unity.ml-agents/CHANGELOG.md
  6. 2
      docs/Background-Machine-Learning.md
  7. 2
      ml-agents/mlagents/trainers/ghost/trainer.py
  8. 4
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  9. 16
      ml-agents/mlagents/trainers/policy/torch_policy.py
  10. 9
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  11. 19
      ml-agents/mlagents/trainers/ppo/trainer.py
  12. 2
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  13. 18
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  14. 19
      ml-agents/mlagents/trainers/sac/trainer.py
  15. 10
      ml-agents/mlagents/trainers/tests/test_ppo.py
  16. 4
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  17. 15
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  18. 9
      ml-agents/mlagents/trainers/tests/test_sac.py
  19. 50
      ml-agents/mlagents/trainers/tests/test_saver.py
  20. 21
      ml-agents/mlagents/trainers/tests/torch/test_bcmodule.py
  21. 9
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py
  22. 2
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
  23. 4
      ml-agents/mlagents/trainers/tf/models.py
  24. 2
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  25. 9
      ml-agents/mlagents/trainers/torch/components/reward_providers/base_reward_provider.py
  26. 9
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  27. 32
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
  28. 1
      ml-agents/mlagents/trainers/torch/model_serialization.py
  29. 8
      ml-agents/mlagents/trainers/torch/utils.py
  30. 24
      ml-agents/mlagents/trainers/trainer/rl_trainer.py
  31. 16
      ml-agents/mlagents/trainers/trainer_controller.py
  32. 16
      ml-agents/mlagents/trainers/model_saver/torch_model_saver.py
  33. 16
      ml-agents/mlagents/trainers/model_saver/tf_model_saver.py
  34. 22
      ml-agents/mlagents/trainers/model_saver/model_saver.py
  35. 3
      ml-agents/setup.py
  36. 2
      test_constraints_max_tf2_version.txt
  37. 2
      test_constraints_min_version.txt
  38. 3
      test_requirements.txt
  39. 38
      ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py
  40. 19
      .github/workflows/nightly.yml
  41. 41
      .github/workflows/pre-commit.yml
  42. 60
      .github/workflows/pytest.yml
  43. 69
      ml-agents/mlagents/trainers/tests/torch/saver/test_saver_reward_providers.py
  44. 0
      /ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  45. 0
      /ml-agents/mlagents/trainers/sac/optimizer_tf.py
  46. 0
      /ml-agents/mlagents/trainers/model_saver
  47. 0
      /ml-agents/mlagents/trainers/model_saver/torch_model_saver.py
  48. 0
      /ml-agents/mlagents/trainers/model_saver/tf_model_saver.py
  49. 0
      /ml-agents/mlagents/trainers/model_saver/model_saver.py
  50. 0
      /ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py

219
.circleci/config.yml


- image: circleci/python:3.8.2
jobs:
build_python:
parameters:
executor:
type: executor
pyversion:
type: string
description: python version to being used (currently only affects caching).
pip_constraints:
type: string
description: Constraints file that is passed to "pip install". We constraint older versions of libraries for older python runtime, in order to help ensure compatibility.
enforce_onnx_conversion:
type: integer
default: 0
description: Whether to raise an exception if ONNX models couldn't be saved.
executor: << parameters.executor >>
working_directory: ~/repo
# Run additional numpy checks on unit tests
environment:
TEST_ENFORCE_NUMPY_FLOAT32: 1
TEST_ENFORCE_ONNX_CONVERSION: << parameters.enforce_onnx_conversion >>
steps:
- checkout
- run:
# Combine all the python dependencies into one file so that we can use that for the cache checksum
name: Combine pip dependencies for caching
command: cat ml-agents/setup.py ml-agents-envs/setup.py gym-unity/setup.py test_requirements.txt << parameters.pip_constraints >> > python_deps.txt
- restore_cache:
keys:
# Parameterize the cache so that different python versions can get different versions of the packages
- v1-dependencies-py<< parameters.pyversion >>-{{ checksum "python_deps.txt" }}
- run:
name: Install Dependencies
command: |
python3 -m venv venv
. venv/bin/activate
pip install --upgrade pip
pip install --upgrade setuptools
pip install --progress-bar=off -e ./ml-agents-envs -c << parameters.pip_constraints >>
pip install --progress-bar=off -e ./ml-agents -c << parameters.pip_constraints >>
pip install --progress-bar=off -r test_requirements.txt -c << parameters.pip_constraints >>
pip install --progress-bar=off -e ./gym-unity -c << parameters.pip_constraints >>
- save_cache:
paths:
- ./venv
key: v1-dependencies-py<< parameters.pyversion >>-{{ checksum "python_deps.txt" }}
- run:
name: Run Tests for ml-agents and gym_unity
# This also dumps the installed pip packages to a file, so we can see what versions are actually being used.
command: |
. venv/bin/activate
mkdir test-reports
pip freeze > test-reports/pip_versions.txt
pytest --cov=ml-agents --cov=ml-agents-envs --cov=gym-unity --cov-report html --junitxml=test-reports/junit.xml -p no:warnings
- run:
name: Verify there are no hidden/missing metafiles.
# Renaming files or deleting files can leave metafiles behind that makes Unity very unhappy.
command: |
. venv/bin/activate
python utils/validate_meta_files.py
- store_test_results:
path: test-reports
- store_artifacts:
path: test-reports
destination: test-reports
- store_artifacts:
path: htmlcov
destination: htmlcov
pre-commit:
docker:
- image: circleci/python:3.7.3
working_directory: ~/repo/
steps:
- checkout
- run:
name: Combine precommit config and python versions for caching
command: |
cat .pre-commit-config.yaml > pre-commit-deps.txt
python -VV >> pre-commit-deps.txt
- restore_cache:
keys:
- v1-precommit-deps-{{ checksum "pre-commit-deps.txt" }}
- run:
name: Install Dependencies
command: |
# Need ruby for search-and-replace
sudo apt-get update
sudo apt-get install ruby-full
# install dotnet and the formatter - see https://docs.microsoft.com/en-us/dotnet/core/install/linux-debian#debian-9-
pushd ~
wget -O - https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.asc.gpg
sudo mv microsoft.asc.gpg /etc/apt/trusted.gpg.d/
wget https://packages.microsoft.com/config/debian/9/prod.list
sudo mv prod.list /etc/apt/sources.list.d/microsoft-prod.list
sudo chown root:root /etc/apt/trusted.gpg.d/microsoft.asc.gpg
sudo chown root:root /etc/apt/sources.list.d/microsoft-prod.list
popd
sudo apt-get install -y apt-transport-https && \
sudo apt-get update && \
sudo apt-get install -y dotnet-sdk-3.1 && \
dotnet tool install -g dotnet-format --version 4.1.131201
echo "Setting up venv"
python3 -m venv venv
. venv/bin/activate
pip install --upgrade pip
pip install --upgrade setuptools
pip install pre-commit
# Install the hooks now so that they'll be cached
pre-commit install-hooks
- save_cache:
paths:
- ~/.cache/pre-commit
- ./venv
key: v1-precommit-deps-{{ checksum "pre-commit-deps.txt" }}
- run:
name: Check Code Style using pre-commit
command: |
. venv/bin/activate
export PATH="$PATH:~/.dotnet/tools"
pre-commit run --show-diff-on-failure --all-files
markdown_link_check:
parameters:
precommit_command:
type: string
description: precommit hook to run
default: markdown-link-check
docker:
- image: circleci/node:12.6.0
working_directory: ~/repo
steps:
- checkout
- restore_cache:
keys:
- v1-node-dependencies-{{ checksum ".pre-commit-config.yaml" }}
# fallback to using the latest cache if no exact match is found
- v1-node-dependencies-
- run:
name: Install Dependencies
command: |
sudo apt-get install python3-venv
python3 -m venv venv
. venv/bin/activate
pip install pre-commit
- run: sudo npm install -g markdown-link-check
- save_cache:
paths:
- ./venv
key: v1-node-dependencies-{{ checksum ".pre-commit-config.yaml" }}
- run:
name: Run markdown-link-check via precommit
command: |
. venv/bin/activate
pre-commit run --hook-stage manual << parameters.precommit_command >> --all-files
deploy:
parameters:
directory:

version: 2
workflow:
jobs:
- build_python:
name: python_3.6.1
executor: python361
pyversion: 3.6.1
# Test python 3.6 with the oldest supported versions
pip_constraints: test_constraints_min_version.txt
- build_python:
name: python_3.7.3
executor: python373
pyversion: 3.7.3
# Test python 3.7 with the newest supported versions
pip_constraints: test_constraints_max_tf1_version.txt
# Make sure ONNX conversion passes here (recent version of tensorflow 1.x)
enforce_onnx_conversion: 1
- build_python:
name: python_3.7.3+tf2
executor: python373
pyversion: 3.7.3
# Test python 3.7 with the newest supported versions
pip_constraints: test_constraints_max_tf2_version.txt
- build_python:
name: python_3.8.2+tf2.2
executor: python382
pyversion: 3.8.2
# Test python 3.8 with the newest edge versions
pip_constraints: test_constraints_max_tf2_version.txt
- markdown_link_check
- pre-commit
# The first deploy jobs are the "real" ones that upload to pypi
- deploy:
name: deploy ml-agents-envs

only: /^release_[0-9]+_test[0-9]+$/
branches:
ignore: /.*/
nightly:
triggers:
- schedule:
cron: "0 0 * * *"
filters:
branches:
only:
- develop
jobs:
- markdown_link_check:
name: markdown-link-check full
precommit_command: markdown-link-check-full

1
DevProject/.gitignore


/Assets/Demonstrations*
/Assets/ML-Agents/Timers*
/csharp_timers.json
/CodeCoverage/
# Environemnt logfile
*Project.log

4
com.unity.ml-agents.extensions/.gitignore


build.sh.meta
build.bat.meta
.idea/
# API Scraper output
*.api
*.api.meta

2
com.unity.ml-agents/.gitignore


# Visual Studio 2015 cache directory
/.vs/
*.api
*.api.meta

9
com.unity.ml-agents/CHANGELOG.md


- Update Barracuda to 1.0.2.
- Enabled C# formatting using `dotnet-format`.
#### ml-agents / ml-agents-envs / gym-unity (Python)
- Experimental PyTorch support has been added. Use `--torch` when running `mlagents-learn`, or add
`framework: pytorch` to your trainer configuration (under the behavior name) to enable it.
Note that PyTorch 1.6.0 or greater should be installed to use this feature; see
[the PyTorch website](https://pytorch.org/) for installation instructions. (#4335)
- The minimum supported version of TensorFlow was increased to 1.14.0. (#4411)
### Bug Fixes
#### com.unity.ml-agents (C#)

- The interaction between EnvManager and TrainerController was changed; EnvManager.advance() was split into to stages,
and TrainerController now uses the results from the first stage to handle new behavior names. This change speeds up
Python training by approximately 5-10%. (#4259)
- Experimental PyTorch support has been added. Use `--torch` when running `mlagents-learn`, or add
`framework: pytorch` to your trainer configuration (under the behavior name) to enable it.
Note that PyTorch 1.6.0 or greater should be installed to use this feature; see
[the PyTorch website](https://pytorch.org/) for installation instructions. (#4335)
### Minor Changes
#### com.unity.ml-agents (C#)

2
docs/Background-Machine-Learning.md


[computer Go](https://en.wikipedia.org/wiki/Computer_Go) program, that leverages
deep learning, that was able to beat Lee Sedol (a Go world champion).
A key characteristic of deep learning algorithms is their ability learn very
A key characteristic of deep learning algorithms is their ability to learn very
complex functions from large amounts of training data. This makes them a natural
choice for reinforcement learning tasks when a large amount of data can be
generated, say through the use of a simulator or engine such as Unity. By

2
ml-agents/mlagents/trainers/ghost/trainer.py


policy = self.trainer.create_policy(
parsed_behavior_id, behavior_spec, create_graph=True
)
self.trainer.saver.initialize_or_load(policy)
self.trainer.model_saver.initialize_or_load(policy)
team_id = parsed_behavior_id.team_id
self.controller.subscribe_team_id(team_id, self)

4
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


)
for name, estimate in value_estimates.items():
value_estimates[name] = estimate.detach().cpu().numpy()
next_value_estimate[name] = next_value_estimate[name].detach().cpu().numpy()
value_estimates[name] = ModelUtils.to_numpy(estimate)
next_value_estimate[name] = ModelUtils.to_numpy(next_value_estimate[name])
if done:
for k in next_value_estimate:

16
ml-agents/mlagents/trainers/policy/torch_policy.py


also use a CNN to encode visual input prior to the MLP. Supports discrete and
continuous action spaces, as well as recurrent networks.
:param seed: Random seed.
:param brain: Assigned BrainParameters object.
:param behavior_spec: Assigned BehaviorSpec object.
:param trainer_settings: Defined training parameters.
:param load: Whether a pre-trained model will be loaded or a new one created.
:param tanh_squash: Whether to use a tanh function on the continuous output,

action, log_probs, entropy, value_heads, memories = self.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
)
run_out["action"] = action.detach().cpu().numpy()
run_out["pre_action"] = action.detach().cpu().numpy()
run_out["action"] = ModelUtils.to_numpy(action)
run_out["pre_action"] = ModelUtils.to_numpy(action)
run_out["log_probs"] = log_probs.detach().cpu().numpy()
run_out["entropy"] = entropy.detach().cpu().numpy()
run_out["log_probs"] = ModelUtils.to_numpy(log_probs)
run_out["entropy"] = ModelUtils.to_numpy(entropy)
name: t.detach().cpu().numpy() for name, t in value_heads.items()
name: ModelUtils.to_numpy(t) for name, t in value_heads.items()
run_out["memory_out"] = memories.detach().cpu().numpy().squeeze(0)
run_out["memory_out"] = ModelUtils.to_numpy(memories).squeeze(0)
return run_out
def get_action(

Decides actions given observations information, and takes them in environment.
:param worker_id:
:param decision_requests: A dictionary of brain names and BrainInfo from environment.
:param decision_requests: A dictionary of behavior names and DecisionSteps from environment.
:return: an ActionInfo containing action, memories, values and an object
to be passed to add experiences
"""

9
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


self.optimizer.step()
update_stats = {
"Losses/Policy Loss": abs(policy_loss.detach().cpu().numpy()),
"Losses/Value Loss": value_loss.detach().cpu().numpy(),
"Losses/Policy Loss": policy_loss.item(),
"Losses/Value Loss": value_loss.item(),
"Policy/Learning Rate": decay_lr,
"Policy/Epsilon": decay_eps,
"Policy/Beta": decay_bet,

return update_stats
def get_modules(self):
return {"Optimizer": self.optimizer}
modules = {"Optimizer": self.optimizer}
for reward_provider in self.reward_signals.values():
modules.update(reward_provider.get_modules())
return modules

19
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.policy import Policy
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, PPOSettings, FrameworkType

def __init__(
self,
brain_name: str,
behavior_name: str,
reward_buff_cap: int,
trainer_settings: TrainerSettings,
training: bool,

):
"""
Responsible for collecting experiences and training PPO model.
:param brain_name: The name of the brain associated with trainer config
:param behavior_name: The name of the behavior associated with trainer config
:param reward_buff_cap: Max reward history to track in the reward buffer
:param trainer_settings: The parameters for the trainer.
:param training: Whether the trainer is set for training.

"""
super().__init__(
brain_name, trainer_settings, training, load, artifact_path, reward_buff_cap
behavior_name,
trainer_settings,
training,
load,
artifact_path,
reward_buff_cap,
)
self.hyperparameters: PPOSettings = cast(
PPOSettings, self.trainer_settings.hyperparameters

for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
self.saver.register(self.policy)
self.saver.register(self.optimizer)
self.saver.initialize_or_load()
self.model_saver.register(self.policy)
self.model_saver.register(self.optimizer)
self.model_saver.initialize_or_load()
# Needed to resume loads properly
self.step = policy.get_current_step()

2
ml-agents/mlagents/trainers/ppo/optimizer_tf.py


name="old_probabilities",
)
# Break old log probs into separate branches
# Break old log log_probs into separate branches
old_log_prob_branches = ModelUtils.break_into_branches(
self.all_old_log_probs, self.policy.act_size
)

18
ml-agents/mlagents/trainers/sac/optimizer_torch.py


# Update target network
self.soft_update(self.policy.actor_critic.critic, self.target_network, self.tau)
update_stats = {
"Losses/Policy Loss": abs(policy_loss.detach().cpu().numpy()),
"Losses/Value Loss": value_loss.detach().cpu().numpy(),
"Losses/Q1 Loss": q1_loss.detach().cpu().numpy(),
"Losses/Q2 Loss": q2_loss.detach().cpu().numpy(),
"Policy/Entropy Coeff": torch.exp(self._log_ent_coef)
.detach()
.cpu()
.numpy(),
"Losses/Policy Loss": policy_loss.item(),
"Losses/Value Loss": value_loss.item(),
"Losses/Q1 Loss": q1_loss.item(),
"Losses/Q2 Loss": q2_loss.item(),
"Policy/Entropy Coeff": torch.exp(self._log_ent_coef).item(),
"Policy/Learning Rate": decay_lr,
}

return {}
def get_modules(self):
return {
modules = {
"Optimizer:value_network": self.value_network,
"Optimizer:target_network": self.target_network,
"Optimizer:policy_optimizer": self.policy_optimizer,

for reward_provider in self.reward_signals.values():
modules.update(reward_provider.get_modules())
return modules

19
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy import Policy
from mlagents.trainers.sac.optimizer import SACOptimizer
from mlagents.trainers.sac.optimizer_tf import SACOptimizer
from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.trajectory import Trajectory, SplitObservations
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers

def __init__(
self,
brain_name: str,
behavior_name: str,
reward_buff_cap: int,
trainer_settings: TrainerSettings,
training: bool,

):
"""
Responsible for collecting experiences and training SAC model.
:param brain_name: The name of the brain associated with trainer config
:param behavior_name: The name of the behavior associated with trainer config
:param reward_buff_cap: Max reward history to track in the reward buffer
:param trainer_settings: The parameters for the trainer.
:param training: Whether the trainer is set for training.

"""
super().__init__(
brain_name, trainer_settings, training, load, artifact_path, reward_buff_cap
behavior_name,
trainer_settings,
training,
load,
artifact_path,
reward_buff_cap,
)
self.seed = seed

for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
self.saver.register(self.policy)
self.saver.register(self.optimizer)
self.saver.initialize_or_load()
self.model_saver.register(self.policy)
self.model_saver.register(self.optimizer)
self.model_saver.initialize_or_load()
# Needed to resume loads properly
self.step = policy.get_current_step()

10
ml-agents/mlagents/trainers/tests/test_ppo.py


from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.tests import mock_brain as mb

)
@mock.patch.object(RLTrainer, "create_saver")
@mock.patch.object(RLTrainer, "create_model_saver")
def test_trainer_increment_step(ppo_optimizer, mock_create_saver):
def test_trainer_increment_step(ppo_optimizer, mock_create_model_saver):
trainer_params = PPO_CONFIG
mock_optimizer = mock.Mock()
mock_optimizer.reward_signals = {}

assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0
@mock.patch.object(RLTrainer, "create_saver")
@mock.patch.object(RLTrainer, "create_model_saver")
def test_add_get_policy(ppo_optimizer, mock_create_saver, dummy_config):
def test_add_get_policy(ppo_optimizer, mock_create_model_saver, dummy_config):
mock_optimizer = mock.Mock()
mock_optimizer.reward_signals = {}
ppo_optimizer.return_value = mock_optimizer

4
ml-agents/mlagents/trainers/tests/test_reward_signals.py


import os
import mlagents.trainers.tests.mock_brain as mb
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.sac.optimizer import SACOptimizer
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.sac.optimizer_tf import SACOptimizer
from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG, SAC_CONFIG
from mlagents.trainers.settings import (
GAILSettings,

15
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


def add_policy(self, mock_behavior_id, mock_policy):
def checkpoint_path(brain_name, step):
return os.path.join(self.saver.model_path, f"{brain_name}-{step}")
return os.path.join(self.model_saver.model_path, f"{brain_name}-{step}")
mock_saver = mock.Mock()
mock_saver.model_path = self.artifact_path
mock_saver.save_checkpoint.side_effect = checkpoint_path
self.saver = mock_saver
mock_model_saver = mock.Mock()
mock_model_saver.model_path = self.artifact_path
mock_model_saver.save_checkpoint.side_effect = checkpoint_path
self.model_saver = mock_model_saver
def create_tf_policy(self, parsed_behavior_id, behavior_spec):
return mock.Mock()

checkpoint_interval, num_trajectories * time_horizon, checkpoint_interval
)
calls = [mock.call(trainer.brain_name, step) for step in checkpoint_range]
trainer.saver.save_checkpoint.assert_has_calls(calls, any_order=True)
trainer.model_saver.save_checkpoint.assert_has_calls(calls, any_order=True)
export_ext = "nn" if trainer.framework == FrameworkType.TENSORFLOW else "onnx"
add_checkpoint_calls = [

step,
f"{trainer.saver.model_path}/{trainer.brain_name}-{step}.{export_ext}",
f"{trainer.model_saver.model_path}/{trainer.brain_name}-{step}.{export_ext}",
None,
mock.ANY,
),

9
ml-agents/mlagents/trainers/tests/test_sac.py


from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.sac.trainer import SACTrainer
from mlagents.trainers.sac.optimizer import SACOptimizer
from mlagents.trainers.sac.optimizer_tf import SACOptimizer
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.tests import mock_brain as mb

assert trainer2.update_buffer.num_experiences == buffer_len
@mock.patch.object(RLTrainer, "create_saver")
@mock.patch.object(RLTrainer, "create_model_saver")
def test_add_get_policy(sac_optimizer, mock_create_saver, dummy_config):
def test_add_get_policy(sac_optimizer, mock_create_model_saver, dummy_config):
mock_optimizer = mock.Mock()
mock_optimizer.reward_signals = {}
sac_optimizer.return_value = mock_optimizer

policy = trainer.create_policy(behavior_id, specs)
policy.get_current_step = lambda: 200
trainer.add_policy(behavior_id, policy)
trainer.saver.initialize_or_load(policy)
trainer.saver.initialize_or_load(policy)
trainer.model_saver.initialize_or_load(policy)
trainer.optimizer.update_reward_signals = mock.Mock()
trainer.optimizer.update_reward_signals.return_value = {}
trainer.optimizer.update.return_value = {}

50
ml-agents/mlagents/trainers/tests/test_saver.py


import numpy as np
from mlagents.tf_utils import tf
from mlagents.trainers.saver.tf_saver import TFSaver
from mlagents.trainers.model_saver.tf_model_saver import TFModelSaver
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
saver = TFSaver(trainer_params, tmp_path)
model_saver = TFModelSaver(trainer_params, tmp_path)
saver.register(opt)
assert saver.policy is None
model_saver.register(opt)
assert model_saver.policy is None
saver.register(policy)
assert saver.policy is not None
model_saver.register(policy)
assert model_saver.policy is not None
class ModelVersionTest(unittest.TestCase):

trainer_params = TrainerSettings()
mock_path = tempfile.mkdtemp()
policy = create_policy_mock(trainer_params)
saver = TFSaver(trainer_params, mock_path)
saver.register(policy)
model_saver = TFModelSaver(trainer_params, mock_path)
model_saver.register(policy)
saver._check_model_version(
model_saver._check_model_version(
saver._check_model_version(__version__) # This should be the right version
model_saver._check_model_version(
__version__
) # This should be the right version
# Assert that no additional warnings have been thrown wth correct ver
assert len(cm.output) == 1

path2 = os.path.join(tmp_path, "runid2")
trainer_params = TrainerSettings()
policy = create_policy_mock(trainer_params)
saver = TFSaver(trainer_params, path1)
saver.register(policy)
saver.initialize_or_load(policy)
model_saver = TFModelSaver(trainer_params, path1)
model_saver.register(policy)
model_saver.initialize_or_load(policy)
saver.save_checkpoint(mock_brain_name, 2000)
model_saver.save_checkpoint(mock_brain_name, 2000)
saver = TFSaver(trainer_params, path1, load=True)
model_saver = TFModelSaver(trainer_params, path1, load=True)
saver.register(policy2)
saver.initialize_or_load(policy2)
model_saver.register(policy2)
model_saver.initialize_or_load(policy2)
saver = TFSaver(trainer_params, path2)
model_saver = TFModelSaver(trainer_params, path2)
saver.register(policy3)
saver.initialize_or_load(policy3)
model_saver.register(policy3)
model_saver.initialize_or_load(policy3)
_compare_two_policies(policy2, policy3)
# Assert that the steps are 0.

dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
trainer_params = TrainerSettings()
saver = TFSaver(trainer_params, model_path)
saver.register(policy)
saver.save_checkpoint("Mock_Brain", 100)
model_saver = TFModelSaver(trainer_params, model_path)
model_saver.register(policy)
model_saver.save_checkpoint("Mock_Brain", 100)
assert os.path.isfile(model_path + "/Mock_Brain-100.nn")

21
ml-agents/mlagents/trainers/tests/torch/test_bcmodule.py


import pytest
import mlagents.trainers.tests.mock_brain as mb
import numpy as np
import os
from mlagents.trainers.policy.torch_policy import TorchPolicy

default_num_epoch=3,
)
return bc_module
def assert_stats_are_float(stats):
for _, item in stats.items():
assert isinstance(item, float)
# Test default values

)
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
assert_stats_are_float(stats)
# Test with constant pretraining learning rate

)
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
assert_stats_are_float(stats)
old_learning_rate = bc_module.current_lr
_ = bc_module.update()

)
bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
assert_stats_are_float(stats)
# Test with discrete control and visual observations

)
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
assert_stats_are_float(stats)
# Test with discrete control, visual observations and RNN

)
bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
assert_stats_are_float(stats)
if __name__ == "__main__":

9
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py


from mlagents.trainers.tests.torch.test_reward_providers.utils import (
create_agent_buffer,
)
from mlagents.trainers.torch.utils import ModelUtils
SEED = [42]

buffer = create_agent_buffer(behavior_spec, 5)
for _ in range(200):
curiosity_rp.update(buffer)
prediction = curiosity_rp._network.predict_action(buffer)[0].detach()
target = buffer["actions"][0]
error = float(torch.mean((prediction - target) ** 2))
prediction = curiosity_rp._network.predict_action(buffer)[0]
target = torch.tensor(buffer["actions"][0])
error = torch.mean((prediction - target) ** 2).item()
assert error < 0.001

curiosity_rp.update(buffer)
prediction = curiosity_rp._network.predict_next_state(buffer)[0]
target = curiosity_rp._network.get_next_state(buffer)[0]
error = float(torch.mean((prediction - target) ** 2).detach())
error = float(ModelUtils.to_numpy(torch.mean((prediction - target) ** 2)))
assert error < 0.001

2
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py


buffer["vector_obs"].append(curr_split_obs.vector_observations)
buffer["next_vector_in"].append(next_split_obs.vector_observations)
buffer["actions"].append(action)
buffer["done"].append(np.zeros(1, dtype=np.float32))
buffer["done"] = np.zeros(number, dtype=np.float32)
return buffer

4
ml-agents/mlagents/trainers/tf/models.py


:param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action]
:param action_size: A list containing the number of possible actions for each branch
:return: The action output dimension [batch_size, num_branches], the concatenated
normalized probs (after softmax)
and the concatenated normalized log probs
normalized log_probs (after softmax)
and the concatenated normalized log log_probs
"""
branch_masks = ModelUtils.break_into_branches(action_masks, action_size)
raw_probs = [

2
ml-agents/mlagents/trainers/torch/components/bc/module.py


bc_loss.backward()
self.optimizer.step()
run_out = {"loss": bc_loss.detach().cpu().numpy()}
run_out = {"loss": bc_loss.item()}
return run_out

9
ml-agents/mlagents/trainers/torch/components/reward_providers/base_reward_provider.py


import numpy as np
import torch
from abc import ABC, abstractmethod
from typing import Dict

raise NotImplementedError(
"The reward provider's update method has not been implemented "
)
def get_modules(self) -> Dict[str, torch.nn.Module]:
"""
Returns a dictionary of string identifiers to the torch.nn.Modules used by
the reward providers. This method is used for loading and saving the weights
of the reward providers.
"""
return {}

9
ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py


def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
with torch.no_grad():
rewards = self._network.compute_reward(mini_batch).detach().cpu().numpy()
rewards = ModelUtils.to_numpy(self._network.compute_reward(mini_batch))
rewards = np.minimum(rewards, 1.0 / self.strength)
return rewards * self._has_updated_once

loss.backward()
self.optimizer.step()
return {
"Losses/Curiosity Forward Loss": forward_loss.detach().cpu().numpy(),
"Losses/Curiosity Inverse Loss": inverse_loss.detach().cpu().numpy(),
"Losses/Curiosity Forward Loss": forward_loss.item(),
"Losses/Curiosity Inverse Loss": inverse_loss.item(),
def get_modules(self):
return {f"Module:{self.name}": self._network}
class CuriosityNetwork(torch.nn.Module):

32
ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py


estimates, _ = self._discriminator_network.compute_estimate(
mini_batch, use_vail_noise=False
)
return (
return ModelUtils.to_numpy(
.detach()
.cpu()
.numpy()
)
def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:

loss.backward()
self.optimizer.step()
return stats_dict
def get_modules(self):
return {f"Module:{self.name}": self._discriminator_network}
class DiscriminatorNetwork(torch.nn.Module):

encoder_input = self.get_state_encoding(mini_batch)
if self._settings.use_actions:
actions = self.get_action_input(mini_batch)
dones = torch.as_tensor(mini_batch["done"], dtype=torch.float)
dones = torch.as_tensor(mini_batch["done"], dtype=torch.float).unsqueeze(1)
encoder_input = torch.cat([encoder_input, actions, dones], dim=1)
hidden = self.encoder(encoder_input)
z_mu: Optional[torch.Tensor] = None

expert_estimate, expert_mu = self.compute_estimate(
expert_batch, use_vail_noise=True
)
stats_dict["Policy/GAIL Policy Estimate"] = (
policy_estimate.mean().detach().cpu().numpy()
)
stats_dict["Policy/GAIL Expert Estimate"] = (
expert_estimate.mean().detach().cpu().numpy()
)
stats_dict["Policy/GAIL Policy Estimate"] = policy_estimate.mean().item()
stats_dict["Policy/GAIL Expert Estimate"] = expert_estimate.mean().item()
stats_dict["Losses/GAIL Loss"] = discriminator_loss.detach().cpu().numpy()
stats_dict["Losses/GAIL Loss"] = discriminator_loss.item()
total_loss += discriminator_loss
if self._settings.use_vail:
# KL divergence loss (encourage latent representation to be normal)

torch.tensor(0.0),
)
total_loss += vail_loss
stats_dict["Policy/GAIL Beta"] = self._beta.detach().cpu().numpy()
stats_dict["Losses/GAIL KL Loss"] = kl_loss.detach().cpu().numpy()
stats_dict["Policy/GAIL Beta"] = self._beta.item()
stats_dict["Losses/GAIL KL Loss"] = kl_loss.item()
if self.gradient_penalty_weight > 0.0:
total_loss += (
self.gradient_penalty_weight

policy_action = self.get_action_input(policy_batch)
expert_action = self.get_action_input(policy_batch)
action_epsilon = torch.rand(policy_action.shape)
policy_dones = torch.as_tensor(policy_batch["done"], dtype=torch.float)
expert_dones = torch.as_tensor(expert_batch["done"], dtype=torch.float)
policy_dones = torch.as_tensor(
policy_batch["done"], dtype=torch.float
).unsqueeze(1)
expert_dones = torch.as_tensor(
expert_batch["done"], dtype=torch.float
).unsqueeze(1)
dones_epsilon = torch.rand(policy_dones.shape)
encoder_input = torch.cat(
[

1
ml-agents/mlagents/trainers/torch/model_serialization.py


Exports a Torch model for a Policy to .onnx format for Unity embedding.
:param output_filepath: file path to output the model (without file suffix)
:param brain_name: Brain name of brain to be trained
"""
if not os.path.exists(output_filepath):
os.makedirs(output_filepath)

8
ml-agents/mlagents/trainers/torch/utils.py


return torch.as_tensor(np.asanyarray(ndarray_list), dtype=dtype)
@staticmethod
def to_numpy(tensor: torch.Tensor) -> np.ndarray:
"""
Converts a Torch Tensor to a numpy array. If the Tensor is on the GPU, it will
be brought to the CPU.
"""
return tensor.detach().cpu().numpy()
@staticmethod
def break_into_branches(
concatenated_logits: torch.Tensor, action_size: List[int]
) -> List[torch.Tensor]:

24
ml-agents/mlagents/trainers/trainer/rl_trainer.py


from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.settings import TrainerSettings, FrameworkType
from mlagents.trainers.stats import StatsPropertyType
from mlagents.trainers.saver.saver import BaseSaver
from mlagents.trainers.saver.tf_saver import TFSaver
from mlagents.trainers.model_saver.model_saver import BaseModelSaver
from mlagents.trainers.model_saver.tf_model_saver import TFModelSaver
from mlagents.trainers.saver.torch_saver import TorchSaver
from mlagents.trainers.model_saver.torch_model_saver import TorchModelSaver
except ModuleNotFoundError:
TorchPolicy = None # type: ignore

self._next_save_step = 0
self._next_summary_step = 0
self.saver = self.create_saver(
self.model_saver = self.create_model_saver(
self.framework, self.trainer_settings, self.artifact_path, self.load
)

pass
@staticmethod
def create_saver(
def create_model_saver(
) -> BaseSaver:
) -> BaseModelSaver:
saver = TorchSaver( # type: ignore
model_saver = TorchModelSaver( # type: ignore
saver = TFSaver( # type: ignore
model_saver = TFModelSaver( # type: ignore
return saver
return model_saver
def _policy_mean_reward(self) -> Optional[float]:
""" Returns the mean episode reward for the current policy. """

logger.warning(
"Trainer has multiple policies, but default behavior only saves the first."
)
checkpoint_path = self.saver.save_checkpoint(self.brain_name, self.step)
checkpoint_path = self.model_saver.save_checkpoint(self.brain_name, self.step)
export_ext = "nn" if self.framework == FrameworkType.TENSORFLOW else "onnx"
new_checkpoint = NNCheckpoint(
int(self.step),

return
model_checkpoint = self._checkpoint()
self.saver.copy_final_model(model_checkpoint.file_path)
self.model_saver.copy_final_model(model_checkpoint.file_path)
model_checkpoint, file_path=f"{self.saver.model_path}.{export_ext}"
model_checkpoint, file_path=f"{self.model_saver.model_path}.{export_ext}"
)
NNCheckpointManager.track_final_checkpoint(self.brain_name, final_checkpoint)

16
ml-agents/mlagents/trainers/trainer_controller.py


self.trainers[brain_name].save_model()
self.logger.info("Saved Model")
def _save_model_when_interrupted(self):
self.logger.info(
"Learning was interrupted. Please wait while the graph is generated."
)
self._save_models()
def _export_graph(self):
"""
Saves models for all trainers.
"""
if self.rank is not None and self.rank != 0:
return
for brain_name in self.trainers.keys():
self.trainers[brain_name].save_model()
@staticmethod
def _create_output_path(output_path):
try:

16
ml-agents/mlagents/trainers/model_saver/torch_model_saver.py


from typing import Dict, Union, Optional, cast
from mlagents_envs.exception import UnityPolicyException
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.saver.saver import BaseSaver
from mlagents.trainers.model_saver.model_saver import BaseModelSaver
from mlagents.trainers.settings import TrainerSettings, SerializationSettings
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer

logger = get_logger(__name__)
class TorchSaver(BaseSaver):
class TorchModelSaver(BaseModelSaver):
Saver class for PyTorch
ModelSaver class for PyTorch
"""
def __init__(

self.modules.update(module.get_modules()) # type: ignore
else:
raise UnityPolicyException(
"Registering Object of unsupported type {} to Saver ".format(
"Registering Object of unsupported type {} to ModelSaver ".format(
type(module)
)
)

def save_checkpoint(self, brain_name: str, step: int) -> str:
def save_checkpoint(self, behavior_name: str, step: int) -> str:
checkpoint_path = os.path.join(self.model_path, f"{brain_name}-{step}")
checkpoint_path = os.path.join(self.model_path, f"{behavior_name}-{step}")
self.export(checkpoint_path, brain_name)
self.export(checkpoint_path, behavior_name)
def export(self, output_filepath: str, brain_name: str) -> None:
def export(self, output_filepath: str, behavior_name: str) -> None:
if self.exporter is not None:
self.exporter.export_policy_model(output_filepath)

16
ml-agents/mlagents/trainers/model_saver/tf_model_saver.py


from mlagents_envs.exception import UnityPolicyException
from mlagents_envs.logging_util import get_logger
from mlagents.tf_utils import tf
from mlagents.trainers.saver.saver import BaseSaver
from mlagents.trainers.model_saver.model_saver import BaseModelSaver
from mlagents.trainers.tf.model_serialization import export_policy_model
from mlagents.trainers.settings import TrainerSettings, SerializationSettings
from mlagents.trainers.policy.tf_policy import TFPolicy

logger = get_logger(__name__)
class TFSaver(BaseSaver):
class TFModelSaver(BaseModelSaver):
Saver class for TensorFlow
ModelSaver class for TensorFlow
"""
def __init__(

with self.policy.graph.as_default():
self.tf_saver = tf.train.Saver(max_to_keep=self._keep_checkpoints)
def save_checkpoint(self, brain_name: str, step: int) -> str:
checkpoint_path = os.path.join(self.model_path, f"{brain_name}-{step}")
def save_checkpoint(self, behavior_name: str, step: int) -> str:
checkpoint_path = os.path.join(self.model_path, f"{behavior_name}-{step}")
# Save the TF checkpoint and graph definition
if self.graph:
with self.graph.as_default():

self.graph, self.model_path, "raw_graph_def.pb", as_text=False
)
# also save the policy so we have optimized model files for each checkpoint
self.export(checkpoint_path, brain_name)
self.export(checkpoint_path, behavior_name)
def export(self, output_filepath: str, brain_name: str) -> None:
def export(self, output_filepath: str, behavior_name: str) -> None:
self.model_path, output_filepath, brain_name, self.graph, self.sess
self.model_path, output_filepath, behavior_name, self.graph, self.sess
)
def initialize_or_load(self, policy: Optional[TFPolicy] = None) -> None:

22
ml-agents/mlagents/trainers/model_saver/model_saver.py


from typing import Any
class BaseSaver(abc.ABC):
"""This class is the base class for the Saver"""
class BaseModelSaver(abc.ABC):
"""This class is the base class for the ModelSaver"""
def __init__(self):
pass

"""
Register the modules to the Saver.
The Saver will store the module and include it in the saved files
Register the modules to the ModelSaver.
The ModelSaver will store the module and include it in the saved files
when saving checkpoint/exporting graph.
:param module: the module to be registered
"""

"""
Helper function for registering policy to the Saver.
Helper function for registering policy to the ModelSaver.
:param policy: the policy to be registered
"""
pass

Helper function for registering optimizer to the Saver.
Helper function for registering optimizer to the ModelSaver.
def save_checkpoint(self, brain_name: str, step: int) -> str:
def save_checkpoint(self, behavior_name: str, step: int) -> str:
:param brain_name: Brain name of brain to be trained
:param behavior_name: Behavior name of bevavior to be trained
def export(self, output_filepath: str, brain_name: str) -> None:
def export(self, output_filepath: str, behavior_name: str) -> None:
Saves the serialized model, given a path and brain name.
Saves the serialized model, given a path and behavior name.
:param brain_name: Brain name of brain to be trained.
:param behavior_name: Behavior name of behavior to be trained.
"""
pass

3
ml-agents/setup.py


"Pillow>=4.2.1",
"protobuf>=3.6",
"pyyaml>=3.1.0",
"tensorflow>=1.7,<3.0",
"tensorflow>=1.14,<3.0",
"cattrs>=1.0.0",
"attrs>=19.3.0",
'pypiwin32==223;platform_system=="Windows"',

]
},
cmdclass={"verify": VerifyVersionCommand},
extras_require={"torch": ["torch>=1.5.0"]},
)

2
test_constraints_max_tf2_version.txt


# For projects with upper bounds, we should periodically update this list to the latest release version
grpcio>=1.23.0
numpy>=1.17.2
tensorflow==2.2.0rc3
tensorflow==2.3.0
h5py>=2.10.0

2
test_constraints_min_version.txt


numpy==1.14.1
Pillow==4.2.1
protobuf==3.6
tensorflow==1.7.0
tensorflow==1.14.0
h5py==2.9.0

3
test_requirements.txt


# PyTorch tests are here for the time being, before they are used in the codebase.
torch>=1.5.0
# onnx doesn't currently have a wheel for 3.8
tf2onnx>=1.5.5;python_version<'3.8'
tf2onnx>=1.5.5

38
ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py