浏览代码

Refactoring of the tests folder for the trainers (#4510)

* Refactoring of the tests folder for the trainers

* Fixing issues

* Fixing issues

* Fixing issues
/MLA-1734-demo-provider
GitHub 4 年前
当前提交
e471bd8b
共有 29 个文件被更改,包括 1090 次插入1043 次删除
  1. 10
      ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
  2. 6
      ml-agents/mlagents/trainers/tests/test_trainer_util.py
  3. 12
      ml-agents/mlagents/trainers/tests/torch/test_ppo.py
  4. 10
      ml-agents/mlagents/trainers/tests/torch/test_sac.py
  5. 251
      ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
  6. 21
      ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
  7. 10
      ml-agents/mlagents/trainers/tests/tensorflow/test_sac.py
  8. 2
      ml-agents/mlagents/trainers/tests/tensorflow/test_saver.py
  9. 94
      ml-agents/mlagents/trainers/tests/check_env_trains.py
  10. 72
      ml-agents/mlagents/trainers/tests/dummy_config.py
  11. 0
      ml-agents/mlagents/trainers/tests/torch/__init__.py
  12. 0
      ml-agents/mlagents/trainers/tests/tensorflow/__init__.py
  13. 118
      ml-agents/mlagents/trainers/tests/tensorflow/test_bcmodule.py
  14. 176
      ml-agents/mlagents/trainers/tests/tensorflow/test_reward_signals.py
  15. 520
      ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py
  16. 129
      ml-agents/mlagents/trainers/tests/test_bcmodule.py
  17. 197
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  18. 505
      ml-agents/mlagents/trainers/tests/test_simple_rl.py
  19. 0
      /ml-agents/mlagents/trainers/tests/tensorflow/BasicLearning.pb
  20. 0
      /ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py
  21. 0
      /ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
  22. 0
      /ml-agents/mlagents/trainers/tests/tensorflow/test_sac.py
  23. 0
      /ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py
  24. 0
      /ml-agents/mlagents/trainers/tests/tensorflow/test_barracuda_converter.py
  25. 0
      /ml-agents/mlagents/trainers/tests/tensorflow/test_distributions.py
  26. 0
      /ml-agents/mlagents/trainers/tests/tensorflow/test_models.py
  27. 0
      /ml-agents/mlagents/trainers/tests/tensorflow/test_nn_policy.py
  28. 0
      /ml-agents/mlagents/trainers/tests/tensorflow/test_saver.py

10
ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py


from mlagents.trainers.tests.simple_test_envs import SimpleEnvironment
from mlagents.trainers.stats import StatsReporter
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.tests.test_simple_rl import (
_check_environment_trains,
PPO_CONFIG,
from mlagents.trainers.tests.check_env_trains import (
check_environment_trains,
from mlagents.trainers.tests.dummy_config import ppo_dummy_config
def mock_env_factory(worker_id):

simple_env_factory, EngineConfig.default_config(), num_envs
)
# Run PPO using env_manager
_check_environment_trains(
check_environment_trains(
{"1D": PPO_CONFIG},
{"1D": ppo_dummy_config()},
env_manager=env_manager,
success_threshold=None,
)

6
ml-agents/mlagents/trainers/tests/test_trainer_util.py


from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.exception import TrainerConfigError, UnityTrainerException
from mlagents.trainers.settings import RunOptions
from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG
from mlagents.trainers.tests.dummy_config import ppo_dummy_config
return RunOptions(behaviors={"testbrain": PPO_CONFIG})
return RunOptions(behaviors={"testbrain": ppo_dummy_config()})
@patch("mlagents_envs.base_env.BehaviorSpec")

expected_reward_buff_cap = 1
base_config = dummy_config.behaviors
expected_config = PPO_CONFIG
expected_config = ppo_dummy_config()
def mock_constructor(
self,

12
ml-agents/mlagents/trainers/tests/torch/test_ppo.py


import numpy as np
from mlagents.tf_utils import tf
import copy
import attr
from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer

from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG
from mlagents.trainers.tests.test_reward_signals import ( # noqa: F401; pylint: disable=unused-variable
from mlagents.trainers.settings import NetworkSettings, FrameworkType
from mlagents.trainers.tests.dummy_config import ( # noqa: F401; pylint: disable=unused-variable
ppo_dummy_config,
curiosity_dummy_config,
gail_dummy_config,
)

def dummy_config():
return copy.deepcopy(PPO_CONFIG)
return attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH)
VECTOR_ACTION_SPACE = 2

def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811
# Test evaluate
dummy_config.reward_signals = gail_dummy_config
config = attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH)
PPO_CONFIG, use_rnn=False, use_discrete=False, use_visual=False
config, use_rnn=False, use_discrete=False, use_visual=False
)
# Test update
update_buffer = mb.simulate_rollout(

10
ml-agents/mlagents/trainers/tests/torch/test_sac.py


import pytest
import copy
import attr
from mlagents.trainers.tests.torch.test_simple_rl import SAC_CONFIG
from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.tests.test_reward_signals import ( # noqa: F401; pylint: disable=unused-variable
from mlagents.trainers.settings import NetworkSettings, FrameworkType
from mlagents.trainers.tests.dummy_config import ( # noqa: F401; pylint: disable=unused-variable
sac_dummy_config,
curiosity_dummy_config,
)

return copy.deepcopy(SAC_CONFIG)
return attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)
VECTOR_ACTION_SPACE = 2

251
ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py


import math
import tempfile
import numpy as np
from typing import Dict
from mlagents.trainers.tests.simple_test_envs import (
SimpleEnvironment,

from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.trainer_util import TrainerFactory
from mlagents.trainers.simple_env_manager import SimpleEnvManager
from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary
TrainerSettings,
PPOSettings,
SACSettings,
TrainerType,
ScheduleType,
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
from mlagents_envs.side_channel.environment_parameters_channel import (
EnvironmentParametersChannel,
)
from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
DemonstrationMetaProto,
)

BRAIN_NAME = "1D"
PPO_CONFIG = TrainerSettings(
trainer_type=TrainerType.PPO,
hyperparameters=PPOSettings(
learning_rate=5.0e-3,
learning_rate_schedule=ScheduleType.CONSTANT,
batch_size=16,
buffer_size=64,
),
network_settings=NetworkSettings(num_layers=1, hidden_units=32),
summary_freq=500,
max_steps=3000,
threaded=False,
framework=FrameworkType.PYTORCH,
from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config
from mlagents.trainers.tests.check_env_trains import (
check_environment_trains,
default_reward_processor,
SAC_CONFIG = TrainerSettings(
trainer_type=TrainerType.SAC,
hyperparameters=SACSettings(
learning_rate=5.0e-3,
learning_rate_schedule=ScheduleType.CONSTANT,
batch_size=8,
buffer_init_steps=100,
buffer_size=5000,
tau=0.01,
init_entcoef=0.01,
),
network_settings=NetworkSettings(num_layers=1, hidden_units=16),
summary_freq=100,
max_steps=1000,
threaded=False,
framework=FrameworkType.PYTORCH,
)
# The reward processor is passed as an argument to _check_environment_trains.
# It is applied to the list of all final rewards for each brain individually.
# This is so that we can process all final rewards in different ways for different algorithms.
# Custom reward processors should be built within the test function and passed to _check_environment_trains
# Default is average over the last 5 final rewards
def default_reward_processor(rewards, last_n_rewards=5):
rewards_to_use = rewards[-last_n_rewards:]
# For debugging tests
print(f"Last {last_n_rewards} rewards:", rewards_to_use)
return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean()
class DebugWriter(StatsWriter):
"""
Print to stdout so stats can be viewed in pytest
"""
def __init__(self):
self._last_reward_summary: Dict[str, float] = {}
def get_last_rewards(self):
return self._last_reward_summary
BRAIN_NAME = "1D"
def write_stats(
self, category: str, values: Dict[str, StatsSummary], step: int
) -> None:
for val, stats_summary in values.items():
if val == "Environment/Cumulative Reward":
print(step, val, stats_summary.mean)
self._last_reward_summary[category] = stats_summary.mean
def _check_environment_trains(
env,
trainer_config,
reward_processor=default_reward_processor,
env_parameter_manager=None,
success_threshold=0.9,
env_manager=None,
):
if env_parameter_manager is None:
env_parameter_manager = EnvironmentParameterManager()
# Create controller and begin training.
with tempfile.TemporaryDirectory() as dir:
run_id = "id"
seed = 1337
StatsReporter.writers.clear() # Clear StatsReporters so we don't write to file
debug_writer = DebugWriter()
StatsReporter.add_writer(debug_writer)
if env_manager is None:
env_manager = SimpleEnvManager(env, EnvironmentParametersChannel())
trainer_factory = TrainerFactory(
trainer_config=trainer_config,
output_path=dir,
train_model=True,
load_model=False,
seed=seed,
param_manager=env_parameter_manager,
multi_gpu=False,
)
tc = TrainerController(
trainer_factory=trainer_factory,
output_path=dir,
run_id=run_id,
param_manager=env_parameter_manager,
train=True,
training_seed=seed,
)
# Begin training
tc.start_learning(env_manager)
if (
success_threshold is not None
): # For tests where we are just checking setup and not reward
processed_rewards = [
reward_processor(rewards) for rewards in env.final_rewards.values()
]
assert all(not math.isnan(reward) for reward in processed_rewards)
assert all(reward > success_threshold for reward in processed_rewards)
PPO_TORCH_CONFIG = attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH)
SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)
config = attr.evolve(PPO_CONFIG)
_check_environment_trains(env, {BRAIN_NAME: config})
config = attr.evolve(PPO_TORCH_CONFIG)
check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])

)
new_hyperparams = attr.evolve(
PPO_CONFIG.hyperparameters, batch_size=64, buffer_size=640
PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=640
config = attr.evolve(PPO_CONFIG, hyperparameters=new_hyperparams, max_steps=10000)
_check_environment_trains(env, {BRAIN_NAME: config})
config = attr.evolve(
PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
)
check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])

num_vector=0,
step_size=0.2,
)
new_hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3.0e-4)
config = attr.evolve(PPO_CONFIG, hyperparameters=new_hyperparams)
_check_environment_trains(env, {BRAIN_NAME: config})
new_hyperparams = attr.evolve(
PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
)
config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams)
check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("num_visual", [1, 2])

vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
)
new_networksettings = attr.evolve(
SAC_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
SAC_TORCH_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
new_hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3.0e-4)
new_hyperparams = attr.evolve(
PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
)
PPO_CONFIG,
PPO_TORCH_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=700,

_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
@pytest.mark.parametrize("use_discrete", [True, False])

PPO_CONFIG.network_settings,
PPO_TORCH_CONFIG.network_settings,
PPO_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=128
PPO_TORCH_CONFIG.hyperparameters,
learning_rate=1.0e-3,
batch_size=64,
buffer_size=128,
PPO_CONFIG,
PPO_TORCH_CONFIG,
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
config = attr.evolve(SAC_CONFIG)
_check_environment_trains(env, {BRAIN_NAME: config})
config = attr.evolve(SAC_TORCH_CONFIG)
check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])

)
new_hyperparams = attr.evolve(SAC_CONFIG.hyperparameters, buffer_init_steps=2000)
config = attr.evolve(SAC_CONFIG, hyperparameters=new_hyperparams, max_steps=10000)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
new_hyperparams = attr.evolve(
SAC_TORCH_CONFIG.hyperparameters, buffer_init_steps=2000
)
config = attr.evolve(
SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
@pytest.mark.parametrize("use_discrete", [True, False])

step_size=0.2,
)
new_hyperparams = attr.evolve(
SAC_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4
SAC_TORCH_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4
config = attr.evolve(SAC_CONFIG, hyperparameters=new_hyperparams)
_check_environment_trains(env, {BRAIN_NAME: config})
config = attr.evolve(SAC_TORCH_CONFIG, hyperparameters=new_hyperparams)
check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("num_visual", [1, 2])

vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
)
new_networksettings = attr.evolve(
SAC_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
SAC_TORCH_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
SAC_CONFIG.hyperparameters,
SAC_TORCH_CONFIG.hyperparameters,
SAC_CONFIG,
SAC_TORCH_CONFIG,
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
@pytest.mark.parametrize("use_discrete", [True, False])

[BRAIN_NAME], use_discrete=use_discrete, step_size=step_size
)
new_networksettings = attr.evolve(
SAC_CONFIG.network_settings,
SAC_TORCH_CONFIG.network_settings,
SAC_CONFIG.hyperparameters,
SAC_TORCH_CONFIG.hyperparameters,
batch_size=128,
learning_rate=1e-3,
buffer_init_steps=1000,

SAC_CONFIG,
SAC_TORCH_CONFIG,
_check_environment_trains(env, {BRAIN_NAME: config})
check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])

self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000
)
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2500)
_check_environment_trains(env, {BRAIN_NAME: config})
config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=2500)
check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])

self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=4000
)
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2500)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None)
config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=2500)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None)
processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]

swap_steps=10000,
team_change=400,
)
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=4000)
_check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=4000)
check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
@pytest.mark.parametrize("use_discrete", [True, False])

swap_steps=5000,
team_change=2000,
)
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=3000)
_check_environment_trains(
config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=3000)
check_environment_trains(
env, {BRAIN_NAME: config, brain_name_opp: config}, success_threshold=None
)
processed_rewards = [

@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("trainer_config", [PPO_CONFIG, SAC_CONFIG])
@pytest.mark.parametrize("trainer_config", [PPO_TORCH_CONFIG, SAC_TORCH_CONFIG])
def test_gail(simple_record, use_discrete, trainer_config):
demo_path = simple_record(use_discrete)
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)

behavioral_cloning=bc_settings,
max_steps=500,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])

reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
}
hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3e-4)
hyperparams = attr.evolve(PPO_TORCH_CONFIG.hyperparameters, learning_rate=3e-4)
PPO_CONFIG,
PPO_TORCH_CONFIG,
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])

RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
}
hyperparams = attr.evolve(
SAC_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16
SAC_TORCH_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16
SAC_CONFIG,
SAC_TORCH_CONFIG,
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)

21
ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py


import numpy as np
from mlagents.tf_utils import tf
import copy
import attr
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers

from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG
from mlagents.trainers.tests.test_reward_signals import ( # noqa: F401; pylint: disable=unused-variable
from mlagents.trainers.settings import NetworkSettings, FrameworkType
from mlagents.trainers.tests.dummy_config import ( # noqa: F401; pylint: disable=unused-variable
ppo_dummy_config,
return copy.deepcopy(PPO_CONFIG)
return attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW)
VECTOR_ACTION_SPACE = 2

vector_obs_space=VECTOR_OBS_SPACE,
)
trainer_settings = attr.evolve(dummy_config)
trainer_settings = attr.evolve(dummy_config, framework=FrameworkType.TENSORFLOW)
trainer_settings.network_settings.memory = (
NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
if use_rnn

tf.reset_default_graph()
dummy_config.reward_signals = gail_dummy_config
optimizer = _create_ppo_optimizer_ops_mock(
PPO_CONFIG, use_rnn=False, use_discrete=False, use_visual=False
attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW),
use_rnn=False,
use_discrete=False,
use_visual=False,
)
# Test update
update_buffer = mb.simulate_rollout(

@mock.patch.object(RLTrainer, "create_model_saver")
@mock.patch("mlagents.trainers.ppo.trainer.PPOOptimizer")
def test_trainer_increment_step(ppo_optimizer, mock_create_model_saver):
trainer_params = PPO_CONFIG
trainer_params = attr.evolve(
attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW),
framework=FrameworkType.TENSORFLOW,
)
mock_optimizer = mock.Mock()
mock_optimizer.reward_signals = {}
ppo_optimizer.return_value = mock_optimizer

10
ml-agents/mlagents/trainers/tests/tensorflow/test_sac.py


import pytest
from unittest import mock
import copy
import attr
from mlagents.tf_utils import tf
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers

from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.mock_brain import setup_test_behavior_specs
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
from mlagents.trainers.tests.test_simple_rl import SAC_CONFIG
from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.tests.test_reward_signals import ( # noqa: F401; pylint: disable=unused-variable
from mlagents.trainers.settings import NetworkSettings, FrameworkType
from mlagents.trainers.tests.dummy_config import ( # noqa: F401; pylint: disable=unused-variable
sac_dummy_config,
return copy.deepcopy(SAC_CONFIG)
return attr.evolve(sac_dummy_config(), framework=FrameworkType.TENSORFLOW)
VECTOR_ACTION_SPACE = 2

2
ml-agents/mlagents/trainers/tests/tensorflow/test_saver.py


from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.test_nn_policy import create_policy_mock
from mlagents.trainers.tests.tensorflow.test_nn_policy import create_policy_mock
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer

94
ml-agents/mlagents/trainers/tests/check_env_trains.py


import math
import tempfile
import numpy as np
from typing import Dict
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.trainer_util import TrainerFactory
from mlagents.trainers.simple_env_manager import SimpleEnvManager
from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
from mlagents_envs.side_channel.environment_parameters_channel import (
EnvironmentParametersChannel,
)
class DebugWriter(StatsWriter):
"""
Print to stdout so stats can be viewed in pytest
"""
def __init__(self):
self._last_reward_summary: Dict[str, float] = {}
def get_last_rewards(self):
return self._last_reward_summary
def write_stats(
self, category: str, values: Dict[str, StatsSummary], step: int
) -> None:
for val, stats_summary in values.items():
if val == "Environment/Cumulative Reward":
print(step, val, stats_summary.mean)
self._last_reward_summary[category] = stats_summary.mean
# The reward processor is passed as an argument to _check_environment_trains.
# It is applied to the list of all final rewards for each brain individually.
# This is so that we can process all final rewards in different ways for different algorithms.
# Custom reward processors should be built within the test function and passed to _check_environment_trains
# Default is average over the last 5 final rewards
def default_reward_processor(rewards, last_n_rewards=5):
rewards_to_use = rewards[-last_n_rewards:]
# For debugging tests
print(f"Last {last_n_rewards} rewards:", rewards_to_use)
return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean()
def check_environment_trains(
env,
trainer_config,
reward_processor=default_reward_processor,
env_parameter_manager=None,
success_threshold=0.9,
env_manager=None,
):
if env_parameter_manager is None:
env_parameter_manager = EnvironmentParameterManager()
# Create controller and begin training.
with tempfile.TemporaryDirectory() as dir:
run_id = "id"
seed = 1337
StatsReporter.writers.clear() # Clear StatsReporters so we don't write to file
debug_writer = DebugWriter()
StatsReporter.add_writer(debug_writer)
if env_manager is None:
env_manager = SimpleEnvManager(env, EnvironmentParametersChannel())
trainer_factory = TrainerFactory(
trainer_config=trainer_config,
output_path=dir,
train_model=True,
load_model=False,
seed=seed,
param_manager=env_parameter_manager,
multi_gpu=False,
)
tc = TrainerController(
trainer_factory=trainer_factory,
output_path=dir,
run_id=run_id,
param_manager=env_parameter_manager,
train=True,
training_seed=seed,
)
# Begin training
tc.start_learning(env_manager)
if (
success_threshold is not None
): # For tests where we are just checking setup and not reward
processed_rewards = [
reward_processor(rewards) for rewards in env.final_rewards.values()
]
assert all(not math.isnan(reward) for reward in processed_rewards)
assert all(reward > success_threshold for reward in processed_rewards)

72
ml-agents/mlagents/trainers/tests/dummy_config.py


import pytest
import copy
import os
from mlagents.trainers.settings import (
TrainerSettings,
PPOSettings,
SACSettings,
GAILSettings,
CuriositySettings,
RewardSignalSettings,
NetworkSettings,
TrainerType,
RewardSignalType,
ScheduleType,
)
CONTINUOUS_DEMO_PATH = os.path.dirname(os.path.abspath(__file__)) + "/test.demo"
DISCRETE_DEMO_PATH = os.path.dirname(os.path.abspath(__file__)) + "/testdcvis.demo"
_PPO_CONFIG = TrainerSettings(
trainer_type=TrainerType.PPO,
hyperparameters=PPOSettings(
learning_rate=5.0e-3,
learning_rate_schedule=ScheduleType.CONSTANT,
batch_size=16,
buffer_size=64,
),
network_settings=NetworkSettings(num_layers=1, hidden_units=32),
summary_freq=500,
max_steps=3000,
threaded=False,
)
_SAC_CONFIG = TrainerSettings(
trainer_type=TrainerType.SAC,
hyperparameters=SACSettings(
learning_rate=5.0e-3,
learning_rate_schedule=ScheduleType.CONSTANT,
batch_size=8,
buffer_init_steps=100,
buffer_size=5000,
tau=0.01,
init_entcoef=0.01,
),
network_settings=NetworkSettings(num_layers=1, hidden_units=16),
summary_freq=100,
max_steps=1000,
threaded=False,
)
def ppo_dummy_config():
return copy.deepcopy(_PPO_CONFIG)
def sac_dummy_config():
return copy.deepcopy(_SAC_CONFIG)
@pytest.fixture
def gail_dummy_config():
return {RewardSignalType.GAIL: GAILSettings(demo_path=CONTINUOUS_DEMO_PATH)}
@pytest.fixture
def curiosity_dummy_config():
return {RewardSignalType.CURIOSITY: CuriositySettings()}
@pytest.fixture
def extrinsic_dummy_config():
return {RewardSignalType.EXTRINSIC: RewardSignalSettings()}

0
ml-agents/mlagents/trainers/tests/torch/__init__.py

0
ml-agents/mlagents/trainers/tests/tensorflow/__init__.py

118
ml-agents/mlagents/trainers/tests/tensorflow/test_bcmodule.py


import pytest
import mlagents.trainers.tests.mock_brain as mb
import numpy as np
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.components.bc.module import BCModule
from mlagents.trainers.settings import (
TrainerSettings,
BehavioralCloningSettings,
NetworkSettings,
)
from mlagents.trainers.tests.dummy_config import (
DISCRETE_DEMO_PATH,
CONTINUOUS_DEMO_PATH,
)
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample):
# model_path = env.external_brain_names[0]
trainer_config = TrainerSettings()
trainer_config.network_settings.memory = (
NetworkSettings.MemorySettings() if use_rnn else None
)
policy = TFPolicy(
0, mock_behavior_specs, trainer_config, tanhresample, tanhresample
)
with policy.graph.as_default():
bc_module = BCModule(
policy,
policy_learning_rate=trainer_config.hyperparameters.learning_rate,
default_batch_size=trainer_config.hyperparameters.batch_size,
default_num_epoch=3,
settings=bc_settings,
)
policy.initialize() # Normally the optimizer calls this after the BCModule is created
return bc_module
# Test default values
def test_bcmodule_defaults():
# See if default values match
mock_specs = mb.create_mock_3dball_behavior_specs()
bc_settings = BehavioralCloningSettings(demo_path=CONTINUOUS_DEMO_PATH)
bc_module = create_bc_module(mock_specs, bc_settings, False, False)
assert bc_module.num_epoch == 3
assert bc_module.batch_size == TrainerSettings().hyperparameters.batch_size
# Assign strange values and see if it overrides properly
bc_settings = BehavioralCloningSettings(
demo_path=CONTINUOUS_DEMO_PATH, num_epoch=100, batch_size=10000
)
bc_module = create_bc_module(mock_specs, bc_settings, False, False)
assert bc_module.num_epoch == 100
assert bc_module.batch_size == 10000
# Test with continuous control env and vector actions
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_update(is_sac):
mock_specs = mb.create_mock_3dball_behavior_specs()
bc_settings = BehavioralCloningSettings(demo_path=CONTINUOUS_DEMO_PATH)
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
# Test with constant pretraining learning rate
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_constant_lr_update(is_sac):
mock_specs = mb.create_mock_3dball_behavior_specs()
bc_settings = BehavioralCloningSettings(demo_path=CONTINUOUS_DEMO_PATH, steps=0)
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
old_learning_rate = bc_module.current_lr
_ = bc_module.update()
assert old_learning_rate == bc_module.current_lr
# Test with RNN
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_rnn_update(is_sac):
mock_specs = mb.create_mock_3dball_behavior_specs()
bc_settings = BehavioralCloningSettings(demo_path=CONTINUOUS_DEMO_PATH)
bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
# Test with discrete control and visual observations
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_dc_visual_update(is_sac):
mock_specs = mb.create_mock_banana_behavior_specs()
bc_settings = BehavioralCloningSettings(demo_path=DISCRETE_DEMO_PATH)
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
# Test with discrete control, visual observations and RNN
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_rnn_dc_update(is_sac):
mock_specs = mb.create_mock_banana_behavior_specs()
bc_settings = BehavioralCloningSettings(demo_path=DISCRETE_DEMO_PATH)
bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
if __name__ == "__main__":
pytest.main()

176
ml-agents/mlagents/trainers/tests/tensorflow/test_reward_signals.py


import pytest
import mlagents.trainers.tests.mock_brain as mb
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.sac.optimizer_tf import SACOptimizer
from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
from mlagents.trainers.tests.dummy_config import ( # noqa: F401; pylint: disable=unused-variable
ppo_dummy_config,
sac_dummy_config,
gail_dummy_config,
curiosity_dummy_config,
extrinsic_dummy_config,
DISCRETE_DEMO_PATH,
CONTINUOUS_DEMO_PATH,
)
from mlagents.trainers.settings import (
GAILSettings,
BehavioralCloningSettings,
NetworkSettings,
TrainerType,
RewardSignalType,
)
VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8
DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 20
BATCH_SIZE = 12
NUM_AGENTS = 12
def create_optimizer_mock(
trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual
):
mock_specs = mb.setup_test_behavior_specs(
use_discrete,
use_visual,
vector_action_space=DISCRETE_ACTION_SPACE
if use_discrete
else VECTOR_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
)
trainer_settings = trainer_config
trainer_settings.reward_signals = reward_signal_config
trainer_settings.network_settings.memory = (
NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
if use_rnn
else None
)
policy = TFPolicy(
0, mock_specs, trainer_settings, "test", False, create_tf_graph=False
)
if trainer_settings.trainer_type == TrainerType.SAC:
optimizer = SACOptimizer(policy, trainer_settings)
else:
optimizer = PPOOptimizer(policy, trainer_settings)
optimizer.policy.initialize()
return optimizer
def reward_signal_eval(optimizer, reward_signal_name):
buffer = mb.simulate_rollout(BATCH_SIZE, optimizer.policy.behavior_spec)
# Test evaluate
rsig_result = optimizer.reward_signals[reward_signal_name].evaluate_batch(buffer)
assert rsig_result.scaled_reward.shape == (BATCH_SIZE,)
assert rsig_result.unscaled_reward.shape == (BATCH_SIZE,)
def reward_signal_update(optimizer, reward_signal_name):
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec)
feed_dict = optimizer.reward_signals[reward_signal_name].prepare_update(
optimizer.policy, buffer.make_mini_batch(0, 10), 2
)
out = optimizer.policy._execute_model(
feed_dict, optimizer.reward_signals[reward_signal_name].update_dict
)
assert type(out) is dict
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_gail_cc(trainer_config, gail_dummy_config): # noqa: F811
trainer_config.behavioral_cloning = BehavioralCloningSettings(
demo_path=CONTINUOUS_DEMO_PATH
)
optimizer = create_optimizer_mock(
trainer_config, gail_dummy_config, False, False, False
)
reward_signal_eval(optimizer, "gail")
reward_signal_update(optimizer, "gail")
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_gail_dc_visual(trainer_config, gail_dummy_config): # noqa: F811
gail_dummy_config_discrete = {
RewardSignalType.GAIL: GAILSettings(demo_path=DISCRETE_DEMO_PATH)
}
optimizer = create_optimizer_mock(
trainer_config, gail_dummy_config_discrete, False, True, True
)
reward_signal_eval(optimizer, "gail")
reward_signal_update(optimizer, "gail")
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_gail_rnn(trainer_config, gail_dummy_config): # noqa: F811
policy = create_optimizer_mock(
trainer_config, gail_dummy_config, True, False, False
)
reward_signal_eval(policy, "gail")
reward_signal_update(policy, "gail")
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_curiosity_cc(trainer_config, curiosity_dummy_config): # noqa: F811
policy = create_optimizer_mock(
trainer_config, curiosity_dummy_config, False, False, False
)
reward_signal_eval(policy, "curiosity")
reward_signal_update(policy, "curiosity")
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_curiosity_dc(trainer_config, curiosity_dummy_config): # noqa: F811
policy = create_optimizer_mock(
trainer_config, curiosity_dummy_config, False, True, False
)
reward_signal_eval(policy, "curiosity")
reward_signal_update(policy, "curiosity")
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_curiosity_visual(trainer_config, curiosity_dummy_config): # noqa: F811
policy = create_optimizer_mock(
trainer_config, curiosity_dummy_config, False, False, True
)
reward_signal_eval(policy, "curiosity")
reward_signal_update(policy, "curiosity")
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_curiosity_rnn(trainer_config, curiosity_dummy_config): # noqa: F811
policy = create_optimizer_mock(
trainer_config, curiosity_dummy_config, True, False, False
)
reward_signal_eval(policy, "curiosity")
reward_signal_update(policy, "curiosity")
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_extrinsic(trainer_config, extrinsic_dummy_config): # noqa: F811
policy = create_optimizer_mock(
trainer_config, extrinsic_dummy_config, False, False, False
)
reward_signal_eval(policy, "extrinsic")
reward_signal_update(policy, "extrinsic")
if __name__ == "__main__":
pytest.main()

520
ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py


import math
import tempfile
import pytest
import numpy as np
import attr
from typing import Dict
from mlagents.trainers.tests.simple_test_envs import (
SimpleEnvironment,
MemoryEnvironment,
RecordEnvironment,
)
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.trainer_util import TrainerFactory
from mlagents.trainers.simple_env_manager import SimpleEnvManager
from mlagents.trainers.demo_loader import write_demo
from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary
from mlagents.trainers.settings import (
NetworkSettings,
SelfPlaySettings,
BehavioralCloningSettings,
GAILSettings,
RewardSignalType,
EncoderType,
FrameworkType,
)
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
from mlagents_envs.side_channel.environment_parameters_channel import (
EnvironmentParametersChannel,
)
from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
DemonstrationMetaProto,
)
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous
from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config
PPO_TF_CONFIG = attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW)
SAC_TF_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.TENSORFLOW)
BRAIN_NAME = "1D"
# The reward processor is passed as an argument to _check_environment_trains.
# It is applied to the list of all final rewards for each brain individually.
# This is so that we can process all final rewards in different ways for different algorithms.
# Custom reward processors should be built within the test function and passed to _check_environment_trains
# Default is average over the last 5 final rewards
def default_reward_processor(rewards, last_n_rewards=5):
rewards_to_use = rewards[-last_n_rewards:]
# For debugging tests
print(f"Last {last_n_rewards} rewards:", rewards_to_use)
return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean()
class DebugWriter(StatsWriter):
"""
Print to stdout so stats can be viewed in pytest
"""
def __init__(self):
self._last_reward_summary: Dict[str, float] = {}
def get_last_rewards(self):
return self._last_reward_summary
def write_stats(
self, category: str, values: Dict[str, StatsSummary], step: int
) -> None:
for val, stats_summary in values.items():
if val == "Environment/Cumulative Reward":
print(step, val, stats_summary.mean)
self._last_reward_summary[category] = stats_summary.mean
def _check_environment_trains(
env,
trainer_config,
reward_processor=default_reward_processor,
env_parameter_manager=None,
success_threshold=0.9,
env_manager=None,
):
if env_parameter_manager is None:
env_parameter_manager = EnvironmentParameterManager()
# Create controller and begin training.
with tempfile.TemporaryDirectory() as dir:
run_id = "id"
seed = 1337
StatsReporter.writers.clear() # Clear StatsReporters so we don't write to file
debug_writer = DebugWriter()
StatsReporter.add_writer(debug_writer)
if env_manager is None:
env_manager = SimpleEnvManager(env, EnvironmentParametersChannel())
trainer_factory = TrainerFactory(
trainer_config=trainer_config,
output_path=dir,
train_model=True,
load_model=False,
seed=seed,
param_manager=env_parameter_manager,
multi_gpu=False,
)
tc = TrainerController(
trainer_factory=trainer_factory,
output_path=dir,
run_id=run_id,
param_manager=env_parameter_manager,
train=True,
training_seed=seed,
)
# Begin training
tc.start_learning(env_manager)
if (
success_threshold is not None
): # For tests where we are just checking setup and not reward
processed_rewards = [
reward_processor(rewards) for rewards in env.final_rewards.values()
]
assert all(not math.isnan(reward) for reward in processed_rewards)
assert all(reward > success_threshold for reward in processed_rewards)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ppo(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
config = attr.evolve(PPO_TF_CONFIG, framework=FrameworkType.TENSORFLOW)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_ppo(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
new_hyperparams = attr.evolve(
PPO_TF_CONFIG.hyperparameters, batch_size=64, buffer_size=640
)
config = attr.evolve(
PPO_TF_CONFIG,
hyperparameters=new_hyperparams,
max_steps=10000,
framework=FrameworkType.TENSORFLOW,
)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("num_visual", [1, 2])
def test_visual_ppo(num_visual, use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=use_discrete,
num_visual=num_visual,
num_vector=0,
step_size=0.2,
)
new_hyperparams = attr.evolve(PPO_TF_CONFIG.hyperparameters, learning_rate=3.0e-4)
config = attr.evolve(
PPO_TF_CONFIG,
hyperparameters=new_hyperparams,
framework=FrameworkType.TENSORFLOW,
)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("num_visual", [1, 2])
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
def test_visual_advanced_ppo(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
num_visual=num_visual,
num_vector=0,
step_size=0.5,
vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
)
new_networksettings = attr.evolve(
SAC_TF_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
)
new_hyperparams = attr.evolve(PPO_TF_CONFIG.hyperparameters, learning_rate=3.0e-4)
config = attr.evolve(
PPO_TF_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=500,
summary_freq=100,
framework=FrameworkType.TENSORFLOW,
)
# The number of steps is pretty small for these encoders
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_ppo(use_discrete):
env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
new_network_settings = attr.evolve(
PPO_TF_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16),
)
new_hyperparams = attr.evolve(
PPO_TF_CONFIG.hyperparameters,
learning_rate=1.0e-3,
batch_size=64,
buffer_size=128,
)
config = attr.evolve(
PPO_TF_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_network_settings,
max_steps=5000,
framework=FrameworkType.TENSORFLOW,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_sac(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
config = attr.evolve(SAC_TF_CONFIG, framework=FrameworkType.TENSORFLOW)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_sac(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
new_hyperparams = attr.evolve(SAC_TF_CONFIG.hyperparameters, buffer_init_steps=2000)
config = attr.evolve(
SAC_TF_CONFIG,
hyperparameters=new_hyperparams,
max_steps=10000,
framework=FrameworkType.TENSORFLOW,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("num_visual", [1, 2])
def test_visual_sac(num_visual, use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=use_discrete,
num_visual=num_visual,
num_vector=0,
step_size=0.2,
)
new_hyperparams = attr.evolve(
SAC_TF_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4
)
config = attr.evolve(
SAC_TF_CONFIG,
hyperparameters=new_hyperparams,
framework=FrameworkType.TENSORFLOW,
)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("num_visual", [1, 2])
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
def test_visual_advanced_sac(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
num_visual=num_visual,
num_vector=0,
step_size=0.5,
vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
)
new_networksettings = attr.evolve(
SAC_TF_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
)
new_hyperparams = attr.evolve(
SAC_TF_CONFIG.hyperparameters,
batch_size=16,
learning_rate=3e-4,
buffer_init_steps=0,
)
config = attr.evolve(
SAC_TF_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=100,
framework=FrameworkType.TENSORFLOW,
)
# The number of steps is pretty small for these encoders
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_sac(use_discrete):
step_size = 0.5 if use_discrete else 0.2
env = MemoryEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, step_size=step_size
)
new_networksettings = attr.evolve(
SAC_TF_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16),
)
new_hyperparams = attr.evolve(
SAC_TF_CONFIG.hyperparameters,
batch_size=128,
learning_rate=1e-3,
buffer_init_steps=1000,
steps_per_update=2,
)
config = attr.evolve(
SAC_TF_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=5000,
framework=FrameworkType.TENSORFLOW,
)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000
)
config = attr.evolve(
PPO_TF_CONFIG,
self_play=self_play_settings,
max_steps=2500,
framework=FrameworkType.TENSORFLOW,
)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost_fails(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
)
# This config should fail because the ghosted policy is never swapped with a competent policy.
# Swap occurs after max step is reached.
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=4000
)
config = attr.evolve(
PPO_TF_CONFIG,
self_play=self_play_settings,
max_steps=2500,
framework=FrameworkType.TENSORFLOW,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None)
processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]
success_threshold = 0.9
assert any(reward > success_threshold for reward in processed_rewards) and any(
reward < success_threshold for reward in processed_rewards
)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost(use_discrete):
# Make opponent for asymmetric case
brain_name_opp = BRAIN_NAME + "Opp"
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0,
save_steps=10000,
swap_steps=10000,
team_change=400,
)
config = attr.evolve(
PPO_TF_CONFIG,
self_play=self_play_settings,
max_steps=4000,
framework=FrameworkType.TENSORFLOW,
)
_check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost_fails(use_discrete):
# Make opponent for asymmetric case
brain_name_opp = BRAIN_NAME + "Opp"
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
)
# This config should fail because the team that us not learning when both have reached
# max step should be executing the initial, untrained poliy.
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=0.0,
save_steps=5000,
swap_steps=5000,
team_change=2000,
)
config = attr.evolve(
PPO_TF_CONFIG,
self_play=self_play_settings,
max_steps=3000,
framework=FrameworkType.TENSORFLOW,
)
_check_environment_trains(
env, {BRAIN_NAME: config, brain_name_opp: config}, success_threshold=None
)
processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]
success_threshold = 0.9
assert any(reward > success_threshold for reward in processed_rewards) and any(
reward < success_threshold for reward in processed_rewards
)
@pytest.fixture(scope="session")
def simple_record(tmpdir_factory):
def record_demo(use_discrete, num_visual=0, num_vector=1):
env = RecordEnvironment(
[BRAIN_NAME],
use_discrete=use_discrete,
num_visual=num_visual,
num_vector=num_vector,
n_demos=100,
)
# If we want to use true demos, we can solve the env in the usual way
# Otherwise, we can just call solve to execute the optimal policy
env.solve()
agent_info_protos = env.demonstration_protos[BRAIN_NAME]
meta_data_proto = DemonstrationMetaProto()
brain_param_proto = BrainParametersProto(
vector_action_size=[2] if use_discrete else [1],
vector_action_descriptions=[""],
vector_action_space_type=discrete if use_discrete else continuous,
brain_name=BRAIN_NAME,
is_training=True,
)
action_type = "Discrete" if use_discrete else "Continuous"
demo_path_name = "1DTest" + action_type + ".demo"
demo_path = str(tmpdir_factory.mktemp("tmp_demo").join(demo_path_name))
write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos)
return demo_path
return record_demo
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("trainer_config", [PPO_TF_CONFIG, SAC_TF_CONFIG])
def test_gail(simple_record, use_discrete, trainer_config):
demo_path = simple_record(use_discrete)
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
}
config = attr.evolve(
trainer_config,
reward_signals=reward_signals,
behavioral_cloning=bc_settings,
max_steps=500,
framework=FrameworkType.TENSORFLOW,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_ppo(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
env = SimpleEnvironment(
[BRAIN_NAME],
num_visual=1,
num_vector=0,
use_discrete=use_discrete,
step_size=0.2,
)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
}
hyperparams = attr.evolve(PPO_TF_CONFIG.hyperparameters, learning_rate=3e-4)
config = attr.evolve(
PPO_TF_CONFIG,
reward_signals=reward_signals,
hyperparameters=hyperparams,
behavioral_cloning=bc_settings,
max_steps=1000,
framework=FrameworkType.TENSORFLOW,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_sac(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
env = SimpleEnvironment(
[BRAIN_NAME],
num_visual=1,
num_vector=0,
use_discrete=use_discrete,
step_size=0.2,
)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
}
hyperparams = attr.evolve(
SAC_TF_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16
)
config = attr.evolve(
SAC_TF_CONFIG,
reward_signals=reward_signals,
hyperparameters=hyperparams,
behavioral_cloning=bc_settings,
max_steps=500,
framework=FrameworkType.TENSORFLOW,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)

129
ml-agents/mlagents/trainers/tests/test_bcmodule.py


import pytest
import mlagents.trainers.tests.mock_brain as mb
import numpy as np
import os
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.components.bc.module import BCModule
from mlagents.trainers.settings import (
TrainerSettings,
BehavioralCloningSettings,
NetworkSettings,
)
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample):
# model_path = env.external_brain_names[0]
trainer_config = TrainerSettings()
trainer_config.network_settings.memory = (
NetworkSettings.MemorySettings() if use_rnn else None
)
policy = TFPolicy(
0, mock_behavior_specs, trainer_config, tanhresample, tanhresample
)
with policy.graph.as_default():
bc_module = BCModule(
policy,
policy_learning_rate=trainer_config.hyperparameters.learning_rate,
default_batch_size=trainer_config.hyperparameters.batch_size,
default_num_epoch=3,
settings=bc_settings,
)
policy.initialize() # Normally the optimizer calls this after the BCModule is created
return bc_module
# Test default values
def test_bcmodule_defaults():
# See if default values match
mock_specs = mb.create_mock_3dball_behavior_specs()
bc_settings = BehavioralCloningSettings(
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo"
)
bc_module = create_bc_module(mock_specs, bc_settings, False, False)
assert bc_module.num_epoch == 3
assert bc_module.batch_size == TrainerSettings().hyperparameters.batch_size
# Assign strange values and see if it overrides properly
bc_settings = BehavioralCloningSettings(
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo",
num_epoch=100,
batch_size=10000,
)
bc_module = create_bc_module(mock_specs, bc_settings, False, False)
assert bc_module.num_epoch == 100
assert bc_module.batch_size == 10000
# Test with continuous control env and vector actions
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_update(is_sac):
mock_specs = mb.create_mock_3dball_behavior_specs()
bc_settings = BehavioralCloningSettings(
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo"
)
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
# Test with constant pretraining learning rate
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_constant_lr_update(is_sac):
mock_specs = mb.create_mock_3dball_behavior_specs()
bc_settings = BehavioralCloningSettings(
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo",
steps=0,
)
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
old_learning_rate = bc_module.current_lr
_ = bc_module.update()
assert old_learning_rate == bc_module.current_lr
# Test with RNN
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_rnn_update(is_sac):
mock_specs = mb.create_mock_3dball_behavior_specs()
bc_settings = BehavioralCloningSettings(
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo"
)
bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
# Test with discrete control and visual observations
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_dc_visual_update(is_sac):
mock_specs = mb.create_mock_banana_behavior_specs()
bc_settings = BehavioralCloningSettings(
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "testdcvis.demo"
)
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
# Test with discrete control, visual observations and RNN
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
def test_bcmodule_rnn_dc_update(is_sac):
mock_specs = mb.create_mock_banana_behavior_specs()
bc_settings = BehavioralCloningSettings(
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "testdcvis.demo"
)
bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
if __name__ == "__main__":
pytest.main()

197
ml-agents/mlagents/trainers/tests/test_reward_signals.py


import pytest
import copy
import os
import mlagents.trainers.tests.mock_brain as mb
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.sac.optimizer_tf import SACOptimizer
from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG, SAC_CONFIG
from mlagents.trainers.settings import (
GAILSettings,
CuriositySettings,
RewardSignalSettings,
BehavioralCloningSettings,
NetworkSettings,
TrainerType,
RewardSignalType,
)
CONTINUOUS_PATH = os.path.dirname(os.path.abspath(__file__)) + "/test.demo"
DISCRETE_PATH = os.path.dirname(os.path.abspath(__file__)) + "/testdcvis.demo"
def ppo_dummy_config():
return copy.deepcopy(PPO_CONFIG)
def sac_dummy_config():
return copy.deepcopy(SAC_CONFIG)
@pytest.fixture
def gail_dummy_config():
return {RewardSignalType.GAIL: GAILSettings(demo_path=CONTINUOUS_PATH)}
@pytest.fixture
def curiosity_dummy_config():
return {RewardSignalType.CURIOSITY: CuriositySettings()}
@pytest.fixture
def extrinsic_dummy_config():
return {RewardSignalType.EXTRINSIC: RewardSignalSettings()}
VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8
DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 20
BATCH_SIZE = 12
NUM_AGENTS = 12
def create_optimizer_mock(
trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual
):
mock_specs = mb.setup_test_behavior_specs(
use_discrete,
use_visual,
vector_action_space=DISCRETE_ACTION_SPACE
if use_discrete
else VECTOR_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
)
trainer_settings = trainer_config
trainer_settings.reward_signals = reward_signal_config
trainer_settings.network_settings.memory = (
NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
if use_rnn
else None
)
policy = TFPolicy(
0, mock_specs, trainer_settings, "test", False, create_tf_graph=False
)
if trainer_settings.trainer_type == TrainerType.SAC:
optimizer = SACOptimizer(policy, trainer_settings)
else:
optimizer = PPOOptimizer(policy, trainer_settings)
optimizer.policy.initialize()
return optimizer
def reward_signal_eval(optimizer, reward_signal_name):
buffer = mb.simulate_rollout(BATCH_SIZE, optimizer.policy.behavior_spec)
# Test evaluate
rsig_result = optimizer.reward_signals[reward_signal_name].evaluate_batch(buffer)
assert rsig_result.scaled_reward.shape == (BATCH_SIZE,)
assert rsig_result.unscaled_reward.shape == (BATCH_SIZE,)
def reward_signal_update(optimizer, reward_signal_name):
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec)
feed_dict = optimizer.reward_signals[reward_signal_name].prepare_update(
optimizer.policy, buffer.make_mini_batch(0, 10), 2
)
out = optimizer.policy._execute_model(
feed_dict, optimizer.reward_signals[reward_signal_name].update_dict
)
assert type(out) is dict
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_gail_cc(trainer_config, gail_dummy_config):
trainer_config.behavioral_cloning = BehavioralCloningSettings(
demo_path=CONTINUOUS_PATH
)
optimizer = create_optimizer_mock(
trainer_config, gail_dummy_config, False, False, False
)
reward_signal_eval(optimizer, "gail")
reward_signal_update(optimizer, "gail")
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_gail_dc_visual(trainer_config, gail_dummy_config):
gail_dummy_config_discrete = {
RewardSignalType.GAIL: GAILSettings(demo_path=DISCRETE_PATH)
}
optimizer = create_optimizer_mock(
trainer_config, gail_dummy_config_discrete, False, True, True
)
reward_signal_eval(optimizer, "gail")
reward_signal_update(optimizer, "gail")
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_gail_rnn(trainer_config, gail_dummy_config):
policy = create_optimizer_mock(
trainer_config, gail_dummy_config, True, False, False
)
reward_signal_eval(policy, "gail")
reward_signal_update(policy, "gail")
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_curiosity_cc(trainer_config, curiosity_dummy_config):
policy = create_optimizer_mock(
trainer_config, curiosity_dummy_config, False, False, False
)
reward_signal_eval(policy, "curiosity")
reward_signal_update(policy, "curiosity")
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_curiosity_dc(trainer_config, curiosity_dummy_config):
policy = create_optimizer_mock(
trainer_config, curiosity_dummy_config, False, True, False
)
reward_signal_eval(policy, "curiosity")
reward_signal_update(policy, "curiosity")
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_curiosity_visual(trainer_config, curiosity_dummy_config):
policy = create_optimizer_mock(
trainer_config, curiosity_dummy_config, False, False, True
)
reward_signal_eval(policy, "curiosity")
reward_signal_update(policy, "curiosity")
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_curiosity_rnn(trainer_config, curiosity_dummy_config):
policy = create_optimizer_mock(
trainer_config, curiosity_dummy_config, True, False, False
)
reward_signal_eval(policy, "curiosity")
reward_signal_update(policy, "curiosity")
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_extrinsic(trainer_config, extrinsic_dummy_config):
policy = create_optimizer_mock(
trainer_config, extrinsic_dummy_config, False, False, False
)
reward_signal_eval(policy, "extrinsic")
reward_signal_update(policy, "extrinsic")
if __name__ == "__main__":
pytest.main()

505
ml-agents/mlagents/trainers/tests/test_simple_rl.py


import math
import tempfile
import pytest
import numpy as np
import attr
from typing import Dict
from mlagents.trainers.tests.simple_test_envs import (
SimpleEnvironment,
MemoryEnvironment,
RecordEnvironment,
)
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.trainer_util import TrainerFactory
from mlagents.trainers.simple_env_manager import SimpleEnvManager
from mlagents.trainers.demo_loader import write_demo
from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary
from mlagents.trainers.settings import (
TrainerSettings,
PPOSettings,
SACSettings,
NetworkSettings,
SelfPlaySettings,
BehavioralCloningSettings,
GAILSettings,
TrainerType,
RewardSignalType,
EncoderType,
ScheduleType,
FrameworkType,
)
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
from mlagents_envs.side_channel.environment_parameters_channel import (
EnvironmentParametersChannel,
)
from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
DemonstrationMetaProto,
)
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous
BRAIN_NAME = "1D"
PPO_CONFIG = TrainerSettings(
trainer_type=TrainerType.PPO,
hyperparameters=PPOSettings(
learning_rate=5.0e-3,
learning_rate_schedule=ScheduleType.CONSTANT,
batch_size=16,
buffer_size=64,
),
network_settings=NetworkSettings(num_layers=1, hidden_units=32),
summary_freq=500,
max_steps=3000,
threaded=False,
framework=FrameworkType.TENSORFLOW,
)
SAC_CONFIG = TrainerSettings(
trainer_type=TrainerType.SAC,
hyperparameters=SACSettings(
learning_rate=5.0e-3,
learning_rate_schedule=ScheduleType.CONSTANT,
batch_size=8,
buffer_init_steps=100,
buffer_size=5000,
tau=0.01,
init_entcoef=0.01,
),
network_settings=NetworkSettings(num_layers=1, hidden_units=16),
summary_freq=100,
max_steps=1000,
threaded=False,
)
# The reward processor is passed as an argument to _check_environment_trains.
# It is applied to the list of all final rewards for each brain individually.
# This is so that we can process all final rewards in different ways for different algorithms.
# Custom reward processors should be built within the test function and passed to _check_environment_trains
# Default is average over the last 5 final rewards
def default_reward_processor(rewards, last_n_rewards=5):
rewards_to_use = rewards[-last_n_rewards:]
# For debugging tests
print(f"Last {last_n_rewards} rewards:", rewards_to_use)
return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean()
class DebugWriter(StatsWriter):
"""
Print to stdout so stats can be viewed in pytest
"""
def __init__(self):
self._last_reward_summary: Dict[str, float] = {}
def get_last_rewards(self):
return self._last_reward_summary
def write_stats(
self, category: str, values: Dict[str, StatsSummary], step: int
) -> None:
for val, stats_summary in values.items():
if val == "Environment/Cumulative Reward":
print(step, val, stats_summary.mean)
self._last_reward_summary[category] = stats_summary.mean
def _check_environment_trains(
env,
trainer_config,
reward_processor=default_reward_processor,
env_parameter_manager=None,
success_threshold=0.9,
env_manager=None,
):
if env_parameter_manager is None:
env_parameter_manager = EnvironmentParameterManager()
# Create controller and begin training.
with tempfile.TemporaryDirectory() as dir:
run_id = "id"
seed = 1337
StatsReporter.writers.clear() # Clear StatsReporters so we don't write to file
debug_writer = DebugWriter()
StatsReporter.add_writer(debug_writer)
if env_manager is None:
env_manager = SimpleEnvManager(env, EnvironmentParametersChannel())
trainer_factory = TrainerFactory(
trainer_config=trainer_config,
output_path=dir,
train_model=True,
load_model=False,
seed=seed,
param_manager=env_parameter_manager,
multi_gpu=False,
)
tc = TrainerController(
trainer_factory=trainer_factory,
output_path=dir,
run_id=run_id,
param_manager=env_parameter_manager,
train=True,
training_seed=seed,
)
# Begin training
tc.start_learning(env_manager)
if (
success_threshold is not None
): # For tests where we are just checking setup and not reward
processed_rewards = [
reward_processor(rewards) for rewards in env.final_rewards.values()
]
assert all(not math.isnan(reward) for reward in processed_rewards)
assert all(reward > success_threshold for reward in processed_rewards)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ppo(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
config = attr.evolve(PPO_CONFIG)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_ppo(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
new_hyperparams = attr.evolve(
PPO_CONFIG.hyperparameters, batch_size=64, buffer_size=640
)
config = attr.evolve(PPO_CONFIG, hyperparameters=new_hyperparams, max_steps=10000)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("num_visual", [1, 2])
def test_visual_ppo(num_visual, use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=use_discrete,
num_visual=num_visual,
num_vector=0,
step_size=0.2,
)
new_hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3.0e-4)
config = attr.evolve(PPO_CONFIG, hyperparameters=new_hyperparams)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("num_visual", [1, 2])
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
def test_visual_advanced_ppo(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
num_visual=num_visual,
num_vector=0,
step_size=0.5,
vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
)
new_networksettings = attr.evolve(
SAC_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
)
new_hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3.0e-4)
config = attr.evolve(
PPO_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=500,
summary_freq=100,
)
# The number of steps is pretty small for these encoders
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_ppo(use_discrete):
env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
new_network_settings = attr.evolve(
PPO_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16),
)
new_hyperparams = attr.evolve(
PPO_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=128
)
config = attr.evolve(
PPO_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_network_settings,
max_steps=5000,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_sac(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
config = attr.evolve(SAC_CONFIG)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_sac(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
new_hyperparams = attr.evolve(SAC_CONFIG.hyperparameters, buffer_init_steps=2000)
config = attr.evolve(SAC_CONFIG, hyperparameters=new_hyperparams, max_steps=10000)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("num_visual", [1, 2])
def test_visual_sac(num_visual, use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=use_discrete,
num_visual=num_visual,
num_vector=0,
step_size=0.2,
)
new_hyperparams = attr.evolve(
SAC_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4
)
config = attr.evolve(SAC_CONFIG, hyperparameters=new_hyperparams)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("num_visual", [1, 2])
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
def test_visual_advanced_sac(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
num_visual=num_visual,
num_vector=0,
step_size=0.5,
vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
)
new_networksettings = attr.evolve(
SAC_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
)
new_hyperparams = attr.evolve(
SAC_CONFIG.hyperparameters,
batch_size=16,
learning_rate=3e-4,
buffer_init_steps=0,
)
config = attr.evolve(
SAC_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=100,
)
# The number of steps is pretty small for these encoders
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_sac(use_discrete):
step_size = 0.5 if use_discrete else 0.2
env = MemoryEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, step_size=step_size
)
new_networksettings = attr.evolve(
SAC_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16),
)
new_hyperparams = attr.evolve(
SAC_CONFIG.hyperparameters,
batch_size=128,
learning_rate=1e-3,
buffer_init_steps=1000,
steps_per_update=2,
)
config = attr.evolve(
SAC_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=5000,
)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000
)
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2500)
_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost_fails(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
)
# This config should fail because the ghosted policy is never swapped with a competent policy.
# Swap occurs after max step is reached.
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=4000
)
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2500)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None)
processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]
success_threshold = 0.9
assert any(reward > success_threshold for reward in processed_rewards) and any(
reward < success_threshold for reward in processed_rewards
)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost(use_discrete):
# Make opponent for asymmetric case
brain_name_opp = BRAIN_NAME + "Opp"
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0,
save_steps=10000,
swap_steps=10000,
team_change=400,
)
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=4000)
_check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost_fails(use_discrete):
# Make opponent for asymmetric case
brain_name_opp = BRAIN_NAME + "Opp"
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
)
# This config should fail because the team that us not learning when both have reached
# max step should be executing the initial, untrained poliy.
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=0.0,
save_steps=5000,
swap_steps=5000,
team_change=2000,
)
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=3000)
_check_environment_trains(
env, {BRAIN_NAME: config, brain_name_opp: config}, success_threshold=None
)
processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]
success_threshold = 0.9
assert any(reward > success_threshold for reward in processed_rewards) and any(
reward < success_threshold for reward in processed_rewards
)
@pytest.fixture(scope="session")
def simple_record(tmpdir_factory):
def record_demo(use_discrete, num_visual=0, num_vector=1):
env = RecordEnvironment(
[BRAIN_NAME],
use_discrete=use_discrete,
num_visual=num_visual,
num_vector=num_vector,
n_demos=100,
)
# If we want to use true demos, we can solve the env in the usual way
# Otherwise, we can just call solve to execute the optimal policy
env.solve()
agent_info_protos = env.demonstration_protos[BRAIN_NAME]
meta_data_proto = DemonstrationMetaProto()
brain_param_proto = BrainParametersProto(
vector_action_size=[2] if use_discrete else [1],
vector_action_descriptions=[""],
vector_action_space_type=discrete if use_discrete else continuous,
brain_name=BRAIN_NAME,
is_training=True,
)
action_type = "Discrete" if use_discrete else "Continuous"
demo_path_name = "1DTest" + action_type + ".demo"
demo_path = str(tmpdir_factory.mktemp("tmp_demo").join(demo_path_name))
write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos)
return demo_path
return record_demo
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("trainer_config", [PPO_CONFIG, SAC_CONFIG])
def test_gail(simple_record, use_discrete, trainer_config):
demo_path = simple_record(use_discrete)
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
}
config = attr.evolve(
trainer_config,
reward_signals=reward_signals,
behavioral_cloning=bc_settings,
max_steps=500,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_ppo(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
env = SimpleEnvironment(
[BRAIN_NAME],
num_visual=1,
num_vector=0,
use_discrete=use_discrete,
step_size=0.2,
)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
}
hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3e-4)
config = attr.evolve(
PPO_CONFIG,
reward_signals=reward_signals,
hyperparameters=hyperparams,
behavioral_cloning=bc_settings,
max_steps=1000,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_sac(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
env = SimpleEnvironment(
[BRAIN_NAME],
num_visual=1,
num_vector=0,
use_discrete=use_discrete,
step_size=0.2,
)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
}
hyperparams = attr.evolve(
SAC_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16
)
config = attr.evolve(
SAC_CONFIG,
reward_signals=reward_signals,
hyperparameters=hyperparams,
behavioral_cloning=bc_settings,
max_steps=500,
)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)

/ml-agents/mlagents/trainers/tests/BasicLearning.pb → /ml-agents/mlagents/trainers/tests/tensorflow/BasicLearning.pb

/ml-agents/mlagents/trainers/tests/test_ghost.py → /ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py

/ml-agents/mlagents/trainers/tests/test_ppo.py → /ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py

/ml-agents/mlagents/trainers/tests/test_sac.py → /ml-agents/mlagents/trainers/tests/tensorflow/test_sac.py

/ml-agents/mlagents/trainers/tests/test_tf_policy.py → /ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py

/ml-agents/mlagents/trainers/tests/test_barracuda_converter.py → /ml-agents/mlagents/trainers/tests/tensorflow/test_barracuda_converter.py

/ml-agents/mlagents/trainers/tests/test_distributions.py → /ml-agents/mlagents/trainers/tests/tensorflow/test_distributions.py

/ml-agents/mlagents/trainers/tests/test_models.py → /ml-agents/mlagents/trainers/tests/tensorflow/test_models.py

/ml-agents/mlagents/trainers/tests/test_nn_policy.py → /ml-agents/mlagents/trainers/tests/tensorflow/test_nn_policy.py

/ml-agents/mlagents/trainers/tests/test_saver.py → /ml-agents/mlagents/trainers/tests/tensorflow/test_saver.py

正在加载...
取消
保存