浏览代码
Refactoring of the tests folder for the trainers (#4510)
Refactoring of the tests folder for the trainers (#4510)
* Refactoring of the tests folder for the trainers * Fixing issues * Fixing issues * Fixing issues/MLA-1734-demo-provider
GitHub
4 年前
当前提交
e471bd8b
共有 29 个文件被更改,包括 1090 次插入 和 1043 次删除
-
10ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
-
6ml-agents/mlagents/trainers/tests/test_trainer_util.py
-
12ml-agents/mlagents/trainers/tests/torch/test_ppo.py
-
10ml-agents/mlagents/trainers/tests/torch/test_sac.py
-
251ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
-
21ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
-
10ml-agents/mlagents/trainers/tests/tensorflow/test_sac.py
-
2ml-agents/mlagents/trainers/tests/tensorflow/test_saver.py
-
94ml-agents/mlagents/trainers/tests/check_env_trains.py
-
72ml-agents/mlagents/trainers/tests/dummy_config.py
-
0ml-agents/mlagents/trainers/tests/torch/__init__.py
-
0ml-agents/mlagents/trainers/tests/tensorflow/__init__.py
-
118ml-agents/mlagents/trainers/tests/tensorflow/test_bcmodule.py
-
176ml-agents/mlagents/trainers/tests/tensorflow/test_reward_signals.py
-
520ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py
-
129ml-agents/mlagents/trainers/tests/test_bcmodule.py
-
197ml-agents/mlagents/trainers/tests/test_reward_signals.py
-
505ml-agents/mlagents/trainers/tests/test_simple_rl.py
-
0/ml-agents/mlagents/trainers/tests/tensorflow/BasicLearning.pb
-
0/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py
-
0/ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
-
0/ml-agents/mlagents/trainers/tests/tensorflow/test_sac.py
-
0/ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py
-
0/ml-agents/mlagents/trainers/tests/tensorflow/test_barracuda_converter.py
-
0/ml-agents/mlagents/trainers/tests/tensorflow/test_distributions.py
-
0/ml-agents/mlagents/trainers/tests/tensorflow/test_models.py
-
0/ml-agents/mlagents/trainers/tests/tensorflow/test_nn_policy.py
-
0/ml-agents/mlagents/trainers/tests/tensorflow/test_saver.py
|
|||
import math |
|||
import tempfile |
|||
import numpy as np |
|||
from typing import Dict |
|||
from mlagents.trainers.trainer_controller import TrainerController |
|||
from mlagents.trainers.trainer_util import TrainerFactory |
|||
from mlagents.trainers.simple_env_manager import SimpleEnvManager |
|||
from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary |
|||
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager |
|||
from mlagents_envs.side_channel.environment_parameters_channel import ( |
|||
EnvironmentParametersChannel, |
|||
) |
|||
|
|||
|
|||
class DebugWriter(StatsWriter): |
|||
""" |
|||
Print to stdout so stats can be viewed in pytest |
|||
""" |
|||
|
|||
def __init__(self): |
|||
self._last_reward_summary: Dict[str, float] = {} |
|||
|
|||
def get_last_rewards(self): |
|||
return self._last_reward_summary |
|||
|
|||
def write_stats( |
|||
self, category: str, values: Dict[str, StatsSummary], step: int |
|||
) -> None: |
|||
for val, stats_summary in values.items(): |
|||
if val == "Environment/Cumulative Reward": |
|||
print(step, val, stats_summary.mean) |
|||
self._last_reward_summary[category] = stats_summary.mean |
|||
|
|||
|
|||
# The reward processor is passed as an argument to _check_environment_trains. |
|||
# It is applied to the list of all final rewards for each brain individually. |
|||
# This is so that we can process all final rewards in different ways for different algorithms. |
|||
# Custom reward processors should be built within the test function and passed to _check_environment_trains |
|||
# Default is average over the last 5 final rewards |
|||
def default_reward_processor(rewards, last_n_rewards=5): |
|||
rewards_to_use = rewards[-last_n_rewards:] |
|||
# For debugging tests |
|||
print(f"Last {last_n_rewards} rewards:", rewards_to_use) |
|||
return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean() |
|||
|
|||
|
|||
def check_environment_trains( |
|||
env, |
|||
trainer_config, |
|||
reward_processor=default_reward_processor, |
|||
env_parameter_manager=None, |
|||
success_threshold=0.9, |
|||
env_manager=None, |
|||
): |
|||
if env_parameter_manager is None: |
|||
env_parameter_manager = EnvironmentParameterManager() |
|||
# Create controller and begin training. |
|||
with tempfile.TemporaryDirectory() as dir: |
|||
run_id = "id" |
|||
seed = 1337 |
|||
StatsReporter.writers.clear() # Clear StatsReporters so we don't write to file |
|||
debug_writer = DebugWriter() |
|||
StatsReporter.add_writer(debug_writer) |
|||
if env_manager is None: |
|||
env_manager = SimpleEnvManager(env, EnvironmentParametersChannel()) |
|||
trainer_factory = TrainerFactory( |
|||
trainer_config=trainer_config, |
|||
output_path=dir, |
|||
train_model=True, |
|||
load_model=False, |
|||
seed=seed, |
|||
param_manager=env_parameter_manager, |
|||
multi_gpu=False, |
|||
) |
|||
|
|||
tc = TrainerController( |
|||
trainer_factory=trainer_factory, |
|||
output_path=dir, |
|||
run_id=run_id, |
|||
param_manager=env_parameter_manager, |
|||
train=True, |
|||
training_seed=seed, |
|||
) |
|||
|
|||
# Begin training |
|||
tc.start_learning(env_manager) |
|||
if ( |
|||
success_threshold is not None |
|||
): # For tests where we are just checking setup and not reward |
|||
processed_rewards = [ |
|||
reward_processor(rewards) for rewards in env.final_rewards.values() |
|||
] |
|||
assert all(not math.isnan(reward) for reward in processed_rewards) |
|||
assert all(reward > success_threshold for reward in processed_rewards) |
|
|||
import pytest |
|||
import copy |
|||
import os |
|||
from mlagents.trainers.settings import ( |
|||
TrainerSettings, |
|||
PPOSettings, |
|||
SACSettings, |
|||
GAILSettings, |
|||
CuriositySettings, |
|||
RewardSignalSettings, |
|||
NetworkSettings, |
|||
TrainerType, |
|||
RewardSignalType, |
|||
ScheduleType, |
|||
) |
|||
|
|||
CONTINUOUS_DEMO_PATH = os.path.dirname(os.path.abspath(__file__)) + "/test.demo" |
|||
DISCRETE_DEMO_PATH = os.path.dirname(os.path.abspath(__file__)) + "/testdcvis.demo" |
|||
|
|||
_PPO_CONFIG = TrainerSettings( |
|||
trainer_type=TrainerType.PPO, |
|||
hyperparameters=PPOSettings( |
|||
learning_rate=5.0e-3, |
|||
learning_rate_schedule=ScheduleType.CONSTANT, |
|||
batch_size=16, |
|||
buffer_size=64, |
|||
), |
|||
network_settings=NetworkSettings(num_layers=1, hidden_units=32), |
|||
summary_freq=500, |
|||
max_steps=3000, |
|||
threaded=False, |
|||
) |
|||
|
|||
_SAC_CONFIG = TrainerSettings( |
|||
trainer_type=TrainerType.SAC, |
|||
hyperparameters=SACSettings( |
|||
learning_rate=5.0e-3, |
|||
learning_rate_schedule=ScheduleType.CONSTANT, |
|||
batch_size=8, |
|||
buffer_init_steps=100, |
|||
buffer_size=5000, |
|||
tau=0.01, |
|||
init_entcoef=0.01, |
|||
), |
|||
network_settings=NetworkSettings(num_layers=1, hidden_units=16), |
|||
summary_freq=100, |
|||
max_steps=1000, |
|||
threaded=False, |
|||
) |
|||
|
|||
|
|||
def ppo_dummy_config(): |
|||
return copy.deepcopy(_PPO_CONFIG) |
|||
|
|||
|
|||
def sac_dummy_config(): |
|||
return copy.deepcopy(_SAC_CONFIG) |
|||
|
|||
|
|||
@pytest.fixture |
|||
def gail_dummy_config(): |
|||
return {RewardSignalType.GAIL: GAILSettings(demo_path=CONTINUOUS_DEMO_PATH)} |
|||
|
|||
|
|||
@pytest.fixture |
|||
def curiosity_dummy_config(): |
|||
return {RewardSignalType.CURIOSITY: CuriositySettings()} |
|||
|
|||
|
|||
@pytest.fixture |
|||
def extrinsic_dummy_config(): |
|||
return {RewardSignalType.EXTRINSIC: RewardSignalSettings()} |
|
|||
import pytest |
|||
import mlagents.trainers.tests.mock_brain as mb |
|||
|
|||
import numpy as np |
|||
|
|||
from mlagents.trainers.policy.tf_policy import TFPolicy |
|||
from mlagents.trainers.components.bc.module import BCModule |
|||
from mlagents.trainers.settings import ( |
|||
TrainerSettings, |
|||
BehavioralCloningSettings, |
|||
NetworkSettings, |
|||
) |
|||
|
|||
from mlagents.trainers.tests.dummy_config import ( |
|||
DISCRETE_DEMO_PATH, |
|||
CONTINUOUS_DEMO_PATH, |
|||
) |
|||
|
|||
|
|||
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample): |
|||
# model_path = env.external_brain_names[0] |
|||
trainer_config = TrainerSettings() |
|||
trainer_config.network_settings.memory = ( |
|||
NetworkSettings.MemorySettings() if use_rnn else None |
|||
) |
|||
policy = TFPolicy( |
|||
0, mock_behavior_specs, trainer_config, tanhresample, tanhresample |
|||
) |
|||
with policy.graph.as_default(): |
|||
bc_module = BCModule( |
|||
policy, |
|||
policy_learning_rate=trainer_config.hyperparameters.learning_rate, |
|||
default_batch_size=trainer_config.hyperparameters.batch_size, |
|||
default_num_epoch=3, |
|||
settings=bc_settings, |
|||
) |
|||
policy.initialize() # Normally the optimizer calls this after the BCModule is created |
|||
return bc_module |
|||
|
|||
|
|||
# Test default values |
|||
def test_bcmodule_defaults(): |
|||
# See if default values match |
|||
mock_specs = mb.create_mock_3dball_behavior_specs() |
|||
bc_settings = BehavioralCloningSettings(demo_path=CONTINUOUS_DEMO_PATH) |
|||
bc_module = create_bc_module(mock_specs, bc_settings, False, False) |
|||
assert bc_module.num_epoch == 3 |
|||
assert bc_module.batch_size == TrainerSettings().hyperparameters.batch_size |
|||
# Assign strange values and see if it overrides properly |
|||
bc_settings = BehavioralCloningSettings( |
|||
demo_path=CONTINUOUS_DEMO_PATH, num_epoch=100, batch_size=10000 |
|||
) |
|||
bc_module = create_bc_module(mock_specs, bc_settings, False, False) |
|||
assert bc_module.num_epoch == 100 |
|||
assert bc_module.batch_size == 10000 |
|||
|
|||
|
|||
# Test with continuous control env and vector actions |
|||
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) |
|||
def test_bcmodule_update(is_sac): |
|||
mock_specs = mb.create_mock_3dball_behavior_specs() |
|||
bc_settings = BehavioralCloningSettings(demo_path=CONTINUOUS_DEMO_PATH) |
|||
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac) |
|||
stats = bc_module.update() |
|||
for _, item in stats.items(): |
|||
assert isinstance(item, np.float32) |
|||
|
|||
|
|||
# Test with constant pretraining learning rate |
|||
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) |
|||
def test_bcmodule_constant_lr_update(is_sac): |
|||
mock_specs = mb.create_mock_3dball_behavior_specs() |
|||
bc_settings = BehavioralCloningSettings(demo_path=CONTINUOUS_DEMO_PATH, steps=0) |
|||
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac) |
|||
stats = bc_module.update() |
|||
for _, item in stats.items(): |
|||
assert isinstance(item, np.float32) |
|||
old_learning_rate = bc_module.current_lr |
|||
|
|||
_ = bc_module.update() |
|||
assert old_learning_rate == bc_module.current_lr |
|||
|
|||
|
|||
# Test with RNN |
|||
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) |
|||
def test_bcmodule_rnn_update(is_sac): |
|||
mock_specs = mb.create_mock_3dball_behavior_specs() |
|||
bc_settings = BehavioralCloningSettings(demo_path=CONTINUOUS_DEMO_PATH) |
|||
bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac) |
|||
stats = bc_module.update() |
|||
for _, item in stats.items(): |
|||
assert isinstance(item, np.float32) |
|||
|
|||
|
|||
# Test with discrete control and visual observations |
|||
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) |
|||
def test_bcmodule_dc_visual_update(is_sac): |
|||
mock_specs = mb.create_mock_banana_behavior_specs() |
|||
bc_settings = BehavioralCloningSettings(demo_path=DISCRETE_DEMO_PATH) |
|||
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac) |
|||
stats = bc_module.update() |
|||
for _, item in stats.items(): |
|||
assert isinstance(item, np.float32) |
|||
|
|||
|
|||
# Test with discrete control, visual observations and RNN |
|||
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) |
|||
def test_bcmodule_rnn_dc_update(is_sac): |
|||
mock_specs = mb.create_mock_banana_behavior_specs() |
|||
bc_settings = BehavioralCloningSettings(demo_path=DISCRETE_DEMO_PATH) |
|||
bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac) |
|||
stats = bc_module.update() |
|||
for _, item in stats.items(): |
|||
assert isinstance(item, np.float32) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
pytest.main() |
|
|||
import pytest |
|||
|
|||
import mlagents.trainers.tests.mock_brain as mb |
|||
from mlagents.trainers.policy.tf_policy import TFPolicy |
|||
from mlagents.trainers.sac.optimizer_tf import SACOptimizer |
|||
from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer |
|||
from mlagents.trainers.tests.dummy_config import ( # noqa: F401; pylint: disable=unused-variable |
|||
ppo_dummy_config, |
|||
sac_dummy_config, |
|||
gail_dummy_config, |
|||
curiosity_dummy_config, |
|||
extrinsic_dummy_config, |
|||
DISCRETE_DEMO_PATH, |
|||
CONTINUOUS_DEMO_PATH, |
|||
) |
|||
from mlagents.trainers.settings import ( |
|||
GAILSettings, |
|||
BehavioralCloningSettings, |
|||
NetworkSettings, |
|||
TrainerType, |
|||
RewardSignalType, |
|||
) |
|||
|
|||
|
|||
VECTOR_ACTION_SPACE = 2 |
|||
VECTOR_OBS_SPACE = 8 |
|||
DISCRETE_ACTION_SPACE = [3, 3, 3, 2] |
|||
BUFFER_INIT_SAMPLES = 20 |
|||
BATCH_SIZE = 12 |
|||
NUM_AGENTS = 12 |
|||
|
|||
|
|||
def create_optimizer_mock( |
|||
trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual |
|||
): |
|||
mock_specs = mb.setup_test_behavior_specs( |
|||
use_discrete, |
|||
use_visual, |
|||
vector_action_space=DISCRETE_ACTION_SPACE |
|||
if use_discrete |
|||
else VECTOR_ACTION_SPACE, |
|||
vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0, |
|||
) |
|||
trainer_settings = trainer_config |
|||
trainer_settings.reward_signals = reward_signal_config |
|||
trainer_settings.network_settings.memory = ( |
|||
NetworkSettings.MemorySettings(sequence_length=16, memory_size=10) |
|||
if use_rnn |
|||
else None |
|||
) |
|||
policy = TFPolicy( |
|||
0, mock_specs, trainer_settings, "test", False, create_tf_graph=False |
|||
) |
|||
if trainer_settings.trainer_type == TrainerType.SAC: |
|||
optimizer = SACOptimizer(policy, trainer_settings) |
|||
else: |
|||
optimizer = PPOOptimizer(policy, trainer_settings) |
|||
optimizer.policy.initialize() |
|||
return optimizer |
|||
|
|||
|
|||
def reward_signal_eval(optimizer, reward_signal_name): |
|||
buffer = mb.simulate_rollout(BATCH_SIZE, optimizer.policy.behavior_spec) |
|||
# Test evaluate |
|||
rsig_result = optimizer.reward_signals[reward_signal_name].evaluate_batch(buffer) |
|||
assert rsig_result.scaled_reward.shape == (BATCH_SIZE,) |
|||
assert rsig_result.unscaled_reward.shape == (BATCH_SIZE,) |
|||
|
|||
|
|||
def reward_signal_update(optimizer, reward_signal_name): |
|||
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec) |
|||
feed_dict = optimizer.reward_signals[reward_signal_name].prepare_update( |
|||
optimizer.policy, buffer.make_mini_batch(0, 10), 2 |
|||
) |
|||
out = optimizer.policy._execute_model( |
|||
feed_dict, optimizer.reward_signals[reward_signal_name].update_dict |
|||
) |
|||
assert type(out) is dict |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_gail_cc(trainer_config, gail_dummy_config): # noqa: F811 |
|||
trainer_config.behavioral_cloning = BehavioralCloningSettings( |
|||
demo_path=CONTINUOUS_DEMO_PATH |
|||
) |
|||
optimizer = create_optimizer_mock( |
|||
trainer_config, gail_dummy_config, False, False, False |
|||
) |
|||
reward_signal_eval(optimizer, "gail") |
|||
reward_signal_update(optimizer, "gail") |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_gail_dc_visual(trainer_config, gail_dummy_config): # noqa: F811 |
|||
gail_dummy_config_discrete = { |
|||
RewardSignalType.GAIL: GAILSettings(demo_path=DISCRETE_DEMO_PATH) |
|||
} |
|||
optimizer = create_optimizer_mock( |
|||
trainer_config, gail_dummy_config_discrete, False, True, True |
|||
) |
|||
reward_signal_eval(optimizer, "gail") |
|||
reward_signal_update(optimizer, "gail") |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_gail_rnn(trainer_config, gail_dummy_config): # noqa: F811 |
|||
policy = create_optimizer_mock( |
|||
trainer_config, gail_dummy_config, True, False, False |
|||
) |
|||
reward_signal_eval(policy, "gail") |
|||
reward_signal_update(policy, "gail") |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_curiosity_cc(trainer_config, curiosity_dummy_config): # noqa: F811 |
|||
policy = create_optimizer_mock( |
|||
trainer_config, curiosity_dummy_config, False, False, False |
|||
) |
|||
reward_signal_eval(policy, "curiosity") |
|||
reward_signal_update(policy, "curiosity") |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_curiosity_dc(trainer_config, curiosity_dummy_config): # noqa: F811 |
|||
policy = create_optimizer_mock( |
|||
trainer_config, curiosity_dummy_config, False, True, False |
|||
) |
|||
reward_signal_eval(policy, "curiosity") |
|||
reward_signal_update(policy, "curiosity") |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_curiosity_visual(trainer_config, curiosity_dummy_config): # noqa: F811 |
|||
policy = create_optimizer_mock( |
|||
trainer_config, curiosity_dummy_config, False, False, True |
|||
) |
|||
reward_signal_eval(policy, "curiosity") |
|||
reward_signal_update(policy, "curiosity") |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_curiosity_rnn(trainer_config, curiosity_dummy_config): # noqa: F811 |
|||
policy = create_optimizer_mock( |
|||
trainer_config, curiosity_dummy_config, True, False, False |
|||
) |
|||
reward_signal_eval(policy, "curiosity") |
|||
reward_signal_update(policy, "curiosity") |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_extrinsic(trainer_config, extrinsic_dummy_config): # noqa: F811 |
|||
policy = create_optimizer_mock( |
|||
trainer_config, extrinsic_dummy_config, False, False, False |
|||
) |
|||
reward_signal_eval(policy, "extrinsic") |
|||
reward_signal_update(policy, "extrinsic") |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
pytest.main() |
|
|||
import math |
|||
import tempfile |
|||
import pytest |
|||
import numpy as np |
|||
import attr |
|||
from typing import Dict |
|||
|
|||
from mlagents.trainers.tests.simple_test_envs import ( |
|||
SimpleEnvironment, |
|||
MemoryEnvironment, |
|||
RecordEnvironment, |
|||
) |
|||
from mlagents.trainers.trainer_controller import TrainerController |
|||
from mlagents.trainers.trainer_util import TrainerFactory |
|||
from mlagents.trainers.simple_env_manager import SimpleEnvManager |
|||
from mlagents.trainers.demo_loader import write_demo |
|||
from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary |
|||
from mlagents.trainers.settings import ( |
|||
NetworkSettings, |
|||
SelfPlaySettings, |
|||
BehavioralCloningSettings, |
|||
GAILSettings, |
|||
RewardSignalType, |
|||
EncoderType, |
|||
FrameworkType, |
|||
) |
|||
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager |
|||
from mlagents_envs.side_channel.environment_parameters_channel import ( |
|||
EnvironmentParametersChannel, |
|||
) |
|||
from mlagents_envs.communicator_objects.demonstration_meta_pb2 import ( |
|||
DemonstrationMetaProto, |
|||
) |
|||
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto |
|||
from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous |
|||
|
|||
from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config |
|||
|
|||
PPO_TF_CONFIG = attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW) |
|||
SAC_TF_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.TENSORFLOW) |
|||
|
|||
BRAIN_NAME = "1D" |
|||
|
|||
|
|||
# The reward processor is passed as an argument to _check_environment_trains. |
|||
# It is applied to the list of all final rewards for each brain individually. |
|||
# This is so that we can process all final rewards in different ways for different algorithms. |
|||
# Custom reward processors should be built within the test function and passed to _check_environment_trains |
|||
# Default is average over the last 5 final rewards |
|||
def default_reward_processor(rewards, last_n_rewards=5): |
|||
rewards_to_use = rewards[-last_n_rewards:] |
|||
# For debugging tests |
|||
print(f"Last {last_n_rewards} rewards:", rewards_to_use) |
|||
return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean() |
|||
|
|||
|
|||
class DebugWriter(StatsWriter): |
|||
""" |
|||
Print to stdout so stats can be viewed in pytest |
|||
""" |
|||
|
|||
def __init__(self): |
|||
self._last_reward_summary: Dict[str, float] = {} |
|||
|
|||
def get_last_rewards(self): |
|||
return self._last_reward_summary |
|||
|
|||
def write_stats( |
|||
self, category: str, values: Dict[str, StatsSummary], step: int |
|||
) -> None: |
|||
for val, stats_summary in values.items(): |
|||
if val == "Environment/Cumulative Reward": |
|||
print(step, val, stats_summary.mean) |
|||
self._last_reward_summary[category] = stats_summary.mean |
|||
|
|||
|
|||
def _check_environment_trains( |
|||
env, |
|||
trainer_config, |
|||
reward_processor=default_reward_processor, |
|||
env_parameter_manager=None, |
|||
success_threshold=0.9, |
|||
env_manager=None, |
|||
): |
|||
if env_parameter_manager is None: |
|||
env_parameter_manager = EnvironmentParameterManager() |
|||
# Create controller and begin training. |
|||
with tempfile.TemporaryDirectory() as dir: |
|||
run_id = "id" |
|||
seed = 1337 |
|||
StatsReporter.writers.clear() # Clear StatsReporters so we don't write to file |
|||
debug_writer = DebugWriter() |
|||
StatsReporter.add_writer(debug_writer) |
|||
if env_manager is None: |
|||
env_manager = SimpleEnvManager(env, EnvironmentParametersChannel()) |
|||
trainer_factory = TrainerFactory( |
|||
trainer_config=trainer_config, |
|||
output_path=dir, |
|||
train_model=True, |
|||
load_model=False, |
|||
seed=seed, |
|||
param_manager=env_parameter_manager, |
|||
multi_gpu=False, |
|||
) |
|||
|
|||
tc = TrainerController( |
|||
trainer_factory=trainer_factory, |
|||
output_path=dir, |
|||
run_id=run_id, |
|||
param_manager=env_parameter_manager, |
|||
train=True, |
|||
training_seed=seed, |
|||
) |
|||
|
|||
# Begin training |
|||
tc.start_learning(env_manager) |
|||
if ( |
|||
success_threshold is not None |
|||
): # For tests where we are just checking setup and not reward |
|||
processed_rewards = [ |
|||
reward_processor(rewards) for rewards in env.final_rewards.values() |
|||
] |
|||
assert all(not math.isnan(reward) for reward in processed_rewards) |
|||
assert all(reward > success_threshold for reward in processed_rewards) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_simple_ppo(use_discrete): |
|||
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete) |
|||
config = attr.evolve(PPO_TF_CONFIG, framework=FrameworkType.TENSORFLOW) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_2d_ppo(use_discrete): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8 |
|||
) |
|||
new_hyperparams = attr.evolve( |
|||
PPO_TF_CONFIG.hyperparameters, batch_size=64, buffer_size=640 |
|||
) |
|||
config = attr.evolve( |
|||
PPO_TF_CONFIG, |
|||
hyperparameters=new_hyperparams, |
|||
max_steps=10000, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
@pytest.mark.parametrize("num_visual", [1, 2]) |
|||
def test_visual_ppo(num_visual, use_discrete): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], |
|||
use_discrete=use_discrete, |
|||
num_visual=num_visual, |
|||
num_vector=0, |
|||
step_size=0.2, |
|||
) |
|||
new_hyperparams = attr.evolve(PPO_TF_CONFIG.hyperparameters, learning_rate=3.0e-4) |
|||
config = attr.evolve( |
|||
PPO_TF_CONFIG, |
|||
hyperparameters=new_hyperparams, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("num_visual", [1, 2]) |
|||
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"]) |
|||
def test_visual_advanced_ppo(vis_encode_type, num_visual): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], |
|||
use_discrete=True, |
|||
num_visual=num_visual, |
|||
num_vector=0, |
|||
step_size=0.5, |
|||
vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3), |
|||
) |
|||
new_networksettings = attr.evolve( |
|||
SAC_TF_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type) |
|||
) |
|||
new_hyperparams = attr.evolve(PPO_TF_CONFIG.hyperparameters, learning_rate=3.0e-4) |
|||
config = attr.evolve( |
|||
PPO_TF_CONFIG, |
|||
hyperparameters=new_hyperparams, |
|||
network_settings=new_networksettings, |
|||
max_steps=500, |
|||
summary_freq=100, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
# The number of steps is pretty small for these encoders |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_recurrent_ppo(use_discrete): |
|||
env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete) |
|||
new_network_settings = attr.evolve( |
|||
PPO_TF_CONFIG.network_settings, |
|||
memory=NetworkSettings.MemorySettings(memory_size=16), |
|||
) |
|||
new_hyperparams = attr.evolve( |
|||
PPO_TF_CONFIG.hyperparameters, |
|||
learning_rate=1.0e-3, |
|||
batch_size=64, |
|||
buffer_size=128, |
|||
) |
|||
config = attr.evolve( |
|||
PPO_TF_CONFIG, |
|||
hyperparameters=new_hyperparams, |
|||
network_settings=new_network_settings, |
|||
max_steps=5000, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_simple_sac(use_discrete): |
|||
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete) |
|||
config = attr.evolve(SAC_TF_CONFIG, framework=FrameworkType.TENSORFLOW) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_2d_sac(use_discrete): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8 |
|||
) |
|||
new_hyperparams = attr.evolve(SAC_TF_CONFIG.hyperparameters, buffer_init_steps=2000) |
|||
config = attr.evolve( |
|||
SAC_TF_CONFIG, |
|||
hyperparameters=new_hyperparams, |
|||
max_steps=10000, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
@pytest.mark.parametrize("num_visual", [1, 2]) |
|||
def test_visual_sac(num_visual, use_discrete): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], |
|||
use_discrete=use_discrete, |
|||
num_visual=num_visual, |
|||
num_vector=0, |
|||
step_size=0.2, |
|||
) |
|||
new_hyperparams = attr.evolve( |
|||
SAC_TF_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4 |
|||
) |
|||
config = attr.evolve( |
|||
SAC_TF_CONFIG, |
|||
hyperparameters=new_hyperparams, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("num_visual", [1, 2]) |
|||
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"]) |
|||
def test_visual_advanced_sac(vis_encode_type, num_visual): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], |
|||
use_discrete=True, |
|||
num_visual=num_visual, |
|||
num_vector=0, |
|||
step_size=0.5, |
|||
vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3), |
|||
) |
|||
new_networksettings = attr.evolve( |
|||
SAC_TF_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type) |
|||
) |
|||
new_hyperparams = attr.evolve( |
|||
SAC_TF_CONFIG.hyperparameters, |
|||
batch_size=16, |
|||
learning_rate=3e-4, |
|||
buffer_init_steps=0, |
|||
) |
|||
config = attr.evolve( |
|||
SAC_TF_CONFIG, |
|||
hyperparameters=new_hyperparams, |
|||
network_settings=new_networksettings, |
|||
max_steps=100, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
# The number of steps is pretty small for these encoders |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_recurrent_sac(use_discrete): |
|||
step_size = 0.5 if use_discrete else 0.2 |
|||
env = MemoryEnvironment( |
|||
[BRAIN_NAME], use_discrete=use_discrete, step_size=step_size |
|||
) |
|||
new_networksettings = attr.evolve( |
|||
SAC_TF_CONFIG.network_settings, |
|||
memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16), |
|||
) |
|||
new_hyperparams = attr.evolve( |
|||
SAC_TF_CONFIG.hyperparameters, |
|||
batch_size=128, |
|||
learning_rate=1e-3, |
|||
buffer_init_steps=1000, |
|||
steps_per_update=2, |
|||
) |
|||
config = attr.evolve( |
|||
SAC_TF_CONFIG, |
|||
hyperparameters=new_hyperparams, |
|||
network_settings=new_networksettings, |
|||
max_steps=5000, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_simple_ghost(use_discrete): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete |
|||
) |
|||
self_play_settings = SelfPlaySettings( |
|||
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000 |
|||
) |
|||
config = attr.evolve( |
|||
PPO_TF_CONFIG, |
|||
self_play=self_play_settings, |
|||
max_steps=2500, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_simple_ghost_fails(use_discrete): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete |
|||
) |
|||
# This config should fail because the ghosted policy is never swapped with a competent policy. |
|||
# Swap occurs after max step is reached. |
|||
self_play_settings = SelfPlaySettings( |
|||
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=4000 |
|||
) |
|||
config = attr.evolve( |
|||
PPO_TF_CONFIG, |
|||
self_play=self_play_settings, |
|||
max_steps=2500, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None) |
|||
processed_rewards = [ |
|||
default_reward_processor(rewards) for rewards in env.final_rewards.values() |
|||
] |
|||
success_threshold = 0.9 |
|||
assert any(reward > success_threshold for reward in processed_rewards) and any( |
|||
reward < success_threshold for reward in processed_rewards |
|||
) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_simple_asymm_ghost(use_discrete): |
|||
# Make opponent for asymmetric case |
|||
brain_name_opp = BRAIN_NAME + "Opp" |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete |
|||
) |
|||
self_play_settings = SelfPlaySettings( |
|||
play_against_latest_model_ratio=1.0, |
|||
save_steps=10000, |
|||
swap_steps=10000, |
|||
team_change=400, |
|||
) |
|||
config = attr.evolve( |
|||
PPO_TF_CONFIG, |
|||
self_play=self_play_settings, |
|||
max_steps=4000, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_simple_asymm_ghost_fails(use_discrete): |
|||
# Make opponent for asymmetric case |
|||
brain_name_opp = BRAIN_NAME + "Opp" |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete |
|||
) |
|||
# This config should fail because the team that us not learning when both have reached |
|||
# max step should be executing the initial, untrained poliy. |
|||
self_play_settings = SelfPlaySettings( |
|||
play_against_latest_model_ratio=0.0, |
|||
save_steps=5000, |
|||
swap_steps=5000, |
|||
team_change=2000, |
|||
) |
|||
config = attr.evolve( |
|||
PPO_TF_CONFIG, |
|||
self_play=self_play_settings, |
|||
max_steps=3000, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
_check_environment_trains( |
|||
env, {BRAIN_NAME: config, brain_name_opp: config}, success_threshold=None |
|||
) |
|||
processed_rewards = [ |
|||
default_reward_processor(rewards) for rewards in env.final_rewards.values() |
|||
] |
|||
success_threshold = 0.9 |
|||
assert any(reward > success_threshold for reward in processed_rewards) and any( |
|||
reward < success_threshold for reward in processed_rewards |
|||
) |
|||
|
|||
|
|||
@pytest.fixture(scope="session") |
|||
def simple_record(tmpdir_factory): |
|||
def record_demo(use_discrete, num_visual=0, num_vector=1): |
|||
env = RecordEnvironment( |
|||
[BRAIN_NAME], |
|||
use_discrete=use_discrete, |
|||
num_visual=num_visual, |
|||
num_vector=num_vector, |
|||
n_demos=100, |
|||
) |
|||
# If we want to use true demos, we can solve the env in the usual way |
|||
# Otherwise, we can just call solve to execute the optimal policy |
|||
env.solve() |
|||
agent_info_protos = env.demonstration_protos[BRAIN_NAME] |
|||
meta_data_proto = DemonstrationMetaProto() |
|||
brain_param_proto = BrainParametersProto( |
|||
vector_action_size=[2] if use_discrete else [1], |
|||
vector_action_descriptions=[""], |
|||
vector_action_space_type=discrete if use_discrete else continuous, |
|||
brain_name=BRAIN_NAME, |
|||
is_training=True, |
|||
) |
|||
action_type = "Discrete" if use_discrete else "Continuous" |
|||
demo_path_name = "1DTest" + action_type + ".demo" |
|||
demo_path = str(tmpdir_factory.mktemp("tmp_demo").join(demo_path_name)) |
|||
write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos) |
|||
return demo_path |
|||
|
|||
return record_demo |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
@pytest.mark.parametrize("trainer_config", [PPO_TF_CONFIG, SAC_TF_CONFIG]) |
|||
def test_gail(simple_record, use_discrete, trainer_config): |
|||
demo_path = simple_record(use_discrete) |
|||
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2) |
|||
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000) |
|||
reward_signals = { |
|||
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path) |
|||
} |
|||
config = attr.evolve( |
|||
trainer_config, |
|||
reward_signals=reward_signals, |
|||
behavioral_cloning=bc_settings, |
|||
max_steps=500, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_gail_visual_ppo(simple_record, use_discrete): |
|||
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0) |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], |
|||
num_visual=1, |
|||
num_vector=0, |
|||
use_discrete=use_discrete, |
|||
step_size=0.2, |
|||
) |
|||
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500) |
|||
reward_signals = { |
|||
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path) |
|||
} |
|||
hyperparams = attr.evolve(PPO_TF_CONFIG.hyperparameters, learning_rate=3e-4) |
|||
config = attr.evolve( |
|||
PPO_TF_CONFIG, |
|||
reward_signals=reward_signals, |
|||
hyperparameters=hyperparams, |
|||
behavioral_cloning=bc_settings, |
|||
max_steps=1000, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_gail_visual_sac(simple_record, use_discrete): |
|||
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0) |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], |
|||
num_visual=1, |
|||
num_vector=0, |
|||
use_discrete=use_discrete, |
|||
step_size=0.2, |
|||
) |
|||
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000) |
|||
reward_signals = { |
|||
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path) |
|||
} |
|||
hyperparams = attr.evolve( |
|||
SAC_TF_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16 |
|||
) |
|||
config = attr.evolve( |
|||
SAC_TF_CONFIG, |
|||
reward_signals=reward_signals, |
|||
hyperparameters=hyperparams, |
|||
behavioral_cloning=bc_settings, |
|||
max_steps=500, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9) |
|
|||
import pytest |
|||
import mlagents.trainers.tests.mock_brain as mb |
|||
|
|||
import numpy as np |
|||
import os |
|||
|
|||
from mlagents.trainers.policy.tf_policy import TFPolicy |
|||
from mlagents.trainers.components.bc.module import BCModule |
|||
from mlagents.trainers.settings import ( |
|||
TrainerSettings, |
|||
BehavioralCloningSettings, |
|||
NetworkSettings, |
|||
) |
|||
|
|||
|
|||
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample): |
|||
# model_path = env.external_brain_names[0] |
|||
trainer_config = TrainerSettings() |
|||
trainer_config.network_settings.memory = ( |
|||
NetworkSettings.MemorySettings() if use_rnn else None |
|||
) |
|||
policy = TFPolicy( |
|||
0, mock_behavior_specs, trainer_config, tanhresample, tanhresample |
|||
) |
|||
with policy.graph.as_default(): |
|||
bc_module = BCModule( |
|||
policy, |
|||
policy_learning_rate=trainer_config.hyperparameters.learning_rate, |
|||
default_batch_size=trainer_config.hyperparameters.batch_size, |
|||
default_num_epoch=3, |
|||
settings=bc_settings, |
|||
) |
|||
policy.initialize() # Normally the optimizer calls this after the BCModule is created |
|||
return bc_module |
|||
|
|||
|
|||
# Test default values |
|||
def test_bcmodule_defaults(): |
|||
# See if default values match |
|||
mock_specs = mb.create_mock_3dball_behavior_specs() |
|||
bc_settings = BehavioralCloningSettings( |
|||
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo" |
|||
) |
|||
bc_module = create_bc_module(mock_specs, bc_settings, False, False) |
|||
assert bc_module.num_epoch == 3 |
|||
assert bc_module.batch_size == TrainerSettings().hyperparameters.batch_size |
|||
# Assign strange values and see if it overrides properly |
|||
bc_settings = BehavioralCloningSettings( |
|||
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo", |
|||
num_epoch=100, |
|||
batch_size=10000, |
|||
) |
|||
bc_module = create_bc_module(mock_specs, bc_settings, False, False) |
|||
assert bc_module.num_epoch == 100 |
|||
assert bc_module.batch_size == 10000 |
|||
|
|||
|
|||
# Test with continuous control env and vector actions |
|||
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) |
|||
def test_bcmodule_update(is_sac): |
|||
mock_specs = mb.create_mock_3dball_behavior_specs() |
|||
bc_settings = BehavioralCloningSettings( |
|||
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo" |
|||
) |
|||
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac) |
|||
stats = bc_module.update() |
|||
for _, item in stats.items(): |
|||
assert isinstance(item, np.float32) |
|||
|
|||
|
|||
# Test with constant pretraining learning rate |
|||
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) |
|||
def test_bcmodule_constant_lr_update(is_sac): |
|||
mock_specs = mb.create_mock_3dball_behavior_specs() |
|||
bc_settings = BehavioralCloningSettings( |
|||
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo", |
|||
steps=0, |
|||
) |
|||
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac) |
|||
stats = bc_module.update() |
|||
for _, item in stats.items(): |
|||
assert isinstance(item, np.float32) |
|||
old_learning_rate = bc_module.current_lr |
|||
|
|||
_ = bc_module.update() |
|||
assert old_learning_rate == bc_module.current_lr |
|||
|
|||
|
|||
# Test with RNN |
|||
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) |
|||
def test_bcmodule_rnn_update(is_sac): |
|||
mock_specs = mb.create_mock_3dball_behavior_specs() |
|||
bc_settings = BehavioralCloningSettings( |
|||
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "test.demo" |
|||
) |
|||
bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac) |
|||
stats = bc_module.update() |
|||
for _, item in stats.items(): |
|||
assert isinstance(item, np.float32) |
|||
|
|||
|
|||
# Test with discrete control and visual observations |
|||
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) |
|||
def test_bcmodule_dc_visual_update(is_sac): |
|||
mock_specs = mb.create_mock_banana_behavior_specs() |
|||
bc_settings = BehavioralCloningSettings( |
|||
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "testdcvis.demo" |
|||
) |
|||
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac) |
|||
stats = bc_module.update() |
|||
for _, item in stats.items(): |
|||
assert isinstance(item, np.float32) |
|||
|
|||
|
|||
# Test with discrete control, visual observations and RNN |
|||
@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"]) |
|||
def test_bcmodule_rnn_dc_update(is_sac): |
|||
mock_specs = mb.create_mock_banana_behavior_specs() |
|||
bc_settings = BehavioralCloningSettings( |
|||
demo_path=os.path.dirname(os.path.abspath(__file__)) + "/" + "testdcvis.demo" |
|||
) |
|||
bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac) |
|||
stats = bc_module.update() |
|||
for _, item in stats.items(): |
|||
assert isinstance(item, np.float32) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
pytest.main() |
|
|||
import pytest |
|||
import copy |
|||
import os |
|||
import mlagents.trainers.tests.mock_brain as mb |
|||
from mlagents.trainers.policy.tf_policy import TFPolicy |
|||
from mlagents.trainers.sac.optimizer_tf import SACOptimizer |
|||
from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer |
|||
from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG, SAC_CONFIG |
|||
from mlagents.trainers.settings import ( |
|||
GAILSettings, |
|||
CuriositySettings, |
|||
RewardSignalSettings, |
|||
BehavioralCloningSettings, |
|||
NetworkSettings, |
|||
TrainerType, |
|||
RewardSignalType, |
|||
) |
|||
|
|||
CONTINUOUS_PATH = os.path.dirname(os.path.abspath(__file__)) + "/test.demo" |
|||
DISCRETE_PATH = os.path.dirname(os.path.abspath(__file__)) + "/testdcvis.demo" |
|||
|
|||
|
|||
def ppo_dummy_config(): |
|||
return copy.deepcopy(PPO_CONFIG) |
|||
|
|||
|
|||
def sac_dummy_config(): |
|||
return copy.deepcopy(SAC_CONFIG) |
|||
|
|||
|
|||
@pytest.fixture |
|||
def gail_dummy_config(): |
|||
return {RewardSignalType.GAIL: GAILSettings(demo_path=CONTINUOUS_PATH)} |
|||
|
|||
|
|||
@pytest.fixture |
|||
def curiosity_dummy_config(): |
|||
return {RewardSignalType.CURIOSITY: CuriositySettings()} |
|||
|
|||
|
|||
@pytest.fixture |
|||
def extrinsic_dummy_config(): |
|||
return {RewardSignalType.EXTRINSIC: RewardSignalSettings()} |
|||
|
|||
|
|||
VECTOR_ACTION_SPACE = 2 |
|||
VECTOR_OBS_SPACE = 8 |
|||
DISCRETE_ACTION_SPACE = [3, 3, 3, 2] |
|||
BUFFER_INIT_SAMPLES = 20 |
|||
BATCH_SIZE = 12 |
|||
NUM_AGENTS = 12 |
|||
|
|||
|
|||
def create_optimizer_mock( |
|||
trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual |
|||
): |
|||
mock_specs = mb.setup_test_behavior_specs( |
|||
use_discrete, |
|||
use_visual, |
|||
vector_action_space=DISCRETE_ACTION_SPACE |
|||
if use_discrete |
|||
else VECTOR_ACTION_SPACE, |
|||
vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0, |
|||
) |
|||
trainer_settings = trainer_config |
|||
trainer_settings.reward_signals = reward_signal_config |
|||
trainer_settings.network_settings.memory = ( |
|||
NetworkSettings.MemorySettings(sequence_length=16, memory_size=10) |
|||
if use_rnn |
|||
else None |
|||
) |
|||
policy = TFPolicy( |
|||
0, mock_specs, trainer_settings, "test", False, create_tf_graph=False |
|||
) |
|||
if trainer_settings.trainer_type == TrainerType.SAC: |
|||
optimizer = SACOptimizer(policy, trainer_settings) |
|||
else: |
|||
optimizer = PPOOptimizer(policy, trainer_settings) |
|||
optimizer.policy.initialize() |
|||
return optimizer |
|||
|
|||
|
|||
def reward_signal_eval(optimizer, reward_signal_name): |
|||
buffer = mb.simulate_rollout(BATCH_SIZE, optimizer.policy.behavior_spec) |
|||
# Test evaluate |
|||
rsig_result = optimizer.reward_signals[reward_signal_name].evaluate_batch(buffer) |
|||
assert rsig_result.scaled_reward.shape == (BATCH_SIZE,) |
|||
assert rsig_result.unscaled_reward.shape == (BATCH_SIZE,) |
|||
|
|||
|
|||
def reward_signal_update(optimizer, reward_signal_name): |
|||
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec) |
|||
feed_dict = optimizer.reward_signals[reward_signal_name].prepare_update( |
|||
optimizer.policy, buffer.make_mini_batch(0, 10), 2 |
|||
) |
|||
out = optimizer.policy._execute_model( |
|||
feed_dict, optimizer.reward_signals[reward_signal_name].update_dict |
|||
) |
|||
assert type(out) is dict |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_gail_cc(trainer_config, gail_dummy_config): |
|||
trainer_config.behavioral_cloning = BehavioralCloningSettings( |
|||
demo_path=CONTINUOUS_PATH |
|||
) |
|||
optimizer = create_optimizer_mock( |
|||
trainer_config, gail_dummy_config, False, False, False |
|||
) |
|||
reward_signal_eval(optimizer, "gail") |
|||
reward_signal_update(optimizer, "gail") |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_gail_dc_visual(trainer_config, gail_dummy_config): |
|||
gail_dummy_config_discrete = { |
|||
RewardSignalType.GAIL: GAILSettings(demo_path=DISCRETE_PATH) |
|||
} |
|||
optimizer = create_optimizer_mock( |
|||
trainer_config, gail_dummy_config_discrete, False, True, True |
|||
) |
|||
reward_signal_eval(optimizer, "gail") |
|||
reward_signal_update(optimizer, "gail") |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_gail_rnn(trainer_config, gail_dummy_config): |
|||
policy = create_optimizer_mock( |
|||
trainer_config, gail_dummy_config, True, False, False |
|||
) |
|||
reward_signal_eval(policy, "gail") |
|||
reward_signal_update(policy, "gail") |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_curiosity_cc(trainer_config, curiosity_dummy_config): |
|||
policy = create_optimizer_mock( |
|||
trainer_config, curiosity_dummy_config, False, False, False |
|||
) |
|||
reward_signal_eval(policy, "curiosity") |
|||
reward_signal_update(policy, "curiosity") |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_curiosity_dc(trainer_config, curiosity_dummy_config): |
|||
policy = create_optimizer_mock( |
|||
trainer_config, curiosity_dummy_config, False, True, False |
|||
) |
|||
reward_signal_eval(policy, "curiosity") |
|||
reward_signal_update(policy, "curiosity") |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_curiosity_visual(trainer_config, curiosity_dummy_config): |
|||
policy = create_optimizer_mock( |
|||
trainer_config, curiosity_dummy_config, False, False, True |
|||
) |
|||
reward_signal_eval(policy, "curiosity") |
|||
reward_signal_update(policy, "curiosity") |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_curiosity_rnn(trainer_config, curiosity_dummy_config): |
|||
policy = create_optimizer_mock( |
|||
trainer_config, curiosity_dummy_config, True, False, False |
|||
) |
|||
reward_signal_eval(policy, "curiosity") |
|||
reward_signal_update(policy, "curiosity") |
|||
|
|||
|
|||
@pytest.mark.parametrize( |
|||
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|||
) |
|||
def test_extrinsic(trainer_config, extrinsic_dummy_config): |
|||
policy = create_optimizer_mock( |
|||
trainer_config, extrinsic_dummy_config, False, False, False |
|||
) |
|||
reward_signal_eval(policy, "extrinsic") |
|||
reward_signal_update(policy, "extrinsic") |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
pytest.main() |
|
|||
import math |
|||
import tempfile |
|||
import pytest |
|||
import numpy as np |
|||
import attr |
|||
from typing import Dict |
|||
|
|||
from mlagents.trainers.tests.simple_test_envs import ( |
|||
SimpleEnvironment, |
|||
MemoryEnvironment, |
|||
RecordEnvironment, |
|||
) |
|||
from mlagents.trainers.trainer_controller import TrainerController |
|||
from mlagents.trainers.trainer_util import TrainerFactory |
|||
from mlagents.trainers.simple_env_manager import SimpleEnvManager |
|||
from mlagents.trainers.demo_loader import write_demo |
|||
from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary |
|||
from mlagents.trainers.settings import ( |
|||
TrainerSettings, |
|||
PPOSettings, |
|||
SACSettings, |
|||
NetworkSettings, |
|||
SelfPlaySettings, |
|||
BehavioralCloningSettings, |
|||
GAILSettings, |
|||
TrainerType, |
|||
RewardSignalType, |
|||
EncoderType, |
|||
ScheduleType, |
|||
FrameworkType, |
|||
) |
|||
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager |
|||
from mlagents_envs.side_channel.environment_parameters_channel import ( |
|||
EnvironmentParametersChannel, |
|||
) |
|||
from mlagents_envs.communicator_objects.demonstration_meta_pb2 import ( |
|||
DemonstrationMetaProto, |
|||
) |
|||
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto |
|||
from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous |
|||
|
|||
BRAIN_NAME = "1D" |
|||
|
|||
|
|||
PPO_CONFIG = TrainerSettings( |
|||
trainer_type=TrainerType.PPO, |
|||
hyperparameters=PPOSettings( |
|||
learning_rate=5.0e-3, |
|||
learning_rate_schedule=ScheduleType.CONSTANT, |
|||
batch_size=16, |
|||
buffer_size=64, |
|||
), |
|||
network_settings=NetworkSettings(num_layers=1, hidden_units=32), |
|||
summary_freq=500, |
|||
max_steps=3000, |
|||
threaded=False, |
|||
framework=FrameworkType.TENSORFLOW, |
|||
) |
|||
|
|||
SAC_CONFIG = TrainerSettings( |
|||
trainer_type=TrainerType.SAC, |
|||
hyperparameters=SACSettings( |
|||
learning_rate=5.0e-3, |
|||
learning_rate_schedule=ScheduleType.CONSTANT, |
|||
batch_size=8, |
|||
buffer_init_steps=100, |
|||
buffer_size=5000, |
|||
tau=0.01, |
|||
init_entcoef=0.01, |
|||
), |
|||
network_settings=NetworkSettings(num_layers=1, hidden_units=16), |
|||
summary_freq=100, |
|||
max_steps=1000, |
|||
threaded=False, |
|||
) |
|||
|
|||
|
|||
# The reward processor is passed as an argument to _check_environment_trains. |
|||
# It is applied to the list of all final rewards for each brain individually. |
|||
# This is so that we can process all final rewards in different ways for different algorithms. |
|||
# Custom reward processors should be built within the test function and passed to _check_environment_trains |
|||
# Default is average over the last 5 final rewards |
|||
def default_reward_processor(rewards, last_n_rewards=5): |
|||
rewards_to_use = rewards[-last_n_rewards:] |
|||
# For debugging tests |
|||
print(f"Last {last_n_rewards} rewards:", rewards_to_use) |
|||
return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean() |
|||
|
|||
|
|||
class DebugWriter(StatsWriter): |
|||
""" |
|||
Print to stdout so stats can be viewed in pytest |
|||
""" |
|||
|
|||
def __init__(self): |
|||
self._last_reward_summary: Dict[str, float] = {} |
|||
|
|||
def get_last_rewards(self): |
|||
return self._last_reward_summary |
|||
|
|||
def write_stats( |
|||
self, category: str, values: Dict[str, StatsSummary], step: int |
|||
) -> None: |
|||
for val, stats_summary in values.items(): |
|||
if val == "Environment/Cumulative Reward": |
|||
print(step, val, stats_summary.mean) |
|||
self._last_reward_summary[category] = stats_summary.mean |
|||
|
|||
|
|||
def _check_environment_trains( |
|||
env, |
|||
trainer_config, |
|||
reward_processor=default_reward_processor, |
|||
env_parameter_manager=None, |
|||
success_threshold=0.9, |
|||
env_manager=None, |
|||
): |
|||
if env_parameter_manager is None: |
|||
env_parameter_manager = EnvironmentParameterManager() |
|||
# Create controller and begin training. |
|||
with tempfile.TemporaryDirectory() as dir: |
|||
run_id = "id" |
|||
seed = 1337 |
|||
StatsReporter.writers.clear() # Clear StatsReporters so we don't write to file |
|||
debug_writer = DebugWriter() |
|||
StatsReporter.add_writer(debug_writer) |
|||
if env_manager is None: |
|||
env_manager = SimpleEnvManager(env, EnvironmentParametersChannel()) |
|||
trainer_factory = TrainerFactory( |
|||
trainer_config=trainer_config, |
|||
output_path=dir, |
|||
train_model=True, |
|||
load_model=False, |
|||
seed=seed, |
|||
param_manager=env_parameter_manager, |
|||
multi_gpu=False, |
|||
) |
|||
|
|||
tc = TrainerController( |
|||
trainer_factory=trainer_factory, |
|||
output_path=dir, |
|||
run_id=run_id, |
|||
param_manager=env_parameter_manager, |
|||
train=True, |
|||
training_seed=seed, |
|||
) |
|||
|
|||
# Begin training |
|||
tc.start_learning(env_manager) |
|||
if ( |
|||
success_threshold is not None |
|||
): # For tests where we are just checking setup and not reward |
|||
processed_rewards = [ |
|||
reward_processor(rewards) for rewards in env.final_rewards.values() |
|||
] |
|||
assert all(not math.isnan(reward) for reward in processed_rewards) |
|||
assert all(reward > success_threshold for reward in processed_rewards) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_simple_ppo(use_discrete): |
|||
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete) |
|||
config = attr.evolve(PPO_CONFIG) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_2d_ppo(use_discrete): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8 |
|||
) |
|||
new_hyperparams = attr.evolve( |
|||
PPO_CONFIG.hyperparameters, batch_size=64, buffer_size=640 |
|||
) |
|||
config = attr.evolve(PPO_CONFIG, hyperparameters=new_hyperparams, max_steps=10000) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
@pytest.mark.parametrize("num_visual", [1, 2]) |
|||
def test_visual_ppo(num_visual, use_discrete): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], |
|||
use_discrete=use_discrete, |
|||
num_visual=num_visual, |
|||
num_vector=0, |
|||
step_size=0.2, |
|||
) |
|||
new_hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3.0e-4) |
|||
config = attr.evolve(PPO_CONFIG, hyperparameters=new_hyperparams) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("num_visual", [1, 2]) |
|||
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"]) |
|||
def test_visual_advanced_ppo(vis_encode_type, num_visual): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], |
|||
use_discrete=True, |
|||
num_visual=num_visual, |
|||
num_vector=0, |
|||
step_size=0.5, |
|||
vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3), |
|||
) |
|||
new_networksettings = attr.evolve( |
|||
SAC_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type) |
|||
) |
|||
new_hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3.0e-4) |
|||
config = attr.evolve( |
|||
PPO_CONFIG, |
|||
hyperparameters=new_hyperparams, |
|||
network_settings=new_networksettings, |
|||
max_steps=500, |
|||
summary_freq=100, |
|||
) |
|||
# The number of steps is pretty small for these encoders |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_recurrent_ppo(use_discrete): |
|||
env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete) |
|||
new_network_settings = attr.evolve( |
|||
PPO_CONFIG.network_settings, |
|||
memory=NetworkSettings.MemorySettings(memory_size=16), |
|||
) |
|||
new_hyperparams = attr.evolve( |
|||
PPO_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=128 |
|||
) |
|||
config = attr.evolve( |
|||
PPO_CONFIG, |
|||
hyperparameters=new_hyperparams, |
|||
network_settings=new_network_settings, |
|||
max_steps=5000, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_simple_sac(use_discrete): |
|||
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete) |
|||
config = attr.evolve(SAC_CONFIG) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_2d_sac(use_discrete): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8 |
|||
) |
|||
new_hyperparams = attr.evolve(SAC_CONFIG.hyperparameters, buffer_init_steps=2000) |
|||
config = attr.evolve(SAC_CONFIG, hyperparameters=new_hyperparams, max_steps=10000) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
@pytest.mark.parametrize("num_visual", [1, 2]) |
|||
def test_visual_sac(num_visual, use_discrete): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], |
|||
use_discrete=use_discrete, |
|||
num_visual=num_visual, |
|||
num_vector=0, |
|||
step_size=0.2, |
|||
) |
|||
new_hyperparams = attr.evolve( |
|||
SAC_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4 |
|||
) |
|||
config = attr.evolve(SAC_CONFIG, hyperparameters=new_hyperparams) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("num_visual", [1, 2]) |
|||
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"]) |
|||
def test_visual_advanced_sac(vis_encode_type, num_visual): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], |
|||
use_discrete=True, |
|||
num_visual=num_visual, |
|||
num_vector=0, |
|||
step_size=0.5, |
|||
vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3), |
|||
) |
|||
new_networksettings = attr.evolve( |
|||
SAC_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type) |
|||
) |
|||
new_hyperparams = attr.evolve( |
|||
SAC_CONFIG.hyperparameters, |
|||
batch_size=16, |
|||
learning_rate=3e-4, |
|||
buffer_init_steps=0, |
|||
) |
|||
config = attr.evolve( |
|||
SAC_CONFIG, |
|||
hyperparameters=new_hyperparams, |
|||
network_settings=new_networksettings, |
|||
max_steps=100, |
|||
) |
|||
# The number of steps is pretty small for these encoders |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_recurrent_sac(use_discrete): |
|||
step_size = 0.5 if use_discrete else 0.2 |
|||
env = MemoryEnvironment( |
|||
[BRAIN_NAME], use_discrete=use_discrete, step_size=step_size |
|||
) |
|||
new_networksettings = attr.evolve( |
|||
SAC_CONFIG.network_settings, |
|||
memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16), |
|||
) |
|||
new_hyperparams = attr.evolve( |
|||
SAC_CONFIG.hyperparameters, |
|||
batch_size=128, |
|||
learning_rate=1e-3, |
|||
buffer_init_steps=1000, |
|||
steps_per_update=2, |
|||
) |
|||
config = attr.evolve( |
|||
SAC_CONFIG, |
|||
hyperparameters=new_hyperparams, |
|||
network_settings=new_networksettings, |
|||
max_steps=5000, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_simple_ghost(use_discrete): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete |
|||
) |
|||
self_play_settings = SelfPlaySettings( |
|||
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000 |
|||
) |
|||
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2500) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_simple_ghost_fails(use_discrete): |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete |
|||
) |
|||
# This config should fail because the ghosted policy is never swapped with a competent policy. |
|||
# Swap occurs after max step is reached. |
|||
self_play_settings = SelfPlaySettings( |
|||
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=4000 |
|||
) |
|||
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=2500) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None) |
|||
processed_rewards = [ |
|||
default_reward_processor(rewards) for rewards in env.final_rewards.values() |
|||
] |
|||
success_threshold = 0.9 |
|||
assert any(reward > success_threshold for reward in processed_rewards) and any( |
|||
reward < success_threshold for reward in processed_rewards |
|||
) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_simple_asymm_ghost(use_discrete): |
|||
# Make opponent for asymmetric case |
|||
brain_name_opp = BRAIN_NAME + "Opp" |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete |
|||
) |
|||
self_play_settings = SelfPlaySettings( |
|||
play_against_latest_model_ratio=1.0, |
|||
save_steps=10000, |
|||
swap_steps=10000, |
|||
team_change=400, |
|||
) |
|||
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=4000) |
|||
_check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config}) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_simple_asymm_ghost_fails(use_discrete): |
|||
# Make opponent for asymmetric case |
|||
brain_name_opp = BRAIN_NAME + "Opp" |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete |
|||
) |
|||
# This config should fail because the team that us not learning when both have reached |
|||
# max step should be executing the initial, untrained poliy. |
|||
self_play_settings = SelfPlaySettings( |
|||
play_against_latest_model_ratio=0.0, |
|||
save_steps=5000, |
|||
swap_steps=5000, |
|||
team_change=2000, |
|||
) |
|||
config = attr.evolve(PPO_CONFIG, self_play=self_play_settings, max_steps=3000) |
|||
_check_environment_trains( |
|||
env, {BRAIN_NAME: config, brain_name_opp: config}, success_threshold=None |
|||
) |
|||
processed_rewards = [ |
|||
default_reward_processor(rewards) for rewards in env.final_rewards.values() |
|||
] |
|||
success_threshold = 0.9 |
|||
assert any(reward > success_threshold for reward in processed_rewards) and any( |
|||
reward < success_threshold for reward in processed_rewards |
|||
) |
|||
|
|||
|
|||
@pytest.fixture(scope="session") |
|||
def simple_record(tmpdir_factory): |
|||
def record_demo(use_discrete, num_visual=0, num_vector=1): |
|||
env = RecordEnvironment( |
|||
[BRAIN_NAME], |
|||
use_discrete=use_discrete, |
|||
num_visual=num_visual, |
|||
num_vector=num_vector, |
|||
n_demos=100, |
|||
) |
|||
# If we want to use true demos, we can solve the env in the usual way |
|||
# Otherwise, we can just call solve to execute the optimal policy |
|||
env.solve() |
|||
agent_info_protos = env.demonstration_protos[BRAIN_NAME] |
|||
meta_data_proto = DemonstrationMetaProto() |
|||
brain_param_proto = BrainParametersProto( |
|||
vector_action_size=[2] if use_discrete else [1], |
|||
vector_action_descriptions=[""], |
|||
vector_action_space_type=discrete if use_discrete else continuous, |
|||
brain_name=BRAIN_NAME, |
|||
is_training=True, |
|||
) |
|||
action_type = "Discrete" if use_discrete else "Continuous" |
|||
demo_path_name = "1DTest" + action_type + ".demo" |
|||
demo_path = str(tmpdir_factory.mktemp("tmp_demo").join(demo_path_name)) |
|||
write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos) |
|||
return demo_path |
|||
|
|||
return record_demo |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
@pytest.mark.parametrize("trainer_config", [PPO_CONFIG, SAC_CONFIG]) |
|||
def test_gail(simple_record, use_discrete, trainer_config): |
|||
demo_path = simple_record(use_discrete) |
|||
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2) |
|||
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000) |
|||
reward_signals = { |
|||
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path) |
|||
} |
|||
config = attr.evolve( |
|||
trainer_config, |
|||
reward_signals=reward_signals, |
|||
behavioral_cloning=bc_settings, |
|||
max_steps=500, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_gail_visual_ppo(simple_record, use_discrete): |
|||
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0) |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], |
|||
num_visual=1, |
|||
num_vector=0, |
|||
use_discrete=use_discrete, |
|||
step_size=0.2, |
|||
) |
|||
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500) |
|||
reward_signals = { |
|||
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path) |
|||
} |
|||
hyperparams = attr.evolve(PPO_CONFIG.hyperparameters, learning_rate=3e-4) |
|||
config = attr.evolve( |
|||
PPO_CONFIG, |
|||
reward_signals=reward_signals, |
|||
hyperparameters=hyperparams, |
|||
behavioral_cloning=bc_settings, |
|||
max_steps=1000, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9) |
|||
|
|||
|
|||
@pytest.mark.parametrize("use_discrete", [True, False]) |
|||
def test_gail_visual_sac(simple_record, use_discrete): |
|||
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0) |
|||
env = SimpleEnvironment( |
|||
[BRAIN_NAME], |
|||
num_visual=1, |
|||
num_vector=0, |
|||
use_discrete=use_discrete, |
|||
step_size=0.2, |
|||
) |
|||
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000) |
|||
reward_signals = { |
|||
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path) |
|||
} |
|||
hyperparams = attr.evolve( |
|||
SAC_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16 |
|||
) |
|||
config = attr.evolve( |
|||
SAC_CONFIG, |
|||
reward_signals=reward_signals, |
|||
hyperparameters=hyperparams, |
|||
behavioral_cloning=bc_settings, |
|||
max_steps=500, |
|||
) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9) |
撰写
预览
正在加载...
取消
保存
Reference in new issue