|
|
|
|
|
|
import pytest |
|
|
|
import yaml |
|
|
|
import numpy as np |
|
|
|
from typing import Dict |
|
|
|
from typing import Dict, Any |
|
|
|
|
|
|
|
from mlagents.trainers.tests.simple_test_envs import ( |
|
|
|
Simple1DEnvironment, |
|
|
|
|
|
|
learning_rate: 5.0e-3 |
|
|
|
learning_rate_schedule: constant |
|
|
|
max_steps: 1500 |
|
|
|
memory_size: 256 |
|
|
|
memory_size: 16 |
|
|
|
normalize: false |
|
|
|
num_epoch: 3 |
|
|
|
num_layers: 1 |
|
|
|
|
|
|
gamma: 0.99 |
|
|
|
""" |
|
|
|
|
|
|
|
PPO_CONFIG_RECURRENT = f""" |
|
|
|
{BRAIN_NAME}: |
|
|
|
trainer: ppo |
|
|
|
batch_size: 64 |
|
|
|
beta: 5.0e-3 |
|
|
|
buffer_size: 128 |
|
|
|
epsilon: 0.2 |
|
|
|
hidden_units: 32 |
|
|
|
lambd: 0.95 |
|
|
|
learning_rate: 5.0e-3 |
|
|
|
max_steps: 3000 |
|
|
|
memory_size: 16 |
|
|
|
normalize: false |
|
|
|
learning_rate_schedule: constant |
|
|
|
num_epoch: 3 |
|
|
|
num_layers: 1 |
|
|
|
time_horizon: 64 |
|
|
|
sequence_length: 32 |
|
|
|
summary_freq: 500 |
|
|
|
use_recurrent: true |
|
|
|
reward_signals: |
|
|
|
extrinsic: |
|
|
|
strength: 1.0 |
|
|
|
gamma: 0.99 |
|
|
|
""" |
|
|
|
|
|
|
|
SAC_CONFIG = f""" |
|
|
|
{BRAIN_NAME}: |
|
|
|
trainer: sac |
|
|
|
|
|
|
init_entcoef: 0.01 |
|
|
|
learning_rate: 5.0e-3 |
|
|
|
max_steps: 1000 |
|
|
|
memory_size: 256 |
|
|
|
normalize: false |
|
|
|
num_update: 1 |
|
|
|
train_interval: 1 |
|
|
|
num_layers: 1 |
|
|
|
time_horizon: 64 |
|
|
|
sequence_length: 64 |
|
|
|
summary_freq: 100 |
|
|
|
tau: 0.01 |
|
|
|
use_recurrent: false |
|
|
|
curiosity_enc_size: 128 |
|
|
|
demo_path: None |
|
|
|
vis_encode_type: simple |
|
|
|
reward_signals: |
|
|
|
extrinsic: |
|
|
|
strength: 1.0 |
|
|
|
gamma: 0.99 |
|
|
|
""" |
|
|
|
|
|
|
|
SAC_CONFIG_RECURRENT = f""" |
|
|
|
{BRAIN_NAME}: |
|
|
|
trainer: sac |
|
|
|
batch_size: 32 |
|
|
|
buffer_size: 500 |
|
|
|
buffer_init_steps: 100 |
|
|
|
hidden_units: 16 |
|
|
|
init_entcoef: 0.01 |
|
|
|
learning_rate: 5.0e-3 |
|
|
|
max_steps: 1000 |
|
|
|
memory_size: 16 |
|
|
|
normalize: false |
|
|
|
num_update: 1 |
|
|
|
|
|
|
sequence_length: 32 |
|
|
|
summary_freq: 100 |
|
|
|
tau: 0.01 |
|
|
|
use_recurrent: true |
|
|
|
use_recurrent: false |
|
|
|
curiosity_enc_size: 128 |
|
|
|
demo_path: None |
|
|
|
vis_encode_type: simple |
|
|
|
|
|
|
gamma: 0.99 |
|
|
|
""" |
|
|
|
|
|
|
|
GHOST_CONFIG_PASS = f""" |
|
|
|
{BRAIN_NAME}: |
|
|
|
trainer: ppo |
|
|
|
batch_size: 16 |
|
|
|
beta: 5.0e-3 |
|
|
|
buffer_size: 64 |
|
|
|
epsilon: 0.2 |
|
|
|
hidden_units: 32 |
|
|
|
lambd: 0.95 |
|
|
|
learning_rate: 5.0e-3 |
|
|
|
max_steps: 2500 |
|
|
|
memory_size: 256 |
|
|
|
normalize: false |
|
|
|
num_epoch: 3 |
|
|
|
num_layers: 1 |
|
|
|
time_horizon: 64 |
|
|
|
sequence_length: 64 |
|
|
|
summary_freq: 500 |
|
|
|
use_recurrent: false |
|
|
|
reward_signals: |
|
|
|
extrinsic: |
|
|
|
strength: 1.0 |
|
|
|
gamma: 0.99 |
|
|
|
self_play: |
|
|
|
play_against_current_self_ratio: 1.0 |
|
|
|
save_steps: 2000 |
|
|
|
swap_steps: 2000 |
|
|
|
""" |
|
|
|
# This config should fail because the ghosted policy is never swapped with a competent policy. |
|
|
|
# Swap occurs after max step is reached. |
|
|
|
GHOST_CONFIG_FAIL = f""" |
|
|
|
{BRAIN_NAME}: |
|
|
|
trainer: ppo |
|
|
|
batch_size: 16 |
|
|
|
beta: 5.0e-3 |
|
|
|
buffer_size: 64 |
|
|
|
epsilon: 0.2 |
|
|
|
hidden_units: 32 |
|
|
|
lambd: 0.95 |
|
|
|
learning_rate: 5.0e-3 |
|
|
|
max_steps: 2500 |
|
|
|
memory_size: 256 |
|
|
|
normalize: false |
|
|
|
num_epoch: 3 |
|
|
|
num_layers: 1 |
|
|
|
time_horizon: 64 |
|
|
|
sequence_length: 64 |
|
|
|
summary_freq: 500 |
|
|
|
use_recurrent: false |
|
|
|
reward_signals: |
|
|
|
extrinsic: |
|
|
|
strength: 1.0 |
|
|
|
gamma: 0.99 |
|
|
|
self_play: |
|
|
|
play_against_current_self_ratio: 1.0 |
|
|
|
save_steps: 2000 |
|
|
|
swap_steps: 4000 |
|
|
|
""" |
|
|
|
def generate_config( |
|
|
|
config: str, override_vals: Dict[str, Any] = None |
|
|
|
) -> Dict[str, Any]: |
|
|
|
trainer_config = yaml.safe_load(config) |
|
|
|
if override_vals is not None: |
|
|
|
trainer_config[BRAIN_NAME].update(override_vals) |
|
|
|
return trainer_config |
|
|
|
|
|
|
|
|
|
|
|
# The reward processor is passed as an argument to _check_environment_trains. |
|
|
|
|
|
|
|
|
|
|
def _check_environment_trains( |
|
|
|
env, |
|
|
|
config, |
|
|
|
trainer_config, |
|
|
|
reward_processor=default_reward_processor, |
|
|
|
meta_curriculum=None, |
|
|
|
success_threshold=0.99, |
|
|
|
|
|
|
StatsReporter.writers.clear() # Clear StatsReporters so we don't write to file |
|
|
|
debug_writer = DebugWriter() |
|
|
|
StatsReporter.add_writer(debug_writer) |
|
|
|
trainer_config = yaml.safe_load(config) |
|
|
|
env_manager = SimpleEnvManager(env, FloatPropertiesChannel()) |
|
|
|
trainer_factory = TrainerFactory( |
|
|
|
trainer_config=trainer_config, |
|
|
|
|
|
|
@pytest.mark.parametrize("use_discrete", [True, False]) |
|
|
|
def test_simple_ppo(use_discrete): |
|
|
|
env = Simple1DEnvironment([BRAIN_NAME], use_discrete=use_discrete) |
|
|
|
_check_environment_trains(env, PPO_CONFIG) |
|
|
|
config = generate_config(PPO_CONFIG) |
|
|
|
_check_environment_trains(env, config) |
|
|
|
def test_recurrent_ppo(use_discrete): |
|
|
|
env = Memory1DEnvironment([BRAIN_NAME], use_discrete=use_discrete) |
|
|
|
_check_environment_trains(env, PPO_CONFIG_RECURRENT) |
|
|
|
@pytest.mark.parametrize("num_visual", [1, 2]) |
|
|
|
def test_visual_ppo(num_visual, use_discrete): |
|
|
|
env = Simple1DEnvironment( |
|
|
|
[BRAIN_NAME], use_discrete=use_discrete, num_visual=num_visual, num_vector=0 |
|
|
|
) |
|
|
|
override_vals = {"learning_rate": 3.0e-4} |
|
|
|
config = generate_config(PPO_CONFIG, override_vals) |
|
|
|
_check_environment_trains(env, config) |
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("num_visual", [1, 2]) |
|
|
|
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn"]) |
|
|
|
def test_visual_advanced_ppo(vis_encode_type, num_visual): |
|
|
|
env = Simple1DEnvironment( |
|
|
|
[BRAIN_NAME], |
|
|
|
use_discrete=True, |
|
|
|
num_visual=num_visual, |
|
|
|
num_vector=0, |
|
|
|
step_size=0.5, |
|
|
|
vis_obs_size=(36, 36, 3), |
|
|
|
) |
|
|
|
override_vals = { |
|
|
|
"learning_rate": 3.0e-4, |
|
|
|
"vis_encode_type": vis_encode_type, |
|
|
|
"max_steps": 500, |
|
|
|
"summary_freq": 100, |
|
|
|
} |
|
|
|
config = generate_config(PPO_CONFIG, override_vals) |
|
|
|
# The number of steps is pretty small for these encoders |
|
|
|
_check_environment_trains(env, config, success_threshold=0.9) |
|
|
|
def test_recurrent_sac(use_discrete): |
|
|
|
def test_recurrent_ppo(use_discrete): |
|
|
|
_check_environment_trains(env, SAC_CONFIG_RECURRENT) |
|
|
|
override_vals = { |
|
|
|
"max_steps": 3000, |
|
|
|
"batch_size": 64, |
|
|
|
"buffer_size": 128, |
|
|
|
"use_recurrent": True, |
|
|
|
} |
|
|
|
config = generate_config(PPO_CONFIG, override_vals) |
|
|
|
_check_environment_trains(env, config) |
|
|
|
_check_environment_trains(env, SAC_CONFIG) |
|
|
|
config = generate_config(SAC_CONFIG) |
|
|
|
_check_environment_trains(env, config) |
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("use_discrete", [True, False]) |
|
|
|
@pytest.mark.parametrize("num_visual", [1, 2]) |
|
|
|
def test_visual_sac(num_visual, use_discrete): |
|
|
|
env = Simple1DEnvironment( |
|
|
|
[BRAIN_NAME], use_discrete=use_discrete, num_visual=num_visual, num_vector=0 |
|
|
|
) |
|
|
|
override_vals = {"batch_size": 16, "learning_rate": 3e-4} |
|
|
|
config = generate_config(SAC_CONFIG, override_vals) |
|
|
|
_check_environment_trains(env, config) |
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("num_visual", [1, 2]) |
|
|
|
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn"]) |
|
|
|
def test_visual_advanced_sac(vis_encode_type, num_visual): |
|
|
|
env = Simple1DEnvironment( |
|
|
|
[BRAIN_NAME], |
|
|
|
use_discrete=True, |
|
|
|
num_visual=num_visual, |
|
|
|
num_vector=0, |
|
|
|
step_size=0.5, |
|
|
|
vis_obs_size=(36, 36, 3), |
|
|
|
) |
|
|
|
override_vals = { |
|
|
|
"batch_size": 16, |
|
|
|
"learning_rate": 3.0e-4, |
|
|
|
"vis_encode_type": vis_encode_type, |
|
|
|
"buffer_init_steps": 0, |
|
|
|
"max_steps": 100, |
|
|
|
} |
|
|
|
config = generate_config(SAC_CONFIG, override_vals) |
|
|
|
# The number of steps is pretty small for these encoders |
|
|
|
_check_environment_trains(env, config, success_threshold=0.9) |
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("use_discrete", [True, False]) |
|
|
|
def test_recurrent_sac(use_discrete): |
|
|
|
env = Memory1DEnvironment([BRAIN_NAME], use_discrete=use_discrete) |
|
|
|
override_vals = {"batch_size": 32, "use_recurrent": True} |
|
|
|
config = generate_config(SAC_CONFIG, override_vals) |
|
|
|
_check_environment_trains(env, config) |
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("use_discrete", [True, False]) |
|
|
|
|
|
|
) |
|
|
|
_check_environment_trains(env, GHOST_CONFIG_PASS) |
|
|
|
override_vals = { |
|
|
|
"max_steps": 2500, |
|
|
|
"self_play": { |
|
|
|
"play_against_current_self_ratio": 1.0, |
|
|
|
"save_steps": 2000, |
|
|
|
"swap_steps": 2000, |
|
|
|
}, |
|
|
|
} |
|
|
|
config = generate_config(PPO_CONFIG, override_vals) |
|
|
|
_check_environment_trains(env, config) |
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("use_discrete", [True, False]) |
|
|
|
|
|
|
) |
|
|
|
_check_environment_trains(env, GHOST_CONFIG_FAIL, success_threshold=None) |
|
|
|
# This config should fail because the ghosted policy is never swapped with a competent policy. |
|
|
|
# Swap occurs after max step is reached. |
|
|
|
override_vals = { |
|
|
|
"max_steps": 2500, |
|
|
|
"self_play": { |
|
|
|
"play_against_current_self_ratio": 1.0, |
|
|
|
"save_steps": 2000, |
|
|
|
"swap_steps": 4000, |
|
|
|
}, |
|
|
|
} |
|
|
|
config = generate_config(PPO_CONFIG, override_vals) |
|
|
|
_check_environment_trains(env, config, success_threshold=None) |
|
|
|
processed_rewards = [ |
|
|
|
default_reward_processor(rewards) for rewards in env.final_rewards.values() |
|
|
|
] |
|
|
|