|
|
|
|
|
|
from unittest import mock |
|
|
|
import numpy as np |
|
|
|
import mlagents.trainers.tests.mock_brain as mb |
|
|
|
from mlagents.trainers.ppo.policy import PPOPolicy |
|
|
|
from mlagents.trainers.sac.policy import SACPolicy |
|
|
|
|
|
|
VECTOR_OBS_SPACE = 8 |
|
|
|
DISCRETE_ACTION_SPACE = [3, 3, 3, 2] |
|
|
|
BUFFER_INIT_SAMPLES = 20 |
|
|
|
BATCH_SIZE = 12 |
|
|
|
mock_env, trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual |
|
|
|
trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual |
|
|
|
env, mock_brain, _ = mb.setup_mock_env_and_brains( |
|
|
|
mock_env, |
|
|
|
mock_brain = mb.setup_mock_brain( |
|
|
|
num_agents=NUM_AGENTS, |
|
|
|
vector_action_space=VECTOR_ACTION_SPACE, |
|
|
|
vector_obs_space=VECTOR_OBS_SPACE, |
|
|
|
discrete_action_space=DISCRETE_ACTION_SPACE, |
|
|
|
|
|
|
model_path = env.external_brain_names[0] |
|
|
|
model_path = "testpath" |
|
|
|
trainer_parameters["model_path"] = model_path |
|
|
|
trainer_parameters["keep_checkpoints"] = 3 |
|
|
|
trainer_parameters["reward_signals"].update(reward_signal_config) |
|
|
|
|
|
|
else: |
|
|
|
policy = SACPolicy(0, mock_brain, trainer_parameters, False, False) |
|
|
|
return env, policy |
|
|
|
return policy |
|
|
|
def reward_signal_eval(env, policy, reward_signal_name): |
|
|
|
brain_infos = env.reset() |
|
|
|
brain_info = brain_infos[env.external_brain_names[0]] |
|
|
|
next_brain_info = env.step()[env.external_brain_names[0]] |
|
|
|
def reward_signal_eval(policy, reward_signal_name): |
|
|
|
buffer = mb.simulate_rollout(BATCH_SIZE, policy.brain) |
|
|
|
action = np.ones((len(brain_info.agents), policy.num_branches), dtype=np.float32) |
|
|
|
rsig_result = policy.reward_signals[reward_signal_name].evaluate( |
|
|
|
brain_info, action, next_brain_info |
|
|
|
) |
|
|
|
assert rsig_result.scaled_reward.shape == (NUM_AGENTS,) |
|
|
|
assert rsig_result.unscaled_reward.shape == (NUM_AGENTS,) |
|
|
|
rsig_result = policy.reward_signals[reward_signal_name].evaluate_batch(buffer) |
|
|
|
assert rsig_result.scaled_reward.shape == (BATCH_SIZE,) |
|
|
|
assert rsig_result.unscaled_reward.shape == (BATCH_SIZE,) |
|
|
|
def reward_signal_update(env, policy, reward_signal_name): |
|
|
|
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES) |
|
|
|
def reward_signal_update(policy, reward_signal_name): |
|
|
|
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain) |
|
|
|
feed_dict = policy.reward_signals[reward_signal_name].prepare_update( |
|
|
|
policy.model, buffer.make_mini_batch(0, 10), 2 |
|
|
|
) |
|
|
|
|
|
|
@pytest.mark.parametrize( |
|
|
|
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] |
|
|
|
) |
|
|
|
@mock.patch("mlagents_envs.environment.UnityEnvironment") |
|
|
|
def test_gail_cc(mock_env, trainer_config, gail_dummy_config): |
|
|
|
env, policy = create_policy_mock( |
|
|
|
mock_env, trainer_config, gail_dummy_config, False, False, False |
|
|
|
) |
|
|
|
reward_signal_eval(env, policy, "gail") |
|
|
|
reward_signal_update(env, policy, "gail") |
|
|
|
def test_gail_cc(trainer_config, gail_dummy_config): |
|
|
|
policy = create_policy_mock(trainer_config, gail_dummy_config, False, False, False) |
|
|
|
reward_signal_eval(policy, "gail") |
|
|
|
reward_signal_update(policy, "gail") |
|
|
|
@mock.patch("mlagents_envs.environment.UnityEnvironment") |
|
|
|
def test_gail_dc_visual(mock_env, trainer_config, gail_dummy_config): |
|
|
|
def test_gail_dc_visual(trainer_config, gail_dummy_config): |
|
|
|
env, policy = create_policy_mock( |
|
|
|
mock_env, trainer_config, gail_dummy_config, False, True, True |
|
|
|
) |
|
|
|
reward_signal_eval(env, policy, "gail") |
|
|
|
reward_signal_update(env, policy, "gail") |
|
|
|
policy = create_policy_mock(trainer_config, gail_dummy_config, False, True, True) |
|
|
|
reward_signal_eval(policy, "gail") |
|
|
|
reward_signal_update(policy, "gail") |
|
|
|
@mock.patch("mlagents_envs.environment.UnityEnvironment") |
|
|
|
def test_gail_rnn(mock_env, trainer_config, gail_dummy_config): |
|
|
|
env, policy = create_policy_mock( |
|
|
|
mock_env, trainer_config, gail_dummy_config, True, False, False |
|
|
|
) |
|
|
|
reward_signal_eval(env, policy, "gail") |
|
|
|
reward_signal_update(env, policy, "gail") |
|
|
|
def test_gail_rnn(trainer_config, gail_dummy_config): |
|
|
|
policy = create_policy_mock(trainer_config, gail_dummy_config, True, False, False) |
|
|
|
reward_signal_eval(policy, "gail") |
|
|
|
reward_signal_update(policy, "gail") |
|
|
|
@mock.patch("mlagents_envs.environment.UnityEnvironment") |
|
|
|
def test_curiosity_cc(mock_env, trainer_config, curiosity_dummy_config): |
|
|
|
env, policy = create_policy_mock( |
|
|
|
mock_env, trainer_config, curiosity_dummy_config, False, False, False |
|
|
|
def test_curiosity_cc(trainer_config, curiosity_dummy_config): |
|
|
|
policy = create_policy_mock( |
|
|
|
trainer_config, curiosity_dummy_config, False, False, False |
|
|
|
reward_signal_eval(env, policy, "curiosity") |
|
|
|
reward_signal_update(env, policy, "curiosity") |
|
|
|
reward_signal_eval(policy, "curiosity") |
|
|
|
reward_signal_update(policy, "curiosity") |
|
|
|
@mock.patch("mlagents_envs.environment.UnityEnvironment") |
|
|
|
def test_curiosity_dc(mock_env, trainer_config, curiosity_dummy_config): |
|
|
|
env, policy = create_policy_mock( |
|
|
|
mock_env, trainer_config, curiosity_dummy_config, False, True, False |
|
|
|
def test_curiosity_dc(trainer_config, curiosity_dummy_config): |
|
|
|
policy = create_policy_mock( |
|
|
|
trainer_config, curiosity_dummy_config, False, True, False |
|
|
|
reward_signal_eval(env, policy, "curiosity") |
|
|
|
reward_signal_update(env, policy, "curiosity") |
|
|
|
reward_signal_eval(policy, "curiosity") |
|
|
|
reward_signal_update(policy, "curiosity") |
|
|
|
@mock.patch("mlagents_envs.environment.UnityEnvironment") |
|
|
|
def test_curiosity_visual(mock_env, trainer_config, curiosity_dummy_config): |
|
|
|
env, policy = create_policy_mock( |
|
|
|
mock_env, trainer_config, curiosity_dummy_config, False, False, True |
|
|
|
def test_curiosity_visual(trainer_config, curiosity_dummy_config): |
|
|
|
policy = create_policy_mock( |
|
|
|
trainer_config, curiosity_dummy_config, False, False, True |
|
|
|
reward_signal_eval(env, policy, "curiosity") |
|
|
|
reward_signal_update(env, policy, "curiosity") |
|
|
|
reward_signal_eval(policy, "curiosity") |
|
|
|
reward_signal_update(policy, "curiosity") |
|
|
|
@mock.patch("mlagents_envs.environment.UnityEnvironment") |
|
|
|
def test_curiosity_rnn(mock_env, trainer_config, curiosity_dummy_config): |
|
|
|
env, policy = create_policy_mock( |
|
|
|
mock_env, trainer_config, curiosity_dummy_config, True, False, False |
|
|
|
def test_curiosity_rnn(trainer_config, curiosity_dummy_config): |
|
|
|
policy = create_policy_mock( |
|
|
|
trainer_config, curiosity_dummy_config, True, False, False |
|
|
|
reward_signal_eval(env, policy, "curiosity") |
|
|
|
reward_signal_update(env, policy, "curiosity") |
|
|
|
reward_signal_eval(policy, "curiosity") |
|
|
|
reward_signal_update(policy, "curiosity") |
|
|
|
@mock.patch("mlagents_envs.environment.UnityEnvironment") |
|
|
|
def test_extrinsic(mock_env, trainer_config, curiosity_dummy_config): |
|
|
|
env, policy = create_policy_mock( |
|
|
|
mock_env, trainer_config, curiosity_dummy_config, False, False, False |
|
|
|
def test_extrinsic(trainer_config, curiosity_dummy_config): |
|
|
|
policy = create_policy_mock( |
|
|
|
trainer_config, curiosity_dummy_config, False, False, False |
|
|
|
reward_signal_eval(env, policy, "extrinsic") |
|
|
|
reward_signal_update(env, policy, "extrinsic") |
|
|
|
reward_signal_eval(policy, "extrinsic") |
|
|
|
reward_signal_update(policy, "extrinsic") |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |