浏览代码

[tests] Visual observation tests (#3549)

/bug-failed-api-check
GitHub 5 年前
当前提交
bcce774f
共有 3 个文件被更改,包括 156 次插入133 次删除
  1. 40
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  2. 4
      ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
  3. 245
      ml-agents/mlagents/trainers/tests/test_simple_rl.py

40
ml-agents/mlagents/trainers/tests/simple_test_envs.py


import random
from typing import Dict, List
from typing import Dict, List, Any
import numpy as np
from mlagents_envs.base_env import (

)
OBS_SIZE = 1
VIS_OBS_SIZE = (20, 20, 3)
STEP_SIZE = 0.1
TIME_PENALTY = 0.001

it reaches -1. The position is incremented by the action amount (clamped to [-step_size, step_size]).
"""
def __init__(self, brain_names, use_discrete, step_size=STEP_SIZE):
def __init__(
self,
brain_names,
use_discrete,
step_size=STEP_SIZE,
num_visual=0,
num_vector=1,
vis_obs_size=VIS_OBS_SIZE,
vec_obs_size=OBS_SIZE,
):
self.num_visual = num_visual
self.num_vector = num_vector
self.vis_obs_size = vis_obs_size
self.vec_obs_size = vec_obs_size
[(OBS_SIZE,)], action_type, (2,) if use_discrete else 1
self._make_obs_spec(), action_type, (2,) if use_discrete else 1
)
self.names = brain_names
self.position: Dict[str, float] = {}

self.action[name] = None
self.step_result[name] = None
def _make_obs_spec(self) -> List[Any]:
obs_spec: List[Any] = []
for _ in range(self.num_vector):
obs_spec.append((self.vec_obs_size,))
for _ in range(self.num_visual):
obs_spec.append(self.vis_obs_size)
return obs_spec
def _make_obs(self, value: float) -> List[np.ndarray]:
obs = []
for _ in range(self.num_vector):
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * value)
for _ in range(self.num_visual):
obs.append(np.ones((1,) + self.vis_obs_size, dtype=np.float32) * value)
return obs
def get_agent_groups(self):
return self.names

def _make_batched_step(
self, name: str, done: bool, reward: float
) -> BatchedStepResult:
m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal[name]]
m_vector_obs = self._make_obs(self.goal[name])
m_reward = np.array([reward], dtype=np.float32)
m_done = np.array([done], dtype=np.bool)
m_agent_id = np.array([0], dtype=np.int32)

recurrent_obs_val = (
self.goal[name] if self.step_count[name] <= self.num_show_steps else 0
)
m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * recurrent_obs_val]
m_vector_obs = self._make_obs(recurrent_obs_val)
m_reward = np.array([reward], dtype=np.float32)
m_done = np.array([done], dtype=np.bool)
m_agent_id = np.array([0], dtype=np.int32)

4
ml-agents/mlagents/trainers/tests/test_meta_curriculum.py


from mlagents.trainers.meta_curriculum import MetaCurriculum
import json
import yaml
from mlagents.trainers.tests.simple_test_envs import Simple1DEnvironment
from mlagents.trainers.tests.test_simple_rl import _check_environment_trains, BRAIN_NAME

env = Simple1DEnvironment([BRAIN_NAME], use_discrete=False)
curriculum_config = json.loads(dummy_curriculum_json_str)
mc = MetaCurriculum({curriculum_brain_name: curriculum_config})
trainer_config = yaml.safe_load(TRAINER_CONFIG)
env, TRAINER_CONFIG, meta_curriculum=mc, success_threshold=None
env, trainer_config, meta_curriculum=mc, success_threshold=None
)

245
ml-agents/mlagents/trainers/tests/test_simple_rl.py


import pytest
import yaml
import numpy as np
from typing import Dict
from typing import Dict, Any
from mlagents.trainers.tests.simple_test_envs import (
Simple1DEnvironment,

learning_rate: 5.0e-3
learning_rate_schedule: constant
max_steps: 1500
memory_size: 256
memory_size: 16
normalize: false
num_epoch: 3
num_layers: 1

gamma: 0.99
"""
PPO_CONFIG_RECURRENT = f"""
{BRAIN_NAME}:
trainer: ppo
batch_size: 64
beta: 5.0e-3
buffer_size: 128
epsilon: 0.2
hidden_units: 32
lambd: 0.95
learning_rate: 5.0e-3
max_steps: 3000
memory_size: 16
normalize: false
learning_rate_schedule: constant
num_epoch: 3
num_layers: 1
time_horizon: 64
sequence_length: 32
summary_freq: 500
use_recurrent: true
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
SAC_CONFIG = f"""
{BRAIN_NAME}:
trainer: sac

init_entcoef: 0.01
learning_rate: 5.0e-3
max_steps: 1000
memory_size: 256
normalize: false
num_update: 1
train_interval: 1
num_layers: 1
time_horizon: 64
sequence_length: 64
summary_freq: 100
tau: 0.01
use_recurrent: false
curiosity_enc_size: 128
demo_path: None
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
SAC_CONFIG_RECURRENT = f"""
{BRAIN_NAME}:
trainer: sac
batch_size: 32
buffer_size: 500
buffer_init_steps: 100
hidden_units: 16
init_entcoef: 0.01
learning_rate: 5.0e-3
max_steps: 1000
memory_size: 16
normalize: false
num_update: 1

sequence_length: 32
summary_freq: 100
tau: 0.01
use_recurrent: true
use_recurrent: false
curiosity_enc_size: 128
demo_path: None
vis_encode_type: simple

gamma: 0.99
"""
GHOST_CONFIG_PASS = f"""
{BRAIN_NAME}:
trainer: ppo
batch_size: 16
beta: 5.0e-3
buffer_size: 64
epsilon: 0.2
hidden_units: 32
lambd: 0.95
learning_rate: 5.0e-3
max_steps: 2500
memory_size: 256
normalize: false
num_epoch: 3
num_layers: 1
time_horizon: 64
sequence_length: 64
summary_freq: 500
use_recurrent: false
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
self_play:
play_against_current_self_ratio: 1.0
save_steps: 2000
swap_steps: 2000
"""
# This config should fail because the ghosted policy is never swapped with a competent policy.
# Swap occurs after max step is reached.
GHOST_CONFIG_FAIL = f"""
{BRAIN_NAME}:
trainer: ppo
batch_size: 16
beta: 5.0e-3
buffer_size: 64
epsilon: 0.2
hidden_units: 32
lambd: 0.95
learning_rate: 5.0e-3
max_steps: 2500
memory_size: 256
normalize: false
num_epoch: 3
num_layers: 1
time_horizon: 64
sequence_length: 64
summary_freq: 500
use_recurrent: false
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
self_play:
play_against_current_self_ratio: 1.0
save_steps: 2000
swap_steps: 4000
"""
def generate_config(
config: str, override_vals: Dict[str, Any] = None
) -> Dict[str, Any]:
trainer_config = yaml.safe_load(config)
if override_vals is not None:
trainer_config[BRAIN_NAME].update(override_vals)
return trainer_config
# The reward processor is passed as an argument to _check_environment_trains.

def _check_environment_trains(
env,
config,
trainer_config,
reward_processor=default_reward_processor,
meta_curriculum=None,
success_threshold=0.99,

StatsReporter.writers.clear() # Clear StatsReporters so we don't write to file
debug_writer = DebugWriter()
StatsReporter.add_writer(debug_writer)
trainer_config = yaml.safe_load(config)
env_manager = SimpleEnvManager(env, FloatPropertiesChannel())
trainer_factory = TrainerFactory(
trainer_config=trainer_config,

@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ppo(use_discrete):
env = Simple1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
_check_environment_trains(env, PPO_CONFIG)
config = generate_config(PPO_CONFIG)
_check_environment_trains(env, config)
def test_recurrent_ppo(use_discrete):
env = Memory1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
_check_environment_trains(env, PPO_CONFIG_RECURRENT)
@pytest.mark.parametrize("num_visual", [1, 2])
def test_visual_ppo(num_visual, use_discrete):
env = Simple1DEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, num_visual=num_visual, num_vector=0
)
override_vals = {"learning_rate": 3.0e-4}
config = generate_config(PPO_CONFIG, override_vals)
_check_environment_trains(env, config)
@pytest.mark.parametrize("num_visual", [1, 2])
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn"])
def test_visual_advanced_ppo(vis_encode_type, num_visual):
env = Simple1DEnvironment(
[BRAIN_NAME],
use_discrete=True,
num_visual=num_visual,
num_vector=0,
step_size=0.5,
vis_obs_size=(36, 36, 3),
)
override_vals = {
"learning_rate": 3.0e-4,
"vis_encode_type": vis_encode_type,
"max_steps": 500,
"summary_freq": 100,
}
config = generate_config(PPO_CONFIG, override_vals)
# The number of steps is pretty small for these encoders
_check_environment_trains(env, config, success_threshold=0.9)
def test_recurrent_sac(use_discrete):
def test_recurrent_ppo(use_discrete):
_check_environment_trains(env, SAC_CONFIG_RECURRENT)
override_vals = {
"max_steps": 3000,
"batch_size": 64,
"buffer_size": 128,
"use_recurrent": True,
}
config = generate_config(PPO_CONFIG, override_vals)
_check_environment_trains(env, config)
_check_environment_trains(env, SAC_CONFIG)
config = generate_config(SAC_CONFIG)
_check_environment_trains(env, config)
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("num_visual", [1, 2])
def test_visual_sac(num_visual, use_discrete):
env = Simple1DEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, num_visual=num_visual, num_vector=0
)
override_vals = {"batch_size": 16, "learning_rate": 3e-4}
config = generate_config(SAC_CONFIG, override_vals)
_check_environment_trains(env, config)
@pytest.mark.parametrize("num_visual", [1, 2])
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn"])
def test_visual_advanced_sac(vis_encode_type, num_visual):
env = Simple1DEnvironment(
[BRAIN_NAME],
use_discrete=True,
num_visual=num_visual,
num_vector=0,
step_size=0.5,
vis_obs_size=(36, 36, 3),
)
override_vals = {
"batch_size": 16,
"learning_rate": 3.0e-4,
"vis_encode_type": vis_encode_type,
"buffer_init_steps": 0,
"max_steps": 100,
}
config = generate_config(SAC_CONFIG, override_vals)
# The number of steps is pretty small for these encoders
_check_environment_trains(env, config, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_sac(use_discrete):
env = Memory1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
override_vals = {"batch_size": 32, "use_recurrent": True}
config = generate_config(SAC_CONFIG, override_vals)
_check_environment_trains(env, config)
@pytest.mark.parametrize("use_discrete", [True, False])

)
_check_environment_trains(env, GHOST_CONFIG_PASS)
override_vals = {
"max_steps": 2500,
"self_play": {
"play_against_current_self_ratio": 1.0,
"save_steps": 2000,
"swap_steps": 2000,
},
}
config = generate_config(PPO_CONFIG, override_vals)
_check_environment_trains(env, config)
@pytest.mark.parametrize("use_discrete", [True, False])

)
_check_environment_trains(env, GHOST_CONFIG_FAIL, success_threshold=None)
# This config should fail because the ghosted policy is never swapped with a competent policy.
# Swap occurs after max step is reached.
override_vals = {
"max_steps": 2500,
"self_play": {
"play_against_current_self_ratio": 1.0,
"save_steps": 2000,
"swap_steps": 4000,
},
}
config = generate_config(PPO_CONFIG, override_vals)
_check_environment_trains(env, config, success_threshold=None)
processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]

正在加载...
取消
保存