浏览代码

Simple1DEnv refactor and additional ghost trainer tests (#3537)

/bug-failed-api-check
GitHub 5 年前
当前提交
f469cbb0
共有 3 个文件被更改,包括 221 次插入147 次删除
  1. 13
      ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
  2. 218
      ml-agents/mlagents/trainers/tests/test_simple_rl.py
  3. 137
      ml-agents/mlagents/trainers/tests/simple_test_envs.py

13
ml-agents/mlagents/trainers/tests/test_meta_curriculum.py


from mlagents.trainers.meta_curriculum import MetaCurriculum
import json
from mlagents.trainers.tests.test_simple_rl import (
Simple1DEnvironment,
_check_environment_trains,
BRAIN_NAME,
)
from mlagents.trainers.tests.simple_test_envs import Simple1DEnvironment
from mlagents.trainers.tests.test_simple_rl import _check_environment_trains, BRAIN_NAME
from mlagents.trainers.tests.test_curriculum import dummy_curriculum_json_str

@pytest.mark.parametrize("curriculum_brain_name", [BRAIN_NAME, "WrongBrainName"])
def test_simple_metacurriculum(curriculum_brain_name):
env = Simple1DEnvironment(use_discrete=False)
env = Simple1DEnvironment([BRAIN_NAME], use_discrete=False)
_check_environment_trains(env, TRAINER_CONFIG, mc, None)
_check_environment_trains(
env, TRAINER_CONFIG, meta_curriculum=mc, success_threshold=None
)

218
ml-agents/mlagents/trainers/tests/test_simple_rl.py


import math
import random
from typing import Dict
from mlagents.trainers.tests.simple_test_envs import Simple1DEnvironment
from mlagents_envs.base_env import (
BaseEnv,
AgentGroupSpec,
BatchedStepResult,
ActionType,
)
from mlagents.trainers.brain import BrainParameters
BRAIN_NAME = __name__
OBS_SIZE = 1
STEP_SIZE = 0.1
TIME_PENALTY = 0.001
MIN_STEPS = int(1.0 / STEP_SIZE) + 1
SUCCESS_REWARD = 1.0 + MIN_STEPS * TIME_PENALTY
def clamp(x, min_val, max_val):
return max(min_val, min(x, max_val))
class Simple1DEnvironment(BaseEnv):
"""
Very simple "game" - the agent has a position on [-1, 1], gets a reward of 1 if it reaches 1, and a reward of -1 if
it reaches -1. The position is incremented by the action amount (clamped to [-step_size, step_size]).
"""
def __init__(self, use_discrete):
super().__init__()
self.discrete = use_discrete
action_type = ActionType.DISCRETE if use_discrete else ActionType.CONTINUOUS
self.group_spec = AgentGroupSpec(
[(OBS_SIZE,)], action_type, (2,) if use_discrete else 1
)
# state
self.position = 0.0
self.step_count = 0
self.random = random.Random(str(self.group_spec))
self.goal = self.random.choice([-1, 1])
self.action = None
self.step_result = None
def get_agent_groups(self):
return [BRAIN_NAME]
def get_agent_group_spec(self, name):
return self.group_spec
def set_action_for_agent(self, name, id, data):
pass
def set_actions(self, name, data):
self.action = data
def get_step_result(self, name):
return self.step_result
def step(self) -> None:
assert self.action is not None
if self.discrete:
act = self.action[0][0]
delta = 1 if act else -1
else:
delta = self.action[0][0]
delta = clamp(delta, -STEP_SIZE, STEP_SIZE)
self.position += delta
self.position = clamp(self.position, -1, 1)
self.step_count += 1
done = self.position >= 1.0 or self.position <= -1.0
if done:
reward = SUCCESS_REWARD * self.position * self.goal
else:
reward = -TIME_PENALTY
m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal]
m_reward = np.array([reward], dtype=np.float32)
m_done = np.array([done], dtype=np.bool)
m_agent_id = np.array([0], dtype=np.int32)
action_mask = self._generate_mask()
if done:
self._reset_agent()
self.step_result = BatchedStepResult(
m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask
)
def _generate_mask(self):
if self.discrete:
# LL-Python API will return an empty dim if there is only 1 agent.
ndmask = np.array(2 * [False], dtype=np.bool)
ndmask = np.expand_dims(ndmask, axis=0)
action_mask = [ndmask]
else:
action_mask = None
return action_mask
def _reset_agent(self):
self.position = 0.0
self.step_count = 0
self.goal = self.random.choice([-1, 1])
def reset(self) -> None: # type: ignore
self._reset_agent()
m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal]
m_reward = np.array([0], dtype=np.float32)
m_done = np.array([False], dtype=np.bool)
m_agent_id = np.array([0], dtype=np.int32)
action_mask = self._generate_mask()
self.step_result = BatchedStepResult(
m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask
)
@property
def external_brains(self) -> Dict[str, BrainParameters]:
return self._brains
@property
def reset_parameters(self) -> Dict[str, str]:
return {}
def close(self):
pass
BRAIN_NAME = "1D"
PPO_CONFIG = f"""
{BRAIN_NAME}:

gamma: 0.99
"""
GHOST_CONFIG = f"""
GHOST_CONFIG_PASS = f"""
{BRAIN_NAME}:
trainer: ppo
batch_size: 16
beta: 5.0e-3
buffer_size: 64
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 5.0e-3
max_steps: 2500
memory_size: 256
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 500
use_recurrent: false
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
self_play:
play_against_current_self_ratio: 1.0
save_steps: 2000
swap_steps: 2000
"""
# This config should fail because the ghosted policy is never swapped with a competent policy.
# Swap occurs after max step is reached.
GHOST_CONFIG_FAIL = f"""
{BRAIN_NAME}:
trainer: ppo
batch_size: 16

strength: 1.0
gamma: 0.99
self_play:
save_step: 1000
play_against_current_self_ratio: 1.0
save_steps: 2000
swap_steps: 4000
# The reward processor is passed as an argument to _check_environment_trains.
# It is applied to the list pf all final rewards for each brain individually.
# This is so that we can process all final rewards in different ways for different algorithms.
# Custom reward processors shuld be built within the test function and passed to _check_environment_trains
# Default is average over the last 5 final rewards
def default_reward_processor(rewards, last_n_rewards=5):
return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean()
env, config, meta_curriculum=None, success_threshold=0.99
env,
config,
reward_processor=default_reward_processor,
meta_curriculum=None,
success_threshold=0.99,
):
# Create controller and begin training.
with tempfile.TemporaryDirectory() as dir:

# Begin training
tc.start_learning(env_manager)
print(tc._get_measure_vals())
for mean_reward in tc._get_measure_vals().values():
assert not math.isnan(mean_reward)
assert mean_reward > success_threshold
processed_rewards = [
reward_processor(rewards) for rewards in env.final_rewards.values()
]
assert all(not math.isnan(reward) for reward in processed_rewards)
assert all(reward > success_threshold for reward in processed_rewards)
env = Simple1DEnvironment(use_discrete=use_discrete)
env = Simple1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
env = Simple1DEnvironment(use_discrete=use_discrete)
env = Simple1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
env = Simple1DEnvironment(use_discrete=use_discrete)
_check_environment_trains(env, GHOST_CONFIG)
env = Simple1DEnvironment(
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
)
_check_environment_trains(env, GHOST_CONFIG_PASS)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost_fails(use_discrete):
env = Simple1DEnvironment(
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
)
_check_environment_trains(env, GHOST_CONFIG_FAIL, success_threshold=None)
processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]
success_threshold = 0.99
assert any(reward > success_threshold for reward in processed_rewards) and any(
reward < success_threshold for reward in processed_rewards
)

137
ml-agents/mlagents/trainers/tests/simple_test_envs.py


import random
from typing import Dict, List
import numpy as np
from mlagents_envs.base_env import (
BaseEnv,
AgentGroupSpec,
BatchedStepResult,
ActionType,
)
OBS_SIZE = 1
STEP_SIZE = 0.1
TIME_PENALTY = 0.001
MIN_STEPS = int(1.0 / STEP_SIZE) + 1
SUCCESS_REWARD = 1.0 + MIN_STEPS * TIME_PENALTY
def clamp(x, min_val, max_val):
return max(min_val, min(x, max_val))
class Simple1DEnvironment(BaseEnv):
"""
Very simple "game" - the agent has a position on [-1, 1], gets a reward of 1 if it reaches 1, and a reward of -1 if
it reaches -1. The position is incremented by the action amount (clamped to [-step_size, step_size]).
"""
def __init__(self, brain_names, use_discrete):
super().__init__()
self.discrete = use_discrete
action_type = ActionType.DISCRETE if use_discrete else ActionType.CONTINUOUS
self.group_spec = AgentGroupSpec(
[(OBS_SIZE,)], action_type, (2,) if use_discrete else 1
)
self.names = brain_names
self.position: Dict[str, float] = {}
self.step_count: Dict[str, float] = {}
self.random = random.Random(str(self.group_spec))
self.goal = self.random.choice([-1, 1])
self.action = {}
self.rewards: Dict[str, float] = {}
self.final_rewards: Dict[str, List[float]] = {}
self.step_result: Dict[str, BatchedStepResult] = {}
for name in self.names:
self.rewards[name] = 0
self.final_rewards[name] = []
self._reset_agent(name)
self.action[name] = None
self.step_result[name] = None
def get_agent_groups(self):
return self.names
def get_agent_group_spec(self, name):
return self.group_spec
def set_action_for_agent(self, name, id, data):
pass
def set_actions(self, name, data):
self.action[name] = data
def get_step_result(self, name):
return self.step_result[name]
def step(self) -> None:
assert all(action is not None for action in self.action.values())
for name in self.names:
if self.discrete:
act = self.action[name][0][0]
delta = 1 if act else -1
else:
delta = self.action[name][0][0]
delta = clamp(delta, -STEP_SIZE, STEP_SIZE)
self.position[name] += delta
self.position[name] = clamp(self.position[name], -1, 1)
self.step_count[name] += 1
done = self.position[name] >= 1.0 or self.position[name] <= -1.0
if done:
reward = SUCCESS_REWARD * self.position[name] * self.goal
else:
reward = -TIME_PENALTY
self.rewards[name] += reward
m_vector_obs = [np.zeros((1, OBS_SIZE), dtype=np.float32)]
m_reward = np.array([reward], dtype=np.float32)
m_done = np.array([done], dtype=np.bool)
m_agent_id = np.array([0], dtype=np.int32)
action_mask = self._generate_mask()
if done:
self._reset_agent(name)
self.step_result[name] = BatchedStepResult(
m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask
)
def _generate_mask(self):
if self.discrete:
# LL-Python API will return an empty dim if there is only 1 agent.
ndmask = np.array(2 * [False], dtype=np.bool)
ndmask = np.expand_dims(ndmask, axis=0)
action_mask = [ndmask]
else:
action_mask = None
return action_mask
def _reset_agent(self, name):
self.position[name] = 0.0
self.step_count[name] = 0
self.final_rewards[name].append(self.rewards[name])
self.rewards[name] = 0
def reset(self) -> None: # type: ignore
for name in self.names:
self._reset_agent(name)
m_vector_obs = [np.zeros((1, OBS_SIZE), dtype=np.float32)]
m_reward = np.array([0], dtype=np.float32)
m_done = np.array([False], dtype=np.bool)
m_agent_id = np.array([0], dtype=np.int32)
action_mask = self._generate_mask()
self.step_result[name] = BatchedStepResult(
m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask
)
@property
def reset_parameters(self) -> Dict[str, str]:
return {}
def close(self):
pass
正在加载...
取消
保存