浏览代码

[tests] LSTM end-to-end tests (#3544)

/bug-failed-api-check
GitHub 5 年前
当前提交
323f104c
共有 2 个文件被更改,包括 161 次插入45 次删除
  1. 98
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  2. 108
      ml-agents/mlagents/trainers/tests/test_simple_rl.py

98
ml-agents/mlagents/trainers/tests/simple_test_envs.py


it reaches -1. The position is incremented by the action amount (clamped to [-step_size, step_size]).
"""
def __init__(self, brain_names, use_discrete):
def __init__(self, brain_names, use_discrete, step_size=STEP_SIZE):
super().__init__()
self.discrete = use_discrete
action_type = ActionType.DISCRETE if use_discrete else ActionType.CONTINUOUS

self.rewards: Dict[str, float] = {}
self.final_rewards: Dict[str, List[float]] = {}
self.step_result: Dict[str, BatchedStepResult] = {}
self.step_size = step_size # defines the difficulty of the test
for name in self.names:
self.goal[name] = self.random.choice([-1, 1])

def get_step_result(self, name):
return self.step_result[name]
def _take_action(self, name: str) -> bool:
if self.discrete:
act = self.action[name][0][0]
delta = 1 if act else -1
else:
delta = self.action[name][0][0]
delta = clamp(delta, -self.step_size, self.step_size)
self.position[name] += delta
self.position[name] = clamp(self.position[name], -1, 1)
self.step_count[name] += 1
done = self.position[name] >= 1.0 or self.position[name] <= -1.0
return done
def _compute_reward(self, name: str, done: bool) -> float:
if done:
reward = SUCCESS_REWARD * self.position[name] * self.goal[name]
else:
reward = -TIME_PENALTY
return reward
def _make_batched_step(
self, name: str, done: bool, reward: float
) -> BatchedStepResult:
m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal[name]]
m_reward = np.array([reward], dtype=np.float32)
m_done = np.array([done], dtype=np.bool)
m_agent_id = np.array([0], dtype=np.int32)
action_mask = self._generate_mask()
return BatchedStepResult(
m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask
)
if self.discrete:
act = self.action[name][0][0]
delta = 1 if act else -1
else:
delta = self.action[name][0][0]
delta = clamp(delta, -STEP_SIZE, STEP_SIZE)
self.position[name] += delta
self.position[name] = clamp(self.position[name], -1, 1)
self.step_count[name] += 1
done = self.position[name] >= 1.0 or self.position[name] <= -1.0
if done:
reward = SUCCESS_REWARD * self.position[name] * self.goal[name]
else:
reward = -TIME_PENALTY
done = self._take_action(name)
reward = self._compute_reward(name, done)
m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal[name]]
m_reward = np.array([reward], dtype=np.float32)
m_done = np.array([done], dtype=np.bool)
m_agent_id = np.array([0], dtype=np.int32)
action_mask = self._generate_mask()
self.step_result[name] = self._make_batched_step(name, done, reward)
self.step_result[name] = BatchedStepResult(
m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask
)
def _generate_mask(self):
if self.discrete:
# LL-Python API will return an empty dim if there is only 1 agent.

def reset(self) -> None: # type: ignore
for name in self.names:
self._reset_agent(name)
m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal[name]]
m_reward = np.array([0], dtype=np.float32)
m_done = np.array([False], dtype=np.bool)
m_agent_id = np.array([0], dtype=np.int32)
action_mask = self._generate_mask()
self.step_result[name] = BatchedStepResult(
m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask
)
self.step_result[name] = self._make_batched_step(name, False, 0.0)
@property
def reset_parameters(self) -> Dict[str, str]:

pass
class Memory1DEnvironment(Simple1DEnvironment):
def __init__(self, brain_names, use_discrete, step_size=0.2):
super().__init__(brain_names, use_discrete, step_size=0.2)
# Number of steps to reveal the goal for. Lower is harder. Should be
# less than 1/step_size to force agent to use memory
self.num_show_steps = 2
def _make_batched_step(
self, name: str, done: bool, reward: float
) -> BatchedStepResult:
recurrent_obs_val = (
self.goal[name] if self.step_count[name] <= self.num_show_steps else 0
)
m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * recurrent_obs_val]
m_reward = np.array([reward], dtype=np.float32)
m_done = np.array([done], dtype=np.bool)
m_agent_id = np.array([0], dtype=np.int32)
action_mask = self._generate_mask()
return BatchedStepResult(
m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask
)

108
ml-agents/mlagents/trainers/tests/test_simple_rl.py


import pytest
import yaml
import numpy as np
from typing import Dict
from mlagents.trainers.tests.simple_test_envs import Simple1DEnvironment
from mlagents.trainers.tests.simple_test_envs import (
Simple1DEnvironment,
Memory1DEnvironment,
)
from mlagents.trainers.stats import StatsReporter
from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary
from mlagents_envs.side_channel.float_properties_channel import FloatPropertiesChannel
BRAIN_NAME = "1D"

beta: 5.0e-3
buffer_size: 64
epsilon: 0.2
hidden_units: 128
hidden_units: 32
max_steps: 2500
learning_rate_schedule: constant
max_steps: 1500
num_layers: 2
num_layers: 1
time_horizon: 64
sequence_length: 64
summary_freq: 500

gamma: 0.99
"""
PPO_CONFIG_RECURRENT = f"""
{BRAIN_NAME}:
trainer: ppo
batch_size: 64
beta: 5.0e-3
buffer_size: 128
epsilon: 0.2
hidden_units: 32
lambd: 0.95
learning_rate: 5.0e-3
max_steps: 3000
memory_size: 16
normalize: false
learning_rate_schedule: constant
num_epoch: 3
num_layers: 1
time_horizon: 64
sequence_length: 32
summary_freq: 500
use_recurrent: true
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
SAC_CONFIG = f"""
{BRAIN_NAME}:
trainer: sac

gamma: 0.99
"""
SAC_CONFIG_RECURRENT = f"""
{BRAIN_NAME}:
trainer: sac
batch_size: 32
buffer_size: 500
buffer_init_steps: 100
hidden_units: 16
init_entcoef: 0.01
learning_rate: 5.0e-3
max_steps: 1000
memory_size: 16
normalize: false
num_update: 1
train_interval: 1
num_layers: 1
time_horizon: 64
sequence_length: 32
summary_freq: 100
tau: 0.01
use_recurrent: true
curiosity_enc_size: 128
demo_path: None
vis_encode_type: simple
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
GHOST_CONFIG_PASS = f"""
{BRAIN_NAME}:
trainer: ppo

epsilon: 0.2
hidden_units: 128
hidden_units: 32
lambd: 0.95
learning_rate: 5.0e-3
max_steps: 2500

num_layers: 2
num_layers: 1
time_horizon: 64
sequence_length: 64
summary_freq: 500

beta: 5.0e-3
buffer_size: 64
epsilon: 0.2
hidden_units: 128
hidden_units: 32
lambd: 0.95
learning_rate: 5.0e-3
max_steps: 2500

num_layers: 2
num_layers: 1
time_horizon: 64
sequence_length: 64
summary_freq: 500

return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean()
class DebugWriter(StatsWriter):
"""
Print to stdout so stats can be viewed in pytest
"""
def write_stats(
self, category: str, values: Dict[str, StatsSummary], step: int
) -> None:
for val, stats_summary in values.items():
if val == "Environment/Cumulative Reward":
print(step, val, stats_summary.mean)
def write_text(self, category: str, text: str, step: int) -> None:
pass
def _check_environment_trains(
env,
config,

save_freq = 99999
seed = 1337
StatsReporter.writers.clear() # Clear StatsReporters so we don't write to file
debug_writer = DebugWriter()
StatsReporter.add_writer(debug_writer)
trainer_config = yaml.safe_load(config)
env_manager = SimpleEnvManager(env, FloatPropertiesChannel())
trainer_factory = TrainerFactory(

def test_simple_ppo(use_discrete):
env = Simple1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
_check_environment_trains(env, PPO_CONFIG)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_ppo(use_discrete):
env = Memory1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
_check_environment_trains(env, PPO_CONFIG_RECURRENT)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_sac(use_discrete):
env = Memory1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
_check_environment_trains(env, SAC_CONFIG_RECURRENT)
@pytest.mark.parametrize("use_discrete", [True, False])

正在加载...
取消
保存