浏览代码
Develop model transfer test (#4214)
Develop model transfer test (#4214)
* test env, and code integration * delete results/develop/bisim-review
GitHub
4 年前
当前提交
839eb2cb
共有 6 个文件被更改,包括 560 次插入 和 80 次删除
-
29ml-agents/mlagents/trainers/policy/transfer_policy.py
-
136ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
-
4ml-agents/mlagents/trainers/ppo_transfer/trainer.py
-
10ml-agents/mlagents/trainers/settings.py
-
223ml-agents/mlagents/trainers/tests/test_simple_transfer.py
-
238ml-agents/mlagents/trainers/tests/transfer_test_envs.py
|
|||
import math |
|||
import tempfile |
|||
import pytest |
|||
import numpy as np |
|||
import attr |
|||
from typing import Dict |
|||
|
|||
from mlagents.trainers.tests.transfer_test_envs import SimpleTransferEnvironment |
|||
from mlagents.trainers.trainer_controller import TrainerController |
|||
from mlagents.trainers.trainer_util import TrainerFactory |
|||
from mlagents.trainers.simple_env_manager import SimpleEnvManager |
|||
from mlagents.trainers.demo_loader import write_demo |
|||
from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary, TensorboardWriter, CSVWriter |
|||
from mlagents.trainers.settings import ( |
|||
TrainerSettings, |
|||
PPOSettings, |
|||
PPOTransferSettings, |
|||
SACSettings, |
|||
NetworkSettings, |
|||
SelfPlaySettings, |
|||
BehavioralCloningSettings, |
|||
GAILSettings, |
|||
TrainerType, |
|||
RewardSignalType, |
|||
) |
|||
from mlagents.trainers.models import EncoderType, ScheduleType |
|||
from mlagents_envs.side_channel.environment_parameters_channel import ( |
|||
EnvironmentParametersChannel, |
|||
) |
|||
from mlagents_envs.communicator_objects.demonstration_meta_pb2 import ( |
|||
DemonstrationMetaProto, |
|||
) |
|||
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto |
|||
from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous |
|||
|
|||
BRAIN_NAME = "Simple" |
|||
|
|||
|
|||
PPO_CONFIG = TrainerSettings( |
|||
trainer_type=TrainerType.PPO, |
|||
hyperparameters=PPOSettings( |
|||
learning_rate=5.0e-3, |
|||
learning_rate_schedule=ScheduleType.CONSTANT, |
|||
batch_size=16, |
|||
buffer_size=64, |
|||
), |
|||
network_settings=NetworkSettings(num_layers=2, hidden_units=32), |
|||
summary_freq=500, |
|||
max_steps=3000, |
|||
threaded=False, |
|||
) |
|||
|
|||
SAC_CONFIG = TrainerSettings( |
|||
trainer_type=TrainerType.SAC, |
|||
hyperparameters=SACSettings( |
|||
learning_rate=5.0e-3, |
|||
learning_rate_schedule=ScheduleType.CONSTANT, |
|||
batch_size=8, |
|||
buffer_init_steps=100, |
|||
buffer_size=5000, |
|||
tau=0.01, |
|||
init_entcoef=0.01, |
|||
), |
|||
network_settings=NetworkSettings(num_layers=1, hidden_units=16), |
|||
summary_freq=100, |
|||
max_steps=1000, |
|||
threaded=False, |
|||
) |
|||
|
|||
Transfer_CONFIG = TrainerSettings( |
|||
trainer_type=TrainerType.PPO_Transfer, |
|||
hyperparameters=PPOTransferSettings( |
|||
learning_rate=5.0e-3, |
|||
learning_rate_schedule=ScheduleType.CONSTANT, |
|||
batch_size=16, |
|||
buffer_size=64, |
|||
feature_size=2, |
|||
reuse_encoder=True, |
|||
in_epoch_alter=True, |
|||
in_batch_alter=False, |
|||
use_op_buffer=True, |
|||
policy_layers=1 |
|||
), |
|||
network_settings=NetworkSettings(num_layers=1, hidden_units=32), |
|||
summary_freq=500, |
|||
max_steps=3000, |
|||
threaded=False, |
|||
) |
|||
|
|||
|
|||
|
|||
# The reward processor is passed as an argument to _check_environment_trains. |
|||
# It is applied to the list pf all final rewards for each brain individually. |
|||
# This is so that we can process all final rewards in different ways for different algorithms. |
|||
# Custom reward processors shuld be built within the test function and passed to _check_environment_trains |
|||
# Default is average over the last 5 final rewards |
|||
def default_reward_processor(rewards, last_n_rewards=5): |
|||
rewards_to_use = rewards[-last_n_rewards:] |
|||
# For debugging tests |
|||
print("Last {} rewards:".format(last_n_rewards), rewards_to_use) |
|||
return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean() |
|||
|
|||
|
|||
class DebugWriter(StatsWriter): |
|||
""" |
|||
Print to stdout so stats can be viewed in pytest |
|||
""" |
|||
|
|||
def __init__(self): |
|||
self._last_reward_summary: Dict[str, float] = {} |
|||
self.stats = {} |
|||
|
|||
def get_last_rewards(self): |
|||
return self._last_reward_summary |
|||
|
|||
def write_stats( |
|||
self, category: str, values: Dict[str, StatsSummary], step: int |
|||
) -> None: |
|||
for val, stats_summary in values.items(): |
|||
if val == "Environment/Cumulative Reward": |
|||
print(step, val, stats_summary.mean) |
|||
self.stats[step] = stats_summary.mean |
|||
self._last_reward_summary[category] = stats_summary.mean |
|||
|
|||
def write2file(self, filename): |
|||
with open(filename, "w") as reward_file: |
|||
for step in self.stats.keys(): |
|||
reward_file.write(str(step) + ":" + str(self.stats[step]) + "\n") |
|||
|
|||
|
|||
def _check_environment_trains( |
|||
env, |
|||
trainer_config, |
|||
reward_processor=default_reward_processor, |
|||
meta_curriculum=None, |
|||
success_threshold=0.9, |
|||
env_manager=None, |
|||
run_id="id", |
|||
seed=1337 |
|||
): |
|||
# Create controller and begin training. |
|||
model_dir = "./transfer_results/" + run_id |
|||
StatsReporter.writers.clear() # Clear StatsReporters so we don't write to file |
|||
debug_writer = DebugWriter() |
|||
StatsReporter.add_writer(debug_writer) |
|||
|
|||
csv_writer = CSVWriter( |
|||
model_dir, |
|||
required_fields=[ |
|||
"Environment/Cumulative Reward", |
|||
"Environment/Episode Length", |
|||
], |
|||
) |
|||
tb_writer = TensorboardWriter( |
|||
model_dir, clear_past_data=True |
|||
) |
|||
StatsReporter.add_writer(tb_writer) |
|||
StatsReporter.add_writer(csv_writer) |
|||
|
|||
if env_manager is None: |
|||
env_manager = SimpleEnvManager(env, EnvironmentParametersChannel()) |
|||
trainer_factory = TrainerFactory( |
|||
trainer_config=trainer_config, |
|||
output_path=model_dir, |
|||
train_model=True, |
|||
load_model=False, |
|||
seed=seed, |
|||
meta_curriculum=meta_curriculum, |
|||
multi_gpu=False, |
|||
) |
|||
|
|||
tc = TrainerController( |
|||
trainer_factory=trainer_factory, |
|||
output_path=model_dir, |
|||
run_id=run_id, |
|||
meta_curriculum=meta_curriculum, |
|||
train=True, |
|||
training_seed=seed, |
|||
) |
|||
|
|||
# Begin training |
|||
tc.start_learning(env_manager) |
|||
# debug_writer.write2file(model_dir+"/reward.txt") |
|||
|
|||
if ( |
|||
success_threshold is not None |
|||
): # For tests where we are just checking setup and not reward |
|||
processed_rewards = [ |
|||
reward_processor(rewards) for rewards in env.final_rewards.values() |
|||
] |
|||
assert all(not math.isnan(reward) for reward in processed_rewards) |
|||
assert all(reward > success_threshold for reward in processed_rewards) |
|||
|
|||
|
|||
def test_2d_model(config=Transfer_CONFIG, obs_spec_type="rich", run_id="modelbased_rich_5e-4", seed=1337): |
|||
env = SimpleTransferEnvironment( |
|||
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.8, num_vector=2, obs_spec_type=obs_spec_type |
|||
) |
|||
new_hyperparams = attr.evolve( |
|||
config.hyperparameters, batch_size=64, buffer_size=640, learning_rate=5.0e-4, |
|||
) |
|||
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=10000) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed) |
|||
|
|||
def test_2d_transfer(config=Transfer_CONFIG, obs_spec_type="rich", run_id="transfer_rich_iealter_retrain-enc_5e-4", seed=1337): |
|||
env = SimpleTransferEnvironment( |
|||
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.8, num_vector=2, obs_spec_type=obs_spec_type |
|||
) |
|||
new_hyperparams = attr.evolve( |
|||
config.hyperparameters, batch_size=64, buffer_size=640, use_transfer=True, |
|||
transfer_path="./transfer_results/modelbased_normal_opbuf_ibalter_s2/Simple", |
|||
use_op_buffer=True, in_epoch_alter=True, learning_rate=5.0e-4, train_policy=False, |
|||
train_value=False, train_model=False |
|||
) |
|||
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=10000) |
|||
_check_environment_trains(env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
# test_2d_model(seed=0) |
|||
test_2d_transfer(seed=0) |
|||
# for i in range(5): |
|||
# test_2d_model(seed=i) |
|
|||
import random |
|||
from typing import Dict, List, Any, Tuple |
|||
import numpy as np |
|||
|
|||
from mlagents_envs.base_env import ( |
|||
BaseEnv, |
|||
BehaviorSpec, |
|||
DecisionSteps, |
|||
TerminalSteps, |
|||
ActionType, |
|||
BehaviorMapping, |
|||
) |
|||
from mlagents_envs.tests.test_rpc_utils import proto_from_steps_and_action |
|||
from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import ( |
|||
AgentInfoActionPairProto, |
|||
) |
|||
|
|||
OBS_SIZE = 1 |
|||
VIS_OBS_SIZE = (20, 20, 3) |
|||
STEP_SIZE = 0.1 |
|||
|
|||
TIME_PENALTY = 0.01 |
|||
MIN_STEPS = int(1.0 / STEP_SIZE) + 1 |
|||
SUCCESS_REWARD = 1.0 + MIN_STEPS * TIME_PENALTY |
|||
|
|||
|
|||
def clamp(x, min_val, max_val): |
|||
return max(min_val, min(x, max_val)) |
|||
|
|||
|
|||
class SimpleTransferEnvironment(BaseEnv): |
|||
""" |
|||
Very simple "game" - the agent has a position on [-1, 1], gets a reward of 1 if it reaches 1, and a reward of -1 if |
|||
it reaches -1. The position is incremented by the action amount (clamped to [-step_size, step_size]). |
|||
""" |
|||
|
|||
def __init__( |
|||
self, |
|||
brain_names, |
|||
use_discrete, |
|||
step_size=STEP_SIZE, |
|||
num_visual=0, |
|||
num_vector=1, |
|||
vis_obs_size=VIS_OBS_SIZE, |
|||
vec_obs_size=OBS_SIZE, |
|||
action_size=1, |
|||
obs_spec_type="normal" # normal: (x,y); rich: (x+y, x-y, x*y) |
|||
): |
|||
super().__init__() |
|||
self.discrete = use_discrete |
|||
self.num_visual = num_visual |
|||
self.num_vector = num_vector |
|||
self.vis_obs_size = vis_obs_size |
|||
self.vec_obs_size = vec_obs_size |
|||
self.obs_spec_type = obs_spec_type |
|||
action_type = ActionType.DISCRETE if use_discrete else ActionType.CONTINUOUS |
|||
self.behavior_spec = BehaviorSpec( |
|||
self._make_obs_spec(), |
|||
action_type, |
|||
tuple(2 for _ in range(action_size)) if use_discrete else action_size, |
|||
) |
|||
self.action_size = action_size |
|||
self.names = brain_names |
|||
self.positions: Dict[str, List[float]] = {} |
|||
self.step_count: Dict[str, float] = {} |
|||
self.random = random.Random(str(self.behavior_spec)) |
|||
self.goal: Dict[str, int] = {} |
|||
self.action = {} |
|||
self.rewards: Dict[str, float] = {} |
|||
self.final_rewards: Dict[str, List[float]] = {} |
|||
self.step_result: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {} |
|||
self.agent_id: Dict[str, int] = {} |
|||
self.step_size = step_size # defines the difficulty of the test |
|||
|
|||
for name in self.names: |
|||
self.agent_id[name] = 0 |
|||
self.goal[name] = self.random.choice([-1, 1]) |
|||
self.rewards[name] = 0 |
|||
self.final_rewards[name] = [] |
|||
self._reset_agent(name) |
|||
self.action[name] = None |
|||
self.step_result[name] = None |
|||
|
|||
def _make_obs_spec(self) -> List[Any]: |
|||
obs_spec: List[Any] = [] |
|||
# goal |
|||
for _ in range(self.num_vector): |
|||
obs_spec.append((self.vec_obs_size,)) |
|||
for _ in range(self.num_visual): |
|||
obs_spec.append(self.vis_obs_size) |
|||
# position |
|||
if self.obs_spec_type == "normal": |
|||
for _ in range(self.num_vector): |
|||
obs_spec.append((self.vec_obs_size,)) |
|||
# composed position |
|||
if self.obs_spec_type == "rich": |
|||
for _ in range(self.num_vector+1): |
|||
obs_spec.append((self.vec_obs_size,)) |
|||
print("obs_spec:", obs_spec) |
|||
return obs_spec |
|||
|
|||
def _make_obs(self, value: float) -> List[np.ndarray]: |
|||
obs = [] |
|||
for _ in range(self.num_vector): |
|||
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * value) |
|||
if self.obs_spec_type == "normal": |
|||
for name in self.names: |
|||
for i in self.positions[name]: |
|||
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * i) |
|||
elif self.obs_spec_type == "rich": |
|||
for name in self.names: |
|||
i = self.positions[name][0] |
|||
j = self.positions[name][1] |
|||
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i+j)) |
|||
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i-j)) |
|||
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i*j)) |
|||
for _ in range(self.num_visual): |
|||
obs.append(np.ones((1,) + self.vis_obs_size, dtype=np.float32) * value) |
|||
return obs |
|||
|
|||
@property |
|||
def behavior_specs(self): |
|||
behavior_dict = {} |
|||
for n in self.names: |
|||
behavior_dict[n] = self.behavior_spec |
|||
return BehaviorMapping(behavior_dict) |
|||
|
|||
def set_action_for_agent(self, behavior_name, agent_id, action): |
|||
pass |
|||
|
|||
def set_actions(self, behavior_name, action): |
|||
self.action[behavior_name] = action |
|||
|
|||
def get_steps(self, behavior_name): |
|||
return self.step_result[behavior_name] |
|||
|
|||
def _take_action(self, name: str) -> bool: |
|||
deltas = [] |
|||
for _act in self.action[name][0]: |
|||
if self.discrete: |
|||
deltas.append(1 if _act else -1) |
|||
else: |
|||
deltas.append(_act) |
|||
for i, _delta in enumerate(deltas): |
|||
_delta = clamp(_delta, -self.step_size, self.step_size) |
|||
self.positions[name][i] += _delta |
|||
self.positions[name][i] = clamp(self.positions[name][i], -1, 1) |
|||
self.step_count[name] += 1 |
|||
# Both must be in 1.0 to be done |
|||
done = all(pos >= 1.0 or pos <= -1.0 for pos in self.positions[name]) |
|||
# print(self.positions) |
|||
return done |
|||
|
|||
def _generate_mask(self): |
|||
if self.discrete: |
|||
# LL-Python API will return an empty dim if there is only 1 agent. |
|||
ndmask = np.array(2 * self.action_size * [False], dtype=np.bool) |
|||
ndmask = np.expand_dims(ndmask, axis=0) |
|||
action_mask = [ndmask] |
|||
else: |
|||
action_mask = None |
|||
return action_mask |
|||
|
|||
def _compute_reward(self, name: str, done: bool) -> float: |
|||
if done: |
|||
reward = 0.0 |
|||
for _pos in self.positions[name]: |
|||
reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len( |
|||
self.positions[name] |
|||
) |
|||
else: |
|||
reward = -TIME_PENALTY |
|||
return reward |
|||
|
|||
def _reset_agent(self, name): |
|||
self.goal[name] = self.random.choice([-1, 1]) |
|||
self.positions[name] = [0.0 for _ in range(self.action_size)] |
|||
self.step_count[name] = 0 |
|||
self.rewards[name] = 0 |
|||
self.agent_id[name] = self.agent_id[name] + 1 |
|||
|
|||
def _make_batched_step( |
|||
self, name: str, done: bool, reward: float |
|||
) -> Tuple[DecisionSteps, TerminalSteps]: |
|||
m_vector_obs = self._make_obs(self.goal[name]) |
|||
m_reward = np.array([reward], dtype=np.float32) |
|||
m_agent_id = np.array([self.agent_id[name]], dtype=np.int32) |
|||
action_mask = self._generate_mask() |
|||
decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id, action_mask) |
|||
terminal_step = TerminalSteps.empty(self.behavior_spec) |
|||
if done: |
|||
self.final_rewards[name].append(self.rewards[name]) |
|||
self._reset_agent(name) |
|||
new_vector_obs = self._make_obs(self.goal[name]) |
|||
( |
|||
new_reward, |
|||
new_done, |
|||
new_agent_id, |
|||
new_action_mask, |
|||
) = self._construct_reset_step(name) |
|||
|
|||
decision_step = DecisionSteps( |
|||
new_vector_obs, new_reward, new_agent_id, new_action_mask |
|||
) |
|||
terminal_step = TerminalSteps( |
|||
m_vector_obs, m_reward, np.array([False], dtype=np.bool), m_agent_id |
|||
) |
|||
return (decision_step, terminal_step) |
|||
|
|||
def _construct_reset_step( |
|||
self, name: str |
|||
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: |
|||
new_reward = np.array([0.0], dtype=np.float32) |
|||
new_done = np.array([False], dtype=np.bool) |
|||
new_agent_id = np.array([self.agent_id[name]], dtype=np.int32) |
|||
new_action_mask = self._generate_mask() |
|||
return new_reward, new_done, new_agent_id, new_action_mask |
|||
|
|||
def step(self) -> None: |
|||
assert all(action is not None for action in self.action.values()) |
|||
for name in self.names: |
|||
|
|||
done = self._take_action(name) |
|||
reward = self._compute_reward(name, done) |
|||
self.rewards[name] += reward |
|||
self.step_result[name] = self._make_batched_step(name, done, reward) |
|||
|
|||
def reset(self) -> None: # type: ignore |
|||
for name in self.names: |
|||
self._reset_agent(name) |
|||
self.step_result[name] = self._make_batched_step(name, False, 0.0) |
|||
|
|||
@property |
|||
def reset_parameters(self) -> Dict[str, str]: |
|||
return {} |
|||
|
|||
def close(self): |
|||
pass |
撰写
预览
正在加载...
取消
保存
Reference in new issue