浏览代码

Very simple environment for testing (#2266)

* WIP doesn't crash

* return stats and assert convergence

* pass lint checks

* rename

* fix-reset-params

* add time penalty

* _get_measure_vals always returns something

* fix tests

* unused import

* single env, fix double step

* move LocalEnvManager to ml-agents-envs

* move and rename EnvManager

* remove obsolete docstring and method

* clean up
/develop-generalizationTraining-TrainerController
GitHub 5 年前
当前提交
19283bfa
共有 9 个文件被更改,包括 296 次插入32 次删除
  1. 6
      ml-agents-envs/mlagents/envs/base_unity_environment.py
  2. 1
      ml-agents-envs/mlagents/envs/env_manager.py
  3. 6
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  4. 4
      ml-agents/mlagents/trainers/trainer.py
  5. 55
      ml-agents/mlagents/trainers/trainer_controller.py
  6. 81
      ml-agents-envs/mlagents/envs/simple_env_manager.py
  7. 0
      ml-agents/mlagents/trainers/tests/test_environments/__init__.py
  8. 175
      ml-agents/mlagents/trainers/tests/test_environments/test_simple.py

6
ml-agents-envs/mlagents/envs/base_unity_environment.py


pass
@abstractmethod
def reset(self, config=None, train_mode=True) -> AllBrainInfo:
def reset(
self, config=None, train_mode=True, custom_reset_parameters=None
) -> AllBrainInfo:
pass
@property

@property
@abstractmethod
def reset_parameters(self) -> Dict[str, str]:
def reset_parameters(self) -> Dict[str, float]:
pass
@abstractmethod

1
ml-agents-envs/mlagents/envs/env_manager.py


def reset(self, config=None, train_mode=True) -> List[StepInfo]:
pass
@property
@abstractmethod
def external_brains(self) -> Dict[str, BrainParameters]:
pass

6
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


):
external_brains = {"testbrain": expected_brain_params}
def mock_constructor(self, brain, trainer_params, training, load, seed, run_id):
def mock_constructor(self, brain, trainer_parameters, training, load, seed, run_id):
assert trainer_params == expected_config
assert trainer_parameters == expected_config
assert training == tc.train_model
assert load == tc.load_model
assert seed == tc.seed

def assert_ppo_trainer_constructed(
input_config, tc, expected_brain_params, expected_config, expected_reward_buff_cap=0
input_config, tc, expected_brain_params, expected_config, expected_reward_buff_cap=1
):
external_brains = {"testbrain": expected_brain_params}

4
ml-agents/mlagents/trainers/trainer.py


"""
self.trainer_metrics.write_training_metrics()
def write_summary(self, global_step, delta_train_start, lesson_num=0):
def write_summary(
self, global_step: int, delta_train_start: float, lesson_num: int = 0
) -> None:
"""
Saves training statistics to Tensorboard.
:param delta_train_start: Time elapsed since training started.

55
ml-agents/mlagents/trainers/trainer_controller.py


from mlagents.envs import BrainParameters
from mlagents.envs.env_manager import StepInfo
from mlagents.envs.env_manager import EnvManager
from mlagents.envs.subprocess_env_manager import SubprocessEnvManager
from mlagents.envs.exception import UnityEnvironmentException
from mlagents.envs.timers import hierarchical_timer, get_timer_tree, timed

tf.set_random_seed(self.seed)
def _get_measure_vals(self):
brain_names_to_measure_vals = {}
brain_names_to_measure_vals = {}
for (
brain_name,
curriculum,

elif curriculum.measure == "reward":
measure_val = np.mean(self.trainers[brain_name].reward_buffer)
brain_names_to_measure_vals[brain_name] = measure_val
return brain_names_to_measure_vals
return None
for brain_name, trainer in self.trainers.items():
measure_val = np.mean(trainer.reward_buffer)
brain_names_to_measure_vals[brain_name] = measure_val
return brain_names_to_measure_vals
def _save_model(self, steps=0):
"""

for brain_name in external_brains:
if trainer_parameters_dict[brain_name]["trainer"] == "offline_bc":
self.trainers[brain_name] = OfflineBCTrainer(
external_brains[brain_name],
trainer_parameters_dict[brain_name],
self.train_model,
self.load_model,
self.seed,
self.run_id,
brain=external_brains[brain_name],
trainer_parameters=trainer_parameters_dict[brain_name],
training=self.train_model,
load=self.load_model,
seed=self.seed,
run_id=self.run_id,
external_brains[brain_name],
trainer_parameters_dict[brain_name],
self.train_model,
self.load_model,
self.seed,
self.run_id,
brain=external_brains[brain_name],
trainer_parameters=trainer_parameters_dict[brain_name],
training=self.train_model,
load=self.load_model,
seed=self.seed,
run_id=self.run_id,
external_brains[brain_name],
self.meta_curriculum.brains_to_curriculums[
brain=external_brains[brain_name],
reward_buff_cap=self.meta_curriculum.brains_to_curriculums[
else 0,
trainer_parameters_dict[brain_name],
self.train_model,
self.load_model,
self.seed,
self.run_id,
else 1,
trainer_parameters=trainer_parameters_dict[brain_name],
training=self.train_model,
load=self.load_model,
seed=self.seed,
run_id=self.run_id,
)
self.trainer_metrics[brain_name] = self.trainers[
brain_name

"permissions are set correctly.".format(model_path)
)
def _reset_env(self, env: SubprocessEnvManager) -> List[StepInfo]:
def _reset_env(self, env: EnvManager) -> List[StepInfo]:
"""Resets the environment.
Returns:

trainer.write_summary(global_step, delta_train_start)
def start_learning(
self, env_manager: SubprocessEnvManager, trainer_config: Dict[str, Any]
self, env_manager: EnvManager, trainer_config: Dict[str, Any]
) -> None:
# TODO: Should be able to start learning at different lesson numbers
# for each curriculum.

self._write_timing_tree()
@timed
def advance(self, env: SubprocessEnvManager) -> int:
def advance(self, env: EnvManager) -> int:
if self.meta_curriculum:
# Get the sizes of the reward buffers.
reward_buff_sizes = {

81
ml-agents-envs/mlagents/envs/simple_env_manager.py


from typing import Any, Dict, List
from mlagents.envs.base_unity_environment import BaseUnityEnvironment
from mlagents.envs.env_manager import EnvManager, StepInfo
from mlagents.envs.timers import timed
from mlagents.envs import ActionInfo, BrainParameters
class SimpleEnvManager(EnvManager):
"""
Simple implementation of the EnvManager interface that only handles one BaseUnityEnvironment at a time.
This is generally only useful for testing; see SubprocessEnvManager for a production-quality implementation.
"""
def __init__(self, env: BaseUnityEnvironment):
super().__init__()
self.env = env
self.previous_step: StepInfo = StepInfo(None, {}, None)
self.previous_all_action_info: Dict[str, ActionInfo] = {}
def step(self) -> List[StepInfo]:
all_action_info = self._take_step(self.previous_step)
self.previous_all_action_info = all_action_info
if self.env.global_done:
all_brain_info = self.env.reset()
else:
actions = {}
memories = {}
texts = {}
values = {}
for brain_name, action_info in all_action_info.items():
actions[brain_name] = action_info.action
memories[brain_name] = action_info.memory
texts[brain_name] = action_info.text
values[brain_name] = action_info.value
all_brain_info = self.env.step(actions, memories, texts, values)
step_brain_info = all_brain_info
step_info = StepInfo(
self.previous_step.current_all_brain_info,
step_brain_info,
self.previous_all_action_info,
)
self.previous_step = step_info
return [step_info]
def reset(
self,
config: Dict[str, float] = None,
train_mode: bool = True,
custom_reset_parameters: Any = None,
) -> List[StepInfo]: # type: ignore
all_brain_info = self.env.reset(
config=config,
train_mode=train_mode,
custom_reset_parameters=custom_reset_parameters,
)
self.previous_step = StepInfo(None, all_brain_info, None)
return [self.previous_step]
@property
def external_brains(self) -> Dict[str, BrainParameters]:
return self.env.external_brains
@property
def reset_parameters(self) -> Dict[str, float]:
return self.env.reset_parameters
def close(self):
self.env.close()
@timed
def _take_step(self, last_step: StepInfo) -> Dict[str, ActionInfo]:
all_action_info: Dict[str, ActionInfo] = {}
for brain_name, brain_info in last_step.current_all_brain_info.items():
all_action_info[brain_name] = self.policies[brain_name].get_action(
brain_info
)
return all_action_info

0
ml-agents/mlagents/trainers/tests/test_environments/__init__.py

175
ml-agents/mlagents/trainers/tests/test_environments/test_simple.py


import yaml
import math
import tempfile
from typing import Any, Dict
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.envs.base_unity_environment import BaseUnityEnvironment
from mlagents.envs import BrainInfo, AllBrainInfo, BrainParameters
from mlagents.envs.communicator_objects import AgentInfoProto
from mlagents.envs.simple_env_manager import SimpleEnvManager
BRAIN_NAME = __name__
OBS_SIZE = 1
STEP_SIZE = 0.1
TIME_PENALTY = 0.001
MIN_STEPS = int(1.0 / STEP_SIZE) + 1
SUCCESS_REWARD = 1.0 + MIN_STEPS * TIME_PENALTY
def clamp(x, min_val, max_val):
return max(min_val, min(x, max_val))
class Simple1DEnvironment(BaseUnityEnvironment):
"""
Very simple "game" - the agent has a position on [-1, 1], gets a reward of 1 if it reaches 1, and a reward of -1 if
it reaches -1. The position is incremented by the action amount (clamped to [-step_size, step_size]).
"""
def __init__(self):
self._brains: Dict[str, BrainParameters] = {}
self._brains[BRAIN_NAME] = BrainParameters(
brain_name=BRAIN_NAME,
vector_observation_space_size=OBS_SIZE,
num_stacked_vector_observations=1,
camera_resolutions=[],
vector_action_space_size=[1],
vector_action_descriptions=["moveDirection"],
vector_action_space_type=1, # "continuous"
)
# state
self.position = 0.0
self.step_count = 0
def step(
self,
vector_action: Dict[str, Any] = None,
memory: Dict[str, Any] = None,
text_action: Dict[str, Any] = None,
value: Dict[str, Any] = None,
) -> AllBrainInfo:
assert vector_action is not None
delta = vector_action[BRAIN_NAME][0][0]
delta = clamp(delta, -STEP_SIZE, STEP_SIZE)
self.position += delta
self.position = clamp(self.position, -1, 1)
self.step_count += 1
done = self.position >= 1.0 or self.position <= -1.0
if done:
reward = SUCCESS_REWARD * self.position
else:
reward = -TIME_PENALTY
agent_info = AgentInfoProto(
stacked_vector_observation=[self.position] * OBS_SIZE,
reward=reward,
done=done,
)
if done:
self._reset_agent()
return {
BRAIN_NAME: BrainInfo.from_agent_proto(
0, [agent_info], self._brains[BRAIN_NAME]
)
}
def _reset_agent(self):
self.position = 0.0
self.step_count = 0
def reset(
self,
config: Dict[str, float] = None,
train_mode: bool = True,
custom_reset_parameters: Any = None,
) -> AllBrainInfo: # type: ignore
self._reset_agent()
agent_info = AgentInfoProto(
stacked_vector_observation=[self.position] * OBS_SIZE,
done=False,
max_step_reached=False,
)
return {
BRAIN_NAME: BrainInfo.from_agent_proto(
0, [agent_info], self._brains[BRAIN_NAME]
)
}
@property
def global_done(self):
return False
@property
def external_brains(self) -> Dict[str, BrainParameters]:
return self._brains
@property
def reset_parameters(self) -> Dict[str, str]:
return {}
def close(self):
pass
def test_simple():
config = """
default:
trainer: ppo
batch_size: 16
beta: 5.0e-3
buffer_size: 64
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 5.0e-3
max_steps: 2500
memory_size: 256
normalize: false
num_epoch: 3
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 500
use_recurrent: false
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
vis_encode_type: default
"""
# Create controller and begin training.
with tempfile.TemporaryDirectory() as dir:
run_id = "id"
save_freq = 99999
tc = TrainerController(
dir,
dir,
run_id,
save_freq,
meta_curriculum=None,
load=False,
train=True,
keep_checkpoints=1,
lesson=None,
training_seed=1337,
fast_simulation=True,
)
# Begin training
env = Simple1DEnvironment()
env_manager = SimpleEnvManager(env)
trainer_config = yaml.safe_load(config)
tc.start_learning(env_manager, trainer_config)
for brain_name, mean_reward in tc._get_measure_vals().items():
assert not math.isnan(mean_reward)
assert mean_reward > 0.99
正在加载...
取消
保存