浏览代码

Initial commit of running active learning code

Active learning code is running on walker variable speed. Needs to be tested to see if it is working.
/active-variablespeed
Scott Jordan 4 年前
当前提交
56745026
共有 10 个文件被更改,包括 147 次插入147 次删除
  1. 2
      Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs
  2. 7
      config/ppo/WalkerStaticVariableSpeed.yaml
  3. 5
      ml-agents/mlagents/trainers/active_learning.py
  4. 172
      ml-agents/mlagents/trainers/active_learning_manager.py
  5. 47
      ml-agents/mlagents/trainers/agent_processor.py
  6. 6
      ml-agents/mlagents/trainers/learn.py
  7. 27
      ml-agents/mlagents/trainers/settings.py
  8. 9
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  9. 18
      ml-agents/mlagents/trainers/trainer_controller.py
  10. 1
      ml-agents/mlagents/trainers/trainer_util.py

2
Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs


public void SetResetParameters()
{
targetWalkingSpeed = GetParameterWithDefault("target_walkingspeed", 10.0f);
targetWalkingSpeed = GetParameterWithDefault("targetWalkingSpeed", 10.0f);
// target_headheight = GetParameterWithDefault("target_height", 0.5497f);
SetTorsoMass();
}

7
config/ppo/WalkerStaticVariableSpeed.yaml


time_horizon: 1000
summary_freq: 30000
threaded: true
agent_parameters:
WalkerStaticVariableSpeed:
targetWalkingSpeed:
sampler_type: uniform
sampler_parameters:
min_value: 0.0
max_value: 10

5
ml-agents/mlagents/trainers/active_learning.py


self.X = self.X[-T:, :]
self.Y = self.Y[-T:, :]
if self.X.shape[0] < 5: # TODO seems to throw an error if only one sample is present. Refitting should probably only happen every N data points anyways
return None
if refit:
model = StandardActiveLearningGP(self.X, self.Y, bounds=self.bounds)
mll = ExactMarginalLogLikelihood(model.likelihood, model)

fit_gpytorch_model(mll)
else:
self.model.set_train_data(self.X, self.Y)
# self.model = self.model.condition_on_observations(new_X, new_Y)
# self.model = self.model.condition_on_observations(new_X, new_Y) # TODO: might be faster than setting the data need to test
def get_design_points(self, num_points:int=1, time=None):
if not self.model or time < 30:

172
ml-agents/mlagents/trainers/active_learning_manager.py


from typing import Dict, List, Tuple, Optional
from mlagents.trainers.settings import (
EnvironmentParameterSettings,
AgentParameterSettings,
ParameterRandomizationSettings,
)
from collections import defaultdict

logger = get_logger(__name__)
import torch
class ActiveLearningTaskManager(EnvironmentParameterManager):
class ActiveLearningTaskManager:
settings: Optional[Dict[str, AgentParameterSettings]] = None,
run_seed: int = -1,
settings: Optional[Dict[str,AgentParameterSettings]] = None,
restore: bool = False,
):
"""

:param settings: A dictionary from environment parameter to
EnvironmentParameterSettings.
:param run_seed: When the seed is not provided for an environment parameter,
this seed will be used instead.
:param restore: If true, the EnvironmentParameterManager will use the
GlobalTrainingStatus to try and reload the lesson status of each environment
parameter.

self._dict_settings = settings
lows = []
highs = []
for parameter_name in self._dict_settings.keys():
self._dict_settings[parameter_name].
self._smoothed_values: Dict[str, float] = defaultdict(float)
for key in self._dict_settings.keys():
self._smoothed_values[key] = 0.0
# Update the seeds of the samplers
self._set_sampler_seeds(run_seed)
task_ranges = []
self._taskSampler = ActiveLearningTaskSampler(task_ranges)
def _set_sampler_seeds(self, seed):
"""
Sets the seeds for the samplers (if no seed was already present). Note that
using the provided seed.
"""
offset = 0
for settings in self._dict_settings.values():
for lesson in settings.curriculum:
if lesson.value.seed == -1:
lesson.value.seed = seed + offset
offset += 1
self.behavior_names = list(self._dict_settings.keys())
self.param_names = {name: list(self._dict_settings[name].parameters.keys()) for name in self.behavior_names}
self._taskSamplers = {}
for behavior_name in self.behavior_names:
lows = []
highs = []
parameters = self._dict_settings[behavior_name].parameters
for parameter_name in self.param_names[behavior_name]:
low = parameters[parameter_name].min_value
high = parameters[parameter_name].max_value
lows.append(low)
highs.append(high)
task_ranges = torch.tensor([lows, highs]).float().T
self._taskSamplers[behavior_name] = ActiveLearningTaskSampler(task_ranges)
self.t = {name: 0.0 for name in self.behavior_names}
def get_minimum_reward_buffer_size(self, behavior_name: str) -> int:
"""
Calculates the minimum size of the reward buffer a behavior must use. This
method uses the 'min_lesson_length' sampler_parameter to determine this value.
:param behavior_name: The name of the behavior the minimum reward buffer
size corresponds to.
"""
result = 1
for settings in self._dict_settings.values():
for lesson in settings.curriculum:
if lesson.completion_criteria is not None:
if lesson.completion_criteria.behavior == behavior_name:
result = max(
result, lesson.completion_criteria.min_lesson_length
)
return result
def _make_task(self, behavior_name, tau):
task = {}
for i, name in enumerate(self.param_names[behavior_name]):
task[name] = tau[i]
return task
def get_current_samplers(self) -> Dict[str, ParameterRandomizationSettings]:
"""
Creates a dictionary from environment parameter name to their corresponding
ParameterRandomizationSettings. If curriculum is used, the
ParameterRandomizationSettings corresponds to the sampler of the current lesson.
"""
samplers: Dict[str, ParameterRandomizationSettings] = {}
for param_name, settings in self._dict_settings.items():
lesson_num = GlobalTrainingStatus.get_parameter_state(
param_name, StatusType.LESSON_NUM
)
lesson = settings.curriculum[lesson_num]
samplers[param_name] = lesson.value
return samplers
def _build_tau(self, behavior_name, task, time):
tau = []
for name in self.param_names[behavior_name]:
tau.append(task[name])
tau.append(time)
return torch.tensor(tau).float()
def get_current_lesson_number(self) -> Dict[str, int]:
def get_tasks(self, behavior_name, num_samples) -> Dict[str, ParameterRandomizationSettings]:
Creates a dictionary from environment parameter to the current lesson number.
If not using curriculum, this number is always 0 for that environment parameter.
TODO
result: Dict[str, int] = {}
for parameter_name in self._dict_settings.keys():
result[parameter_name] = GlobalTrainingStatus.get_parameter_state(
parameter_name, StatusType.LESSON_NUM
)
return result
behavior_name = [bname for bname in self.behavior_names if bname in behavior_name][0] # TODO make work with actual behavior names
current_time = self.t[behavior_name] + 1
taus = self._taskSamplers[behavior_name].get_design_points(num_points=num_samples, time=current_time).data.numpy().tolist()
tasks = [self._make_task(behavior_name, tau) for tau in taus]
return tasks
def update_lessons(
self,
trainer_steps: Dict[str, int],
trainer_max_steps: Dict[str, int],
trainer_reward_buffer: Dict[str, List[float]],
def update(self, behavior_name: str, task_perfs: List[Tuple[Dict, float]]
Given progress metrics, calculates if at least one environment parameter is
in a new lesson and if at least one environment parameter requires the env
to reset.
:param trainer_steps: A dictionary from behavior_name to the number of training
steps this behavior's trainer has performed.
:param trainer_max_steps: A dictionary from behavior_name to the maximum number
of training steps this behavior's trainer has performed.
:param trainer_reward_buffer: A dictionary from behavior_name to the list of
the most recent episode returns for this behavior's trainer.
:returns: A tuple of two booleans : (True if any lesson has changed, True if
environment needs to reset)
TODO
updated = False
for param_name, settings in self._dict_settings.items():
lesson_num = GlobalTrainingStatus.get_parameter_state(
param_name, StatusType.LESSON_NUM
)
lesson = settings.curriculum[lesson_num]
if (
lesson.completion_criteria is not None
and len(settings.curriculum) > lesson_num + 1
):
behavior_to_consider = lesson.completion_criteria.behavior
if behavior_to_consider in trainer_steps:
must_increment, new_smoothing = lesson.completion_criteria.need_increment(
float(trainer_steps[behavior_to_consider])
/ float(trainer_max_steps[behavior_to_consider]),
trainer_reward_buffer[behavior_to_consider],
self._smoothed_values[param_name],
)
self._smoothed_values[param_name] = new_smoothing
if must_increment:
GlobalTrainingStatus.set_parameter_state(
param_name, StatusType.LESSON_NUM, lesson_num + 1
)
new_lesson_name = settings.curriculum[lesson_num + 1].name
logger.info(
f"Parameter '{param_name}' has changed. Now in lesson '{new_lesson_name}'"
)
updated = True
if lesson.completion_criteria.require_reset:
must_reset = True
updated = True
behavior_name = [bname for bname in self.behavior_names if bname in behavior_name][0] # TODO make work with actual behavior names
taus = []
perfs = []
for task, perf in task_perfs:
perfs.append(perf)
self.t[behavior_name] = self.t[behavior_name] + 1
tau = self._build_tau(behavior_name, task, self.t[behavior_name])
taus.append(tau)
X = torch.stack(taus, dim=0)
Y = torch.tensor(perfs).float()
self._taskSamplers[behavior_name].update_model(X, Y, refit=True)
return updated, must_reset

47
ml-agents/mlagents/trainers/agent_processor.py


behavior_id: str,
stats_reporter: StatsReporter,
max_trajectory_length: int = sys.maxsize,
set_task_params_fn = None
):
"""
Create an AgentProcessor.

self.behavior_id = behavior_id
self.task_queue: List[Dict[str, float]] = []
self.task_perf_queue: List[Tuple[Dict[str, float],float]] = []
self.task_to_set: Dict[str, List] = defaultdict([])
self.set_task_params_fn = set_task_params_fn
self.tasks_needed: Dict[str, Tuple[str, str]] = {}
def add_experiences(
self,

action_global_agent_ids = [
get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids
]
for global_id in action_global_agent_ids:
for global_id, local_id in zip(action_global_agent_ids, previous_action.agent_ids):
if global_id not in self.episode_tasks.keys():
self._assign_task(worker_id, global_id, local_id)
# Iterate over all the terminal steps
for terminal_step in terminal_steps.values():

terminal_step, global_id, terminal_steps.agent_id_to_index[local_id]
)
if global_id not in self.episode_tasks.keys():
self._assign_task(worker_id, global_id, local_id)
# Iterate over all the decision steps
for ongoing_step in decision_steps.values():
local_id = ongoing_step.agent_id

)
if global_id not in self.episode_tasks.keys():
self._assign_task(worker_id, global_id, local_id)
for _gid in action_global_agent_ids:
# If the ID doesn't have a last step result, the agent just reset,

self.policy.save_previous_action(
[_gid], take_action_outputs["action"]
)
task = self.task_queue.pop(0)
if len(self.task_queue) > 0:
task = self.task_queue.pop(0)
self.episode_tasks[global_id] = task
self.set_task_params_fn(worker_id, local_id, task)
else:
if global_id not in self.tasks_needed.keys():
self.tasks_needed[global_id] = (worker_id, local_id)
if len(self.task_queue) == 0 # if task queue is empty put a copy of this task on the queue so other agents don't miss out
self.task_queue.append(task)
self.episode_tasks[global_id] = task
self.task_to_set[worker_id].append((local_id, task))
# agent_params = AgentParametersChannel()
# for param, value in task.items():
# self.task_params_channel.set_float_parameter(local_id, param, value)
def get_num_tasks_needed(self):
return len(self.tasks_needed)
def add_new_tasks(self, tasks):
self.task_queue.extend(tasks)
to_del = []
for global_id, (worker_id, local_id) in self.tasks_needed.items():
if len(self.task_queue) > 0:
self._assign_task(worker_id, global_id, local_id)
to_del.append(global_id)
for key in to_del:
self._safe_delete(self.tasks_needed, key)
def _process_step(
self, step: Union[TerminalStep, DecisionStep], global_id: str, index: int

interrupted=interrupted,
memory=memory,
)
# Add the value outputs if needed
self.experience_buffers[global_id].append(experience)
self.episode_rewards[global_id] += step.reward

self._queue.put(item)
# TODO: Callback new agent, callback episode end
class AgentManager(AgentProcessor):
"""
An AgentManager is an AgentProcessor that also holds a single trajectory and policy queue.

stats_reporter: StatsReporter,
max_trajectory_length: int = sys.maxsize,
threaded: bool = True,
**kwargs
super().__init__(policy, behavior_id, stats_reporter, max_trajectory_length)
super().__init__(policy, behavior_id, stats_reporter, max_trajectory_length, **kwargs)
trajectory_queue_len = 20 if threaded else 0
self.trajectory_queue: AgentManagerQueue[Trajectory] = AgentManagerQueue(
self.behavior_id, maxlen=trajectory_queue_len

6
ml-agents/mlagents/trainers/learn.py


from mlagents import tf_utils
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
from mlagents.trainers.active_learning_manager import ActiveLearningTaskManager
from mlagents.trainers.trainer_util import TrainerFactory, handle_existing_directories
from mlagents.trainers.stats import (
TensorboardWriter,

options.environment_parameters, run_seed, restore=checkpoint_settings.resume
)
task_parameter_manager = ActiveLearningTaskManager(
options.agent_parameters, restore=checkpoint_settings.resume
)
trainer_factory = TrainerFactory(
options.behaviors,
write_path,

write_path,
checkpoint_settings.run_id,
env_parameter_manager,
task_parameter_manager,
not checkpoint_settings.inference,
run_seed,
)

27
ml-agents/mlagents/trainers/settings.py


parameters: Dict[str, UniformSettings]
@staticmethod
def structure(d: Mapping, t: type) -> AgentParameterSettings:
def structure(d: Mapping, t: type):# -> Dict[str, AgentParameterSettings]:
"""
Helper method to structure a Dict of EnvironmentParameterSettings class. Meant
to be registered with cattr.register_structure_hook() and called with

raise TrainerConfigError(
f"Unsupported agent environment parameter settings {d}."
)
d_final: Dict[str, ] = {}
for agent_parameter, agent_parameter_config in d.items():
sampler = ParameterRandomizationSettings.structure(
agent_parameter_config, ParameterRandomizationSettings
)
d_final[agent_parameter] = sampler
print(agent_parameter)
settings = AgentParameterSettings(parameters=d_final)
return settings
d_final: Dict[str, AgentParameterSettings] = {}
for behavior_name, behavior_config in d.items():
tmp_settings: Dict[str, UniformSettings] = {}
for agent_parameter, agent_parameter_config in behavior_config.items():
sampler = ParameterRandomizationSettings.structure(
agent_parameter_config, ParameterRandomizationSettings
)
tmp_settings[agent_parameter] = sampler
d_final[behavior_name] = AgentParameterSettings(parameters=tmp_settings)
# settings = AgentParameterSettings(parameters=d_final)
return d_final
@attr.s(auto_attribs=True)
class EnvironmentParameterSettings:

env_settings: EnvironmentSettings = attr.ib(factory=EnvironmentSettings)
engine_settings: EngineSettings = attr.ib(factory=EngineSettings)
environment_parameters: Optional[Dict[str, EnvironmentParameterSettings]] = None
agent_parameters: Optional[Dict[str, AgentParameterSettings]] = None
checkpoint_settings: CheckpointSettings = attr.ib(factory=CheckpointSettings)
# These are options that are relevant to the run itself, and not the engine or environment.

cattr.register_structure_hook(CheckpointSettings, strict_to_cls)
cattr.register_structure_hook(
Dict[str, EnvironmentParameterSettings], EnvironmentParameterSettings.structure
)
cattr.register_structure_hook(
Dict[str, AgentParameterSettings], AgentParameterSettings.structure
)
cattr.register_structure_hook(Lesson, strict_to_cls)
cattr.register_structure_hook(

9
ml-agents/mlagents/trainers/subprocess_env_manager.py


for ew in self.env_workers:
ew.send(EnvironmentCommand.ENVIRONMENT_PARAMETERS, config)
def set_agent_parameters(self) -> None:
def set_agent_parameters(self, worker_id, local_id, task) -> None:
for worker_id, ew in enumerate(self.env_workers):
for brain_name in self.agent_managers.keys():
tasks = self.agent_managers[brain_name].task_to_set[worker_id]
ew.send(EnvironmentCommand.AGENT_PARAMETERS, tasks)
self.agent_managers[brain_name].task_to_set[worker_id].empty()
self.env_workers[worker_id].send(EnvironmentCommand.AGENT_PARAMETERS, (local_id, task))
@property
def training_behaviors(self) -> Dict[BehaviorName, BehaviorSpec]:

18
ml-agents/mlagents/trainers/trainer_controller.py


)
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
from mlagents.trainers.active_learning_manager import ActiveLearningTaskManager
from mlagents.trainers.trainer_util import TrainerFactory
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.agent_processor import AgentManager

output_path: str,
run_id: str,
param_manager: EnvironmentParameterManager,
task_manager: ActiveLearningTaskManager,
train: bool,
training_seed: int,
):

self.run_id = run_id
self.train_model = train
self.param_manager = param_manager
self.task_manager = task_manager
self.ghost_controller = self.trainer_factory.ghost_controller
self.registered_behavior_ids: Set[str] = set()

trainer.stats_reporter,
trainer.parameters.time_horizon,
threaded=trainer.threaded,
set_task_params_fn = lambda worker_id, local_id, task: env_manager.set_agent_parameters(worker_id, local_id, task)
)
env_manager.set_agent_manager(name_behavior_id, agent_manager)
env_manager.set_policy(name_behavior_id, policy)

task_perf = {}
for k, v in env.agent_managers.items():
perfs = v.task_perf_queue
v.task_perf_queue.empty()
v.task_perf_queue.clear()
# Attempt to increment the lessons of the brains who
# were ready.
updated, param_must_reset = self.param_manager.update_lessons(

trainer.stats_reporter.set_stat(
f"Environment/Lesson Number/{param_name}", lesson_number
)
for behavior_name, manager in env_manager.agent_managers.items():
task_perf = manager.task_perf_queue
manager.task_perf_queue = []
N = len(task_perf) # TODO get info about how many are active if no tasks are completed
if N > 0:
self.task_manager.update(behavior_name, task_perf)
K = manager.get_num_tasks_needed()
if K > 0:
new_tasks = self.task_manager.get_tasks(behavior_name, K)
manager.add_new_tasks(new_tasks)
for trainer in self.trainers.values():
if not trainer.threaded:

1
ml-agents/mlagents/trainers/trainer_util.py


from mlagents_envs.logging_util import get_logger
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
from mlagents.trainers.active_learning_manager import ActiveLearningTaskManager
from mlagents.trainers.exception import TrainerConfigError
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.exception import UnityTrainerException

正在加载...
取消
保存