浏览代码

Don't block one policy queue

Only put policies when policy is actually updated
/develop/sac-apex
Ervin Teng 4 年前
当前提交
f29b17a9
共有 6 个文件被更改,包括 47 次插入30 次删除
  1. 6
      ml-agents/mlagents/trainers/agent_processor.py
  2. 14
      ml-agents/mlagents/trainers/env_manager.py
  3. 1
      ml-agents/mlagents/trainers/ppo/trainer.py
  4. 43
      ml-agents/mlagents/trainers/sac/trainer.py
  5. 2
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  6. 11
      ml-agents/mlagents/trainers/trainer/rl_trainer.py

6
ml-agents/mlagents/trainers/agent_processor.py


pass
def __init__(self, behavior_id: str, maxlen: int = 1):
def __init__(self, behavior_id: str, maxlen: int = 20):
"""
Initializes an AgentManagerQueue. Note that we can give it a behavior_id so that it can be identified
separately from an AgentManager.

self.trajectory_queue: AgentManagerQueue[Trajectory] = AgentManagerQueue(
self.behavior_id
)
# NOTE: we make policy queues of infinite length to avoid lockups of the trainers.
# In the environment manager, we make sure to empty the policy queue before continuing to produce steps.
self.behavior_id
self.behavior_id, maxlen=0
)
self.publish_trajectory_queue(self.trajectory_queue)

14
ml-agents/mlagents/trainers/env_manager.py


if self.first_step_infos is not None:
self._process_step_infos(self.first_step_infos)
self.first_step_infos = None
# Get new policies if found
# Get new policies if found. Always get the latest policy.
_policy = None
_policy = self.agent_managers[brain_name].policy_queue.get(block=False)
self.set_policy(brain_name, _policy)
# We make sure to empty the policy queue before continuing to produce steps.
# This halts the trainers until the policy queue is empty.
while True:
_policy = self.agent_managers[brain_name].policy_queue.get(
block=False
)
pass
if _policy is not None:
self.set_policy(brain_name, _policy)
# Step the environment
new_step_infos = self._step()
# Add to AgentProcessor

1
ml-agents/mlagents/trainers/ppo/trainer.py


for stat, val in update_stats.items():
self._stats_reporter.add_stat(stat, val)
self._clear_update_buffer()
return True
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
"""

43
ml-agents/mlagents/trainers/sac/trainer.py


)
@timed
def _update_policy(self) -> None:
def _update_policy(self) -> bool:
:return: Whether or not the policy was updated.
self.update_sac_policy()
self.update_reward_signals()
policy_was_updated = self._update_sac_policy()
self._update_reward_signals()
return policy_was_updated
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
policy = NNPolicy(

return policy
def update_sac_policy(self) -> None:
def _update_sac_policy(self) -> bool:
has_updated = False
self.cumulative_returns_since_policy_update.clear()
n_sequences = max(
int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1

for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
self.update_steps += 1
self.update_steps += 1
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
has_updated = True
if self.optimizer.bc_module:
update_stats = self.optimizer.bc_module.update()
for stat, val in update_stats.items():
self._stats_reporter.add_stat(stat, val)
# Truncate update buffer if neccessary. Truncate more than we need to to avoid truncating
# a large buffer at each update.

)
return has_updated
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
if self.optimizer.bc_module:
update_stats = self.optimizer.bc_module.update()
for stat, val in update_stats.items():
self._stats_reporter.add_stat(stat, val)
def update_reward_signals(self) -> None:
def _update_reward_signals(self) -> None:
"""
Iterate through the reward signals and update them. Unlike in PPO,
do it separate from the policy so that it can be done at a different

int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
)
batch_update_stats: Dict[str, list] = defaultdict(list)
while self.step / self.reward_signal_update_steps > self.steps_per_update:
while (
self.step / self.reward_signal_update_steps
> self.reward_signal_steps_per_update
):
# Get minibatches for reward signal update if needed
reward_signal_minibatches = {}
for name, signal in self.optimizer.reward_signals.items():

batch_update_stats[stat_name].append(value)
self.reward_signal_update_steps += 1
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy

2
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


return True
def _update_policy(self):
pass
returnn> <span class="bp">True
def add_policy(self):
pass

11
ml-agents/mlagents/trainers/trainer/rl_trainer.py


return False
@abc.abstractmethod
def _update_policy(self):
def _update_policy(self) -> bool:
:return: Whether or not the policy was updated.
"""
pass

if self.should_still_train:
if self._is_ready_update():
with hierarchical_timer("_update_policy"):
self._update_policy()
for q in self.policy_queues:
# Get policies that correspond to the policy queue in question
q.put(self.get_policy(q.behavior_id))
if self._update_policy():
for q in self.policy_queues:
# Get policies that correspond to the policy queue in question
q.put(self.get_policy(q.behavior_id))
else:
self._clear_update_buffer()
正在加载...
取消
保存