GitHub
5 年前
当前提交
2fd305e7
共有 32 个文件被更改,包括 1261 次插入 和 837 次删除
-
3ml-agents/mlagents/trainers/action_info.py
-
198ml-agents/mlagents/trainers/agent_processor.py
-
29ml-agents/mlagents/trainers/buffer.py
-
1ml-agents/mlagents/trainers/components/bc/module.py
-
2ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
-
13ml-agents/mlagents/trainers/curriculum.py
-
32ml-agents/mlagents/trainers/demo_loader.py
-
6ml-agents/mlagents/trainers/learn.py
-
26ml-agents/mlagents/trainers/models.py
-
45ml-agents/mlagents/trainers/ppo/policy.py
-
212ml-agents/mlagents/trainers/ppo/trainer.py
-
246ml-agents/mlagents/trainers/rl_trainer.py
-
2ml-agents/mlagents/trainers/sac/policy.py
-
140ml-agents/mlagents/trainers/sac/trainer.py
-
45ml-agents/mlagents/trainers/tests/mock_brain.py
-
95ml-agents/mlagents/trainers/tests/test_buffer.py
-
2ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
-
174ml-agents/mlagents/trainers/tests/test_ppo.py
-
48ml-agents/mlagents/trainers/tests/test_rl_trainer.py
-
38ml-agents/mlagents/trainers/tests/test_sac.py
-
3ml-agents/mlagents/trainers/tests/test_simple_rl.py
-
22ml-agents/mlagents/trainers/tests/test_trainer_controller.py
-
4ml-agents/mlagents/trainers/tests/test_trainer_util.py
-
62ml-agents/mlagents/trainers/tf_policy.py
-
108ml-agents/mlagents/trainers/trainer.py
-
39ml-agents/mlagents/trainers/trainer_controller.py
-
4ml-agents/mlagents/trainers/trainer_util.py
-
118ml-agents/mlagents/trainers/stats.py
-
63ml-agents/mlagents/trainers/tests/test_agent_processor.py
-
80ml-agents/mlagents/trainers/tests/test_stats.py
-
110ml-agents/mlagents/trainers/tests/test_trajectory.py
-
128ml-agents/mlagents/trainers/trajectory.py
|
|||
from typing import List, Union |
|||
import sys |
|||
from typing import List, Dict |
|||
from collections import defaultdict, Counter |
|||
from mlagents.trainers.buffer import AgentBuffer, BufferException |
|||
from mlagents.trainers.trainer import Trainer |
|||
from mlagents.trainers.trajectory import Trajectory, AgentExperience |
|||
from mlagents.trainers.brain import BrainInfo |
|||
from mlagents.trainers.tf_policy import TFPolicy |
|||
from mlagents.trainers.action_info import ActionInfoOutputs |
|||
from mlagents.trainers.stats import StatsReporter |
|||
class ProcessingBuffer(dict): |
|||
class AgentProcessor: |
|||
ProcessingBuffer contains a dictionary of AgentBuffer. The AgentBuffers are indexed by agent_id. |
|||
AgentProcessor contains a dictionary per-agent trajectory buffers. The buffers are indexed by agent_id. |
|||
Buffer also contains an update_buffer that corresponds to the buffer used when updating the model. |
|||
One AgentProcessor should be created per agent group. |
|||
def __str__(self): |
|||
return "local_buffers :\n{0}".format( |
|||
"\n".join(["\tagent {0} :{1}".format(k, str(self[k])) for k in self.keys()]) |
|||
) |
|||
|
|||
def __getitem__(self, key): |
|||
if key not in self.keys(): |
|||
self[key] = AgentBuffer() |
|||
return super().__getitem__(key) |
|||
|
|||
def reset_local_buffers(self) -> None: |
|||
def __init__( |
|||
self, |
|||
trainer: Trainer, |
|||
policy: TFPolicy, |
|||
stats_reporter: StatsReporter, |
|||
max_trajectory_length: int = sys.maxsize, |
|||
): |
|||
Resets all the local AgentBuffers. |
|||
Create an AgentProcessor. |
|||
:param trainer: Trainer instance connected to this AgentProcessor. Trainer is given trajectory |
|||
when it is finished. |
|||
:param policy: Policy instance associated with this AgentProcessor. |
|||
:param max_trajectory_length: Maximum length of a trajectory before it is added to the trainer. |
|||
:param stats_category: The category under which to write the stats. Usually, this comes from the Trainer. |
|||
for buf in self.values(): |
|||
buf.reset_agent() |
|||
self.experience_buffers: Dict[str, List[AgentExperience]] = defaultdict(list) |
|||
self.last_brain_info: Dict[str, BrainInfo] = {} |
|||
self.last_take_action_outputs: Dict[str, ActionInfoOutputs] = {} |
|||
# Note: this is needed until we switch to AgentExperiences as the data input type. |
|||
# We still need some info from the policy (memories, previous actions) |
|||
# that really should be gathered by the env-manager. |
|||
self.policy = policy |
|||
self.episode_steps: Counter = Counter() |
|||
self.episode_rewards: Dict[str, float] = defaultdict(float) |
|||
self.stats_reporter = stats_reporter |
|||
self.trainer = trainer |
|||
self.max_trajectory_length = max_trajectory_length |
|||
def append_to_update_buffer( |
|||
def add_experiences( |
|||
update_buffer: AgentBuffer, |
|||
agent_id: Union[int, str], |
|||
key_list: List[str] = None, |
|||
batch_size: int = None, |
|||
training_length: int = None, |
|||
curr_info: BrainInfo, |
|||
next_info: BrainInfo, |
|||
take_action_outputs: ActionInfoOutputs, |
|||
Appends the buffer of an agent to the update buffer. |
|||
:param update_buffer: A reference to an AgentBuffer to append the agent's buffer to |
|||
:param agent_id: The id of the agent which data will be appended |
|||
:param key_list: The fields that must be added. If None: all fields will be appended. |
|||
:param batch_size: The number of elements that must be appended. If None: All of them will be. |
|||
:param training_length: The length of the samples that must be appended. If None: only takes one element. |
|||
Adds experiences to each agent's experience history. |
|||
:param curr_info: current BrainInfo. |
|||
:param next_info: next BrainInfo. |
|||
:param take_action_outputs: The outputs of the Policy's get_action method. |
|||
if key_list is None: |
|||
key_list = self[agent_id].keys() |
|||
if not self[agent_id].check_length(key_list): |
|||
raise BufferException( |
|||
"The length of the fields {0} for agent {1} were not of same length".format( |
|||
key_list, agent_id |
|||
) |
|||
if take_action_outputs: |
|||
self.stats_reporter.add_stat( |
|||
"Policy/Entropy", take_action_outputs["entropy"].mean() |
|||
for field_key in key_list: |
|||
update_buffer[field_key].extend( |
|||
self[agent_id][field_key].get_batch( |
|||
batch_size=batch_size, training_length=training_length |
|||
) |
|||
self.stats_reporter.add_stat( |
|||
"Policy/Learning Rate", take_action_outputs["learning_rate"] |
|||
def append_all_agent_batch_to_update_buffer( |
|||
self, |
|||
update_buffer: AgentBuffer, |
|||
key_list: List[str] = None, |
|||
batch_size: int = None, |
|||
training_length: int = None, |
|||
) -> None: |
|||
""" |
|||
Appends the buffer of all agents to the update buffer. |
|||
:param key_list: The fields that must be added. If None: all fields will be appended. |
|||
:param batch_size: The number of elements that must be appended. If None: All of them will be. |
|||
:param training_length: The length of the samples that must be appended. If None: only takes one element. |
|||
""" |
|||
for agent_id in self.keys(): |
|||
self.append_to_update_buffer( |
|||
update_buffer, agent_id, key_list, batch_size, training_length |
|||
) |
|||
for agent_id in curr_info.agents: |
|||
self.last_brain_info[agent_id] = curr_info |
|||
self.last_take_action_outputs[agent_id] = take_action_outputs |
|||
|
|||
# Store the environment reward |
|||
tmp_environment_reward = next_info.rewards |
|||
|
|||
for next_idx, agent_id in enumerate(next_info.agents): |
|||
stored_info = self.last_brain_info.get(agent_id, None) |
|||
if stored_info is not None: |
|||
stored_take_action_outputs = self.last_take_action_outputs[agent_id] |
|||
idx = stored_info.agents.index(agent_id) |
|||
obs = [] |
|||
if not stored_info.local_done[idx]: |
|||
for i, _ in enumerate(stored_info.visual_observations): |
|||
obs.append(stored_info.visual_observations[i][idx]) |
|||
if self.policy.use_vec_obs: |
|||
obs.append(stored_info.vector_observations[idx]) |
|||
if self.policy.use_recurrent: |
|||
memory = self.policy.retrieve_memories([agent_id])[0, :] |
|||
else: |
|||
memory = None |
|||
|
|||
done = next_info.local_done[next_idx] |
|||
max_step = next_info.max_reached[next_idx] |
|||
|
|||
# Add the outputs of the last eval |
|||
action = stored_take_action_outputs["action"][idx] |
|||
if self.policy.use_continuous_act: |
|||
action_pre = stored_take_action_outputs["pre_action"][idx] |
|||
else: |
|||
action_pre = None |
|||
action_probs = stored_take_action_outputs["log_probs"][idx] |
|||
action_masks = stored_info.action_masks[idx] |
|||
prev_action = self.policy.retrieve_previous_action([agent_id])[0, :] |
|||
|
|||
experience = AgentExperience( |
|||
obs=obs, |
|||
reward=tmp_environment_reward[next_idx], |
|||
done=done, |
|||
action=action, |
|||
action_probs=action_probs, |
|||
action_pre=action_pre, |
|||
action_mask=action_masks, |
|||
prev_action=prev_action, |
|||
max_step=max_step, |
|||
memory=memory, |
|||
) |
|||
# Add the value outputs if needed |
|||
self.experience_buffers[agent_id].append(experience) |
|||
self.episode_rewards[agent_id] += tmp_environment_reward[next_idx] |
|||
if ( |
|||
next_info.local_done[next_idx] |
|||
or ( |
|||
len(self.experience_buffers[agent_id]) |
|||
>= self.max_trajectory_length |
|||
) |
|||
) and len(self.experience_buffers[agent_id]) > 0: |
|||
# Make next AgentExperience |
|||
next_obs = [] |
|||
for i, _ in enumerate(next_info.visual_observations): |
|||
next_obs.append(next_info.visual_observations[i][next_idx]) |
|||
if self.policy.use_vec_obs: |
|||
next_obs.append(next_info.vector_observations[next_idx]) |
|||
trajectory = Trajectory( |
|||
steps=self.experience_buffers[agent_id], |
|||
agent_id=agent_id, |
|||
next_obs=next_obs, |
|||
) |
|||
# This will eventually be replaced with a queue |
|||
self.trainer.process_trajectory(trajectory) |
|||
self.experience_buffers[agent_id] = [] |
|||
if next_info.local_done[next_idx]: |
|||
self.stats_reporter.add_stat( |
|||
"Environment/Cumulative Reward", |
|||
self.episode_rewards.get(agent_id, 0), |
|||
) |
|||
self.stats_reporter.add_stat( |
|||
"Environment/Episode Length", |
|||
self.episode_steps.get(agent_id, 0), |
|||
) |
|||
del self.episode_steps[agent_id] |
|||
del self.episode_rewards[agent_id] |
|||
elif not next_info.local_done[next_idx]: |
|||
self.episode_steps[agent_id] += 1 |
|||
self.policy.save_previous_action( |
|||
curr_info.agents, take_action_outputs["action"] |
|||
) |