浏览代码

No longer using ProcessingBuffer for PPO

/develop-newnormalization
Ervin Teng 5 年前
当前提交
f94365a2
共有 4 个文件被更改,包括 84 次插入136 次删除
  1. 161
      ml-agents/mlagents/trainers/agent_processor.py
  2. 53
      ml-agents/mlagents/trainers/ppo/policy.py
  3. 4
      ml-agents/mlagents/trainers/ppo/trainer.py
  4. 2
      ml-agents/mlagents/trainers/rl_trainer.py

161
ml-agents/mlagents/trainers/agent_processor.py


agent_id: str
class BootstrapExperience(NamedTuple):
"""
A partial AgentExperience needed to bootstrap GAE.
"""
obs: List[np.ndarray]
agent_id: str
class SplitObservations(NamedTuple):
vector_observations: np.ndarray
visual_observations: List[np.ndarray]

steps: Iterable[AgentExperience]
next_step: AgentExperience # The next step after the trajectory. Used for GAE when time_horizon is reached.
bootstrap_step: BootstrapExperience # The next step after the trajectory. Used for GAE.
class AgentProcessorException(UnityException):

"""
def __init__(self, trainer: Trainer):
self.processing_buffer = ProcessingBuffer()
self.experience_buffers: Dict[str, List] = defaultdict(list)
self.last_brain_info: Dict[str, BrainInfo] = defaultdict(BrainInfo)
self.last_take_action_outputs: Dict[str, ActionInfoOutputs] = defaultdict(
ActionInfoOutputs
)
self.stats: Dict[str, List] = defaultdict(list)
# Note: this is needed until we switch to AgentExperiences as the data input type.
# We still need some info from the policy (memories, previous actions)

return "local_buffers :\n{0}".format(
"\n".join(
[
"\tagent {0} :{1}".format(k, str(self.processing_buffer[k]))
for k in self.processing_buffer.keys()
"\tagent {0} :{1}".format(k, str(self.experience_buffers[k]))
for k in self.experience_buffers.keys()
def reset_local_buffers(self) -> None:
"""
Resets all the local local_buffers
"""
agent_ids = list(self.processing_buffer.keys())
for k in agent_ids:
self.processing_buffer[k].reset_agent()
def add_experiences(
self,
curr_info: BrainInfo,

self.stats[name].append(np.mean(values))
for agent_id in curr_info.agents:
self.processing_buffer[agent_id].last_brain_info = curr_info
self.processing_buffer[
agent_id
].last_take_action_outputs = take_action_outputs
self.last_brain_info[agent_id] = curr_info
self.last_take_action_outputs[agent_id] = take_action_outputs
stored_info = self.processing_buffer[agent_id].last_brain_info
stored_take_action_outputs = self.processing_buffer[
agent_id
].last_take_action_outputs
stored_info = self.last_brain_info[agent_id]
stored_take_action_outputs = self.last_take_action_outputs[agent_id]
obs = []
self.processing_buffer[agent_id]["visual_obs%d" % i].append(
stored_info.visual_observations[i][idx]
)
self.processing_buffer[agent_id][
"next_visual_obs%d" % i
].append(next_info.visual_observations[i][next_idx])
obs.append(stored_info.visual_observations[i][idx])
self.processing_buffer[agent_id]["vector_obs"].append(
stored_info.vector_observations[idx]
)
self.processing_buffer[agent_id]["next_vector_in"].append(
next_info.vector_observations[next_idx]
)
obs.append(stored_info.vector_observations[idx])
self.processing_buffer[agent_id]["memory"].append(
self.policy.retrieve_memories([agent_id])[0, :]
)
memory = self.policy.retrieve_memories([agent_id])[0, :]
else:
memory = None
self.processing_buffer[agent_id]["masks"].append(1.0)
self.processing_buffer[agent_id]["done"].append(
next_info.local_done[next_idx]
)
done = next_info.local_done[next_idx]
self.add_policy_outputs(stored_take_action_outputs, agent_id, idx)
action = take_action_outputs["action"][idx]
if self.policy.use_continuous_act:
action_pre = take_action_outputs["pre_action"][idx]
epsilon = take_action_outputs["random_normal_epsilon"][idx]
else:
action_pre = None
epsilon = None
action_probs = take_action_outputs["log_probs"][idx]
action_masks = stored_info.action_masks[idx]
prev_action = self.policy.retrieve_previous_action([agent_id])[0, :]
# Store action masks if necessary. Eventually these will be
# None for continuous actions
if stored_info.action_masks[idx] is not None:
self.processing_buffer[agent_id]["action_mask"].append(
stored_info.action_masks[idx], padding_value=1
)
# TODO: This should be done by the env_manager, and put it in
# the AgentExperience
self.processing_buffer[agent_id]["prev_action"].append(
self.policy.retrieve_previous_action([agent_id])[0, :]
values = stored_take_action_outputs["value_heads"]
experience = AgentExperience(
obs=obs,
reward=tmp_environment[next_idx],
done=done,
action=action,
action_probs=action_probs,
action_pre=action_pre,
action_mask=action_masks,
prev_action=prev_action,
agent_id=agent_id,
memory=memory,
epsilon=epsilon,
values = stored_take_action_outputs["value_heads"]
self.processing_buffer[agent_id]["environment_rewards"].append(
tmp_environment[next_idx]
)
self.experience_buffers[agent_id].append(experience)
for name, value in values.items():
self.processing_buffer[agent_id][
"{}_value_estimates".format(name)
].append(value[idx][0])
agent_actions = self.processing_buffer[agent_id]["actions"]
or len(agent_actions) > self.time_horizon
) and len(agent_actions) > 0:
trajectory = self.processing_buffer.agent_to_trajectory(
agent_id, training_length=self.policy.sequence_length
or len(self.experience_buffers[agent_id]) > self.time_horizon
) and len(self.experience_buffers[agent_id]) > 0:
# Make next AgentExperience
next_obs = []
for i, _ in enumerate(next_info.visual_observations):
next_obs.append(next_info.visual_observations[i][next_idx])
if self.policy.use_vec_obs:
next_obs.append(next_info.vector_observations[next_idx])
bootstrap_step = BootstrapExperience(
obs=next_obs, agent_id=agent_id
)
trajectory = Trajectory(
steps=self.experience_buffers[agent_id],
bootstrap_step=bootstrap_step,
self.processing_buffer[agent_id].reset_agent()
self.experience_buffers[agent_id] = []
elif not next_info.local_done[next_idx]:
if agent_id not in self.episode_steps:
self.episode_steps[agent_id] = 0

)
def add_policy_outputs(
self, take_action_outputs: ActionInfoOutputs, agent_id: str, agent_idx: int
) -> None:
"""
Takes the output of the last action and store it into the training buffer.
"""
actions = take_action_outputs["action"]
if self.policy.use_continuous_act:
actions_pre = take_action_outputs["pre_action"]
self.processing_buffer[agent_id]["actions_pre"].append(
actions_pre[agent_idx]
)
epsilons = take_action_outputs["random_normal_epsilon"]
self.processing_buffer[agent_id]["random_normal_epsilon"].append(
epsilons[agent_idx]
)
a_dist = take_action_outputs["log_probs"]
# value is a dictionary from name of reward to value estimate of the value head
self.processing_buffer[agent_id]["actions"].append(actions[agent_idx])
self.processing_buffer[agent_id]["action_probs"].append(a_dist[agent_idx])
TODO: Remove.
"""
def __str__(self):

memory=memory,
epsilon=self[agent_id]["random_normal_epsilon"][_exp],
)
bootstrap_step = BootstrapExperience(obs=obs, agent_id=agent_id)
trajectory = Trajectory(steps=trajectory_list, next_step=experience)
trajectory = Trajectory(steps=trajectory_list, bootstrap_step=bootstrap_step)
return trajectory
def append_all_agent_batch_to_update_buffer(

53
ml-agents/mlagents/trainers/ppo/policy.py


from mlagents.tf_utils import tf
from mlagents.envs.timers import timed
from mlagents.envs.brain import BrainInfo, BrainParameters
from mlagents.envs.brain import BrainParameters
from mlagents.trainers.agent_processor import AgentExperience, split_obs
from mlagents.trainers.agent_processor import BootstrapExperience, split_obs
from mlagents.trainers.ppo.models import PPOModel
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer

feed_dict[model.memory_in] = mem_in
return feed_dict
def get_value_estimates(
self, brain_info: BrainInfo, idx: int, done: bool
) -> Dict[str, float]:
"""
Generates value estimates for bootstrapping.
:param brain_info: BrainInfo to be used for bootstrapping.
:param idx: Index in BrainInfo of agent.
:param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
:return: The value estimate dictionary with key being the name of the reward signal and the value the
corresponding value estimate.
"""
feed_dict: Dict[tf.Tensor, Any] = {
self.model.batch_size: 1,
self.model.sequence_length: 1,
}
for i in range(len(brain_info.visual_observations)):
feed_dict[self.model.visual_in[i]] = [
brain_info.visual_observations[i][idx]
]
if self.use_vec_obs:
feed_dict[self.model.vector_in] = [brain_info.vector_observations[idx]]
if self.use_recurrent:
feed_dict[self.model.memory_in] = self.retrieve_memories([idx])
if not self.use_continuous_act and self.use_recurrent:
feed_dict[self.model.prev_action] = self.retrieve_previous_action([idx])
value_estimates = self.sess.run(self.model.value_heads, feed_dict)
value_estimates = {k: float(v) for k, v in value_estimates.items()}
# If we're done, reassign all of the value estimates that need terminal states.
if done:
for k in value_estimates:
if self.reward_signals[k].use_terminal_states:
value_estimates[k] = 0.0
return value_estimates
def get_batched_value_estimates(self, batch: AgentBuffer) -> Dict[str, np.ndarray]:
feed_dict: Dict[tf.Tensor, Any] = {
self.model.batch_size: batch.num_experiences,

return value_estimates
def get_value_estimates2(self, experience: AgentExperience) -> Dict[str, float]:
def get_value_estimates(
self, experience: BootstrapExperience, done: bool
) -> Dict[str, float]:
"""
Generates value estimates for bootstrapping.
:param brain_info: BrainInfo to be used for bootstrapping.

self.model.batch_size: 1,
self.model.sequence_length: 1,
}
agent_id = experience.agent_id
vec_vis_obs = split_obs(experience.obs)
for i in range(len(vec_vis_obs.visual_observations)):
feed_dict[self.model.visual_in[i]] = [vec_vis_obs.visual_observations[i]]

if self.use_recurrent:
feed_dict[self.model.memory_in] = [experience.memory]
feed_dict[self.model.memory_in] = self.retrieve_memories([agent_id])
feed_dict[self.model.prev_action] = [experience.prev_action]
feed_dict[self.model.prev_action] = self.retrieve_memories([agent_id])
if experience.done:
if done:
for k in value_estimates:
if self.reward_signals[k].use_terminal_states:
value_estimates[k] = 0.0

4
ml-agents/mlagents/trainers/ppo/trainer.py


for name, v in value_estimates.items():
agent_buffer_trajectory["{}_value_estimates".format(name)].extend(v)
value_next = self.policy.get_value_estimates2(trajectory.next_step)
value_next = self.policy.get_value_estimates(
trajectory.bootstrap_step, trajectory.steps[-1].done
)
# Evaluate all reward functions
self.collected_rewards["environment"][agent_id] += np.sum(

2
ml-agents/mlagents/trainers/rl_trainer.py


if step < len(trajectory.steps) - 1:
next_vec_vis_obs = split_obs(trajectory.steps[step + 1].obs)
else:
next_vec_vis_obs = split_obs(trajectory.next_step.obs)
next_vec_vis_obs = split_obs(trajectory.bootstrap_step.obs)
for i, _ in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["visual_obs%d" % i].append(
vec_vis_obs.visual_observations[i]

正在加载...
取消
保存