浏览代码

Remove BootstrapExperience

/develop-newnormalization
Ervin Teng 5 年前
当前提交
97d66e71
共有 9 个文件被更改,包括 36 次插入43 次删除
  1. 23
      ml-agents/mlagents/trainers/agent_processor.py
  2. 6
      ml-agents/mlagents/trainers/ppo/policy.py
  3. 5
      ml-agents/mlagents/trainers/ppo/trainer.py
  4. 4
      ml-agents/mlagents/trainers/sac/trainer.py
  5. 2
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  6. 9
      ml-agents/mlagents/trainers/tests/test_ppo.py
  7. 4
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  8. 4
      ml-agents/mlagents/trainers/trainer_controller.py
  9. 22
      ml-agents/mlagents/trainers/trajectory.py

23
ml-agents/mlagents/trainers/agent_processor.py


import numpy as np
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.trajectory import (
Trajectory,
AgentExperience,
BootstrapExperience,
)
from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents.envs.brain import BrainInfo
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.envs.action_info import ActionInfoOutputs

One AgentProcessor should be created per agent group.
"""
def __init__(self, trainer: Trainer, policy: TFPolicy, time_horizon: int):
def __init__(self, trainer: Trainer, policy: TFPolicy, max_trajectory_length: int):
:param time_horizon: Maximum length of a trajectory before it is added to the trainer.
:param max_trajectory_length: Maximum length of a trajectory before it is added to the trainer.
"""
self.experience_buffers: Dict[str, List] = defaultdict(list)
self.last_brain_info: Dict[str, BrainInfo] = {}

# that really should be gathered by the env-manager.
self.policy = policy
self.episode_steps: Dict[str, int] = {}
self.time_horizon = time_horizon
self.max_trajectory_length = max_trajectory_length
self.trainer = trainer
def __str__(self):

if (
next_info.local_done[next_idx]
or len(self.experience_buffers[agent_id]) >= self.time_horizon
or len(self.experience_buffers[agent_id])
>= self.max_trajectory_length
) and len(self.experience_buffers[agent_id]) > 0:
# Make next AgentExperience
next_obs = []

next_obs.append(next_info.vector_observations[next_idx])
bootstrap_step = BootstrapExperience(
obs=next_obs, agent_id=agent_id
)
trajectory = Trajectory(
steps=self.experience_buffers[agent_id],
bootstrap_step=bootstrap_step,
)
trajectory = Trajectory(steps=self.experience_buffers[agent_id])
# This will eventually be replaced with a queue
self.trainer.process_trajectory(trajectory)
self.experience_buffers[agent_id] = []

6
ml-agents/mlagents/trainers/ppo/policy.py


from mlagents.envs.timers import timed
from mlagents.envs.brain import BrainParameters
from mlagents.trainers.models import EncoderType, LearningRateSchedule
from mlagents.trainers.trajectory import BootstrapExperience, split_obs
from mlagents.trainers.trajectory import split_obs, AgentExperience
from mlagents.trainers.ppo.models import PPOModel
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer

return value_estimates
def get_value_estimates(
self, experience: BootstrapExperience, done: bool
self, experience: AgentExperience, done: bool
:param experience: BootstrapExperience to be used for bootstrapping.
:param experience: AgentExperience to be used for bootstrapping.
:param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
:return: The value estimate dictionary with key being the name of the reward signal and the value the
corresponding value estimate.

5
ml-agents/mlagents/trainers/ppo/trainer.py


agent_id = trajectory.steps[
-1
].agent_id # All the agents should have the same ID
# Note that this agent buffer version of the traj. is one less than the len of the raw trajectory
# for bootstrapping purposes.
agent_buffer_trajectory = trajectory_to_agentbuffer(trajectory)
# Update the normalization
if self.is_training:

agent_buffer_trajectory["{}_value_estimates".format(name)].extend(v)
value_next = self.policy.get_value_estimates(
trajectory.bootstrap_step,
trajectory.steps[-1],
trajectory.steps[-1].done and not trajectory.steps[-1].max_step,
)

4
ml-agents/mlagents/trainers/sac/trainer.py


"""
last_step = trajectory.steps[-1]
agent_id = last_step.agent_id # All the agents should have the same ID
# Note that this agent buffer version of the traj. is one less than the len of the raw trajectory
# for bootstrapping purposes.
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])

2
ml-agents/mlagents/trainers/tests/test_agent_processor.py


def test_agentprocessor(num_vis_obs):
policy = create_mock_policy()
trainer = mock.Mock()
processor = AgentProcessor(trainer, policy, time_horizon=5)
processor = AgentProcessor(trainer, policy, max_trajectory_length=5)
fake_action_outputs = {
"action": [0.1, 0.1],
"value_heads": {},

9
ml-agents/mlagents/trainers/tests/test_ppo.py


dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False)
time_horizon = 15
length=15, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=2
length=time_horizon + 1,
max_step_complete=True,
vec_obs_size=1,
num_vis_obs=0,
action_space=2,
)
trainer.process_trajectory(trajectory)

# Add a terminal trajectory
trajectory = make_fake_trajectory(
length=15,
length=time_horizon + 1,
max_step_complete=False,
vec_obs_size=1,
num_vis_obs=0,

4
ml-agents/mlagents/trainers/tests/test_trajectory.py


from mlagents.trainers.trajectory import (
AgentExperience,
BootstrapExperience,
Trajectory,
split_obs,
trajectory_to_agentbuffer,

agent_id=agent_id,
)
steps_list.append(last_experience)
bootstrap_experience = BootstrapExperience(obs=obs, agent_id=agent_id)
return Trajectory(steps=steps_list, bootstrap_step=bootstrap_experience)
return Trajectory(steps=steps_list)
@pytest.mark.parametrize("num_visual_obs", [0, 1, 2])

4
ml-agents/mlagents/trainers/trainer_controller.py


processor=AgentProcessor(
trainer,
trainer.policy,
trainer.parameters["time_horizon"],
trainer.parameters["time_horizon"] + 1,
# Note: for a trajectory to be useful for bootstrapping, we need the next state.
# Hence we have the processor give us 1 more than the time horizon.
)
self.managers[name] = agent_manager
last_brain_names = external_brains

22
ml-agents/mlagents/trainers/trajectory.py


agent_id: str
class BootstrapExperience(NamedTuple):
"""
A partial AgentExperience needed to bootstrap GAE.
"""
obs: List[np.ndarray]
agent_id: str
class SplitObservations(NamedTuple):
vector_observations: np.ndarray
visual_observations: List[np.ndarray]

steps: List[AgentExperience]
bootstrap_step: BootstrapExperience # The next step after the trajectory. Used for GAE.
class AgentProcessorException(UnityException):

"""
Converts a Trajectory to an AgentBuffer
:param trajectory: A Trajectory
:returns: AgentBuffer
:returns: AgentBuffer. Note that the length of the AgentBuffer will be one
less than the trajectory, as the next observation need to be populated from the last
step of the trajectory.
for step, exp in enumerate(trajectory.steps):
for step, exp in enumerate(trajectory.steps[:-1]):
if step < len(trajectory.steps) - 1:
next_vec_vis_obs = split_obs(trajectory.steps[step + 1].obs)
else:
next_vec_vis_obs = split_obs(trajectory.bootstrap_step.obs)
next_vec_vis_obs = split_obs(trajectory.steps[step + 1].obs)
for i, _ in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["visual_obs%d" % i].append(
vec_vis_obs.visual_observations[i]

正在加载...
取消
保存