浏览代码

Make conversion methods part of NamedTuples

/develop-newnormalization
Ervin Teng 5 年前
当前提交
fdf9aea7
共有 7 个文件被更改,包括 81 次插入86 次删除
  1. 4
      ml-agents/mlagents/trainers/bc/trainer.py
  2. 4
      ml-agents/mlagents/trainers/ppo/trainer.py
  3. 10
      ml-agents/mlagents/trainers/sac/trainer.py
  4. 3
      ml-agents/mlagents/trainers/tests/test_ppo.py
  5. 11
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  6. 4
      ml-agents/mlagents/trainers/tf_policy.py
  7. 131
      ml-agents/mlagents/trainers/trajectory.py

4
ml-agents/mlagents/trainers/bc/trainer.py


from mlagents.trainers.bc.policy import BCPolicy
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trajectory import Trajectory, trajectory_to_agentbuffer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.trainer import Trainer
logger = logging.getLogger("mlagents.trainers")

Processing involves calculating value and advantage targets for model updating step.
"""
agent_id = trajectory.agent_id # All the experiences should have the same ID
agent_buffer_trajectory = trajectory_to_agentbuffer(trajectory)
agent_buffer_trajectory = trajectory.to_agentbuffer()
# Evaluate all reward functions
self.cumulative_rewards[agent_id] += np.sum(

4
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy, get_devices
from mlagents.trainers.rl_trainer import RLTrainer
from mlagents.trainers.trajectory import Trajectory, trajectory_to_agentbuffer
from mlagents.trainers.trajectory import Trajectory
logger = logging.getLogger("mlagents.trainers")

# Add to episode_steps
self.episode_steps[agent_id] += len(trajectory.steps)
agent_buffer_trajectory = trajectory_to_agentbuffer(trajectory)
agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])

10
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents.envs.timers import timed
from mlagents.trainers.sac.policy import SACPolicy
from mlagents.trainers.rl_trainer import RLTrainer
from mlagents.trainers.trajectory import (
Trajectory,
trajectory_to_agentbuffer,
split_obs,
)
from mlagents.trainers.trajectory import Trajectory, SplitObservations
LOGGER = logging.getLogger("mlagents.trainers")

# Add to episode_steps
self.episode_steps[agent_id] += len(trajectory.steps)
agent_buffer_trajectory = trajectory_to_agentbuffer(trajectory)
agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
if self.is_training:

# Bootstrap using the last step rather than the bootstrap step if max step is reached.
# Set last element to duplicate obs and remove dones.
if last_step.max_step:
vec_vis_obs = split_obs(last_step.obs)
vec_vis_obs = SplitObservations.from_observations(last_step.obs)
for i, obs in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs
if vec_vis_obs.vector_observations.size > 1:

3
ml-agents/mlagents/trainers/tests/test_ppo.py


from mlagents.trainers.ppo.models import PPOModel
from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.trajectory import trajectory_to_agentbuffer
from mlagents.trainers.brain import BrainParameters
from mlagents.envs.environment import UnityEnvironment
from mlagents.envs.mock_communicator import MockCommunicator

assert type(key) is str
assert val != 0.0
agentbuffer = trajectory_to_agentbuffer(trajectory)
agentbuffer = trajectory.to_agentbuffer()
batched_values = policy.get_batched_value_estimates(agentbuffer)
for values in batched_values.values():
assert len(values) == 15

11
ml-agents/mlagents/trainers/tests/test_trajectory.py


import numpy as np
import pytest
from mlagents.trainers.trajectory import (
AgentExperience,
Trajectory,
split_obs,
trajectory_to_agentbuffer,
)
from mlagents.trainers.trajectory import AgentExperience, Trajectory, SplitObservations
VEC_OBS_SIZE = 6
ACTION_SIZE = 4

obs.append(np.ones((84, 84, 3), dtype=np.float32))
for i in range(num_vec_obs):
obs.append(np.ones(VEC_OBS_SIZE, dtype=np.float32))
split_observations = split_obs(obs)
split_observations = SplitObservations.from_observations(obs)
if num_vec_obs == 1:
assert len(split_observations.vector_observations) == VEC_OBS_SIZE

]
wanted_keys = set(wanted_keys)
trajectory = make_fake_trajectory(length=length)
agentbuffer = trajectory_to_agentbuffer(trajectory)
agentbuffer = trajectory.to_agentbuffer()
seen_keys = set()
for key, field in agentbuffer.items():
assert len(field) == length

4
ml-agents/mlagents/trainers/tf_policy.py


from tensorflow.python.platform import gfile
from tensorflow.python.framework import graph_util
from mlagents.trainers import tensorflow_to_barracuda as tf2bc
from mlagents.trainers.trajectory import split_obs
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.brain import BrainInfo

self.model.batch_size: 1,
self.model.sequence_length: 1,
}
vec_vis_obs = split_obs(next_obs)
vec_vis_obs = SplitObservations.from_observations(next_obs)
for i in range(len(vec_vis_obs.visual_observations)):
feed_dict[self.model.visual_in[i]] = [vec_vis_obs.visual_observations[i]]

131
ml-agents/mlagents/trainers/trajectory.py


vector_observations: np.ndarray
visual_observations: List[np.ndarray]
@staticmethod
def from_observations(obs: List[np.ndarray]) -> "SplitObservations":
vis_obs_indices = []
vec_obs_indices = []
for index, observation in enumerate(obs):
if len(observation.shape) == 1:
vec_obs_indices.append(index)
if len(observation.shape) == 3:
vis_obs_indices.append(index)
vec_obs = (
np.concatenate([obs[i] for i in vec_obs_indices], axis=0)
if len(vec_obs_indices) > 0
else np.array([], dtype=np.float32)
)
vis_obs = [obs[i] for i in vis_obs_indices]
return SplitObservations(
vector_observations=vec_obs, visual_observations=vis_obs
)
class Trajectory(NamedTuple):
steps: List[AgentExperience]

agent_id: str
def split_obs(obs: List[np.ndarray]) -> SplitObservations:
vis_obs_indices = []
vec_obs_indices = []
for index, observation in enumerate(obs):
if len(observation.shape) == 1:
vec_obs_indices.append(index)
if len(observation.shape) == 3:
vis_obs_indices.append(index)
vec_obs = (
np.concatenate([obs[i] for i in vec_obs_indices], axis=0)
if len(vec_obs_indices) > 0
else np.array([], dtype=np.float32)
)
vis_obs = [obs[i] for i in vis_obs_indices]
return SplitObservations(vector_observations=vec_obs, visual_observations=vis_obs)
def to_agentbuffer(self) -> AgentBuffer:
"""
Converts a Trajectory to an AgentBuffer
:param trajectory: A Trajectory
:returns: AgentBuffer. Note that the length of the AgentBuffer will be one
less than the trajectory, as the next observation need to be populated from the last
step of the trajectory.
"""
agent_buffer_trajectory = AgentBuffer()
for step, exp in enumerate(self.steps):
vec_vis_obs = SplitObservations.from_observations(exp.obs)
if step < len(self.steps) - 1:
next_vec_vis_obs = SplitObservations.from_observations(
self.steps[step + 1].obs
)
else:
next_vec_vis_obs = SplitObservations.from_observations(self.next_obs)
def trajectory_to_agentbuffer(trajectory: Trajectory) -> AgentBuffer:
"""
Converts a Trajectory to an AgentBuffer
:param trajectory: A Trajectory
:returns: AgentBuffer. Note that the length of the AgentBuffer will be one
less than the trajectory, as the next observation need to be populated from the last
step of the trajectory.
"""
agent_buffer_trajectory = AgentBuffer()
for step, exp in enumerate(trajectory.steps):
vec_vis_obs = split_obs(exp.obs)
if step < len(trajectory.steps) - 1:
next_vec_vis_obs = split_obs(trajectory.steps[step + 1].obs)
else:
next_vec_vis_obs = split_obs(trajectory.next_obs)
for i, _ in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["visual_obs%d" % i].append(
vec_vis_obs.visual_observations[i]
for i, _ in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["visual_obs%d" % i].append(
vec_vis_obs.visual_observations[i]
)
agent_buffer_trajectory["next_visual_obs%d" % i].append(
next_vec_vis_obs.visual_observations[i]
)
agent_buffer_trajectory["vector_obs"].append(
vec_vis_obs.vector_observations
agent_buffer_trajectory["next_visual_obs%d" % i].append(
next_vec_vis_obs.visual_observations[i]
agent_buffer_trajectory["next_vector_in"].append(
next_vec_vis_obs.vector_observations
agent_buffer_trajectory["vector_obs"].append(vec_vis_obs.vector_observations)
agent_buffer_trajectory["next_vector_in"].append(
next_vec_vis_obs.vector_observations
)
if exp.memory is not None:
agent_buffer_trajectory["memory"].append(exp.memory)
if exp.memory is not None:
agent_buffer_trajectory["memory"].append(exp.memory)
agent_buffer_trajectory["masks"].append(1.0)
agent_buffer_trajectory["done"].append(exp.done)
# Add the outputs of the last eval
if exp.action_pre is not None:
actions_pre = exp.action_pre
agent_buffer_trajectory["actions_pre"].append(actions_pre)
agent_buffer_trajectory["masks"].append(1.0)
agent_buffer_trajectory["done"].append(exp.done)
# Add the outputs of the last eval
if exp.action_pre is not None:
actions_pre = exp.action_pre
agent_buffer_trajectory["actions_pre"].append(actions_pre)
# value is a dictionary from name of reward to value estimate of the value head
agent_buffer_trajectory["actions"].append(exp.action)
agent_buffer_trajectory["action_probs"].append(exp.action_probs)
# value is a dictionary from name of reward to value estimate of the value head
agent_buffer_trajectory["actions"].append(exp.action)
agent_buffer_trajectory["action_probs"].append(exp.action_probs)
# Store action masks if necessary. Eventually these will be
# None for continuous actions
if exp.action_mask is not None:
agent_buffer_trajectory["action_mask"].append(
exp.action_mask, padding_value=1
)
# Store action masks if necessary. Eventually these will be
# None for continuous actions
if exp.action_mask is not None:
agent_buffer_trajectory["action_mask"].append(
exp.action_mask, padding_value=1
)
agent_buffer_trajectory["prev_action"].append(exp.prev_action)
agent_buffer_trajectory["prev_action"].append(exp.prev_action)
# Add the value outputs if needed
agent_buffer_trajectory["environment_rewards"].append(exp.reward)
return agent_buffer_trajectory
# Add the value outputs if needed
agent_buffer_trajectory["environment_rewards"].append(exp.reward)
return agent_buffer_trajectory
正在加载...
取消
保存