浏览代码

use sensor types to differentiate obs (#4749)

/goal-conditioning
GitHub 4 年前
当前提交
22658a40
共有 9 个文件被更改,包括 85 次插入42 次删除
  1. 12
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  2. 8
      ml-agents/mlagents/trainers/policy/policy.py
  3. 10
      ml-agents/mlagents/trainers/policy/torch_policy.py
  4. 4
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  5. 2
      ml-agents/mlagents/trainers/ppo/trainer.py
  6. 6
      ml-agents/mlagents/trainers/sac/trainer.py
  7. 10
      ml-agents/mlagents/trainers/torch/model_serialization.py
  8. 23
      ml-agents/mlagents/trainers/torch/networks.py
  9. 52
      ml-agents/mlagents/trainers/trajectory.py

12
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])[:, 1:]]
goals = [ModelUtils.list_to_tensor(batch["vector_obs"])[:, :1]]
vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
goals = [ModelUtils.list_to_tensor(batch["goals"])]
if self.policy.use_vis_obs:
visual_obs = []
for idx, _ in enumerate(

memory = torch.zeros([1, 1, self.policy.m_size])
vec_vis_obs = SplitObservations.from_observations(next_obs)
vec_vis_obs = SplitObservations.from_observations(
next_obs, self.policy.behavior_spec
)
ModelUtils.list_to_tensor(vec_vis_obs.vector_observations[1:]).unsqueeze(0)
ModelUtils.list_to_tensor(vec_vis_obs.vector_observations).unsqueeze(0)
]
next_vis_obs = [
ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0)

# goals dont change but otherwise broken
next_goals = [torch.as_tensor(vec_vis_obs.vector_observations[:1])]
next_goals = [torch.as_tensor(vec_vis_obs.goals)]
value_estimates, next_memory = self.policy.actor_critic.critic_pass(
vector_obs, visual_obs, goals, memory, sequence_length=batch.num_experiences
)

8
ml-agents/mlagents/trainers/policy/policy.py


from typing import Dict, List, Optional
import numpy as np
from mlagents_envs.base_env import ActionTuple, BehaviorSpec, DecisionSteps
from mlagents_envs.base_env import ActionTuple, BehaviorSpec, DecisionSteps, SensorType
from mlagents_envs.exception import UnityException
from mlagents.trainers.action_info import ActionInfo

else [self.behavior_spec.action_spec.continuous_size]
)
self.vec_obs_size = sum(
shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1
shape[0]
for shape, obs_type in zip(
behavior_spec.observation_shapes, behavior_spec.sensor_types
)
if len(shape) == 1 and obs_type == SensorType.OBSERVATION
)
self.vis_obs_size = sum(
1 for shape in behavior_spec.observation_shapes if len(shape) == 3

10
ml-agents/mlagents/trainers/policy/torch_policy.py


def _split_decision_step(
self, decision_requests: DecisionSteps
) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
vec_vis_obs = SplitObservations.from_observations(
decision_requests.obs, self.behavior_spec
)
mask = None
if self.behavior_spec.action_spec.discrete_size > 0:
mask = torch.ones([len(decision_requests), np.sum(self.act_size)])

If this policy normalizes vector observations, this will update the norm values in the graph.
:param vector_obs: The vector observations to add to the running estimate of the distribution.
"""
vector_obs = [torch.as_tensor(vector_obs)[:, 1:]]
vector_obs = [torch.as_tensor(vector_obs)]
if self.use_vec_obs and self.normalize:
self.actor_critic.update_normalization(vector_obs)

:return: Outputs from network as defined by self.inference_dict.
"""
vec_vis_obs, masks = self._split_decision_step(decision_requests)
vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations[:, 1:])]
vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)]
goals = [torch.as_tensor(vec_vis_obs.vector_observations[:, :1])]
goals = [torch.as_tensor(vec_vis_obs.goals)]
memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze(
0
)

4
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


)
returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])
vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])[:, 1:]]
goals = [ModelUtils.list_to_tensor(batch["vector_obs"])[:, :1]]
vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
goals = [ModelUtils.list_to_tensor(batch["goals"])]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
actions = AgentAction.from_dict(batch)

2
ml-agents/mlagents/trainers/ppo/trainer.py


super()._process_trajectory(trajectory)
agent_id = trajectory.agent_id # All the agents should have the same ID
agent_buffer_trajectory = trajectory.to_agentbuffer()
agent_buffer_trajectory = trajectory.to_agentbuffer(self.policy.behavior_spec)
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])

6
ml-agents/mlagents/trainers/sac/trainer.py


last_step = trajectory.steps[-1]
agent_id = trajectory.agent_id # All the agents should have the same ID
agent_buffer_trajectory = trajectory.to_agentbuffer()
agent_buffer_trajectory = trajectory.to_agentbuffer(self.policy.behavior_spec)
# Update the normalization
if self.is_training:

# Bootstrap using the last step rather than the bootstrap step if max step is reached.
# Set last element to duplicate obs and remove dones.
if last_step.interrupted:
vec_vis_obs = SplitObservations.from_observations(last_step.obs)
vec_vis_obs = SplitObservations.from_observations(
last_step.obs, self.policy.behavior_spec
)
for i, obs in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs
if vec_vis_obs.vector_observations.size > 1:

10
ml-agents/mlagents/trainers/torch/model_serialization.py


for shape in self.policy.behavior_spec.observation_shapes
if len(shape) == 3
]
dummy_goals = [torch.zeros(batch_dim + [1])]
dummy_masks = torch.ones(
batch_dim + [sum(self.policy.behavior_spec.action_spec.discrete_branches)]
)

self.dummy_input = (dummy_vec_obs, dummy_vis_obs, dummy_masks, dummy_memories)
self.dummy_input = (
dummy_vec_obs,
dummy_vis_obs,
dummy_goals,
dummy_masks,
dummy_memories,
)
+ ["goals"]
+ [f"visual_observation_{i}" for i in range(self.policy.vis_obs_size)]
+ ["action_masks", "memories"]
)

23
ml-agents/mlagents/trainers/torch/networks.py


self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
goal: List[torch.tensor],
goals: List[torch.tensor],
actions: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

)
output = self.value_heads(encoding, goal)
output = self.value_heads(encoding, goals)
return output, memories

self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
goal: List[torch.Tensor],
goals: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:

self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
goals: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
) -> Tuple[Union[int, torch.Tensor], ...]:

At this moment, torch.onnx.export() doesn't accept None as tensor to be exported,
so the size of return tuple varies with action spec.
"""
vec_inputs = [vec_inputs[0][:, 1:]]
goal = [vec_inputs[0][:, :1]]
encoding, memories_out = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=1
)

disc_action_out,
action_out_deprecated,
) = self.action_model.get_action_out(encoding, masks, goal)
) = self.action_model.get_action_out(encoding, masks, goals)
export_out = [
self.version_number,
torch.Tensor([self.network_body.memory_size]),

self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
goal: List[torch.Tensor],
goals: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:

self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
goals: List[torch.Tensor],
actions: AgentAction,
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,

vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
)
log_probs, entropies = self.action_model.evaluate(
encoding, masks, actions, goal
encoding, masks, actions, goals
)
value_outputs = self.value_heads(encoding)
return log_probs, entropies, value_outputs

vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
goals: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

encoding, memories = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
)
action, log_probs, entropies = self.action_model(encoding, masks, goal)
action, log_probs, entropies = self.action_model(encoding, masks, goals)
value_outputs = self.value_heads(encoding)
return action, log_probs, entropies, value_outputs, memories

self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
goal: List[torch.Tensor],
goals: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:

value_outputs, critic_mem_out = self.critic(
vec_inputs,
vis_inputs,
goal,
goals,
memories=critic_mem,
sequence_length=sequence_length,
)

52
ml-agents/mlagents/trainers/trajectory.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents_envs.base_env import ActionTuple
from mlagents_envs.base_env import ActionTuple, BehaviorSpec, SensorType
from mlagents.trainers.torch.action_log_probs import LogProbsTuple

class SplitObservations(NamedTuple):
vector_observations: np.ndarray
visual_observations: List[np.ndarray]
goals: np.ndarray
def from_observations(obs: List[np.ndarray]) -> "SplitObservations":
def from_observations(obs: List[np.ndarray], behavior_spec) -> "SplitObservations":
"""
Divides a List of numpy arrays into a SplitObservations NamedTuple.
This allows you to access the vector and visual observations directly,

"""
vis_obs_list: List[np.ndarray] = []
vec_obs_list: List[np.ndarray] = []
goal_list: List[np.ndarray] = []
for observation in obs:
# Obs could be batched or single
if len(observation.shape) == 1 or len(observation.shape) == 2:
vec_obs_list.append(observation)
if len(observation.shape) == 3 or len(observation.shape) == 4:
vis_obs_list.append(observation)
last_obs = observation
for observation, sensor_type in zip(obs, behavior_spec.sensor_types):
if sensor_type == SensorType.PARAMETERIZATION:
goal_list.append(observation)
elif sensor_type == SensorType.OBSERVATION:
# Obs could be batched or single
if len(observation.shape) == 1 or len(observation.shape) == 2:
vec_obs_list.append(observation)
if len(observation.shape) == 3 or len(observation.shape) == 4:
vis_obs_list.append(observation)
last_obs = observation
if last_obs is not None:
is_batched = len(last_obs.shape) == 2 or len(last_obs.shape) == 4
if is_batched:

else np.zeros((last_obs.shape[0], 0), dtype=np.float32)
)
goals = (
np.concatenate(goal_list, axis=1)
if len(goal_list) > 0
else np.zeros((last_obs.shape[0], 0), dtype=np.float32)
)
else:
vec_obs = (
np.concatenate(vec_obs_list, axis=0)

goals = (
np.concatenate(goal_list, axis=0)
if len(goal_list) > 0
else np.array([], dtype=np.float32)
)
vector_observations=vec_obs, visual_observations=vis_obs_list
vector_observations=vec_obs, visual_observations=vis_obs_list, goals=goals
)

agent_id: str
behavior_id: str
def to_agentbuffer(self) -> AgentBuffer:
def to_agentbuffer(self, behavior_spec: BehaviorSpec) -> AgentBuffer:
"""
Converts a Trajectory to an AgentBuffer
:param trajectory: A Trajectory

"""
agent_buffer_trajectory = AgentBuffer()
vec_vis_obs = SplitObservations.from_observations(self.steps[0].obs)
vec_vis_obs = SplitObservations.from_observations(
self.steps[0].obs, behavior_spec
)
self.steps[step + 1].obs
self.steps[step + 1].obs, behavior_spec
next_vec_vis_obs = SplitObservations.from_observations(self.next_obs)
next_vec_vis_obs = SplitObservations.from_observations(
self.next_obs, behavior_spec
)
for i, _ in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["visual_obs%d" % i].append(

agent_buffer_trajectory["next_vector_in"].append(
next_vec_vis_obs.vector_observations
)
agent_buffer_trajectory["goals"].append(vec_vis_obs.goals)
# this shouldnt be necessary in an optimized implementation since the goal does not change
agent_buffer_trajectory["next_goals"].append(next_vec_vis_obs.goals)
if exp.memory is not None:
agent_buffer_trajectory["memory"].append(exp.memory)

正在加载...
取消
保存