浏览代码

[WIP] Refactor trainers to use list of obs rather than vec and vis obs

/develop/rm-rf-new-models
vincentpierre 4 年前
当前提交
735fcd52
共有 13 个文件被更改,包括 207 次插入259 次删除
  1. 8
      ml-agents/mlagents/trainers/demo_loader.py
  2. 31
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  3. 3
      ml-agents/mlagents/trainers/policy/policy.py
  4. 35
      ml-agents/mlagents/trainers/policy/torch_policy.py
  5. 19
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  6. 3
      ml-agents/mlagents/trainers/ppo/trainer.py
  7. 58
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  8. 14
      ml-agents/mlagents/trainers/sac/trainer.py
  9. 19
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
  10. 9
      ml-agents/mlagents/trainers/torch/encoders.py
  11. 111
      ml-agents/mlagents/trainers/torch/networks.py
  12. 59
      ml-agents/mlagents/trainers/torch/utils.py
  13. 97
      ml-agents/mlagents/trainers/trajectory.py

8
ml-agents/mlagents/trainers/demo_loader.py


from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
AgentInfoActionPairProto,
)
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.trajectory import ObsUtil
from mlagents_envs.rpc_utils import behavior_spec_from_proto, steps_from_proto
from mlagents_envs.base_env import BehaviorSpec
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto

demo_raw_buffer["done"].append(next_done)
demo_raw_buffer["rewards"].append(next_reward)
split_obs = SplitObservations.from_observations(current_obs)
for i, obs in enumerate(split_obs.visual_observations):
demo_raw_buffer["visual_obs%d" % i].append(obs)
demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
for i, obs in enumerate(current_obs):
demo_raw_buffer[ObsUtil.get_name_at(i)].append(obs[i])
demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
demo_raw_buffer["prev_action"].append(previous_action)
if next_done:

31
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.trajectory import ObsUtil
from mlagents.trainers.torch.components.bc.module import BCModule
from mlagents.trainers.torch.components.reward_providers import create_reward_provider

def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
if self.policy.use_vis_obs:
visual_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
visual_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
visual_obs.append(visual_ob)
else:
visual_obs = []
n_obs = len(self.policy.behavior_spec.observation_shapes)
current_obs = ObsUtil.from_buffer(batch, n_obs)
# Convert to tensors
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
vec_vis_obs = SplitObservations.from_observations(next_obs)
next_vec_obs = [
ModelUtils.list_to_tensor(vec_vis_obs.vector_observations).unsqueeze(0)
]
next_vis_obs = [
ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0)
for _vis_ob in vec_vis_obs.visual_observations
]
next_obs = [obs.unsqueeze(0) for obs in next_obs]
vector_obs, visual_obs, memory, sequence_length=batch.num_experiences
current_obs, memory, sequence_length=batch.num_experiences
next_vec_obs, next_vis_obs, next_memory, sequence_length=1
next_obs, next_memory, sequence_length=1
)
for name, estimate in value_estimates.items():

3
ml-agents/mlagents/trainers/policy/policy.py


from mlagents.trainers.action_info import ActionInfo
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.buffer import AgentBuffer
class UnityPolicyException(UnityException):

raise RuntimeError("NaN action detected.")
@abstractmethod
def update_normalization(self, vector_obs: np.ndarray) -> None:
def update_normalization(self, buffer: AgentBuffer) -> None:
pass
@abstractmethod

35
ml-agents/mlagents/trainers/policy/torch_policy.py


from mlagents_envs.timers import timed
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.torch.networks import (
SharedActorCritic,
SeparateActorCritic,

from mlagents.trainers.buffer import AgentBuffer
EPSILON = 1e-7 # Small value to avoid divide by zero

def _split_decision_step(
self, decision_requests: DecisionSteps
) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
) -> Tuple[List[np.array], np.ndarray]:
obs = decision_requests.obs
mask = None
if not self.use_continuous_act:
mask = torch.ones([len(decision_requests), np.sum(self.act_size)])

)
return vec_vis_obs, mask
return obs, mask
def update_normalization(self, vector_obs: np.ndarray) -> None:
def update_normalization(self, buffer: AgentBuffer) -> None:
vector_obs = [torch.as_tensor(vector_obs)]
self.actor_critic.update_normalization(vector_obs)
self.actor_critic.update_normalization(buffer)
vec_obs: List[torch.Tensor],
vis_obs: List[torch.Tensor],
obs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
seq_len: int = 1,

"""
if memories is None:
dists, memories = self.actor_critic.get_dists(
vec_obs, vis_obs, masks, memories, seq_len
obs, masks, memories, seq_len
vec_obs, vis_obs, masks, memories, seq_len
obs, masks, memories, seq_len
)
action_list = self.actor_critic.sample_action(dists)
log_probs, entropies, all_logs = ModelUtils.get_probs_and_entropy(

def evaluate_actions(
self,
vec_obs: torch.Tensor,
vis_obs: torch.Tensor,
obs: torch.Tensor,
actions: torch.Tensor,
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,

vec_obs, vis_obs, masks, memories, seq_len
obs, masks, memories, seq_len
)
action_list = [actions[..., i] for i in range(actions.shape[-1])]
log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_list, dists)

:param decision_requests: DecisionStep object containing inputs.
:return: Outputs from network as defined by self.inference_dict.
"""
vec_vis_obs, masks = self._split_decision_step(decision_requests)
vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)]
vis_obs = [
torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations
]
obs, masks = self._split_decision_step(decision_requests)
obs = [torch.as_tensor(np_ob) for np_ob in obs]
memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze(
0
)

action, clipped_action, log_probs, entropy, memories = self.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
obs, masks=masks, memories=memories
)
run_out["pre_action"] = ModelUtils.to_numpy(action)

19
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
from mlagents.trainers.settings import TrainerSettings, PPOSettings
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.trajectory import ObsUtil
class TorchPPOOptimizer(TorchOptimizer):

)
returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])
vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
n_obs = len(self.policy.behavior_spec.observation_shapes)
current_obs = ObsUtil.from_buffer(batch, n_obs)
# Convert to tensors
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions_pre"]).unsqueeze(-1)

if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
vis_obs.append(vis_ob)
else:
vis_obs = []
vec_obs,
vis_obs,
current_obs,
masks=act_masks,
actions=actions,
memories=memories,

3
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, PPOSettings
from mlagents.trainers.buffer import AgentBuffer
logger = get_logger(__name__)

agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
self.policy.update_normalization(agent_buffer_trajectory)
# Get all value estimates
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(

58
ml-agents/mlagents/trainers/sac/optimizer_torch.py


from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.settings import TrainerSettings, SACSettings
from contextlib import ExitStack
from mlagents.trainers.trajectory import ObsUtil
EPSILON = 1e-6 # Small value to avoid divide by zero

def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
actions: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

"""
Performs a forward pass on the value network, which consists of a Q1 and Q2
network. Optionally does not evaluate gradients for either the Q1, Q2, or both.
:param vec_inputs: List of vector observation tensors.
:param vis_input: List of visual observation tensors.
:param inputs: List of observation tensors.
:param actions: For a continuous Q function (has actions), tensor of actions.
Otherwise, None.
:param memories: Initial memories if using memory. Otherwise, None.

if not q1_grad:
stack.enter_context(torch.no_grad())
q1_out, _ = self.q1_network(
vec_inputs,
vis_inputs,
inputs,
actions=actions,
memories=memories,
sequence_length=sequence_length,

stack.enter_context(torch.no_grad())
q2_out, _ = self.q2_network(
vec_inputs,
vis_inputs,
inputs,
actions=actions,
memories=memories,
sequence_length=sequence_length,

for name in self.reward_signals:
rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"])
vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
next_vec_obs = [ModelUtils.list_to_tensor(batch["next_vector_in"])]
n_obs = len(self.policy.behavior_spec.observation_shapes)
current_obs = ObsUtil.from_buffer(batch, n_obs)
# Convert to tensors
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
next_obs = ObsUtil.from_buffer_next(batch, n_obs)
# Convert to tensors
next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)

torch.zeros_like(next_memories) if next_memories is not None else None
)
vis_obs: List[torch.Tensor] = []
next_vis_obs: List[torch.Tensor] = []
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
vis_obs.append(vis_ob)
next_vis_ob = ModelUtils.list_to_tensor(
batch["next_visual_obs%d" % idx]
)
next_vis_obs.append(next_vis_ob)
# Copy normalizers from policy
self.value_network.q1_network.network_body.copy_normalization(
self.policy.actor_critic.network_body

self.policy.actor_critic.network_body
)
(sampled_actions, _, log_probs, _, _) = self.policy.sample_actions(
vec_obs,
vis_obs,
current_obs,
masks=act_masks,
memories=memories,
seq_len=self.policy.sequence_length,

vec_obs, vis_obs, memories, sequence_length=self.policy.sequence_length
current_obs, memories, sequence_length=self.policy.sequence_length
vec_obs,
vis_obs,
current_obs,
sampled_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,

vec_obs,
vis_obs,
current_obs,
squeezed_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,

# For discrete, you don't need to backprop through the Q for the policy
q1p_out, q2p_out = self.value_network(
vec_obs,
vis_obs,
current_obs,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q1_grad=False,

vec_obs,
vis_obs,
current_obs,
memories=q_memories,
sequence_length=self.policy.sequence_length,
)

with torch.no_grad():
target_values, _ = self.target_network(
next_vec_obs,
next_vis_obs,
next_obs,
memories=next_memories,
sequence_length=self.policy.sequence_length,
)

14
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer
from mlagents.trainers.trajectory import Trajectory, SplitObservations
from mlagents.trainers.trajectory import Trajectory, ObsUtil
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, SACSettings

# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
self.policy.update_normalization(agent_buffer_trajectory)
# Evaluate all reward functions for reporting purposes
self.collected_rewards["environment"][agent_id] += np.sum(

# Bootstrap using the last step rather than the bootstrap step if max step is reached.
# Set last element to duplicate obs and remove dones.
if last_step.interrupted:
vec_vis_obs = SplitObservations.from_observations(last_step.obs)
for i, obs in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs
if vec_vis_obs.vector_observations.size > 1:
agent_buffer_trajectory["next_vector_in"][
-1
] = vec_vis_obs.vector_observations
last_step_obs = last_step.obs
for i, obs in enumerate(last_step_obs):
agent_buffer_trajectory[ObsUtil.get_name_at_next(i)][-1] = obs
agent_buffer_trajectory["done"][-1] = False
# Append to update buffer

19
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.trajectory import ObsUtil
def create_agent_buffer(

curr_observations = [
curr_obs = [
next_observations = [
next_obs = [
curr_split_obs = SplitObservations.from_observations(curr_observations)
next_split_obs = SplitObservations.from_observations(next_observations)
for i, _ in enumerate(curr_split_obs.visual_observations):
buffer["visual_obs%d" % i].append(curr_split_obs.visual_observations[i])
buffer["next_visual_obs%d" % i].append(
next_split_obs.visual_observations[i]
)
buffer["vector_obs"].append(curr_split_obs.vector_observations)
buffer["next_vector_in"].append(next_split_obs.vector_observations)
for i, obs in enumerate(curr_obs):
buffer[ObsUtil.get_name_at(i)].append(obs)
for i, obs in enumerate(next_obs):
buffer[ObsUtil.get_name_at_next(i)].append(obs)
buffer["actions"].append(action)
buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
buffer["masks"].append(np.ones(1, dtype=np.float32))

9
ml-agents/mlagents/trainers/torch/encoders.py


from mlagents.trainers.torch.layers import linear_layer, Initialization, Swish
from mlagents.torch_utils import torch, nn
from mlagents.trainers.torch.model_serialization import exporting_to_onnx
class Normalizer(nn.Module):

)
def forward(self, visual_obs: torch.Tensor) -> torch.Tensor:
if not exporting_to_onnx.is_exporting():
visual_obs = visual_obs.permute([0, 3, 1, 2])
hidden = self.conv_layers(visual_obs)
hidden = torch.reshape(hidden, (-1, self.final_flat))
return self.dense(hidden)

)
def forward(self, visual_obs: torch.Tensor) -> torch.Tensor:
if not exporting_to_onnx.is_exporting():
visual_obs = visual_obs.permute([0, 3, 1, 2])
hidden = self.conv_layers(visual_obs)
hidden = torch.reshape(hidden, (-1, self.final_flat))
return self.dense(hidden)

)
def forward(self, visual_obs: torch.Tensor) -> torch.Tensor:
if not exporting_to_onnx.is_exporting():
visual_obs = visual_obs.permute([0, 3, 1, 2])
hidden = self.conv_layers(visual_obs)
hidden = hidden.view([-1, self.final_flat])
return self.dense(hidden)

self.sequential = nn.Sequential(*layers)
def forward(self, visual_obs: torch.Tensor) -> torch.Tensor:
if not exporting_to_onnx.is_exporting():
visual_obs = visual_obs.permute([0, 3, 1, 2])
batch_size = visual_obs.shape[0]
hidden = self.sequential(visual_obs)
before_out = hidden.view(batch_size, -1)

111
ml-agents/mlagents/trainers/torch/networks.py


from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.decoders import ValueHeads
from mlagents.trainers.torch.layers import LSTM, LinearEncoder
from mlagents.trainers.torch.model_serialization import exporting_to_onnx
from mlagents.trainers.torch.encoders import VectorInput
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trajectory import ObsUtil
ActivationFunction = Callable[[torch.Tensor], torch.Tensor]
EncoderFunction = Callable[

else 0
)
self.visual_processors, self.vector_processors, encoder_input_size = ModelUtils.create_input_processors(
self.encoders, self.embedding_sizes = ModelUtils.create_input_processors(
total_enc_size = encoder_input_size + encoded_act_size
total_enc_size = sum(self.embedding_sizes) + encoded_act_size
self.linear_encoder = LinearEncoder(
total_enc_size, network_settings.num_layers, self.h_size
)

else:
self.lstm = None # type: ignore
def update_normalization(self, vec_inputs: List[torch.Tensor]) -> None:
for vec_input, vec_enc in zip(vec_inputs, self.vector_processors):
vec_enc.update_normalization(vec_input)
def update_normalization(self, buffer: AgentBuffer) -> None:
obs = ObsUtil.from_buffer(buffer, len(self.encoders))
for vec_input, enc in zip(obs, self.encoders):
if isinstance(enc, VectorInput):
enc.update_normalization(torch.as_tensor(vec_input))
for n1, n2 in zip(self.vector_processors, other_network.vector_processors):
n1.copy_normalization(n2)
for n1, n2 in zip(self.encoders, other_network.encoders):
if isinstance(n1, VectorInput) and isinstance(n2, VectorInput):
n1.copy_normalization(n2)
@property
def memory_size(self) -> int:

self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
for idx, processor in enumerate(self.vector_processors):
vec_input = vec_inputs[idx]
processed_vec = processor(vec_input)
encodes.append(processed_vec)
for idx, processor in enumerate(self.visual_processors):
vis_input = vis_inputs[idx]
if not exporting_to_onnx.is_exporting():
vis_input = vis_input.permute([0, 3, 1, 2])
processed_vis = processor(vis_input)
encodes.append(processed_vis)
for idx, processor in enumerate(self.encoders):
obs_input = inputs[idx]
processed_obs = processor(obs_input)
encodes.append(processed_obs)
if len(encodes) == 0:
raise Exception("No valid inputs to network.")

def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
vec_inputs, vis_inputs, actions, memories, sequence_length
inputs, actions, memories, sequence_length
)
output = self.value_heads(encoding)
return output, memories

@abc.abstractmethod
def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
def update_normalization(self, buffer: AgentBuffer) -> None:
"""
Updates normalization of Actor based on the provided List of vector obs.
:param vector_obs: A List of vector obs as tensors.

@abc.abstractmethod
def get_dists(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

@abc.abstractmethod
def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, int, int, int, int]:

@abc.abstractmethod
def critic_pass(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:

@abc.abstractmethod
def get_dist_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

def get_dists(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
inputs, memories=memories, sequence_length=sequence_length
)
if self.action_spec.is_continuous():
dists = self.distribution(encoding)

"""
Note: This forward() method is required for exporting to ONNX. Don't modify the inputs and outputs.
"""
dists, _ = self.get_dists(vec_inputs, vis_inputs, masks, memories, 1)
# This code will convert the ugly vec and obs into glorious unified list of inputs
concatenated_vec_obs = vec_inputs[0]
inputs = []
start = 0
end = 0
vis_index = 0
for i, enc in enumerate(self.network_body.encoders):
if isinstance(enc, VectorInput):
# This is a vec_obs
vec_size = self.network_body.embedding_sizes[i]
end = start + vec_size
inputs.append(concatenated_vec_obs[:, start:end])
start = end
else:
inputs.append(vis_inputs[vis_index])
vis_index += 1
# End of code to convert the ugly vec and obs into glorious unified list of inputs
dists, _ = self.get_dists(inputs, masks, memories, 1)
if self.action_spec.is_continuous():
action_list = self.sample_action(dists)
action_out = torch.stack(action_list, dim=-1)

def critic_pass(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
inputs, memories=memories, sequence_length=sequence_length
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
inputs, memories=memories, sequence_length=sequence_length
)
if self.action_spec.is_continuous():
dists = self.distribution(encoding)

def critic_pass(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:

actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, -1)
value_outputs, critic_mem_out = self.critic(
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
inputs, memories=critic_mem, sequence_length=sequence_length
)
if actor_mem is not None:
# Make memories with the actor mem unchanged

def get_dist_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

critic_mem = None
actor_mem = None
dists, actor_mem_outs = self.get_dists(
vec_inputs,
vis_inputs,
inputs,
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
inputs, memories=critic_mem, sequence_length=sequence_length
)
if self.use_lstm:
mem_out = torch.cat([actor_mem_outs, critic_mem_outs], dim=-1)

def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
def update_normalization(self, vector_obs: AgentBuffer) -> None:
super().update_normalization(vector_obs)
self.critic.network_body.update_normalization(vector_obs)

59
ml-agents/mlagents/trainers/torch/utils.py


)
@staticmethod
def get_encoder_for_obs(
shape: Tuple[int, ...],
normalize: bool,
h_size: int,
vis_encode_type: EncoderType
) -> Tuple[nn.Module, int]:
"""
Returns the encoder and the size of the generated embedding
"""
if len(shape) == 1:
# Case rank 1 tensor
return (VectorInput(shape[0], normalize), shape[0])
if len(shape) == 2:
raise UnityTrainerException(f"Unsupported shape of {shape} for observation")
if len(shape) == 3:
ModelUtils._check_resolution_for_encoder(
shape[0], shape[1], vis_encode_type
)
visual_encoder_class = ModelUtils.get_encoder_for_type(vis_encode_type)
return (visual_encoder_class(shape[0], shape[1], shape[2], h_size), h_size)
@staticmethod
def create_input_processors(
observation_shapes: List[Tuple[int, ...]],
h_size: int,

:param normalize: Normalize all vector inputs.
:return: Tuple of visual encoders and vector encoders each as a list.
"""
visual_encoders: List[nn.Module] = []
vector_encoders: List[nn.Module] = []
encoders: List[nn.Module] = []
embedding_sizes: List[int] = []
visual_encoder_class = ModelUtils.get_encoder_for_type(vis_encode_type)
vector_size = 0
visual_output_size = 0
if len(dimension) == 3:
ModelUtils._check_resolution_for_encoder(
dimension[0], dimension[1], vis_encode_type
)
visual_encoders.append(
visual_encoder_class(
dimension[0], dimension[1], dimension[2], h_size
)
)
visual_output_size += h_size
elif len(dimension) == 1:
vector_size += dimension[0]
else:
raise UnityTrainerException(
f"Unsupported shape of {dimension} for observation {i}"
)
if vector_size > 0:
vector_encoders.append(VectorInput(vector_size, normalize))
# Total output size for all inputs + CNNs
total_processed_size = vector_size + visual_output_size
return (
nn.ModuleList(visual_encoders),
nn.ModuleList(vector_encoders),
total_processed_size,
)
encoder, embedding_size = ModelUtils.get_encoder_for_obs(dimension, normalize, h_size, vis_encode_type)
encoders.append(encoder)
embedding_sizes.append(embedding_size)
return (encoders, embedding_sizes)
@staticmethod
def list_to_tensor(

97
ml-agents/mlagents/trainers/trajectory.py


memory: np.ndarray
class SplitObservations(NamedTuple):
vector_observations: np.ndarray
visual_observations: List[np.ndarray]
class ObsUtil:
@staticmethod
def get_obs_with_rank(observations: List[np.array], rank: int) -> List[np.array]:
result = []
for obs in observations:
if len(obs.shape) == rank:
result += [obs]
return result
@staticmethod
def get_name_at(index: int) -> str:
return "obs_%d" % index
@staticmethod
def get_name_at_next(index: int) -> str:
return "next_obs_%d" % index
@staticmethod
def from_buffer(batch: AgentBuffer, num_obs: int) -> List[np.array]:
result = []
for i in range(num_obs):
result.append(batch[ObsUtil.get_name_at(i)])
return result
def from_observations(obs: List[np.ndarray]) -> "SplitObservations":
"""
Divides a List of numpy arrays into a SplitObservations NamedTuple.
This allows you to access the vector and visual observations directly,
without enumerating the list over and over.
:param obs: List of numpy arrays (observation)
:returns: A SplitObservations object.
"""
vis_obs_list: List[np.ndarray] = []
vec_obs_list: List[np.ndarray] = []
last_obs = None
for observation in obs:
# Obs could be batched or single
if len(observation.shape) == 1 or len(observation.shape) == 2:
vec_obs_list.append(observation)
if len(observation.shape) == 3 or len(observation.shape) == 4:
vis_obs_list.append(observation)
last_obs = observation
if last_obs is not None:
is_batched = len(last_obs.shape) == 2 or len(last_obs.shape) == 4
if is_batched:
vec_obs = (
np.concatenate(vec_obs_list, axis=1)
if len(vec_obs_list) > 0
else np.zeros((last_obs.shape[0], 0), dtype=np.float32)
)
else:
vec_obs = (
np.concatenate(vec_obs_list, axis=0)
if len(vec_obs_list) > 0
else np.array([], dtype=np.float32)
)
else:
vec_obs = []
return SplitObservations(
vector_observations=vec_obs, visual_observations=vis_obs_list
)
def from_buffer_next(batch: AgentBuffer, num_obs: int) -> List[np.array]:
result = []
for i in range(num_obs):
result.append(batch[ObsUtil.get_name_at_next(i)])
return result
class Trajectory(NamedTuple):

step of the trajectory.
"""
agent_buffer_trajectory = AgentBuffer()
vec_vis_obs = SplitObservations.from_observations(self.steps[0].obs)
obs = self.steps[0].obs
next_vec_vis_obs = SplitObservations.from_observations(
self.steps[step + 1].obs
)
next_obs = self.steps[step + 1].obs
next_vec_vis_obs = SplitObservations.from_observations(self.next_obs)
next_obs = self.next_obs
for i, _ in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["visual_obs%d" % i].append(
vec_vis_obs.visual_observations[i]
)
agent_buffer_trajectory["next_visual_obs%d" % i].append(
next_vec_vis_obs.visual_observations[i]
)
agent_buffer_trajectory["vector_obs"].append(
vec_vis_obs.vector_observations
)
agent_buffer_trajectory["next_vector_in"].append(
next_vec_vis_obs.vector_observations
)
num_obs = len(obs)
for i in range(num_obs):
agent_buffer_trajectory[ObsUtil.get_name_at(i)].append(obs[i])
agent_buffer_trajectory[ObsUtil.get_name_at_next(i)].append(next_obs[i])
if exp.memory is not None:
agent_buffer_trajectory["memory"].append(exp.memory)

agent_buffer_trajectory["environment_rewards"].append(exp.reward)
# Store the next visual obs as the current
vec_vis_obs = next_vec_vis_obs
obs = next_obs
return agent_buffer_trajectory
@property

正在加载...
取消
保存