浏览代码

Merge branch 'develop-unified-obs' into develop-centralizedcritic

/comms-grad
Ervin Teng 4 年前
当前提交
fdaa8c3d
共有 15 个文件被更改,包括 174 次插入291 次删除
  1. 10
      ml-agents/mlagents/trainers/buffer.py
  2. 6
      ml-agents/mlagents/trainers/demo_loader.py
  3. 28
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  4. 33
      ml-agents/mlagents/trainers/policy/torch_policy.py
  5. 16
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  6. 6
      ml-agents/mlagents/trainers/ppo/trainer.py
  7. 54
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  8. 16
      ml-agents/mlagents/trainers/sac/trainer.py
  9. 20
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  10. 29
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  11. 62
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
  12. 25
      ml-agents/mlagents/trainers/torch/encoders.py
  13. 93
      ml-agents/mlagents/trainers/torch/networks.py
  14. 38
      ml-agents/mlagents/trainers/torch/utils.py
  15. 29
      ml-agents/mlagents/trainers/trajectory.py

10
ml-agents/mlagents/trainers/buffer.py


return len(next(iter(self.values())))
else:
return 0
@staticmethod
def obs_list_to_obs_batch(obs_list: List[List[np.ndarray]]) -> List[np.ndarray]:
"""
Converts a List of obs (an obs itself consinsting of a List of np.ndarray) to
a List of np.ndarray, with the observations batchwise.
"""
# Transpose and convert List of Lists
new_list = list(map(lambda x: np.asanyarray(list(x)), zip(*obs_list)))
return new_list

6
ml-agents/mlagents/trainers/demo_loader.py


from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
AgentInfoActionPairProto,
)
from mlagents.trainers.trajectory import SplitObservations
from mlagents_envs.rpc_utils import behavior_spec_from_proto, steps_from_proto
from mlagents_envs.base_env import BehaviorSpec
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto

demo_raw_buffer["done"].append(next_done)
demo_raw_buffer["rewards"].append(next_reward)
split_obs = SplitObservations.from_observations(current_obs)
for i, obs in enumerate(split_obs.visual_observations):
demo_raw_buffer["visual_obs%d" % i].append(obs)
demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
demo_raw_buffer["obs"].append(current_obs)
demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
demo_raw_buffer["prev_action"].append(previous_action)
if next_done:

28
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.torch.components.bc.module import BCModule
from mlagents.trainers.torch.components.reward_providers import create_reward_provider

def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
if self.policy.use_vis_obs:
visual_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
visual_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
visual_obs.append(visual_ob)
else:
visual_obs = []
obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(batch["obs"])
)
next_obs = ModelUtils.list_to_tensor_list(next_obs)
vec_vis_obs = SplitObservations.from_observations(next_obs)
next_vec_obs = [
ModelUtils.list_to_tensor(vec_vis_obs.vector_observations).unsqueeze(0)
]
next_vis_obs = [
ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0)
for _vis_ob in vec_vis_obs.visual_observations
]
vector_obs, visual_obs, memory, sequence_length=batch.num_experiences
obs, memory, sequence_length=batch.num_experiences
next_vec_obs, next_vis_obs, next_memory, sequence_length=1
next_obs, next_memory, sequence_length=1
)
for name, estimate in value_estimates.items():

33
ml-agents/mlagents/trainers/policy/torch_policy.py


def _split_decision_step(
self, decision_requests: DecisionSteps
) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
obs = ModelUtils.list_to_tensor_list(decision_requests.obs)
mask = None
if not self.use_continuous_act:
mask = torch.ones([len(decision_requests), np.sum(self.act_size)])

)
return vec_vis_obs, mask
return obs, mask
def update_normalization(self, vector_obs: np.ndarray) -> None:
def update_normalization(self, vector_obs: List[np.ndarray]) -> None:
vector_obs = [torch.as_tensor(vector_obs)]
all_obs = ModelUtils.list_to_tensor_list(vector_obs)
self.actor_critic.update_normalization(vector_obs)
self.actor_critic.update_normalization(all_obs)
vec_obs: List[torch.Tensor],
vis_obs: List[torch.Tensor],
obs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
seq_len: int = 1,

entropies, and output memories, all as Torch Tensors.
"""
if memories is None:
dists, memories = self.actor_critic.get_dists(
vec_obs, vis_obs, masks, memories, seq_len
)
dists, memories = self.actor_critic.get_dists(obs, masks, memories, seq_len)
vec_obs, vis_obs, masks, memories, seq_len
obs, masks, memories, seq_len
)
action_list = self.actor_critic.sample_action(dists)
log_probs, entropies, all_logs = ModelUtils.get_probs_and_entropy(

def evaluate_actions(
self,
vec_obs: torch.Tensor,
vis_obs: torch.Tensor,
obs: List[torch.Tensor],
actions: torch.Tensor,
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,

vec_obs, vis_obs, masks, memories, seq_len
obs, masks, memories, seq_len
)
action_list = [actions[..., i] for i in range(actions.shape[-1])]
log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_list, dists)

:param decision_requests: DecisionStep object containing inputs.
:return: Outputs from network as defined by self.inference_dict.
"""
vec_vis_obs, masks = self._split_decision_step(decision_requests)
vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)]
vis_obs = [
torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations
]
obs, masks = self._split_decision_step(decision_requests)
vec_obs, vis_obs, masks=masks, memories=memories
obs, masks=masks, memories=memories
)
run_out["pre_action"] = ModelUtils.to_numpy(action)

16
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


)
returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])
vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(batch["obs"])
)
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions_pre"]).unsqueeze(-1)

if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
vis_obs.append(vis_ob)
else:
vis_obs = []
vec_obs,
vis_obs,
obs,
masks=act_masks,
actions=actions,
memories=memories,

6
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, PPOSettings, FrameworkType
from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (

agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
obs_to_normalize = AgentBuffer.obs_list_to_obs_batch(
agent_buffer_trajectory["obs"]
)
self.policy.update_normalization(obs_to_normalize)
# Get all value estimates
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(

54
ml-agents/mlagents/trainers/sac/optimizer_torch.py


def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
actions: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

"""
Performs a forward pass on the value network, which consists of a Q1 and Q2
network. Optionally does not evaluate gradients for either the Q1, Q2, or both.
:param vec_inputs: List of vector observation tensors.
:param vis_input: List of visual observation tensors.
:param net_inputs: List of observation tensors.
:param actions: For a continuous Q function (has actions), tensor of actions.
Otherwise, None.
:param memories: Initial memories if using memory. Otherwise, None.

if not q1_grad:
stack.enter_context(torch.no_grad())
q1_out, _ = self.q1_network(
vec_inputs,
vis_inputs,
net_inputs,
actions=actions,
memories=memories,
sequence_length=sequence_length,

stack.enter_context(torch.no_grad())
q2_out, _ = self.q2_network(
vec_inputs,
vis_inputs,
net_inputs,
actions=actions,
memories=memories,
sequence_length=sequence_length,

for name in self.reward_signals:
rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"])
vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
next_vec_obs = [ModelUtils.list_to_tensor(batch["next_vector_in"])]
obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(batch["obs"])
)
next_obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(batch["next_obs"])
)
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)

torch.zeros_like(next_memories) if next_memories is not None else None
)
vis_obs: List[torch.Tensor] = []
next_vis_obs: List[torch.Tensor] = []
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
vis_obs.append(vis_ob)
next_vis_ob = ModelUtils.list_to_tensor(
batch["next_visual_obs%d" % idx]
)
next_vis_obs.append(next_vis_ob)
# Copy normalizers from policy
self.value_network.q1_network.network_body.copy_normalization(
self.policy.actor_critic.network_body

self.policy.actor_critic.network_body
)
(sampled_actions, _, log_probs, _, _) = self.policy.sample_actions(
vec_obs,
vis_obs,
obs,
masks=act_masks,
memories=memories,
seq_len=self.policy.sequence_length,

vec_obs, vis_obs, memories, sequence_length=self.policy.sequence_length
obs, memories, sequence_length=self.policy.sequence_length
vec_obs,
vis_obs,
obs,
sampled_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,

vec_obs,
vis_obs,
obs,
squeezed_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,

# For discrete, you don't need to backprop through the Q for the policy
q1p_out, q2p_out = self.value_network(
vec_obs,
vis_obs,
obs,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q1_grad=False,

vec_obs,
vis_obs,
obs,
memories=q_memories,
sequence_length=self.policy.sequence_length,
)

with torch.no_grad():
target_values, _ = self.target_network(
next_vec_obs,
next_vis_obs,
next_obs,
memories=next_memories,
sequence_length=self.policy.sequence_length,
)

16
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer
from mlagents.trainers.trajectory import Trajectory, SplitObservations
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.buffer import AgentBuffer
from mlagents import tf_utils
if tf_utils.is_available():

# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
obs_to_normalize = AgentBuffer.obs_list_to_obs_batch(
agent_buffer_trajectory["obs"]
)
self.policy.update_normalization(obs_to_normalize)
# Evaluate all reward functions for reporting purposes
self.collected_rewards["environment"][agent_id] += np.sum(

# Bootstrap using the last step rather than the bootstrap step if max step is reached.
# Set last element to duplicate obs and remove dones.
if last_step.interrupted:
vec_vis_obs = SplitObservations.from_observations(last_step.obs)
for i, obs in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs
if vec_vis_obs.vector_observations.size > 1:
agent_buffer_trajectory["next_vector_in"][
-1
] = vec_vis_obs.vector_observations
agent_buffer_trajectory["next_obs"][-1] = last_step.obs
agent_buffer_trajectory["done"][-1] = False
# Append to update buffer

20
ml-agents/mlagents/trainers/torch/components/bc/module.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.settings import BehavioralCloningSettings, ScheduleType
from mlagents.trainers.torch.utils import ModelUtils

"""
Helper function for update_batch.
"""
vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])]
obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(mini_batch_demo["obs"]), dtype=torch.float
)
act_masks = None
if self.policy.use_continuous_act:
expert_actions = ModelUtils.list_to_tensor(mini_batch_demo["actions"])

if self.policy.use_recurrent:
memories = torch.zeros(1, self.n_sequences, self.policy.m_size)
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
vis_ob = ModelUtils.list_to_tensor(
mini_batch_demo["visual_obs%d" % idx]
)
vis_obs.append(vis_ob)
else:
vis_obs = []
(
selected_actions,
clipped_actions,

) = self.policy.sample_actions(
vec_obs,
vis_obs,
obs,
masks=act_masks,
memories=memories,
seq_len=self.policy.sequence_length,

29
ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py


"""
Extracts the current state embedding from a mini_batch.
"""
n_vis = len(self._state_encoder.visual_processors)
vec_inputs=[
ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float)
],
vis_inputs=[
ModelUtils.list_to_tensor(
mini_batch["visual_obs%d" % i], dtype=torch.float
)
for i in range(n_vis)
],
net_inputs=ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(mini_batch["obs"]), dtype=torch.float
)
)
return hidden

"""
n_vis = len(self._state_encoder.visual_processors)
vec_inputs=[
ModelUtils.list_to_tensor(
mini_batch["next_vector_in"], dtype=torch.float
)
],
vis_inputs=[
ModelUtils.list_to_tensor(
mini_batch["next_visual_obs%d" % i], dtype=torch.float
)
for i in range(n_vis)
],
net_inputs=ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(mini_batch["next_obs"]),
dtype=torch.float,
)
)
return hidden

62
ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py


"""
Creates the observation input.
"""
n_vis = len(self.encoder.visual_processors)
n_vec = len(self.encoder.vector_processors)
vec_inputs = (
[ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float)]
if n_vec > 0
else []
net_inputs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(mini_batch["obs"]), dtype=torch.float
vis_inputs = [
ModelUtils.list_to_tensor(mini_batch["visual_obs%d" % i], dtype=torch.float)
for i in range(n_vis)
]
return vec_inputs, vis_inputs
return net_inputs
def compute_estimate(
self, mini_batch: AgentBuffer, use_vail_noise: bool = False

:param use_vail_noise: Only when using VAIL : If true, will sample the code, if
false, will return the mean of the code.
"""
vec_inputs, vis_inputs = self.get_state_inputs(mini_batch)
net_inputs = self.get_state_inputs(mini_batch)
hidden, _ = self.encoder(vec_inputs, vis_inputs, action_inputs)
hidden, _ = self.encoder(net_inputs, action_inputs)
hidden, _ = self.encoder(vec_inputs, vis_inputs)
hidden, _ = self.encoder(net_inputs)
z_mu: Optional[torch.Tensor] = None
if self._settings.use_vail:
z_mu = self._z_mu_layer(hidden)

Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp.
for off-policy. Compute gradients w.r.t randomly interpolated input.
"""
policy_vec_inputs, policy_vis_inputs = self.get_state_inputs(policy_batch)
expert_vec_inputs, expert_vis_inputs = self.get_state_inputs(expert_batch)
interp_vec_inputs = []
for policy_vec_input, expert_vec_input in zip(
policy_vec_inputs, expert_vec_inputs
):
obs_epsilon = torch.rand(policy_vec_input.shape)
interp_vec_input = (
obs_epsilon * policy_vec_input + (1 - obs_epsilon) * expert_vec_input
)
interp_vec_input.requires_grad = True # For gradient calculation
interp_vec_inputs.append(interp_vec_input)
interp_vis_inputs = []
for policy_vis_input, expert_vis_input in zip(
policy_vis_inputs, expert_vis_inputs
):
obs_epsilon = torch.rand(policy_vis_input.shape)
interp_vis_input = (
obs_epsilon * policy_vis_input + (1 - obs_epsilon) * expert_vis_input
)
interp_vis_input.requires_grad = True # For gradient calculation
interp_vis_inputs.append(interp_vis_input)
policy_inputs = self.get_state_inputs(policy_batch)
expert_inputs = self.get_state_inputs(expert_batch)
interp_inputs = []
for policy_input, expert_input in zip(policy_inputs, expert_inputs):
obs_epsilon = torch.rand(policy_input.shape)
interp_input = obs_epsilon * policy_input + (1 - obs_epsilon) * expert_input
interp_input.requires_grad = True # For gradient calculation
interp_inputs.append(interp_input)
if self._settings.use_actions:
policy_action = self.get_action_input(policy_batch)
expert_action = self.get_action_input(expert_batch)

dim=1,
)
action_inputs.requires_grad = True
hidden, _ = self.encoder(
interp_vec_inputs, interp_vis_inputs, action_inputs
)
encoder_input = tuple(
interp_vec_inputs + interp_vis_inputs + [action_inputs]
)
hidden, _ = self.encoder(interp_inputs, action_inputs)
encoder_input = tuple(interp_inputs + [action_inputs])
hidden, _ = self.encoder(interp_vec_inputs, interp_vis_inputs)
encoder_input = tuple(interp_vec_inputs + interp_vis_inputs)
hidden, _ = self.encoder(interp_inputs)
encoder_input = tuple(interp_inputs)
if self._settings.use_vail:
use_vail_noise = True
z_mu = self._z_mu_layer(hidden)

25
ml-agents/mlagents/trainers/torch/encoders.py


return height, width
class VectorInput(nn.Module):
class InputProcessor:
def copy_normalization(self, other_input: "InputProcessor") -> None:
pass
def update_normalization(self, inputs: torch.Tensor) -> None:
pass
class VectorInput(nn.Module, InputProcessor):
def __init__(self, input_size: int, normalize: bool = False):
super().__init__()
self.normalizer: Optional[Normalizer] = None

inputs = self.normalizer(inputs)
return inputs
def copy_normalization(self, other_input: "VectorInput") -> None:
if self.normalizer is not None and other_input.normalizer is not None:
self.normalizer.copy_from(other_input.normalizer)
def copy_normalization(self, other_input: "InputProcessor") -> None:
if isinstance(other_input, VectorInput):
if self.normalizer is not None and other_input.normalizer is not None:
self.normalizer.copy_from(other_input.normalizer)
def update_normalization(self, inputs: torch.Tensor) -> None:
if self.normalizer is not None:

class SmallVisualEncoder(nn.Module):
class SmallVisualEncoder(nn.Module, InputProcessor):
"""
CNN architecture used by King in their Candy Crush predictor
https://www.researchgate.net/publication/328307928_Human-Like_Playtesting_with_Deep_Learning

return self.dense(hidden)
class SimpleVisualEncoder(nn.Module):
class SimpleVisualEncoder(nn.Module, InputProcessor):
def __init__(
self, height: int, width: int, initial_channels: int, output_size: int
):

return self.dense(hidden)
class NatureVisualEncoder(nn.Module):
class NatureVisualEncoder(nn.Module, InputProcessor):
def __init__(
self, height: int, width: int, initial_channels: int, output_size: int
):

return input_tensor + self.layers(input_tensor)
class ResNetVisualEncoder(nn.Module):
class ResNetVisualEncoder(nn.Module, InputProcessor):
def __init__(
self, height: int, width: int, initial_channels: int, output_size: int
):

93
ml-agents/mlagents/trainers/torch/networks.py


else 0
)
self.visual_processors, self.vector_processors, encoder_input_size = ModelUtils.create_input_processors(
self.processors, encoder_input_size = ModelUtils.create_input_processors(
observation_shapes,
self.h_size,
network_settings.vis_encode_type,

else:
self.lstm = None # type: ignore
def update_normalization(self, vec_inputs: List[torch.Tensor]) -> None:
for vec_input, vec_enc in zip(vec_inputs, self.vector_processors):
vec_enc.update_normalization(vec_input)
def update_normalization(self, net_inputs: List[torch.Tensor]) -> None:
for _in, enc in zip(net_inputs, self.processors):
enc.update_normalization(_in)
for n1, n2 in zip(self.vector_processors, other_network.vector_processors):
for n1, n2 in zip(self.processors, other_network.processors):
n1.copy_normalization(n2)
@property

def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
for idx, processor in enumerate(self.vector_processors):
vec_input = vec_inputs[idx]
processed_vec = processor(vec_input)
for idx, processor in enumerate(self.processors):
net_input = net_inputs[idx]
if not exporting_to_onnx.is_exporting() and len(net_input.shape) > 3:
net_input = net_input.permute([0, 3, 1, 2])
processed_vec = processor(net_input)
for idx, processor in enumerate(self.visual_processors):
vis_input = vis_inputs[idx]
if not exporting_to_onnx.is_exporting():
vis_input = vis_input.permute([0, 3, 1, 2])
processed_vis = processor(vis_input)
encodes.append(processed_vis)
if len(encodes) == 0:
raise Exception("No valid inputs to network.")

def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
vec_inputs, vis_inputs, actions, memories, sequence_length
net_inputs, actions, memories, sequence_length
)
output = self.value_heads(encoding)
return output, memories

@abc.abstractmethod
def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
def update_normalization(self, net_inputs: List[torch.Tensor]) -> None:
"""
Updates normalization of Actor based on the provided List of vector obs.
:param vector_obs: A List of vector obs as tensors.

@abc.abstractmethod
def get_dists(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

@abc.abstractmethod
def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, int, int, int, int]:

@abc.abstractmethod
def critic_pass(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:

@abc.abstractmethod
def get_dist_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

def memory_size(self) -> int:
return self.network_body.memory_size
def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
self.network_body.update_normalization(vector_obs)
def update_normalization(self, net_inputs: List[torch.Tensor]) -> None:
self.network_body.update_normalization(net_inputs)
def sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
actions = []

def get_dists(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
net_inputs, memories=memories, sequence_length=sequence_length
)
if self.action_spec.is_continuous():
dists = self.distribution(encoding)

def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, int, int, int, int]:

dists, _ = self.get_dists(vec_inputs, vis_inputs, masks, memories, 1)
dists, _ = self.get_dists(net_inputs, masks, memories, 1)
if self.action_spec.is_continuous():
action_list = self.sample_action(dists)
action_out = torch.stack(action_list, dim=-1)

def critic_pass(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
net_inputs, memories=memories, sequence_length=sequence_length
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
net_inputs, memories=memories, sequence_length=sequence_length
)
if self.action_spec.is_continuous():
dists = self.distribution(encoding)

def critic_pass(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:

actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, -1)
value_outputs, critic_mem_out = self.critic(
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
net_inputs, memories=critic_mem, sequence_length=sequence_length
)
if actor_mem is not None:
# Make memories with the actor mem unchanged

def get_dist_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

critic_mem = None
actor_mem = None
dists, actor_mem_outs = self.get_dists(
vec_inputs,
vis_inputs,
memories=actor_mem,
sequence_length=sequence_length,
masks=masks,
net_inputs, memories=actor_mem, sequence_length=sequence_length, masks=masks
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
net_inputs, memories=critic_mem, sequence_length=sequence_length
)
if self.use_lstm:
mem_out = torch.cat([actor_mem_outs, critic_mem_outs], dim=-1)

def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
super().update_normalization(vector_obs)
self.critic.network_body.update_normalization(vector_obs)
def update_normalization(self, net_inputs: List[torch.Tensor]) -> None:
super().update_normalization(net_inputs)
self.critic.network_body.update_normalization(net_inputs)
class GlobalSteps(nn.Module):

38
ml-agents/mlagents/trainers/torch/utils.py


h_size: int,
vis_encode_type: EncoderType,
normalize: bool = False,
) -> Tuple[nn.ModuleList, nn.ModuleList, int]:
) -> Tuple[nn.ModuleList, int]:
"""
Creates visual and vector encoders, along with their normalizers.
:param observation_shapes: List of Tuples that represent the action dimensions.

:param normalize: Normalize all vector inputs.
:return: Tuple of visual encoders and vector encoders each as a list.
"""
visual_encoders: List[nn.Module] = []
vector_encoders: List[nn.Module] = []
encoders: List[nn.Module] = []
vector_size = 0
visual_output_size = 0
total_encoded_size = 0
visual_encoders.append(
encoders.append(
visual_output_size += h_size
total_encoded_size += h_size
vector_size += dimension[0]
vector_size = dimension[0]
encoders.append(VectorInput(vector_size, normalize))
total_encoded_size += vector_size
if vector_size > 0:
vector_encoders.append(VectorInput(vector_size, normalize))
total_processed_size = vector_size + visual_output_size
return (
nn.ModuleList(visual_encoders),
nn.ModuleList(vector_encoders),
total_processed_size,
)
return (nn.ModuleList(encoders), total_encoded_size)
@staticmethod
def list_to_tensor(

calling as_tensor on the list directly.
"""
return torch.as_tensor(np.asanyarray(ndarray_list), dtype=dtype)
@staticmethod
def list_to_tensor_list(
ndarray_list: List[np.ndarray], dtype: Optional[torch.dtype] = torch.float32
) -> torch.Tensor:
"""
Converts a list of numpy arrays into a list of tensors. MUCH faster than
calling as_tensor on the list directly.
"""
return [
torch.as_tensor(np.asanyarray(_arr), dtype=dtype) for _arr in ndarray_list
]
@staticmethod
def to_numpy(tensor: torch.Tensor) -> np.ndarray:

29
ml-agents/mlagents/trainers/trajectory.py


step of the trajectory.
"""
agent_buffer_trajectory = AgentBuffer()
vec_vis_obs = SplitObservations.from_observations(self.steps[0].obs)
curr_obs = self.steps[0].obs
next_vec_vis_obs = SplitObservations.from_observations(
self.steps[step + 1].obs
)
next_obs = self.steps[step + 1].obs
next_vec_vis_obs = SplitObservations.from_observations(self.next_obs)
for i, _ in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["visual_obs%d" % i].append(
vec_vis_obs.visual_observations[i]
)
agent_buffer_trajectory["next_visual_obs%d" % i].append(
next_vec_vis_obs.visual_observations[i]
)
next_obs = self.next_obs
agent_buffer_trajectory["obs"].append(curr_obs)
agent_buffer_trajectory["next_obs"].append(next_obs)
agent_buffer_trajectory["vector_obs"].append(
vec_vis_obs.vector_observations
)
agent_buffer_trajectory["next_vector_in"].append(
next_vec_vis_obs.vector_observations
)
if exp.memory is not None:
agent_buffer_trajectory["memory"].append(exp.memory)

agent_buffer_trajectory["prev_action"].append(exp.prev_action)
agent_buffer_trajectory["environment_rewards"].append(exp.reward)
# Store the next visual obs as the current
vec_vis_obs = next_vec_vis_obs
# Store the next obs as the current
curr_obs = next_obs
return agent_buffer_trajectory
@property

正在加载...
取消
保存