比较提交

...
此合并请求有变更与目标分支冲突。
/ml-agents/mlagents/trainers/demo_loader.py
/ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
/ml-agents/mlagents/trainers/policy/torch_policy.py
/ml-agents/mlagents/trainers/ppo/optimizer_torch.py
/ml-agents/mlagents/trainers/ppo/trainer.py
/ml-agents/mlagents/trainers/sac/trainer.py
/ml-agents/mlagents/trainers/sac/optimizer_torch.py
/ml-agents/mlagents/trainers/tests/test_buffer.py
/ml-agents/mlagents/trainers/buffer.py
/ml-agents/mlagents/trainers/torch/components/bc/module.py
/ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
/ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
/ml-agents/mlagents/trainers/torch/utils.py
/ml-agents/mlagents/trainers/torch/networks.py
/ml-agents/mlagents/trainers/torch/encoders.py
/ml-agents/mlagents/trainers/trajectory.py

10 次代码提交

共有 16 个文件被更改,包括 212 次插入331 次删除
  1. 28
      ml-agents/mlagents/trainers/trajectory.py
  2. 54
      ml-agents/mlagents/trainers/buffer.py
  3. 28
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  4. 33
      ml-agents/mlagents/trainers/policy/torch_policy.py
  5. 12
      ml-agents/mlagents/trainers/ppo/trainer.py
  6. 16
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  7. 29
      ml-agents/mlagents/trainers/tests/test_buffer.py
  8. 54
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  9. 16
      ml-agents/mlagents/trainers/sac/trainer.py
  10. 6
      ml-agents/mlagents/trainers/demo_loader.py
  11. 38
      ml-agents/mlagents/trainers/torch/utils.py
  12. 93
      ml-agents/mlagents/trainers/torch/networks.py
  13. 25
      ml-agents/mlagents/trainers/torch/encoders.py
  14. 20
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  15. 29
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  16. 62
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py

28
ml-agents/mlagents/trainers/trajectory.py


step of the trajectory.
"""
agent_buffer_trajectory = AgentBuffer()
vec_vis_obs = SplitObservations.from_observations(self.steps[0].obs)
curr_obs = self.steps[0].obs
next_vec_vis_obs = SplitObservations.from_observations(
self.steps[step + 1].obs
)
next_obs = self.steps[step + 1].obs
next_vec_vis_obs = SplitObservations.from_observations(self.next_obs)
for i, _ in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["visual_obs%d" % i].append(
vec_vis_obs.visual_observations[i]
)
agent_buffer_trajectory["next_visual_obs%d" % i].append(
next_vec_vis_obs.visual_observations[i]
)
agent_buffer_trajectory["vector_obs"].append(
vec_vis_obs.vector_observations
)
agent_buffer_trajectory["next_vector_in"].append(
next_vec_vis_obs.vector_observations
)
next_obs = self.next_obs
agent_buffer_trajectory["obs"].append(curr_obs)
agent_buffer_trajectory["next_obs"].append(next_obs)
if exp.memory is not None:
agent_buffer_trajectory["memory"].append(exp.memory)

agent_buffer_trajectory["prev_action"].append(exp.prev_action)
agent_buffer_trajectory["environment_rewards"].append(exp.reward)
# Store the next visual obs as the current
vec_vis_obs = next_vec_vis_obs
# Store the next obs as the current
curr_obs = next_obs
return agent_buffer_trajectory
@property

54
ml-agents/mlagents/trainers/buffer.py


import numpy as np
import h5py
from typing import List, BinaryIO
from typing import List, BinaryIO, Any
import itertools
from mlagents_envs.exception import UnityException

class AgentBufferField(list):
"""
AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to his
AgentBufferField with the append method.
AgentBufferField is a list of data, usually numpy arrays. When an agent collects a field,
you can add it to its AgentBufferField with the append method.
"""
def __init__(self):

def __str__(self):
return str(np.array(self).shape)
def append(self, element: np.ndarray, padding_value: float = 0.0) -> None:
def append(self, element: Any, padding_value: Any = 0.0) -> None:
Adds an element to this list. Also lets you change the padding
Adds an element to this AgentBuffer. Also lets you change the padding
type, so that it can be set on append (e.g. action_masks should
be padded with 1.)
:param element: The element to append to the list.

self.padding_value = padding_value
def extend(self, data: np.ndarray) -> None:
"""
Adds a list of np.arrays to the end of the list of np.arrays.
:param data: The np.array list to append.
def set(self, data: List[Any]) -> None:
self += list(np.array(data, dtype=np.float32))
def set(self, data):
"""
Sets the list of np.array to the input data
:param data: The np.array list to be set.
Sets the AgentBuffer to the provided list
:param data: The list to be set.
dtype = None
if data is not None and len(data) and isinstance(data[0], float):
dtype = np.float32
self[:] = list(np.array(data, dtype=dtype))
self[:] = data
def get_batch(
self,

) -> np.ndarray:
) -> List[Any]:
from the list of np.array
from the AgentBuffer.
:param batch_size: The number of elements to retrieve. If None:
All elements will be retrieved.
:param training_length: The length of the sequence to be retrieved. If

)
if batch_size * training_length > len(self):
padding = np.array(self[-1], dtype=np.float32) * self.padding_value
return np.array(
[padding] * (training_length - leftover) + self[:],
dtype=np.float32,
)
return [padding] * (training_length - leftover) + self[:]
return np.array(
self[len(self) - batch_size * training_length :],
dtype=np.float32,
)
return self[len(self) - batch_size * training_length :]
else:
# The sequences will have overlapping elements
if batch_size is None:

tmp_list: List[np.ndarray] = []
for end in range(len(self) - batch_size + 1, len(self) + 1):
tmp_list += self[end - training_length : end]
return np.array(tmp_list, dtype=np.float32)
return tmp_list
def reset_field(self) -> None:
"""

return len(next(iter(self.values())))
else:
return 0
@staticmethod
def obs_list_to_obs_batch(obs_list: List[List[np.ndarray]]) -> List[np.ndarray]:
"""
Converts a List of obs (an obs itself consinsting of a List of np.ndarray) to
a List of np.ndarray, with the observations batchwise.
"""
# Transpose and convert List of Lists
new_list = list(map(lambda x: np.asanyarray(list(x)), zip(*obs_list)))
return new_list

28
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.torch.components.bc.module import BCModule
from mlagents.trainers.torch.components.reward_providers import create_reward_provider

def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
if self.policy.use_vis_obs:
visual_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
visual_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
visual_obs.append(visual_ob)
else:
visual_obs = []
obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(batch["obs"])
)
next_obs = ModelUtils.list_to_tensor_list(next_obs)
vec_vis_obs = SplitObservations.from_observations(next_obs)
next_vec_obs = [
ModelUtils.list_to_tensor(vec_vis_obs.vector_observations).unsqueeze(0)
]
next_vis_obs = [
ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0)
for _vis_ob in vec_vis_obs.visual_observations
]
vector_obs, visual_obs, memory, sequence_length=batch.num_experiences
obs, memory, sequence_length=batch.num_experiences
next_vec_obs, next_vis_obs, next_memory, sequence_length=1
next_obs, next_memory, sequence_length=1
)
for name, estimate in value_estimates.items():

33
ml-agents/mlagents/trainers/policy/torch_policy.py


def _split_decision_step(
self, decision_requests: DecisionSteps
) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
obs = ModelUtils.list_to_tensor_list(decision_requests.obs)
mask = None
if not self.use_continuous_act:
mask = torch.ones([len(decision_requests), np.sum(self.act_size)])

)
return vec_vis_obs, mask
return obs, mask
def update_normalization(self, vector_obs: np.ndarray) -> None:
def update_normalization(self, vector_obs: List[np.ndarray]) -> None:
vector_obs = [torch.as_tensor(vector_obs)]
all_obs = ModelUtils.list_to_tensor_list(vector_obs)
self.actor_critic.update_normalization(vector_obs)
self.actor_critic.update_normalization(all_obs)
vec_obs: List[torch.Tensor],
vis_obs: List[torch.Tensor],
obs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
seq_len: int = 1,

entropies, and output memories, all as Torch Tensors.
"""
if memories is None:
dists, memories = self.actor_critic.get_dists(
vec_obs, vis_obs, masks, memories, seq_len
)
dists, memories = self.actor_critic.get_dists(obs, masks, memories, seq_len)
vec_obs, vis_obs, masks, memories, seq_len
obs, masks, memories, seq_len
)
action_list = self.actor_critic.sample_action(dists)
log_probs, entropies, all_logs = ModelUtils.get_probs_and_entropy(

def evaluate_actions(
self,
vec_obs: torch.Tensor,
vis_obs: torch.Tensor,
obs: List[torch.Tensor],
actions: torch.Tensor,
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,

vec_obs, vis_obs, masks, memories, seq_len
obs, masks, memories, seq_len
)
action_list = [actions[..., i] for i in range(actions.shape[-1])]
log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_list, dists)

:param decision_requests: DecisionStep object containing inputs.
:return: Outputs from network as defined by self.inference_dict.
"""
vec_vis_obs, masks = self._split_decision_step(decision_requests)
vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)]
vis_obs = [
torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations
]
obs, masks = self._split_decision_step(decision_requests)
vec_obs, vis_obs, masks=masks, memories=memories
obs, masks=masks, memories=memories
)
run_out["pre_action"] = ModelUtils.to_numpy(action)

12
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, PPOSettings, FrameworkType
from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (

agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
obs_to_normalize = AgentBuffer.obs_list_to_obs_batch(
agent_buffer_trajectory["obs"]
)
self.policy.update_normalization(obs_to_normalize)
# Get all value estimates
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(

n_sequences = max(
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
advantages = self.update_buffer["advantages"].get_batch()
# Normalize advantages
advantages = np.array(self.update_buffer["advantages"].get_batch())
(advantages - advantages.mean()) / (advantages.std() + 1e-10)
list((advantages - advantages.mean()) / (advantages.std() + 1e-10))
)
num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)

16
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


)
returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])
vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(batch["obs"])
)
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions_pre"]).unsqueeze(-1)

if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
vis_obs.append(vis_ob)
else:
vis_obs = []
vec_obs,
vis_obs,
obs,
masks=act_masks,
actions=actions,
memories=memories,

29
ml-agents/mlagents/trainers/tests/test_buffer.py


b = AgentBuffer()
for step in range(9):
b["vector_observation"].append(
[
100 * fake_agent_id + 10 * step + 1,
100 * fake_agent_id + 10 * step + 2,
100 * fake_agent_id + 10 * step + 3,
]
np.array(
[
100 * fake_agent_id + 10 * step + 1,
100 * fake_agent_id + 10 * step + 2,
100 * fake_agent_id + 10 * step + 3,
],
dtype=np.float32,
)
[100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5]
np.array(
[
100 * fake_agent_id + 10 * step + 4,
100 * fake_agent_id + 10 * step + 5,
],
dtype=np.float32,
)
)
return b

a = agent_1_buffer["vector_observation"].get_batch(
batch_size=2, training_length=1, sequential=True
)
assert_array(np.array(a), np.array([[171, 172, 173], [181, 182, 183]]))
assert len(a) == 2
assert_array(
np.array(a), np.array([[171, 172, 173], [181, 182, 183]], dtype=np.float32)
)
a = agent_2_buffer["vector_observation"].get_batch(
batch_size=2, training_length=3, sequential=True
)

[261, 262, 263],
[271, 272, 273],
[281, 282, 283],
]
],
dtype=np.float32,
),
)
a = agent_2_buffer["vector_observation"].get_batch(

54
ml-agents/mlagents/trainers/sac/optimizer_torch.py


def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
actions: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

"""
Performs a forward pass on the value network, which consists of a Q1 and Q2
network. Optionally does not evaluate gradients for either the Q1, Q2, or both.
:param vec_inputs: List of vector observation tensors.
:param vis_input: List of visual observation tensors.
:param net_inputs: List of observation tensors.
:param actions: For a continuous Q function (has actions), tensor of actions.
Otherwise, None.
:param memories: Initial memories if using memory. Otherwise, None.

if not q1_grad:
stack.enter_context(torch.no_grad())
q1_out, _ = self.q1_network(
vec_inputs,
vis_inputs,
net_inputs,
actions=actions,
memories=memories,
sequence_length=sequence_length,

stack.enter_context(torch.no_grad())
q2_out, _ = self.q2_network(
vec_inputs,
vis_inputs,
net_inputs,
actions=actions,
memories=memories,
sequence_length=sequence_length,

for name in self.reward_signals:
rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"])
vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
next_vec_obs = [ModelUtils.list_to_tensor(batch["next_vector_in"])]
obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(batch["obs"])
)
next_obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(batch["next_obs"])
)
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)

torch.zeros_like(next_memories) if next_memories is not None else None
)
vis_obs: List[torch.Tensor] = []
next_vis_obs: List[torch.Tensor] = []
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
vis_obs.append(vis_ob)
next_vis_ob = ModelUtils.list_to_tensor(
batch["next_visual_obs%d" % idx]
)
next_vis_obs.append(next_vis_ob)
# Copy normalizers from policy
self.value_network.q1_network.network_body.copy_normalization(
self.policy.actor_critic.network_body

self.policy.actor_critic.network_body
)
(sampled_actions, _, log_probs, _, _) = self.policy.sample_actions(
vec_obs,
vis_obs,
obs,
masks=act_masks,
memories=memories,
seq_len=self.policy.sequence_length,

vec_obs, vis_obs, memories, sequence_length=self.policy.sequence_length
obs, memories, sequence_length=self.policy.sequence_length
vec_obs,
vis_obs,
obs,
sampled_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,

vec_obs,
vis_obs,
obs,
squeezed_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,

# For discrete, you don't need to backprop through the Q for the policy
q1p_out, q2p_out = self.value_network(
vec_obs,
vis_obs,
obs,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q1_grad=False,

vec_obs,
vis_obs,
obs,
memories=q_memories,
sequence_length=self.policy.sequence_length,
)

with torch.no_grad():
target_values, _ = self.target_network(
next_vec_obs,
next_vis_obs,
next_obs,
memories=next_memories,
sequence_length=self.policy.sequence_length,
)

16
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer
from mlagents.trainers.trajectory import Trajectory, SplitObservations
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.buffer import AgentBuffer
from mlagents import tf_utils
if tf_utils.is_available():

# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
obs_to_normalize = AgentBuffer.obs_list_to_obs_batch(
agent_buffer_trajectory["obs"]
)
self.policy.update_normalization(obs_to_normalize)
# Evaluate all reward functions for reporting purposes
self.collected_rewards["environment"][agent_id] += np.sum(

# Bootstrap using the last step rather than the bootstrap step if max step is reached.
# Set last element to duplicate obs and remove dones.
if last_step.interrupted:
vec_vis_obs = SplitObservations.from_observations(last_step.obs)
for i, obs in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs
if vec_vis_obs.vector_observations.size > 1:
agent_buffer_trajectory["next_vector_in"][
-1
] = vec_vis_obs.vector_observations
agent_buffer_trajectory["next_obs"][-1] = last_step.obs
agent_buffer_trajectory["done"][-1] = False
# Append to update buffer

6
ml-agents/mlagents/trainers/demo_loader.py


from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
AgentInfoActionPairProto,
)
from mlagents.trainers.trajectory import SplitObservations
from mlagents_envs.rpc_utils import behavior_spec_from_proto, steps_from_proto
from mlagents_envs.base_env import BehaviorSpec
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto

demo_raw_buffer["done"].append(next_done)
demo_raw_buffer["rewards"].append(next_reward)
split_obs = SplitObservations.from_observations(current_obs)
for i, obs in enumerate(split_obs.visual_observations):
demo_raw_buffer["visual_obs%d" % i].append(obs)
demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
demo_raw_buffer["obs"].append(current_obs)
demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
demo_raw_buffer["prev_action"].append(previous_action)
if next_done:

38
ml-agents/mlagents/trainers/torch/utils.py


h_size: int,
vis_encode_type: EncoderType,
normalize: bool = False,
) -> Tuple[nn.ModuleList, nn.ModuleList, int]:
) -> Tuple[nn.ModuleList, int]:
"""
Creates visual and vector encoders, along with their normalizers.
:param observation_shapes: List of Tuples that represent the action dimensions.

:param normalize: Normalize all vector inputs.
:return: Tuple of visual encoders and vector encoders each as a list.
"""
visual_encoders: List[nn.Module] = []
vector_encoders: List[nn.Module] = []
encoders: List[nn.Module] = []
vector_size = 0
visual_output_size = 0
total_encoded_size = 0
visual_encoders.append(
encoders.append(
visual_output_size += h_size
total_encoded_size += h_size
vector_size += dimension[0]
vector_size = dimension[0]
encoders.append(VectorInput(vector_size, normalize))
total_encoded_size += vector_size
if vector_size > 0:
vector_encoders.append(VectorInput(vector_size, normalize))
total_processed_size = vector_size + visual_output_size
return (
nn.ModuleList(visual_encoders),
nn.ModuleList(vector_encoders),
total_processed_size,
)
return (nn.ModuleList(encoders), total_encoded_size)
@staticmethod
def list_to_tensor(

calling as_tensor on the list directly.
"""
return torch.as_tensor(np.asanyarray(ndarray_list), dtype=dtype)
@staticmethod
def list_to_tensor_list(
ndarray_list: List[np.ndarray], dtype: Optional[torch.dtype] = torch.float32
) -> torch.Tensor:
"""
Converts a list of numpy arrays into a list of tensors. MUCH faster than
calling as_tensor on the list directly.
"""
return [
torch.as_tensor(np.asanyarray(_arr), dtype=dtype) for _arr in ndarray_list
]
@staticmethod
def to_numpy(tensor: torch.Tensor) -> np.ndarray:

93
ml-agents/mlagents/trainers/torch/networks.py


else 0
)
self.visual_processors, self.vector_processors, encoder_input_size = ModelUtils.create_input_processors(
self.processors, encoder_input_size = ModelUtils.create_input_processors(
observation_shapes,
self.h_size,
network_settings.vis_encode_type,

else:
self.lstm = None # type: ignore
def update_normalization(self, vec_inputs: List[torch.Tensor]) -> None:
for vec_input, vec_enc in zip(vec_inputs, self.vector_processors):
vec_enc.update_normalization(vec_input)
def update_normalization(self, net_inputs: List[torch.Tensor]) -> None:
for _in, enc in zip(net_inputs, self.processors):
enc.update_normalization(_in)
for n1, n2 in zip(self.vector_processors, other_network.vector_processors):
for n1, n2 in zip(self.processors, other_network.processors):
n1.copy_normalization(n2)
@property

def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
for idx, processor in enumerate(self.vector_processors):
vec_input = vec_inputs[idx]
processed_vec = processor(vec_input)
for idx, processor in enumerate(self.processors):
net_input = net_inputs[idx]
if not exporting_to_onnx.is_exporting() and len(net_input.shape) > 3:
net_input = net_input.permute([0, 3, 1, 2])
processed_vec = processor(net_input)
for idx, processor in enumerate(self.visual_processors):
vis_input = vis_inputs[idx]
if not exporting_to_onnx.is_exporting():
vis_input = vis_input.permute([0, 3, 1, 2])
processed_vis = processor(vis_input)
encodes.append(processed_vis)
if len(encodes) == 0:
raise Exception("No valid inputs to network.")

def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
vec_inputs, vis_inputs, actions, memories, sequence_length
net_inputs, actions, memories, sequence_length
)
output = self.value_heads(encoding)
return output, memories

@abc.abstractmethod
def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
def update_normalization(self, net_inputs: List[torch.Tensor]) -> None:
"""
Updates normalization of Actor based on the provided List of vector obs.
:param vector_obs: A List of vector obs as tensors.

@abc.abstractmethod
def get_dists(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

@abc.abstractmethod
def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, int, int, int, int]:

@abc.abstractmethod
def critic_pass(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:

@abc.abstractmethod
def get_dist_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

def memory_size(self) -> int:
return self.network_body.memory_size
def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
self.network_body.update_normalization(vector_obs)
def update_normalization(self, net_inputs: List[torch.Tensor]) -> None:
self.network_body.update_normalization(net_inputs)
def sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
actions = []

def get_dists(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
net_inputs, memories=memories, sequence_length=sequence_length
)
if self.action_spec.is_continuous():
dists = self.distribution(encoding)

def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, int, int, int, int]:

dists, _ = self.get_dists(vec_inputs, vis_inputs, masks, memories, 1)
dists, _ = self.get_dists(net_inputs, masks, memories, 1)
if self.action_spec.is_continuous():
action_list = self.sample_action(dists)
action_out = torch.stack(action_list, dim=-1)

def critic_pass(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
net_inputs, memories=memories, sequence_length=sequence_length
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
net_inputs, memories=memories, sequence_length=sequence_length
)
if self.action_spec.is_continuous():
dists = self.distribution(encoding)

def critic_pass(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:

actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, -1)
value_outputs, critic_mem_out = self.critic(
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
net_inputs, memories=critic_mem, sequence_length=sequence_length
)
if actor_mem is not None:
# Make memories with the actor mem unchanged

def get_dist_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

critic_mem = None
actor_mem = None
dists, actor_mem_outs = self.get_dists(
vec_inputs,
vis_inputs,
memories=actor_mem,
sequence_length=sequence_length,
masks=masks,
net_inputs, memories=actor_mem, sequence_length=sequence_length, masks=masks
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
net_inputs, memories=critic_mem, sequence_length=sequence_length
)
if self.use_lstm:
mem_out = torch.cat([actor_mem_outs, critic_mem_outs], dim=-1)

def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
super().update_normalization(vector_obs)
self.critic.network_body.update_normalization(vector_obs)
def update_normalization(self, net_inputs: List[torch.Tensor]) -> None:
super().update_normalization(net_inputs)
self.critic.network_body.update_normalization(net_inputs)
class GlobalSteps(nn.Module):

25
ml-agents/mlagents/trainers/torch/encoders.py


return height, width
class VectorInput(nn.Module):
class InputProcessor:
def copy_normalization(self, other_input: "InputProcessor") -> None:
pass
def update_normalization(self, inputs: torch.Tensor) -> None:
pass
class VectorInput(nn.Module, InputProcessor):
def __init__(self, input_size: int, normalize: bool = False):
super().__init__()
self.normalizer: Optional[Normalizer] = None

inputs = self.normalizer(inputs)
return inputs
def copy_normalization(self, other_input: "VectorInput") -> None:
if self.normalizer is not None and other_input.normalizer is not None:
self.normalizer.copy_from(other_input.normalizer)
def copy_normalization(self, other_input: "InputProcessor") -> None:
if isinstance(other_input, VectorInput):
if self.normalizer is not None and other_input.normalizer is not None:
self.normalizer.copy_from(other_input.normalizer)
def update_normalization(self, inputs: torch.Tensor) -> None:
if self.normalizer is not None:

class SmallVisualEncoder(nn.Module):
class SmallVisualEncoder(nn.Module, InputProcessor):
"""
CNN architecture used by King in their Candy Crush predictor
https://www.researchgate.net/publication/328307928_Human-Like_Playtesting_with_Deep_Learning

return self.dense(hidden)
class SimpleVisualEncoder(nn.Module):
class SimpleVisualEncoder(nn.Module, InputProcessor):
def __init__(
self, height: int, width: int, initial_channels: int, output_size: int
):

return self.dense(hidden)
class NatureVisualEncoder(nn.Module):
class NatureVisualEncoder(nn.Module, InputProcessor):
def __init__(
self, height: int, width: int, initial_channels: int, output_size: int
):

return input_tensor + self.layers(input_tensor)
class ResNetVisualEncoder(nn.Module):
class ResNetVisualEncoder(nn.Module, InputProcessor):
def __init__(
self, height: int, width: int, initial_channels: int, output_size: int
):

20
ml-agents/mlagents/trainers/torch/components/bc/module.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.settings import BehavioralCloningSettings, ScheduleType
from mlagents.trainers.torch.utils import ModelUtils

"""
Helper function for update_batch.
"""
vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])]
obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(mini_batch_demo["obs"]), dtype=torch.float
)
act_masks = None
if self.policy.use_continuous_act:
expert_actions = ModelUtils.list_to_tensor(mini_batch_demo["actions"])

if self.policy.use_recurrent:
memories = torch.zeros(1, self.n_sequences, self.policy.m_size)
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
vis_ob = ModelUtils.list_to_tensor(
mini_batch_demo["visual_obs%d" % idx]
)
vis_obs.append(vis_ob)
else:
vis_obs = []
(
selected_actions,
clipped_actions,

) = self.policy.sample_actions(
vec_obs,
vis_obs,
obs,
masks=act_masks,
memories=memories,
seq_len=self.policy.sequence_length,

29
ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py


"""
Extracts the current state embedding from a mini_batch.
"""
n_vis = len(self._state_encoder.visual_processors)
vec_inputs=[
ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float)
],
vis_inputs=[
ModelUtils.list_to_tensor(
mini_batch["visual_obs%d" % i], dtype=torch.float
)
for i in range(n_vis)
],
net_inputs=ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(mini_batch["obs"]), dtype=torch.float
)
)
return hidden

"""
n_vis = len(self._state_encoder.visual_processors)
vec_inputs=[
ModelUtils.list_to_tensor(
mini_batch["next_vector_in"], dtype=torch.float
)
],
vis_inputs=[
ModelUtils.list_to_tensor(
mini_batch["next_visual_obs%d" % i], dtype=torch.float
)
for i in range(n_vis)
],
net_inputs=ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(mini_batch["next_obs"]),
dtype=torch.float,
)
)
return hidden

62
ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py


"""
Creates the observation input.
"""
n_vis = len(self.encoder.visual_processors)
n_vec = len(self.encoder.vector_processors)
vec_inputs = (
[ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float)]
if n_vec > 0
else []
net_inputs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(mini_batch["obs"]), dtype=torch.float
vis_inputs = [
ModelUtils.list_to_tensor(mini_batch["visual_obs%d" % i], dtype=torch.float)
for i in range(n_vis)
]
return vec_inputs, vis_inputs
return net_inputs
def compute_estimate(
self, mini_batch: AgentBuffer, use_vail_noise: bool = False

:param use_vail_noise: Only when using VAIL : If true, will sample the code, if
false, will return the mean of the code.
"""
vec_inputs, vis_inputs = self.get_state_inputs(mini_batch)
net_inputs = self.get_state_inputs(mini_batch)
hidden, _ = self.encoder(vec_inputs, vis_inputs, action_inputs)
hidden, _ = self.encoder(net_inputs, action_inputs)
hidden, _ = self.encoder(vec_inputs, vis_inputs)
hidden, _ = self.encoder(net_inputs)
z_mu: Optional[torch.Tensor] = None
if self._settings.use_vail:
z_mu = self._z_mu_layer(hidden)

Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp.
for off-policy. Compute gradients w.r.t randomly interpolated input.
"""
policy_vec_inputs, policy_vis_inputs = self.get_state_inputs(policy_batch)
expert_vec_inputs, expert_vis_inputs = self.get_state_inputs(expert_batch)
interp_vec_inputs = []
for policy_vec_input, expert_vec_input in zip(
policy_vec_inputs, expert_vec_inputs
):
obs_epsilon = torch.rand(policy_vec_input.shape)
interp_vec_input = (
obs_epsilon * policy_vec_input + (1 - obs_epsilon) * expert_vec_input
)
interp_vec_input.requires_grad = True # For gradient calculation
interp_vec_inputs.append(interp_vec_input)
interp_vis_inputs = []
for policy_vis_input, expert_vis_input in zip(
policy_vis_inputs, expert_vis_inputs
):
obs_epsilon = torch.rand(policy_vis_input.shape)
interp_vis_input = (
obs_epsilon * policy_vis_input + (1 - obs_epsilon) * expert_vis_input
)
interp_vis_input.requires_grad = True # For gradient calculation
interp_vis_inputs.append(interp_vis_input)
policy_inputs = self.get_state_inputs(policy_batch)
expert_inputs = self.get_state_inputs(expert_batch)
interp_inputs = []
for policy_input, expert_input in zip(policy_inputs, expert_inputs):
obs_epsilon = torch.rand(policy_input.shape)
interp_input = obs_epsilon * policy_input + (1 - obs_epsilon) * expert_input
interp_input.requires_grad = True # For gradient calculation
interp_inputs.append(interp_input)
if self._settings.use_actions:
policy_action = self.get_action_input(policy_batch)
expert_action = self.get_action_input(expert_batch)

dim=1,
)
action_inputs.requires_grad = True
hidden, _ = self.encoder(
interp_vec_inputs, interp_vis_inputs, action_inputs
)
encoder_input = tuple(
interp_vec_inputs + interp_vis_inputs + [action_inputs]
)
hidden, _ = self.encoder(interp_inputs, action_inputs)
encoder_input = tuple(interp_inputs + [action_inputs])
hidden, _ = self.encoder(interp_vec_inputs, interp_vis_inputs)
encoder_input = tuple(interp_vec_inputs + interp_vis_inputs)
hidden, _ = self.encoder(interp_inputs)
encoder_input = tuple(interp_inputs)
if self._settings.use_vail:
use_vail_noise = True
z_mu = self._z_mu_layer(hidden)

正在加载...
取消
保存