浏览代码

Buffer key enums (#4907)

/bullet-hell-barracuda-test-1.3.1
GitHub 4 年前
当前提交
64fc7f43
共有 25 个文件被更改,包括 570 次插入285 次删除
  1. 2
      .github/workflows/pytest.yml
  2. 392
      ml-agents/mlagents/trainers/buffer.py
  3. 16
      ml-agents/mlagents/trainers/demo_loader.py
  4. 22
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  5. 33
      ml-agents/mlagents/trainers/ppo/trainer.py
  6. 24
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  7. 7
      ml-agents/mlagents/trainers/sac/trainer.py
  8. 6
      ml-agents/mlagents/trainers/tests/__init__.py
  9. 9
      ml-agents/mlagents/trainers/tests/mock_brain.py
  10. 60
      ml-agents/mlagents/trainers/tests/test_buffer.py
  11. 9
      ml-agents/mlagents/trainers/tests/test_demo_loader.py
  12. 29
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  13. 23
      ml-agents/mlagents/trainers/tests/torch/test_ghost.py
  14. 15
      ml-agents/mlagents/trainers/tests/torch/test_policy.py
  15. 68
      ml-agents/mlagents/trainers/tests/torch/test_ppo.py
  16. 3
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py
  17. 18
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
  18. 13
      ml-agents/mlagents/trainers/tests/torch/test_sac.py
  19. 15
      ml-agents/mlagents/trainers/torch/action_log_probs.py
  20. 14
      ml-agents/mlagents/trainers/torch/agent_action.py
  21. 2
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  22. 16
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  23. 4
      ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py
  24. 12
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
  25. 43
      ml-agents/mlagents/trainers/trajectory.py

2
.github/workflows/pytest.yml


jobs:
pytest:
runs-on: ubuntu-latest
env:
TEST_ENFORCE_BUFFER_KEY_TYPES: 1
strategy:
matrix:
python-version: [3.6.x, 3.7.x, 3.8.x]

392
ml-agents/mlagents/trainers/buffer.py


from collections import defaultdict
from collections.abc import MutableMapping
import enum
import itertools
from typing import BinaryIO, DefaultDict, List, Tuple, Union, Optional
from typing import List, BinaryIO
import itertools
from mlagents_envs.exception import UnityException

pass
class AgentBuffer(dict):
class BufferKey(enum.Enum):
ACTION_MASK = "action_mask"
CONTINUOUS_ACTION = "continuous_action"
CONTINUOUS_LOG_PROBS = "continuous_log_probs"
DISCRETE_ACTION = "discrete_action"
DISCRETE_LOG_PROBS = "discrete_log_probs"
DONE = "done"
ENVIRONMENT_REWARDS = "environment_rewards"
MASKS = "masks"
MEMORY = "memory"
PREV_ACTION = "prev_action"
ADVANTAGES = "advantages"
DISCOUNTED_RETURNS = "discounted_returns"
class ObservationKeyPrefix(enum.Enum):
OBSERVATION = "obs"
NEXT_OBSERVATION = "next_obs"
class RewardSignalKeyPrefix(enum.Enum):
# Reward signals
REWARDS = "rewards"
VALUE_ESTIMATES = "value_estimates"
RETURNS = "returns"
ADVANTAGE = "advantage"
AgentBufferKey = Union[
BufferKey, Tuple[ObservationKeyPrefix, int], Tuple[RewardSignalKeyPrefix, str]
]
class RewardSignalUtil:
@staticmethod
def rewards_key(name: str) -> AgentBufferKey:
return RewardSignalKeyPrefix.REWARDS, name
@staticmethod
def value_estimates_key(name: str) -> AgentBufferKey:
return RewardSignalKeyPrefix.RETURNS, name
@staticmethod
def returns_key(name: str) -> AgentBufferKey:
return RewardSignalKeyPrefix.RETURNS, name
@staticmethod
def advantage_key(name: str) -> AgentBufferKey:
return RewardSignalKeyPrefix.ADVANTAGE, name
class AgentBufferField(list):
AgentBuffer contains a dictionary of AgentBufferFields. Each agent has his own AgentBuffer.
The keys correspond to the name of the field. Example: state, action
AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to its
AgentBufferField with the append method.
class AgentBufferField(list):
def __init__(self):
self.padding_value = 0
super().__init__()
def __str__(self):
return str(np.array(self).shape)
def append(self, element: np.ndarray, padding_value: float = 0.0) -> None:
AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to its
AgentBufferField with the append method.
Adds an element to this list. Also lets you change the padding
type, so that it can be set on append (e.g. action_masks should
be padded with 1.)
:param element: The element to append to the list.
:param padding_value: The value used to pad when get_batch is called.
super().append(element)
self.padding_value = padding_value
def __init__(self):
self.padding_value = 0
super().__init__()
def extend(self, data: np.ndarray) -> None:
"""
Adds a list of np.arrays to the end of the list of np.arrays.
:param data: The np.array list to append.
"""
self += list(np.array(data, dtype=np.float32))
def __str__(self):
return str(np.array(self).shape)
def set(self, data):
"""
Sets the list of np.array to the input data
:param data: The np.array list to be set.
"""
# Make sure we convert incoming data to float32 if it's a float
dtype = None
if data is not None and len(data) and isinstance(data[0], float):
dtype = np.float32
self[:] = []
self[:] = list(np.array(data, dtype=dtype))
def append(self, element: np.ndarray, padding_value: float = 0.0) -> None:
"""
Adds an element to this list. Also lets you change the padding
type, so that it can be set on append (e.g. action_masks should
be padded with 1.)
:param element: The element to append to the list.
:param padding_value: The value used to pad when get_batch is called.
"""
super().append(element)
self.padding_value = padding_value
def get_batch(
self,
batch_size: int = None,
training_length: Optional[int] = 1,
sequential: bool = True,
) -> np.ndarray:
"""
Retrieve the last batch_size elements of length training_length
from the list of np.array
:param batch_size: The number of elements to retrieve. If None:
All elements will be retrieved.
:param training_length: The length of the sequence to be retrieved. If
None: only takes one element.
:param sequential: If true and training_length is not None: the elements
will not repeat in the sequence. [a,b,c,d,e] with training_length = 2 and
sequential=True gives [[0,a],[b,c],[d,e]]. If sequential=False gives
[[a,b],[b,c],[c,d],[d,e]]
"""
if training_length is None:
training_length = 1
if sequential:
# The sequences will not have overlapping elements (this involves padding)
leftover = len(self) % training_length
# leftover is the number of elements in the first sequence (this sequence might need 0 padding)
if batch_size is None:
# retrieve the maximum number of elements
batch_size = len(self) // training_length + 1 * (leftover != 0)
# The maximum number of sequences taken from a list of length len(self) without overlapping
# with padding is equal to batch_size
if batch_size > (len(self) // training_length + 1 * (leftover != 0)):
raise BufferException(
"The batch size and training length requested for get_batch where"
" too large given the current number of data points."
)
if batch_size * training_length > len(self):
padding = np.array(self[-1], dtype=np.float32) * self.padding_value
return np.array(
[padding] * (training_length - leftover) + self[:], dtype=np.float32
)
else:
return np.array(
self[len(self) - batch_size * training_length :], dtype=np.float32
)
else:
# The sequences will have overlapping elements
if batch_size is None:
# retrieve the maximum number of elements
batch_size = len(self) - training_length + 1
# The number of sequences of length training_length taken from a list of len(self) elements
# with overlapping is equal to batch_size
if (len(self) - training_length + 1) < batch_size:
raise BufferException(
"The batch size and training length requested for get_batch where"
" too large given the current number of data points."
)
tmp_list: List[np.ndarray] = []
for end in range(len(self) - batch_size + 1, len(self) + 1):
tmp_list += self[end - training_length : end]
return np.array(tmp_list, dtype=np.float32)
def extend(self, data: np.ndarray) -> None:
"""
Adds a list of np.arrays to the end of the list of np.arrays.
:param data: The np.array list to append.
"""
self += list(np.array(data, dtype=np.float32))
def reset_field(self) -> None:
"""
Resets the AgentBufferField
"""
self[:] = []
def set(self, data):
"""
Sets the list of np.array to the input data
:param data: The np.array list to be set.
"""
# Make sure we convert incoming data to float32 if it's a float
dtype = None
if data is not None and len(data) and isinstance(data[0], float):
dtype = np.float32
self[:] = []
self[:] = list(np.array(data, dtype=dtype))
def get_batch(
self,
batch_size: int = None,
training_length: int = 1,
sequential: bool = True,
) -> np.ndarray:
"""
Retrieve the last batch_size elements of length training_length
from the list of np.array
:param batch_size: The number of elements to retrieve. If None:
All elements will be retrieved.
:param training_length: The length of the sequence to be retrieved. If
None: only takes one element.
:param sequential: If true and training_length is not None: the elements
will not repeat in the sequence. [a,b,c,d,e] with training_length = 2 and
sequential=True gives [[0,a],[b,c],[d,e]]. If sequential=False gives
[[a,b],[b,c],[c,d],[d,e]]
"""
if sequential:
# The sequences will not have overlapping elements (this involves padding)
leftover = len(self) % training_length
# leftover is the number of elements in the first sequence (this sequence might need 0 padding)
if batch_size is None:
# retrieve the maximum number of elements
batch_size = len(self) // training_length + 1 * (leftover != 0)
# The maximum number of sequences taken from a list of length len(self) without overlapping
# with padding is equal to batch_size
if batch_size > (len(self) // training_length + 1 * (leftover != 0)):
raise BufferException(
"The batch size and training length requested for get_batch where"
" too large given the current number of data points."
)
if batch_size * training_length > len(self):
padding = np.array(self[-1], dtype=np.float32) * self.padding_value
return np.array(
[padding] * (training_length - leftover) + self[:],
dtype=np.float32,
)
else:
return np.array(
self[len(self) - batch_size * training_length :],
dtype=np.float32,
)
else:
# The sequences will have overlapping elements
if batch_size is None:
# retrieve the maximum number of elements
batch_size = len(self) - training_length + 1
# The number of sequences of length training_length taken from a list of len(self) elements
# with overlapping is equal to batch_size
if (len(self) - training_length + 1) < batch_size:
raise BufferException(
"The batch size and training length requested for get_batch where"
" too large given the current number of data points."
)
tmp_list: List[np.ndarray] = []
for end in range(len(self) - batch_size + 1, len(self) + 1):
tmp_list += self[end - training_length : end]
return np.array(tmp_list, dtype=np.float32)
class AgentBuffer(MutableMapping):
"""
AgentBuffer contains a dictionary of AgentBufferFields. Each agent has his own AgentBuffer.
The keys correspond to the name of the field. Example: state, action
"""
def reset_field(self) -> None:
"""
Resets the AgentBufferField
"""
self[:] = []
# Whether or not to validate the types of keys at runtime
# This should be off for training, but enabled for testing
CHECK_KEY_TYPES_AT_RUNTIME = False
super().__init__()
self._fields: DefaultDict[AgentBufferKey, AgentBufferField] = defaultdict(
AgentBufferField
)
return ", ".join(["'{}' : {}".format(k, str(self[k])) for k in self.keys()])
return ", ".join(
["'{}' : {}".format(k, str(self[k])) for k in self._fields.keys()]
)
for k in self.keys():
self[k].reset_field()
for f in self._fields.values():
f.reset_field()
def __getitem__(self, key):
if key not in self.keys():
self[key] = self.AgentBufferField()
return super().__getitem__(key)
@staticmethod
def _check_key(key):
if isinstance(key, BufferKey):
return
if isinstance(key, tuple):
key0, key1 = key
if isinstance(key0, ObservationKeyPrefix):
if isinstance(key1, int):
return
raise KeyError(f"{key} has type ({type(key0)}, {type(key1)})")
if isinstance(key0, RewardSignalKeyPrefix):
if isinstance(key1, str):
return
raise KeyError(f"{key} has type ({type(key0)}, {type(key1)})")
raise KeyError(f"{key} is a {type(key)}")
def check_length(self, key_list: List[str]) -> bool:
@staticmethod
def _encode_key(key: AgentBufferKey) -> str:
"""
Convert the key to a string representation so that it can be used for serialization.
"""
if isinstance(key, BufferKey):
return key.value
prefix, suffix = key
return f"{prefix.value}:{suffix}"
@staticmethod
def _decode_key(encoded_key: str) -> AgentBufferKey:
"""
Convert the string representation back to a key after serialization.
"""
# Simple case: convert the string directly to a BufferKey
try:
return BufferKey(encoded_key)
except ValueError:
pass
# Not a simple key, so split into two parts
prefix_str, _, suffix_str = encoded_key.partition(":")
# See if it's an ObservationKeyPrefix first
try:
return ObservationKeyPrefix(prefix_str), int(suffix_str)
except ValueError:
pass
# If not, it had better be a RewardSignalKeyPrefix
try:
return RewardSignalKeyPrefix(prefix_str), suffix_str
except ValueError:
raise ValueError(f"Unable to convert {encoded_key} to an AgentBufferKey")
def __getitem__(self, key: AgentBufferKey) -> AgentBufferField:
if self.CHECK_KEY_TYPES_AT_RUNTIME:
self._check_key(key)
return self._fields[key]
def __setitem__(self, key: AgentBufferKey, value: AgentBufferField) -> None:
if self.CHECK_KEY_TYPES_AT_RUNTIME:
self._check_key(key)
self._fields[key] = value
def __delitem__(self, key: AgentBufferKey) -> None:
if self.CHECK_KEY_TYPES_AT_RUNTIME:
self._check_key(key)
self._fields.__delitem__(key)
def __iter__(self):
return self._fields.__iter__()
def __len__(self) -> int:
return self._fields.__len__()
def __contains__(self, key):
if self.CHECK_KEY_TYPES_AT_RUNTIME:
self._check_key(key)
return self._fields.__contains__(key)
def check_length(self, key_list: List[AgentBufferKey]) -> bool:
"""
Some methods will require that some fields have the same length.
check_length will return true if the fields in key_list

if self.CHECK_KEY_TYPES_AT_RUNTIME:
for k in key_list:
self._check_key(k)
if key not in self.keys():
if key not in self._fields:
return False
if (length is not None) and (length != len(self[key])):
return False

def shuffle(self, sequence_length: int, key_list: List[str] = None) -> None:
def shuffle(
self, sequence_length: int, key_list: List[AgentBufferKey] = None
) -> None:
"""
Shuffles the fields in key_list in a consistent way: The reordering will
be the same across fields.

key_list = list(self.keys())
key_list = list(self._fields.keys())
if not self.check_length(key_list):
raise BufferException(
"Unable to shuffle if the fields are not of same length"

:return: Dict of mini batch.
"""
mini_batch = AgentBuffer()
for key in self:
mini_batch[key] = self[key][start:end]
for key, field in self._fields.items():
# slicing AgentBufferField returns a List[Any}
mini_batch[key] = field[start:end] # type: ignore
return mini_batch
def sample_mini_batch(

"""
with h5py.File(file_object, "w") as write_file:
for key, data in self.items():
write_file.create_dataset(key, data=data, dtype="f", compression="gzip")
write_file.create_dataset(
self._encode_key(key), data=data, dtype="f", compression="gzip"
)
def load_from_file(self, file_object: BinaryIO) -> None:
"""

for key in list(read_file.keys()):
self[key] = AgentBuffer.AgentBufferField()
decoded_key = self._decode_key(key)
self[decoded_key] = AgentBufferField()
self[key].extend(read_file[key][()])
self[decoded_key].extend(read_file[key][()])
def truncate(self, max_length: int, sequence_length: int = 1) -> None:
"""

def resequence_and_append(
self,
target_buffer: "AgentBuffer",
key_list: List[str] = None,
key_list: List[AgentBufferKey] = None,
batch_size: int = None,
training_length: int = None,
) -> None:

16
ml-agents/mlagents/trainers/demo_loader.py


import os
from typing import List, Tuple
import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.buffer import AgentBuffer, BufferKey
from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
AgentInfoActionPairProto,
)

else:
current_obs = list(current_decision_step.values())[0].obs
demo_raw_buffer["done"].append(next_done)
demo_raw_buffer["rewards"].append(next_reward)
demo_raw_buffer[BufferKey.DONE].append(next_done)
demo_raw_buffer[BufferKey.ENVIRONMENT_REWARDS].append(next_reward)
for i, obs in enumerate(current_obs):
demo_raw_buffer[ObsUtil.get_name_at(i)].append(obs)
if (

if behavior_spec.action_spec.continuous_size > 0:
demo_raw_buffer["continuous_action"].append(
demo_raw_buffer[BufferKey.CONTINUOUS_ACTION].append(
demo_raw_buffer["discrete_action"].append(
demo_raw_buffer[BufferKey.DISCRETE_ACTION].append(
demo_raw_buffer["continuous_action"].append(
demo_raw_buffer[BufferKey.CONTINUOUS_ACTION].append(
demo_raw_buffer["discrete_action"].append(
demo_raw_buffer[BufferKey.DISCRETE_ACTION].append(
demo_raw_buffer["prev_action"].append(previous_action)
demo_raw_buffer[BufferKey.PREV_ACTION].append(previous_action)
if next_done:
demo_raw_buffer.resequence_and_append(
demo_processed_buffer, batch_size=None, training_length=sequence_length

22
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from typing import Dict, cast
from mlagents.torch_utils import torch
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.buffer import AgentBuffer, BufferKey, RewardSignalUtil
from mlagents_envs.timers import timed
from mlagents.trainers.policy.torch_policy import TorchPolicy

old_values = {}
for name in self.reward_signals:
old_values[name] = ModelUtils.list_to_tensor(
batch[f"{name}_value_estimates"]
batch[RewardSignalUtil.value_estimates_key(name)]
returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])
returns[name] = ModelUtils.list_to_tensor(
batch[RewardSignalUtil.returns_key(name)]
)
n_obs = len(self.policy.behavior_spec.observation_specs)
current_obs = ObsUtil.from_buffer(batch, n_obs)

act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
actions = AgentAction.from_dict(batch)
act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK])
actions = AgentAction.from_buffer(batch)
ModelUtils.list_to_tensor(batch["memory"][i])
for i in range(0, len(batch["memory"]), self.policy.sequence_length)
ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i])
for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length)
]
if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)

memories=memories,
seq_len=self.policy.sequence_length,
)
old_log_probs = ActionLogProbs.from_dict(batch).flatten()
old_log_probs = ActionLogProbs.from_buffer(batch).flatten()
loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool)
ModelUtils.list_to_tensor(batch["advantages"]),
ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]),
log_probs,
old_log_probs,
loss_masks,

33
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents_envs.logging_util import get_logger
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.policy import Policy
from mlagents.trainers.policy.torch_policy import TorchPolicy

)
for name, v in value_estimates.items():
agent_buffer_trajectory[f"{name}_value_estimates"].extend(v)
agent_buffer_trajectory[RewardSignalUtil.value_estimates_key(name)].extend(
v
)
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
np.mean(v),

self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory["environment_rewards"]
agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]
agent_buffer_trajectory[f"{name}_rewards"].extend(evaluate_result)
agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend(
evaluate_result
)
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

for name in self.optimizer.reward_signals:
bootstrap_value = value_next[name]
local_rewards = agent_buffer_trajectory[f"{name}_rewards"].get_batch()
local_rewards = agent_buffer_trajectory[
RewardSignalUtil.rewards_key(name)
].get_batch()
f"{name}_value_estimates"
RewardSignalUtil.value_estimates_key(name)
].get_batch()
local_advantage = get_gae(

)
local_return = local_advantage + local_value_estimates
# This is later use as target for the different value estimates
agent_buffer_trajectory[f"{name}_returns"].set(local_return)
agent_buffer_trajectory[f"{name}_advantage"].set(local_advantage)
agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set(
local_return
)
agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set(
local_advantage
)
tmp_advantages.append(local_advantage)
tmp_returns.append(local_return)

)
global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
agent_buffer_trajectory["advantages"].set(global_advantages)
agent_buffer_trajectory["discounted_returns"].set(global_returns)
agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages)
agent_buffer_trajectory[BufferKey.DISCOUNTED_RETURNS].set(global_returns)
# Append to update buffer
agent_buffer_trajectory.resequence_and_append(
self.update_buffer, training_length=self.policy.sequence_length

int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
advantages = self.update_buffer["advantages"].get_batch()
self.update_buffer["advantages"].set(
advantages = self.update_buffer[BufferKey.ADVANTAGES].get_batch()
self.update_buffer[BufferKey.ADVANTAGES].set(
(advantages - advantages.mean()) / (advantages.std() + 1e-10)
)
num_epoch = self.hyperparameters.num_epoch

24
ml-agents/mlagents/trainers/sac/optimizer_torch.py


from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.buffer import AgentBuffer, BufferKey, RewardSignalUtil
from mlagents_envs.timers import timed
from mlagents_envs.base_env import ActionSpec, ObservationSpec
from mlagents.trainers.exception import UnityTrainerException

"""
rewards = {}
for name in self.reward_signals:
rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"])
rewards[name] = ModelUtils.list_to_tensor(
batch[RewardSignalUtil.rewards_key(name)]
)
n_obs = len(self.policy.behavior_spec.observation_specs)
current_obs = ObsUtil.from_buffer(batch, n_obs)

# Convert to tensors
next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
actions = AgentAction.from_dict(batch)
act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK])
actions = AgentAction.from_buffer(batch)
ModelUtils.list_to_tensor(batch["memory"][i])
for i in range(0, len(batch["memory"]), self.policy.sequence_length)
ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i])
for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length)
batch["memory"][i][self.policy.m_size // 2 :]
batch[BufferKey.MEMORY][i][self.policy.m_size // 2 :]
for i in range(offset, len(batch["memory"]), self.policy.sequence_length)
for i in range(
offset, len(batch[BufferKey.MEMORY]), self.policy.sequence_length
)
]
if len(memories_list) > 0:

memories=next_memories,
sequence_length=self.policy.sequence_length,
)
masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
dones = ModelUtils.list_to_tensor(batch["done"])
masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool)
dones = ModelUtils.list_to_tensor(batch[BufferKey.DONE])
q1_loss, q2_loss = self.sac_q_loss(
q1_stream, q2_stream, target_values, dones, rewards, masks

7
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents_envs.logging_util import get_logger
from mlagents_envs.timers import timed
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
from mlagents.trainers.policy import Policy
from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.policy.torch_policy import TorchPolicy

# Evaluate all reward functions for reporting purposes
self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory["environment_rewards"]
agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]
)
for name, reward_signal in self.optimizer.reward_signals.items():
evaluate_result = (

last_step_obs = last_step.obs
for i, obs in enumerate(last_step_obs):
agent_buffer_trajectory[ObsUtil.get_name_at_next(i)][-1] = obs
agent_buffer_trajectory["done"][-1] = False
agent_buffer_trajectory[BufferKey.DONE][-1] = False
# Append to update buffer
agent_buffer_trajectory.resequence_and_append(

)
# Get rewards for each reward
for name, signal in self.optimizer.reward_signals.items():
sampled_minibatch[f"{name}_rewards"] = (
sampled_minibatch[RewardSignalUtil.rewards_key(name)] = (
signal.evaluate(sampled_minibatch) * signal.strength
)

6
ml-agents/mlagents/trainers/tests/__init__.py


np.array = np_array_no_float64
np.zeros = np_zeros_no_float64
np.ones = np_ones_no_float64
if os.getenv("TEST_ENFORCE_BUFFER_KEY_TYPES"):
from mlagents.trainers.buffer import AgentBuffer
AgentBuffer.CHECK_KEY_TYPES_AT_RUNTIME = True

9
ml-agents/mlagents/trainers/tests/mock_brain.py


from typing import List, Tuple
import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.buffer import AgentBuffer, AgentBufferKey
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents_envs.base_env import (

return Trajectory(
steps=steps_list, agent_id=agent_id, behavior_id=behavior_id, next_obs=obs
)
def copy_buffer_fields(
buffer: AgentBuffer, src_key: AgentBufferKey, dst_keys: List[AgentBufferKey]
) -> None:
for dst_key in dst_keys:
buffer[dst_key] = buffer[src_key]
def simulate_rollout(

60
ml-agents/mlagents/trainers/tests/test_buffer.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.buffer import (
AgentBuffer,
AgentBufferField,
BufferKey,
ObservationKeyPrefix,
RewardSignalKeyPrefix,
)
from mlagents.trainers.trajectory import ObsUtil
def assert_array(a, b):

def construct_fake_buffer(fake_agent_id):
b = AgentBuffer()
for step in range(9):
b["vector_observation"].append(
b[ObsUtil.get_name_at(0)].append(
[
100 * fake_agent_id + 10 * step + 1,
100 * fake_agent_id + 10 * step + 2,

b["action"].append(
b[BufferKey.CONTINUOUS_ACTION].append(
[100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5]
)
return b

agent_1_buffer = construct_fake_buffer(1)
agent_2_buffer = construct_fake_buffer(2)
agent_3_buffer = construct_fake_buffer(3)
a = agent_1_buffer["vector_observation"].get_batch(
a = agent_1_buffer[ObsUtil.get_name_at(0)].get_batch(
a = agent_2_buffer["vector_observation"].get_batch(
a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(
batch_size=2, training_length=3, sequential=True
)
assert_array(

]
),
)
a = agent_2_buffer["vector_observation"].get_batch(
a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(
batch_size=2, training_length=3, sequential=False
)
assert_array(

agent_3_buffer.resequence_and_append(
update_buffer, batch_size=None, training_length=2
)
assert len(update_buffer["action"]) == 20
assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 20
assert np.array(update_buffer["action"]).shape == (20, 2)
assert np.array(update_buffer[BufferKey.CONTINUOUS_ACTION]).shape == (20, 2)
assert np.array(c["action"]).shape == (1, 2)
assert np.array(c[BufferKey.CONTINUOUS_ACTION]).shape == (1, 2)
def fakerandint(values):

# Test non-LSTM
mb = update_buffer.sample_mini_batch(batch_size=4, sequence_length=1)
assert mb.keys() == update_buffer.keys()
assert np.array(mb["action"]).shape == (4, 2)
assert np.array(mb[BufferKey.CONTINUOUS_ACTION]).shape == (4, 2)
# Test LSTM
# We need to check if we ever get a breaking start - this will maximize the probability

assert np.array(mb["action"]).shape == (19, 2)
assert np.array(mb[BufferKey.CONTINUOUS_ACTION]).shape == (19, 2)
def test_num_experiences():

assert len(update_buffer["action"]) == 0
assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 0
assert update_buffer.num_experiences == 0
agent_1_buffer.resequence_and_append(
update_buffer, batch_size=None, training_length=2

)
assert len(update_buffer["action"]) == 20
assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 20
assert update_buffer.num_experiences == 20

update_buffer.truncate(4, sequence_length=3)
assert update_buffer.num_experiences == 3
for buffer_field in update_buffer.values():
assert isinstance(buffer_field, AgentBuffer.AgentBufferField)
assert isinstance(buffer_field, AgentBufferField)
def test_key_encode_decode():
keys = (
list(BufferKey)
+ [(k, 42) for k in ObservationKeyPrefix]
+ [(k, "gail") for k in RewardSignalKeyPrefix]
)
for k in keys:
assert k == AgentBuffer._decode_key(AgentBuffer._encode_key(k))
def test_buffer_save_load():
original = construct_fake_buffer(3)
import io
write_buffer = io.BytesIO()
original.save_to_file(write_buffer)
loaded = AgentBuffer()
loaded.load_from_file(write_buffer)
assert len(original) == len(loaded)
for k in original.keys():
assert np.allclose(original[k], loaded[k])

9
ml-agents/mlagents/trainers/tests/test_demo_loader.py


get_demo_files,
write_delimited,
)
from mlagents.trainers.buffer import BufferKey
BEHAVIOR_SPEC = create_mock_3dball_behavior_specs()

_, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, BEHAVIOR_SPEC)
assert (
len(demo_buffer["continuous_action"]) == total_expected - 1
or len(demo_buffer["discrete_action"]) == total_expected - 1
len(demo_buffer[BufferKey.CONTINUOUS_ACTION]) == total_expected - 1
or len(demo_buffer[BufferKey.DISCRETE_ACTION]) == total_expected - 1
)

_, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1, BEHAVIOR_SPEC)
assert (
len(demo_buffer["continuous_action"]) == total_expected - 1
or len(demo_buffer["discrete_action"]) == total_expected - 1
len(demo_buffer[BufferKey.CONTINUOUS_ACTION]) == total_expected - 1
or len(demo_buffer[BufferKey.DISCRETE_ACTION]) == total_expected - 1
)

29
ml-agents/mlagents/trainers/tests/test_trajectory.py


from mlagents.trainers.tests.mock_brain import make_fake_trajectory
from mlagents.trainers.tests.dummy_config import create_observation_specs_with_shapes
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.buffer import BufferKey, ObservationKeyPrefix
VEC_OBS_SIZE = 6
ACTION_SIZE = 4

length = 15
wanted_keys = [
"next_obs_0",
"next_obs_1",
"obs_0",
"obs_1",
"memory",
"masks",
"done",
"continuous_action",
"discrete_action",
"continuous_log_probs",
"discrete_log_probs",
"action_mask",
"prev_action",
"environment_rewards",
(ObservationKeyPrefix.OBSERVATION, 0),
(ObservationKeyPrefix.OBSERVATION, 1),
(ObservationKeyPrefix.NEXT_OBSERVATION, 0),
(ObservationKeyPrefix.NEXT_OBSERVATION, 1),
BufferKey.MEMORY,
BufferKey.MASKS,
BufferKey.DONE,
BufferKey.CONTINUOUS_ACTION,
BufferKey.DISCRETE_ACTION,
BufferKey.CONTINUOUS_LOG_PROBS,
BufferKey.DISCRETE_LOG_PROBS,
BufferKey.ACTION_MASK,
BufferKey.PREV_ACTION,
BufferKey.ENVIRONMENT_REWARDS,
]
wanted_keys = set(wanted_keys)
trajectory = make_fake_trajectory(

23
ml-agents/mlagents/trainers/tests/torch/test_ghost.py


from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
from mlagents.trainers.tests.mock_brain import copy_buffer_fields
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
from mlagents.trainers.settings import TrainerSettings, SelfPlaySettings
from mlagents.trainers.tests.dummy_config import create_observation_specs_with_shapes

buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs)
# Mock out reward signal eval
buffer["extrinsic_rewards"] = buffer["environment_rewards"]
buffer["extrinsic_returns"] = buffer["environment_rewards"]
buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
buffer["curiosity_rewards"] = buffer["environment_rewards"]
buffer["curiosity_returns"] = buffer["environment_rewards"]
buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
buffer["advantages"] = buffer["environment_rewards"]
copy_buffer_fields(
buffer,
src_key=BufferKey.ENVIRONMENT_REWARDS,
dst_keys=[
BufferKey.ADVANTAGES,
RewardSignalUtil.rewards_key("extrinsic"),
RewardSignalUtil.returns_key("extrinsic"),
RewardSignalUtil.value_estimates_key("extrinsic"),
RewardSignalUtil.rewards_key("curiosity"),
RewardSignalUtil.returns_key("curiosity"),
RewardSignalUtil.value_estimates_key("curiosity"),
],
)
trainer.trainer.update_buffer = buffer
# when ghost trainer advance and wrapped trainer buffers full

15
ml-agents/mlagents/trainers/tests/torch/test_policy.py


from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.trajectory import ObsUtil
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.buffer import BufferKey
VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8

TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
act_masks = ModelUtils.list_to_tensor(buffer["action_mask"])
agent_action = AgentAction.from_dict(buffer)
act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK])
agent_action = AgentAction.from_buffer(buffer)
ModelUtils.list_to_tensor(buffer["memory"][i])
for i in range(0, len(buffer["memory"]), policy.sequence_length)
ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i])
for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length)
]
if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)

TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
act_masks = ModelUtils.list_to_tensor(buffer["action_mask"])
act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK])
ModelUtils.list_to_tensor(buffer["memory"][i])
for i in range(0, len(buffer["memory"]), policy.sequence_length)
ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i])
for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length)
]
if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)

68
ml-agents/mlagents/trainers/tests/torch/test_ppo.py


from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.mock_brain import copy_buffer_fields
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.tests.dummy_config import ( # noqa: F401

)
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
@pytest.fixture

memory_size=optimizer.policy.m_size,
)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
copy_buffer_fields(
update_buffer,
BufferKey.ENVIRONMENT_REWARDS,
[
BufferKey.ADVANTAGES,
RewardSignalUtil.returns_key("extrinsic"),
RewardSignalUtil.value_estimates_key("extrinsic"),
],
)
return_stats = optimizer.update(
update_buffer,

memory_size=optimizer.policy.m_size,
)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"]
copy_buffer_fields(
update_buffer,
src_key=BufferKey.ENVIRONMENT_REWARDS,
dst_keys=[
BufferKey.ADVANTAGES,
RewardSignalUtil.returns_key("extrinsic"),
RewardSignalUtil.value_estimates_key("extrinsic"),
RewardSignalUtil.returns_key("curiosity"),
RewardSignalUtil.value_estimates_key("curiosity"),
],
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["gail_returns"] = update_buffer["environment_rewards"]
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
copy_buffer_fields(
update_buffer,
src_key=BufferKey.ENVIRONMENT_REWARDS,
dst_keys=[
BufferKey.ADVANTAGES,
RewardSignalUtil.returns_key("extrinsic"),
RewardSignalUtil.value_estimates_key("extrinsic"),
RewardSignalUtil.returns_key("gail"),
RewardSignalUtil.value_estimates_key("gail"),
],
)
update_buffer[BufferKey.CONTINUOUS_LOG_PROBS] = np.ones_like(
update_buffer[BufferKey.CONTINUOUS_ACTION]
)
optimizer.update(
update_buffer,

# Check if buffer size is too big
update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["gail_returns"] = update_buffer["environment_rewards"]
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
copy_buffer_fields(
update_buffer,
src_key=BufferKey.ENVIRONMENT_REWARDS,
dst_keys=[
BufferKey.ADVANTAGES,
RewardSignalUtil.returns_key("extrinsic"),
RewardSignalUtil.value_estimates_key("extrinsic"),
RewardSignalUtil.returns_key("gail"),
RewardSignalUtil.value_estimates_key("gail"),
],
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

3
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py


import numpy as np
import pytest
from mlagents.torch_utils import torch
from mlagents.trainers.buffer import BufferKey
from mlagents.trainers.torch.components.reward_providers import (
CuriosityRewardProvider,
create_reward_provider,

for _ in range(200):
curiosity_rp.update(buffer)
prediction = curiosity_rp._network.predict_action(buffer)[0]
target = torch.tensor(buffer["continuous_action"][0])
target = torch.tensor(buffer[BufferKey.CONTINUOUS_ACTION][0])
error = torch.mean((prediction - target) ** 2).item()
assert error < 0.001

18
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.buffer import AgentBuffer, BufferKey
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.trajectory import ObsUtil

action_buffer = behavior_spec.action_spec.random_action(1)
action = {}
if behavior_spec.action_spec.continuous_size > 0:
action["continuous_action"] = action_buffer.continuous
action[BufferKey.CONTINUOUS_ACTION] = action_buffer.continuous
action["discrete_action"] = action_buffer.discrete
action[BufferKey.DISCRETE_ACTION] = action_buffer.discrete
for _ in range(number):
for i, obs in enumerate(curr_obs):

buffer["actions"].append(action)
# TODO
# buffer[AgentBufferKey.ACTIONS].append(action)
buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
buffer["masks"].append(np.ones(1, dtype=np.float32))
buffer["done"] = np.zeros(number, dtype=np.float32)
# TODO was "rewards"
buffer[BufferKey.ENVIRONMENT_REWARDS].append(
np.ones(1, dtype=np.float32) * reward
)
buffer[BufferKey.MASKS].append(np.ones(1, dtype=np.float32))
buffer[BufferKey.DONE] = np.zeros(number, dtype=np.float32)
return buffer

13
ml-agents/mlagents/trainers/tests/torch/test_sac.py


import pytest
from mlagents.torch_utils import torch
from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.tests import mock_brain as mb

BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=24
)
# Mock out reward signal eval
update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
update_buffer[RewardSignalUtil.rewards_key("extrinsic")] = update_buffer[
BufferKey.ENVIRONMENT_REWARDS
]
return_stats = optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

)
# Mock out reward signal eval
update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
update_buffer["curiosity_rewards"] = update_buffer["environment_rewards"]
update_buffer[RewardSignalUtil.rewards_key("extrinsic")] = update_buffer[
BufferKey.ENVIRONMENT_REWARDS
]
update_buffer[RewardSignalUtil.rewards_key("curiosity")] = update_buffer[
BufferKey.ENVIRONMENT_REWARDS
]
return_stats = optimizer.update_reward_signals(
{"curiosity": update_buffer}, num_sequences=update_buffer.num_experiences
)

15
ml-agents/mlagents/trainers/torch/action_log_probs.py


from typing import List, Optional, NamedTuple, Dict
from typing import List, Optional, NamedTuple
from mlagents.trainers.buffer import AgentBuffer, BufferKey
from mlagents_envs.base_env import _ActionTupleBase

return torch.cat(self._to_tensor_list(), dim=1)
@staticmethod
def from_dict(buff: Dict[str, np.ndarray]) -> "ActionLogProbs":
def from_buffer(buff: AgentBuffer) -> "ActionLogProbs":
"""
A static method that accesses continuous and discrete log probs fields in an AgentBuffer
and constructs the corresponding ActionLogProbs from the retrieved np arrays.

if "continuous_log_probs" in buff:
continuous = ModelUtils.list_to_tensor(buff["continuous_log_probs"])
if "discrete_log_probs" in buff:
discrete_tensor = ModelUtils.list_to_tensor(buff["discrete_log_probs"])
if BufferKey.CONTINUOUS_LOG_PROBS in buff:
continuous = ModelUtils.list_to_tensor(buff[BufferKey.CONTINUOUS_LOG_PROBS])
if BufferKey.DISCRETE_LOG_PROBS in buff:
discrete_tensor = ModelUtils.list_to_tensor(
buff[BufferKey.DISCRETE_LOG_PROBS]
)
# This will keep discrete_list = None which enables flatten()
if discrete_tensor.shape[1] > 0:
discrete = [

14
ml-agents/mlagents/trainers/torch/agent_action.py


from typing import List, Optional, NamedTuple, Dict
from typing import List, Optional, NamedTuple
import numpy as np
from mlagents.trainers.buffer import AgentBuffer, BufferKey
from mlagents.trainers.torch.utils import ModelUtils
from mlagents_envs.base_env import ActionTuple

return action_tuple
@staticmethod
def from_dict(buff: Dict[str, np.ndarray]) -> "AgentAction":
def from_buffer(buff: AgentBuffer) -> "AgentAction":
"""
A static method that accesses continuous and discrete action fields in an AgentBuffer
and constructs the corresponding AgentAction from the retrieved np arrays.

if "continuous_action" in buff:
continuous = ModelUtils.list_to_tensor(buff["continuous_action"])
if "discrete_action" in buff:
if BufferKey.CONTINUOUS_ACTION in buff:
continuous = ModelUtils.list_to_tensor(buff[BufferKey.CONTINUOUS_ACTION])
if BufferKey.DISCRETE_ACTION in buff:
buff["discrete_action"], dtype=torch.long
buff[BufferKey.DISCRETE_ACTION], dtype=torch.long
)
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])

2
ml-agents/mlagents/trainers/torch/components/bc/module.py


# Convert to tensors
tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
act_masks = None
expert_actions = AgentAction.from_dict(mini_batch_demo)
expert_actions = AgentAction.from_buffer(mini_batch_demo)
if self.policy.behavior_spec.action_spec.discrete_size > 0:
act_masks = ModelUtils.list_to_tensor(

16
ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py


from typing import Dict, NamedTuple
from mlagents.torch_utils import torch, default_device
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.buffer import AgentBuffer, BufferKey
from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
BaseRewardProvider,
)

Uses the current state embedding and the action of the mini_batch to predict
the next state embedding.
"""
actions = AgentAction.from_dict(mini_batch)
actions = AgentAction.from_buffer(mini_batch)
flattened_action = self._action_flattener.forward(actions)
forward_model_input = torch.cat(
(self.get_current_state(mini_batch), flattened_action), dim=1

action prediction (given the current and next state).
"""
predicted_action = self.predict_action(mini_batch)
actions = AgentAction.from_dict(mini_batch)
actions = AgentAction.from_buffer(mini_batch)
_inverse_loss = 0
if self._action_spec.continuous_size > 0:
sq_difference = (

_inverse_loss += torch.mean(
ModelUtils.dynamic_partition(
sq_difference,
ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float),
ModelUtils.list_to_tensor(
mini_batch[BufferKey.MASKS], dtype=torch.float
),
2,
)[1]
)

ModelUtils.dynamic_partition(
cross_entropy,
ModelUtils.list_to_tensor(
mini_batch["masks"], dtype=torch.float
mini_batch[BufferKey.MASKS], dtype=torch.float
), # use masks not action_masks
2,
)[1]

return torch.mean(
ModelUtils.dynamic_partition(
self.compute_reward(mini_batch),
ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float),
ModelUtils.list_to_tensor(
mini_batch[BufferKey.MASKS], dtype=torch.float
),
2,
)[1]
)

4
ml-agents/mlagents/trainers/torch/components/reward_providers/extrinsic_reward_provider.py


import numpy as np
from typing import Dict
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.buffer import AgentBuffer, BufferKey
from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
BaseRewardProvider,
)

def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
return np.array(mini_batch["environment_rewards"], dtype=np.float32)
return np.array(mini_batch[BufferKey.ENVIRONMENT_REWARDS], dtype=np.float32)
def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
return {}

12
ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py


import numpy as np
from mlagents.torch_utils import torch, default_device
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.buffer import AgentBuffer, BufferKey
from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
BaseRewardProvider,
)

Creates the action Tensor. In continuous case, corresponds to the action. In
the discrete case, corresponds to the concatenation of one hot action Tensors.
"""
return self._action_flattener.forward(AgentAction.from_dict(mini_batch))
return self._action_flattener.forward(AgentAction.from_buffer(mini_batch))
def get_state_inputs(self, mini_batch: AgentBuffer) -> List[torch.Tensor]:
"""

inputs = self.get_state_inputs(mini_batch)
if self._settings.use_actions:
actions = self.get_action_input(mini_batch)
dones = torch.as_tensor(mini_batch["done"], dtype=torch.float).unsqueeze(1)
dones = torch.as_tensor(
mini_batch[BufferKey.DONE], dtype=torch.float
).unsqueeze(1)
action_inputs = torch.cat([actions, dones], dim=1)
hidden, _ = self.encoder(inputs, action_inputs)
else:

expert_action = self.get_action_input(expert_batch)
action_epsilon = torch.rand(policy_action.shape)
policy_dones = torch.as_tensor(
policy_batch["done"], dtype=torch.float
policy_batch[BufferKey.DONE], dtype=torch.float
expert_batch["done"], dtype=torch.float
expert_batch[BufferKey.DONE], dtype=torch.float
).unsqueeze(1)
dones_epsilon = torch.rand(policy_dones.shape)
action_inputs = torch.cat(

43
ml-agents/mlagents/trainers/trajectory.py


from typing import List, NamedTuple
import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.buffer import (
AgentBuffer,
ObservationKeyPrefix,
AgentBufferKey,
BufferKey,
)
from mlagents_envs.base_env import ActionTuple
from mlagents.trainers.torch.action_log_probs import LogProbsTuple

class ObsUtil:
@staticmethod
def get_name_at(index: int) -> str:
def get_name_at(index: int) -> AgentBufferKey:
return f"obs_{index}"
return ObservationKeyPrefix.OBSERVATION, index
def get_name_at_next(index: int) -> str:
def get_name_at_next(index: int) -> AgentBufferKey:
return f"next_obs_{index}"
return ObservationKeyPrefix.NEXT_OBSERVATION, index
@staticmethod
def from_buffer(batch: AgentBuffer, num_obs: int) -> List[np.array]:

agent_buffer_trajectory[ObsUtil.get_name_at_next(i)].append(next_obs[i])
if exp.memory is not None:
agent_buffer_trajectory["memory"].append(exp.memory)
agent_buffer_trajectory[BufferKey.MEMORY].append(exp.memory)
agent_buffer_trajectory["masks"].append(1.0)
agent_buffer_trajectory["done"].append(exp.done)
agent_buffer_trajectory[BufferKey.MASKS].append(1.0)
agent_buffer_trajectory[BufferKey.DONE].append(exp.done)
agent_buffer_trajectory["continuous_action"].append(exp.action.continuous)
agent_buffer_trajectory["discrete_action"].append(exp.action.discrete)
agent_buffer_trajectory["continuous_log_probs"].append(
agent_buffer_trajectory[BufferKey.CONTINUOUS_ACTION].append(
exp.action.continuous
)
agent_buffer_trajectory[BufferKey.DISCRETE_ACTION].append(
exp.action.discrete
)
agent_buffer_trajectory[BufferKey.CONTINUOUS_LOG_PROBS].append(
agent_buffer_trajectory["discrete_log_probs"].append(
agent_buffer_trajectory[BufferKey.DISCRETE_LOG_PROBS].append(
exp.action_probs.discrete
)

mask = 1 - np.concatenate(exp.action_mask)
agent_buffer_trajectory["action_mask"].append(mask, padding_value=1)
agent_buffer_trajectory[BufferKey.ACTION_MASK].append(
mask, padding_value=1
)
agent_buffer_trajectory["action_mask"].append(
agent_buffer_trajectory[BufferKey.ACTION_MASK].append(
agent_buffer_trajectory["prev_action"].append(exp.prev_action)
agent_buffer_trajectory["environment_rewards"].append(exp.reward)
agent_buffer_trajectory[BufferKey.PREV_ACTION].append(exp.prev_action)
agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS].append(exp.reward)
# Store the next visual obs as the current
obs = next_obs

正在加载...
取消
保存