浏览代码

Move the Critic into the Optimizer (#4939)

Co-authored-by: Ervin Teng <ervin@unity3d.com>
/develop/gail-srl-hack
GitHub 3 年前
当前提交
338af2ec
共有 20 个文件被更改,包括 421 次插入331 次删除
  1. 12
      ml-agents/mlagents/trainers/action_info.py
  2. 2
      ml-agents/mlagents/trainers/agent_processor.py
  3. 1
      ml-agents/mlagents/trainers/buffer.py
  4. 140
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  5. 15
      ml-agents/mlagents/trainers/policy/policy.py
  6. 62
      ml-agents/mlagents/trainers/policy/torch_policy.py
  7. 39
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  8. 4
      ml-agents/mlagents/trainers/ppo/trainer.py
  9. 96
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  10. 9
      ml-agents/mlagents/trainers/sac/trainer.py
  11. 33
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  12. 4
      ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py
  13. 23
      ml-agents/mlagents/trainers/tests/torch/test_networks.py
  14. 4
      ml-agents/mlagents/trainers/tests/torch/test_policy.py
  15. 12
      ml-agents/mlagents/trainers/tests/torch/test_ppo.py
  16. 4
      ml-agents/mlagents/trainers/tests/torch/test_sac.py
  17. 6
      ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
  18. 2
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  19. 2
      ml-agents/mlagents/trainers/torch/model_serialization.py
  20. 282
      ml-agents/mlagents/trainers/torch/networks.py

12
ml-agents/mlagents/trainers/action_info.py


class ActionInfo(NamedTuple):
"""
A NamedTuple containing actions and related quantities to the policy forward
pass. Additionally contains the agent ids in the corresponding DecisionStep
:param action: The action output of the policy
:param env_action: The possibly clipped action to be executed in the environment
:param outputs: Dict of all quantities associated with the policy forward pass
:param agent_ids: List of int agent ids in DecisionStep
"""
value: Any
return ActionInfo([], [], [], {}, [])
return ActionInfo([], [], {}, [])

2
ml-agents/mlagents/trainers/agent_processor.py


if stored_decision_step is not None and stored_take_action_outputs is not None:
obs = stored_decision_step.obs
if self.policy.use_recurrent:
memory = self.policy.retrieve_memories([global_id])[0, :]
memory = self.policy.retrieve_previous_memories([global_id])[0, :]
else:
memory = None
done = terminated # Since this is an ongoing step

1
ml-agents/mlagents/trainers/buffer.py


ENVIRONMENT_REWARDS = "environment_rewards"
MASKS = "masks"
MEMORY = "memory"
CRITIC_MEMORY = "critic_memory"
PREV_ACTION = "prev_action"
ADVANTAGES = "advantages"

140
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


from typing import Dict, Optional, Tuple, List
from mlagents.torch_utils import torch
import numpy as np
import math
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.buffer import AgentBuffer, AgentBufferField
from mlagents.trainers.trajectory import ObsUtil
from mlagents.trainers.torch.components.bc.module import BCModule
from mlagents.trainers.torch.components.reward_providers import create_reward_provider

self.global_step = torch.tensor(0)
self.bc_module: Optional[BCModule] = None
self.create_reward_signals(trainer_settings.reward_signals)
self.critic_memory_dict: Dict[str, torch.Tensor] = {}
if trainer_settings.behavioral_cloning is not None:
self.bc_module = BCModule(
self.policy,

default_num_epoch=3,
)
@property
def critic(self):
raise NotImplementedError
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
pass

reward_signal, self.policy.behavior_spec, settings
)
def _evaluate_by_sequence(
self, tensor_obs: List[torch.Tensor], initial_memory: np.ndarray
) -> Tuple[Dict[str, torch.Tensor], AgentBufferField, torch.Tensor]:
"""
Evaluate a trajectory sequence-by-sequence, assembling the result. This enables us to get the
intermediate memories for the critic.
:param tensor_obs: A List of tensors of shape (trajectory_len, <obs_dim>) that are the agent's
observations for this trajectory.
:param initial_memory: The memory that preceeds this trajectory. Of shape (1,1,<mem_size>), i.e.
what is returned as the output of a MemoryModules.
:return: A Tuple of the value estimates as a Dict of [name, tensor], an AgentBufferField of the initial
memories to be used during value function update, and the final memory at the end of the trajectory.
"""
num_experiences = tensor_obs[0].shape[0]
all_next_memories = AgentBufferField()
# In the buffer, the 1st sequence are the ones that are padded. So if seq_len = 3 and
# trajectory is of length 10, the 1st sequence is [pad,pad,obs].
# Compute the number of elements in this padded seq.
leftover = num_experiences % self.policy.sequence_length
# Compute values for the potentially truncated initial sequence
seq_obs = []
first_seq_len = self.policy.sequence_length
for _obs in tensor_obs:
if leftover > 0:
first_seq_len = leftover
first_seq_obs = _obs[0:first_seq_len]
seq_obs.append(first_seq_obs)
# For the first sequence, the initial memory should be the one at the
# beginning of this trajectory.
for _ in range(first_seq_len):
all_next_memories.append(initial_memory.squeeze().detach().numpy())
init_values, _mem = self.critic.critic_pass(
seq_obs, initial_memory, sequence_length=first_seq_len
)
all_values = {
signal_name: [init_values[signal_name]]
for signal_name in init_values.keys()
}
# Evaluate other trajectories, carrying over _mem after each
# trajectory
for seq_num in range(
1, math.ceil((num_experiences) / (self.policy.sequence_length))
):
seq_obs = []
for _ in range(self.policy.sequence_length):
all_next_memories.append(_mem.squeeze().detach().numpy())
for _obs in tensor_obs:
start = seq_num * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
end = (seq_num + 1) * self.policy.sequence_length - (
self.policy.sequence_length - leftover
)
seq_obs.append(_obs[start:end])
values, _mem = self.critic.critic_pass(
seq_obs, _mem, sequence_length=self.policy.sequence_length
)
for signal_name, _val in values.items():
all_values[signal_name].append(_val)
# Create one tensor per reward signal
all_value_tensors = {
signal_name: torch.cat(value_list, dim=0)
for signal_name, value_list in all_values.items()
}
next_mem = _mem
return all_value_tensors, all_next_memories, next_mem
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
self,
batch: AgentBuffer,
next_obs: List[np.ndarray],
done: bool,
agent_id: str = "",
) -> Tuple[Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField]]:
"""
Get value estimates and memories for a trajectory, in batch form.
:param batch: An AgentBuffer that consists of a trajectory.
:param next_obs: the next observation (after the trajectory). Used for boostrapping
if this is not a termiinal trajectory.
:param done: Set true if this is a terminal trajectory.
:param agent_id: Agent ID of the agent that this trajectory belongs to.
:returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)],
the final value estimate as a Dict of [name, float], and optionally (if using memories)
an AgentBufferField of initial critic memories to be used during update.
"""
current_obs = ObsUtil.from_buffer(batch, n_obs)
if agent_id in self.critic_memory_dict:
memory = self.critic_memory_dict[agent_id]
else:
memory = (
torch.zeros((1, 1, self.critic.memory_size))
if self.policy.use_recurrent
else None
)
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
current_obs = [
ModelUtils.list_to_tensor(obs) for obs in ObsUtil.from_buffer(batch, n_obs)
]
memory = torch.zeros([1, 1, self.policy.m_size])
next_obs = [obs.unsqueeze(0) for obs in next_obs]
next_obs = [obs.unsqueeze(0) for obs in next_obs]
# If we're using LSTM, we want to get all the intermediate memories.
all_next_memories: Optional[AgentBufferField] = None
if self.policy.use_recurrent:
(
value_estimates,
all_next_memories,
next_memory,
) = self._evaluate_by_sequence(current_obs, memory)
else:
value_estimates, next_memory = self.critic.critic_pass(
current_obs, memory, sequence_length=batch.num_experiences
)
value_estimates, next_memory = self.policy.actor_critic.critic_pass(
current_obs, memory, sequence_length=batch.num_experiences
)
# Store the memory for the next trajectory
self.critic_memory_dict[agent_id] = next_memory
next_value_estimate, _ = self.policy.actor_critic.critic_pass(
next_value_estimate, _ = self.critic.critic_pass(
next_obs, next_memory, sequence_length=1
)

for k in next_value_estimate:
if not self.reward_signals[k].ignore_done:
next_value_estimate[k] = 0.0
return value_estimates, next_value_estimate
if agent_id in self.critic_memory_dict:
self.critic_memory_dict.pop(agent_id)
return value_estimates, next_value_estimate, all_next_memories

15
ml-agents/mlagents/trainers/policy/policy.py


self.network_settings: NetworkSettings = trainer_settings.network_settings
self.seed = seed
self.previous_action_dict: Dict[str, np.ndarray] = {}
self.previous_memory_dict: Dict[str, np.ndarray] = {}
self.memory_dict: Dict[str, np.ndarray] = {}
self.normalize = trainer_settings.network_settings.normalize
self.use_recurrent = self.network_settings.memory is not None

if memory_matrix is None:
return
# Pass old memories into previous_memory_dict
for agent_id in agent_ids:
if agent_id in self.memory_dict:
self.previous_memory_dict[agent_id] = self.memory_dict[agent_id]
for index, agent_id in enumerate(agent_ids):
self.memory_dict[agent_id] = memory_matrix[index, :]

memory_matrix[index, :] = self.memory_dict[agent_id]
return memory_matrix
def retrieve_previous_memories(self, agent_ids: List[str]) -> np.ndarray:
memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.previous_memory_dict:
memory_matrix[index, :] = self.previous_memory_dict[agent_id]
return memory_matrix
if agent_id in self.previous_memory_dict:
self.previous_memory_dict.pop(agent_id)
def make_empty_previous_action(self, num_agents: int) -> np.ndarray:
"""

62
ml-agents/mlagents/trainers/policy/torch_policy.py


from mlagents_envs.timers import timed
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.torch.networks import (
SharedActorCritic,
SeparateActorCritic,
GlobalSteps,
)
from mlagents.trainers.torch.networks import SimpleActor, SharedActorCritic, GlobalSteps
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.buffer import AgentBuffer

) # could be much simpler if TorchPolicy is nn.Module
self.grads = None
reward_signal_configs = trainer_settings.reward_signals
reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
ac_class = SeparateActorCritic
self.actor = SimpleActor(
observation_specs=self.behavior_spec.observation_specs,
network_settings=trainer_settings.network_settings,
action_spec=behavior_spec.action_spec,
conditional_sigma=self.condition_sigma_on_obs,
tanh_squash=tanh_squash,
)
self.shared_critic = False
ac_class = SharedActorCritic
self.actor_critic = ac_class(
observation_specs=self.behavior_spec.observation_specs,
network_settings=trainer_settings.network_settings,
action_spec=behavior_spec.action_spec,
stream_names=reward_signal_names,
conditional_sigma=self.condition_sigma_on_obs,
tanh_squash=tanh_squash,
)
reward_signal_configs = trainer_settings.reward_signals
reward_signal_names = [
key.value for key, _ in reward_signal_configs.items()
]
self.actor = SharedActorCritic(
observation_specs=self.behavior_spec.observation_specs,
network_settings=trainer_settings.network_settings,
action_spec=behavior_spec.action_spec,
stream_names=reward_signal_names,
conditional_sigma=self.condition_sigma_on_obs,
tanh_squash=tanh_squash,
)
self.shared_critic = True
self.m_size = self.actor_critic.memory_size
self.m_size = self.actor.memory_size
self.actor_critic.to(default_device())
self.actor.to(default_device())
self._clip_action = not tanh_squash
@property

"""
if self.normalize:
self.actor_critic.update_normalization(buffer)
self.actor.update_normalization(buffer)
@timed
def sample_actions(

:param seq_len: Sequence length when using RNN.
:return: Tuple of AgentAction, ActionLogProbs, entropies, and output memories.
"""
actions, log_probs, entropies, memories = self.actor_critic.get_action_stats(
actions, log_probs, entropies, memories = self.actor.get_action_and_stats(
obs, masks, memories, seq_len
)
return (actions, log_probs, entropies, memories)

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
seq_len: int = 1,
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
log_probs, entropies, value_heads = self.actor_critic.get_stats_and_value(
) -> Tuple[ActionLogProbs, torch.Tensor]:
log_probs, entropies = self.actor.get_stats(
return log_probs, entropies, value_heads
return log_probs, entropies
@timed
def evaluate(

return ActionInfo(
action=run_out.get("action"),
env_action=run_out.get("env_action"),
value=run_out.get("value"),
outputs=run_out,
agent_ids=list(decision_requests.agent_id),
)

return self.get_current_step()
def load_weights(self, values: List[np.ndarray]) -> None:
self.actor_critic.load_state_dict(values)
self.actor.load_state_dict(values)
return copy.deepcopy(self.actor_critic.state_dict())
return copy.deepcopy(self.actor.state_dict())
return {"Policy": self.actor_critic, "global_step": self.global_step}
return {"Policy": self.actor, "global_step": self.global_step}

39
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from typing import Dict, cast
from mlagents.torch_utils import torch
from mlagents.torch_utils import torch, default_device
from mlagents.trainers.buffer import AgentBuffer, BufferKey, RewardSignalUtil

from mlagents.trainers.settings import TrainerSettings, PPOSettings
from mlagents.trainers.torch.networks import ValueNetwork
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils

# Create the graph here to give more granular control of the TF graph to the Optimizer.
super().__init__(policy, trainer_settings)
params = list(self.policy.actor_critic.parameters())
reward_signal_configs = trainer_settings.reward_signals
reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
if policy.shared_critic:
self._critic = policy.actor
else:
self._critic = ValueNetwork(
reward_signal_names,
policy.behavior_spec.observation_specs,
network_settings=trainer_settings.network_settings,
)
self._critic.to(default_device())
params = list(self.policy.actor.parameters()) + list(self._critic.parameters())
self.hyperparameters: PPOSettings = cast(
PPOSettings, trainer_settings.hyperparameters
)

self.stream_names = list(self.reward_signals.keys())
@property
def critic(self):
return self._critic
def ppo_value_loss(
self,
values: Dict[str, torch.Tensor],

if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
log_probs, entropy, values = self.policy.evaluate_actions(
# Get value memories
value_memories = [
ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i])
for i in range(
0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length
)
]
if len(value_memories) > 0:
value_memories = torch.stack(value_memories).unsqueeze(0)
log_probs, entropy = self.policy.evaluate_actions(
)
values, _ = self.critic.critic_pass(
current_obs,
memories=value_memories,
sequence_length=self.policy.sequence_length,
)
old_log_probs = ActionLogProbs.from_buffer(batch).flatten()
log_probs = log_probs.flatten()

4
ml-agents/mlagents/trainers/ppo/trainer.py


self.policy.update_normalization(agent_buffer_trajectory)
# Get all value estimates
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
value_estimates, value_next, value_memories = self.optimizer.get_trajectory_value_estimates(
if value_memories is not None:
agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set(value_memories)
for name, v in value_estimates.items():
agent_buffer_trajectory[RewardSignalUtil.value_estimates_key(name)].extend(

96
ml-agents/mlagents/trainers/sac/optimizer_torch.py


def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings):
super().__init__(policy, trainer_params)
reward_signal_configs = trainer_params.reward_signals
reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
if policy.shared_critic:
raise UnityTrainerException("SAC does not support SharedActorCritic")
self._critic = ValueNetwork(
reward_signal_names,
policy.behavior_spec.observation_specs,
policy.network_settings,
)
hyperparameters: SACSettings = cast(SACSettings, trainer_params.hyperparameters)
self.tau = hyperparameters.tau
self.init_entcoef = hyperparameters.init_entcoef

}
self._action_spec = self.policy.behavior_spec.action_spec
self.value_network = TorchSACOptimizer.PolicyValueNetwork(
self.q_network = TorchSACOptimizer.PolicyValueNetwork(
self.stream_names,
self.policy.behavior_spec.observation_specs,
policy_network_settings,

self.policy.behavior_spec.observation_specs,
policy_network_settings,
)
ModelUtils.soft_update(
self.policy.actor_critic.critic, self.target_network, 1.0
)
ModelUtils.soft_update(self._critic, self.target_network, 1.0)
# We create one entropy coefficient per action, whether discrete or continuous.
_disc_log_ent_coef = torch.nn.Parameter(

self.target_entropy = TorchSACOptimizer.TargetEntropy(
continuous=_cont_target, discrete=_disc_target
)
policy_params = list(self.policy.actor_critic.network_body.parameters()) + list(
self.policy.actor_critic.action_model.parameters()
)
value_params = list(self.value_network.parameters()) + list(
self.policy.actor_critic.critic.parameters()
policy_params = list(self.policy.actor.parameters())
value_params = list(self.q_network.parameters()) + list(
self._critic.parameters()
)
logger.debug("value_vars")

)
self._move_to_device(default_device())
@property
def critic(self):
return self._critic
self.value_network.to(device)
self._critic.to(device)
self.q_network.to(device)
def sac_q_loss(
self,

for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length)
]
# LSTM shouldn't have sequence length <1, but stop it from going out of the index if true.
value_memories_list = [
ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i])
for i in range(
0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length
)
]
next_memories_list = [
next_value_memories_list = [
batch[BufferKey.MEMORY][i][self.policy.m_size // 2 :]
batch[BufferKey.CRITIC_MEMORY][i]
offset, len(batch[BufferKey.MEMORY]), self.policy.sequence_length
offset, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length
next_memories = torch.stack(next_memories_list).unsqueeze(0)
value_memories = torch.stack(value_memories_list).unsqueeze(0)
next_value_memories = torch.stack(next_value_memories_list).unsqueeze(0)
next_memories = None
# Q network memories are 0'ed out, since we don't have them during inference.
value_memories = None
next_value_memories = None
# Q and V network memories are 0'ed out, since we don't have them during inference.
torch.zeros_like(next_memories) if next_memories is not None else None
torch.zeros_like(next_value_memories)
if next_value_memories is not None
else None
self.value_network.q1_network.network_body.copy_normalization(
self.policy.actor_critic.network_body
self.q_network.q1_network.network_body.copy_normalization(
self.policy.actor.network_body
self.value_network.q2_network.network_body.copy_normalization(
self.policy.actor_critic.network_body
self.q_network.q2_network.network_body.copy_normalization(
self.policy.actor.network_body
self.policy.actor_critic.network_body
self.policy.actor.network_body
(
sampled_actions,
log_probs,
_,
value_estimates,
_,
) = self.policy.actor_critic.get_action_stats_and_value(
self._critic.network_body.copy_normalization(self.policy.actor.network_body)
sampled_actions, log_probs, _, _, = self.policy.actor.get_action_and_stats(
value_estimates, _ = self._critic.critic_pass(
current_obs, value_memories, sequence_length=self.policy.sequence_length
)
q1p_out, q2p_out = self.value_network(
q1p_out, q2p_out = self.q_network(
current_obs,
cont_sampled_actions,
memories=q_memories,

q1_out, q2_out = self.value_network(
q1_out, q2_out = self.q_network(
current_obs,
cont_actions,
memories=q_memories,

with torch.no_grad():
target_values, _ = self.target_network(
next_obs,
memories=next_memories,
memories=next_value_memories,
sequence_length=self.policy.sequence_length,
)
masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool)

policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks)
entropy_loss = self.sac_entropy_loss(log_probs, masks)
total_value_loss = q1_loss + q2_loss + value_loss
total_value_loss = q1_loss + q2_loss
if self.policy.shared_critic:
policy_loss += value_loss
else:
total_value_loss += value_loss
decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step())
ModelUtils.update_learning_rate(self.policy_optimizer, decay_lr)

self.entropy_optimizer.step()
# Update target network
ModelUtils.soft_update(
self.policy.actor_critic.critic, self.target_network, self.tau
)
ModelUtils.soft_update(self._critic, self.target_network, self.tau)
update_stats = {
"Losses/Policy Loss": policy_loss.item(),
"Losses/Value Loss": value_loss.item(),

def get_modules(self):
modules = {
"Optimizer:value_network": self.value_network,
"Optimizer:value_network": self.q_network,
"Optimizer:target_network": self.target_network,
"Optimizer:policy_optimizer": self.policy_optimizer,
"Optimizer:value_optimizer": self.value_optimizer,

9
ml-agents/mlagents/trainers/sac/trainer.py


self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
# Get all value estimates for reporting purposes
value_estimates, _ = self.optimizer.get_trajectory_value_estimates(
(
value_estimates,
_,
value_memories,
) = self.optimizer.get_trajectory_value_estimates(
if value_memories is not None:
agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set(value_memories)
for name, v in value_estimates.items():
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value",

33
ml-agents/mlagents/trainers/tests/test_agent_processor.py


def create_mock_policy():
mock_policy = mock.Mock()
mock_policy.reward_signals = {}
mock_policy.retrieve_memories.return_value = np.zeros((1, 1), dtype=np.float32)
mock_policy.retrieve_previous_memories.return_value = np.zeros(
(1, 1), dtype=np.float32
)
mock_policy.retrieve_previous_action.return_value = np.zeros((1, 1), dtype=np.int32)
return mock_policy

)
fake_action_outputs = {
"action": ActionTuple(continuous=np.array([[0.1], [0.1]])),
"action": ActionTuple(continuous=np.array([[0.1], [0.1]], dtype=np.float32)),
"log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])),
"log_probs": LogProbsTuple(
continuous=np.array([[0.1], [0.1]], dtype=np.float32)
),
}
mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
num_agents=2,

action_spec=ActionSpec.create_continuous(2),
)
fake_action_info = ActionInfo(
action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
env_action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
value=[0.1, 0.1],
action=ActionTuple(continuous=np.array([[0.1], [0.1]], dtype=np.float32)),
env_action=ActionTuple(continuous=np.array([[0.1], [0.1]], dtype=np.float32)),
outputs=fake_action_outputs,
agent_ids=mock_decision_steps.agent_id,
)

stats_reporter=StatsReporter("testcat"),
)
fake_action_outputs = {
"action": ActionTuple(continuous=np.array([[0.1]])),
"action": ActionTuple(continuous=np.array([[0.1]], dtype=np.float32)),
"log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
"log_probs": LogProbsTuple(continuous=np.array([[0.1]], dtype=np.float32)),
}
mock_decision_step, mock_terminal_step = mb.create_mock_steps(

done=True,
)
fake_action_info = ActionInfo(
action=ActionTuple(continuous=np.array([[0.1]])),
env_action=ActionTuple(continuous=np.array([[0.1]])),
value=[0.1],
action=ActionTuple(continuous=np.array([[0.1]], dtype=np.float32)),
env_action=ActionTuple(continuous=np.array([[0.1]], dtype=np.float32)),
outputs=fake_action_outputs,
agent_ids=mock_decision_step.agent_id,
)

stats_reporter=StatsReporter("testcat"),
)
fake_action_outputs = {
"action": ActionTuple(continuous=np.array([[0.1]])),
"action": ActionTuple(continuous=np.array([[0.1]], dtype=np.float32)),
"log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
"log_probs": LogProbsTuple(continuous=np.array([[0.1]], dtype=np.float32)),
}
mock_decision_step, mock_terminal_step = mb.create_mock_steps(

)
fake_action_info = ActionInfo(
action=ActionTuple(continuous=np.array([[0.1]])),
env_action=ActionTuple(continuous=np.array([[0.1]])),
value=[0.1],
action=ActionTuple(continuous=np.array([[0.1]], dtype=np.float32)),
env_action=ActionTuple(continuous=np.array([[0.1]], dtype=np.float32)),
outputs=fake_action_outputs,
agent_ids=mock_decision_step.agent_id,
)

4
ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py


"""
Make sure two policies have the same output for the same input.
"""
policy1.actor_critic = policy1.actor_critic.to(default_device())
policy2.actor_critic = policy2.actor_critic.to(default_device())
policy1.actor = policy1.actor.to(default_device())
policy2.actor = policy2.actor.to(default_device())
decision_step, _ = mb.create_steps_from_behavior_spec(
policy1.behavior_spec, num_agents=1

23
ml-agents/mlagents/trainers/tests/torch/test_networks.py


from mlagents.trainers.torch.networks import (
NetworkBody,
ValueNetwork,
SimpleActor,
SeparateActorCritic,
)
from mlagents.trainers.settings import NetworkSettings
from mlagents_envs.base_env import ActionSpec

assert _out[0] == pytest.approx(1.0, abs=0.1)
@pytest.mark.parametrize("ac_type", [SharedActorCritic, SeparateActorCritic])
@pytest.mark.parametrize("shared", [True, False])
def test_actor_critic(ac_type, lstm):
def test_actor_critic(lstm, shared):
obs_size = 4
network_settings = NetworkSettings(
memory=NetworkSettings.MemorySettings() if lstm else None, normalize=True

stream_names = [f"stream_name{n}" for n in range(4)]
# action_spec = ActionSpec.create_continuous(act_size[0])
action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size)))
actor = ac_type(obs_spec, network_settings, action_spec, stream_names)
if shared:
actor = critic = SharedActorCritic(
obs_spec, network_settings, action_spec, stream_names, network_settings
)
else:
actor = SimpleActor(obs_spec, network_settings, action_spec)
critic = ValueNetwork(stream_names, obs_spec, network_settings)
if lstm:
sample_obs = torch.ones((1, network_settings.memory.sequence_length, obs_size))
memories = torch.ones(

# memories isn't always set to None, the network should be able to
# deal with that.
# Test critic pass
value_out, memories_out = actor.critic_pass([sample_obs], memories=memories)
value_out, memories_out = critic.critic_pass([sample_obs], memories=memories)
for stream in stream_names:
if lstm:
assert value_out[stream].shape == (network_settings.memory.sequence_length,)

# Test get action stats and_value
action, log_probs, entropies, value_out, mem_out = actor.get_action_stats_and_value(
action, log_probs, entropies, mem_out = actor.get_action_and_stats(
[sample_obs], memories=memories, masks=mask
)
if lstm:

if mem_out is not None:
assert mem_out.shape == memories.shape
for stream in stream_names:
if lstm:
assert value_out[stream].shape == (network_settings.memory.sequence_length,)
else:
assert value_out[stream].shape == (1,)

4
ml-agents/mlagents/trainers/tests/torch/test_policy.py


if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
log_probs, entropy, values = policy.evaluate_actions(
log_probs, entropy = policy.evaluate_actions(
tensor_obs,
masks=act_masks,
actions=agent_action,

assert log_probs.flatten().shape == (64, _size)
assert entropy.shape == (64,)
for val in values.values():
assert val.shape == (64,)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])

12
ml-agents/mlagents/trainers/tests/torch/test_ppo.py


RewardSignalUtil.value_estimates_key("extrinsic"),
],
)
# Copy memories to critic memories
copy_buffer_fields(update_buffer, BufferKey.MEMORY, [BufferKey.CRITIC_MEMORY])
return_stats = optimizer.update(
update_buffer,

RewardSignalUtil.value_estimates_key("curiosity"),
],
)
# Copy memories to critic memories
copy_buffer_fields(update_buffer, BufferKey.MEMORY, [BufferKey.CRITIC_MEMORY])
optimizer.update(
update_buffer,

action_spec=DISCRETE_ACTION_SPEC if discrete else CONTINUOUS_ACTION_SPEC,
max_step_complete=True,
)
run_out, final_value_out = optimizer.get_trajectory_value_estimates(
run_out, final_value_out, all_memories = optimizer.get_trajectory_value_estimates(
if all_memories is not None:
assert len(all_memories) == 15
run_out, final_value_out = optimizer.get_trajectory_value_estimates(
run_out, final_value_out, _ = optimizer.get_trajectory_value_estimates(
trajectory.to_agentbuffer(), trajectory.next_obs, done=True
)
for key, val in final_value_out.items():

# Check if we ignore terminal states properly
optimizer.reward_signals["extrinsic"].use_terminal_states = False
run_out, final_value_out = optimizer.get_trajectory_value_estimates(
run_out, final_value_out, _ = optimizer.get_trajectory_value_estimates(
trajectory.to_agentbuffer(), trajectory.next_obs, done=False
)
for key, val in final_value_out.items():

4
ml-agents/mlagents/trainers/tests/torch/test_sac.py


)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=24
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec, memory_size=12
# Mock out value memories
update_buffer[BufferKey.CRITIC_MEMORY] = update_buffer[BufferKey.MEMORY]
return_stats = optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

6
ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py


new_hyperparams = attr.evolve(
SAC_TORCH_CONFIG.hyperparameters,
batch_size=256,
learning_rate=1e-4,
learning_rate=3e-4,
buffer_init_steps=1000,
steps_per_update=2,
)

network_settings=new_networksettings,
max_steps=4000,
)
check_environment_trains(env, {BRAIN_NAME: config}, training_seed=1213)
check_environment_trains(env, {BRAIN_NAME: config}, training_seed=1337)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])

num_visual=1,
num_vector=0,
action_sizes=action_sizes,
step_size=0.2,
step_size=0.3,
)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)
reward_signals = {

2
ml-agents/mlagents/trainers/torch/components/bc/module.py


self.decay_learning_rate = ModelUtils.DecayedValue(
learning_rate_schedule, self.current_lr, 1e-10, self._anneal_steps
)
params = self.policy.actor_critic.parameters()
params = self.policy.actor.parameters()
self.optimizer = torch.optim.Adam(params, lr=self.current_lr)
_, self.demonstration_buffer = demo_to_buffer(
settings.demo_path, policy.sequence_length, policy.behavior_spec

2
ml-agents/mlagents/trainers/torch/model_serialization.py


with exporting_to_onnx():
torch.onnx.export(
self.policy.actor_critic,
self.policy.actor,
self.dummy_input,
onnx_output_path,
opset_version=SerializationSettings.onnx_opset,

282
ml-agents/mlagents/trainers/torch/networks.py


return encoding, memories
class ValueNetwork(nn.Module):
class Critic(abc.ABC):
@abc.abstractmethod
def update_normalization(self, buffer: AgentBuffer) -> None:
"""
Updates normalization of Actor based on the provided List of vector obs.
:param vector_obs: A List of vector obs as tensors.
"""
pass
def critic_pass(
self,
inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
"""
Get value outputs for the given obs.
:param inputs: List of inputs as tensors.
:param memories: Tensor of memories, if using memory. Otherwise, None.
:returns: Dict of reward stream to output tensor for values.
"""
pass
class ValueNetwork(nn.Module, Critic):
def __init__(
self,
stream_names: List[str],

encoding_size = network_settings.hidden_units
self.value_heads = ValueHeads(stream_names, encoding_size, outputs_per_stream)
def update_normalization(self, buffer: AgentBuffer) -> None:
self.network_body.update_normalization(buffer)
def critic_pass(
self,
inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
value_outputs, critic_mem_out = self.forward(
inputs, memories=memories, sequence_length=sequence_length
)
return value_outputs, critic_mem_out
def forward(
self,
inputs: List[torch.Tensor],

"""
pass
def get_action_stats(
def get_action_and_stats(
self,
inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,

"""
Returns sampled actions.
If memory is enabled, return the memories as well.
:param vec_inputs: A List of vector inputs as tensors.
:param vis_inputs: A List of visual inputs as tensors.
:param inputs: A List of inputs as tensors.
:param masks: If using discrete actions, a Tensor of action masks.
:param memories: If using memory, a Tensor of initial memories.
:param sequence_length: If using memory, the sequence length.

pass
@abc.abstractmethod
def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
var_len_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
) -> Tuple[Union[int, torch.Tensor], ...]:
"""
Forward pass of the Actor for inference. This is required for export to ONNX, and
the inputs and outputs of this method should not be changed without a respective change
in the ONNX export code.
"""
pass
class ActorCritic(Actor):
@abc.abstractmethod
def critic_pass(
def get_stats(
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
"""
Get value outputs for the given obs.
:param inputs: List of inputs as tensors.
:param memories: Tensor of memories, if using memory. Otherwise, None.
:returns: Dict of reward stream to output tensor for values.
"""
pass
@abc.abstractmethod
def get_action_stats_and_value(
self,
inputs: List[torch.Tensor],
actions: AgentAction,
) -> Tuple[
AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
]:
) -> Tuple[ActionLogProbs, torch.Tensor]:
Returns sampled actions and value estimates.
Returns log_probs for actions and entropies.
:param inputs: A List of vector inputs as tensors.
:param inputs: A List of inputs as tensors.
:param actions: AgentAction of actions.
:return: A Tuple of AgentAction, ActionLogProbs, entropies, Dict of reward signal
name to value estimate, and memories. Memories will be None if not using memory.
:return: A Tuple of AgentAction, ActionLogProbs, entropies, and memories.
Memories will be None if not using memory.
@abc.abstractproperty
def memory_size(self):
@abc.abstractmethod
def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
var_len_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
) -> Tuple[Union[int, torch.Tensor], ...]:
Returns the size of the memory (same size used as input and output in the other
methods) used by this Actor.
Forward pass of the Actor for inference. This is required for export to ONNX, and
the inputs and outputs of this method should not be changed without a respective change
in the ONNX export code.
"""
pass

def update_normalization(self, buffer: AgentBuffer) -> None:
self.network_body.update_normalization(buffer)
def get_action_stats(
def get_action_and_stats(
self,
inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,

action, log_probs, entropies = self.action_model(encoding, masks)
return action, log_probs, entropies, memories
def get_stats(
self,
inputs: List[torch.Tensor],
actions: AgentAction,
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[ActionLogProbs, torch.Tensor]:
encoding, actor_mem_outs = self.network_body(
inputs, memories=memories, sequence_length=sequence_length
)
log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
return log_probs, entropies
def forward(
self,
vec_inputs: List[torch.Tensor],

return tuple(export_out)
class SharedActorCritic(SimpleActor, ActorCritic):
class SharedActorCritic(SimpleActor, Critic):
def __init__(
self,
observation_specs: List[ObservationSpec],

inputs, memories=memories, sequence_length=sequence_length
)
return self.value_heads(encoding), memories_out
def get_stats_and_value(
self,
inputs: List[torch.Tensor],
actions: AgentAction,
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
encoding, memories = self.network_body(
inputs, memories=memories, sequence_length=sequence_length
)
log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
value_outputs = self.value_heads(encoding)
return log_probs, entropies, value_outputs
def get_action_stats_and_value(
self,
inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[
AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
]:
encoding, memories = self.network_body(
inputs, memories=memories, sequence_length=sequence_length
)
action, log_probs, entropies = self.action_model(encoding, masks)
value_outputs = self.value_heads(encoding)
return action, log_probs, entropies, value_outputs, memories
class SeparateActorCritic(SimpleActor, ActorCritic):
def __init__(
self,
observation_specs: List[ObservationSpec],
network_settings: NetworkSettings,
action_spec: ActionSpec,
stream_names: List[str],
conditional_sigma: bool = False,
tanh_squash: bool = False,
):
self.use_lstm = network_settings.memory is not None
super().__init__(
observation_specs,
network_settings,
action_spec,
conditional_sigma,
tanh_squash,
)
self.stream_names = stream_names
self.critic = ValueNetwork(stream_names, observation_specs, network_settings)
@property
def memory_size(self) -> int:
return self.network_body.memory_size + self.critic.memory_size
def _get_actor_critic_mem(
self, memories: Optional[torch.Tensor] = None
) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
if self.use_lstm and memories is not None:
# Use only the back half of memories for critic and actor
actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1)
actor_mem, critic_mem = actor_mem.contiguous(), critic_mem.contiguous()
else:
critic_mem = None
actor_mem = None
return actor_mem, critic_mem
def critic_pass(
self,
inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)
value_outputs, critic_mem_out = self.critic(
inputs, memories=critic_mem, sequence_length=sequence_length
)
if actor_mem is not None:
# Make memories with the actor mem unchanged
memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1)
else:
memories_out = None
return value_outputs, memories_out
def get_stats_and_value(
self,
inputs: List[torch.Tensor],
actions: AgentAction,
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)
encoding, actor_mem_outs = self.network_body(
inputs, memories=actor_mem, sequence_length=sequence_length
)
log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
value_outputs, critic_mem_outs = self.critic(
inputs, memories=critic_mem, sequence_length=sequence_length
)
return log_probs, entropies, value_outputs
def get_action_stats(
self,
inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[AgentAction, ActionLogProbs, torch.Tensor, torch.Tensor]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)
action, log_probs, entropies, actor_mem_out = super().get_action_stats(
inputs, masks=masks, memories=actor_mem, sequence_length=sequence_length
)
if critic_mem is not None:
# Make memories with the actor mem unchanged
memories_out = torch.cat([actor_mem_out, critic_mem], dim=-1)
else:
memories_out = None
return action, log_probs, entropies, memories_out
def get_action_stats_and_value(
self,
inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[
AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
]:
actor_mem, critic_mem = self._get_actor_critic_mem(memories)
encoding, actor_mem_outs = self.network_body(
inputs, memories=actor_mem, sequence_length=sequence_length
)
action, log_probs, entropies = self.action_model(encoding, masks)
value_outputs, critic_mem_outs = self.critic(
inputs, memories=critic_mem, sequence_length=sequence_length
)
if self.use_lstm:
mem_out = torch.cat([actor_mem_outs, critic_mem_outs], dim=-1)
else:
mem_out = None
return action, log_probs, entropies, value_outputs, mem_out
def update_normalization(self, buffer: AgentBuffer) -> None:
super().update_normalization(buffer)
self.critic.network_body.update_normalization(buffer)
class GlobalSteps(nn.Module):

正在加载...
取消
保存