浏览代码

Update SAC, fix PPO batching

/develop/unified-obs
Ervin Teng 4 年前
当前提交
6c77ac7a
共有 5 个文件被更改,包括 27 次插入41 次删除
  1. 4
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  2. 54
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  3. 6
      ml-agents/mlagents/trainers/sac/trainer.py
  4. 2
      ml-agents/mlagents/trainers/torch/encoders.py
  5. 2
      ml-agents/mlagents/trainers/torch/networks.py

4
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


)
returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])
obs = ModelUtils.list_to_tensor_list(batch["obs"])
obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(batch["obs"])
)
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions_pre"]).unsqueeze(-1)

54
ml-agents/mlagents/trainers/sac/optimizer_torch.py


def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
actions: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

"""
Performs a forward pass on the value network, which consists of a Q1 and Q2
network. Optionally does not evaluate gradients for either the Q1, Q2, or both.
:param vec_inputs: List of vector observation tensors.
:param vis_input: List of visual observation tensors.
:param net_inputs: List of observation tensors.
:param actions: For a continuous Q function (has actions), tensor of actions.
Otherwise, None.
:param memories: Initial memories if using memory. Otherwise, None.

if not q1_grad:
stack.enter_context(torch.no_grad())
q1_out, _ = self.q1_network(
vec_inputs,
vis_inputs,
net_inputs,
actions=actions,
memories=memories,
sequence_length=sequence_length,

stack.enter_context(torch.no_grad())
q2_out, _ = self.q2_network(
vec_inputs,
vis_inputs,
net_inputs,
actions=actions,
memories=memories,
sequence_length=sequence_length,

for name in self.reward_signals:
rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"])
vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
next_vec_obs = [ModelUtils.list_to_tensor(batch["next_vector_in"])]
obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(batch["obs"])
)
next_obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(batch["next_obs"])
)
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)

torch.zeros_like(next_memories) if next_memories is not None else None
)
vis_obs: List[torch.Tensor] = []
next_vis_obs: List[torch.Tensor] = []
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
vis_obs.append(vis_ob)
next_vis_ob = ModelUtils.list_to_tensor(
batch["next_visual_obs%d" % idx]
)
next_vis_obs.append(next_vis_ob)
# Copy normalizers from policy
self.value_network.q1_network.network_body.copy_normalization(
self.policy.actor_critic.network_body

self.policy.actor_critic.network_body
)
(sampled_actions, _, log_probs, _, _) = self.policy.sample_actions(
vec_obs,
vis_obs,
obs,
masks=act_masks,
memories=memories,
seq_len=self.policy.sequence_length,

vec_obs, vis_obs, memories, sequence_length=self.policy.sequence_length
obs, memories, sequence_length=self.policy.sequence_length
vec_obs,
vis_obs,
obs,
sampled_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,

vec_obs,
vis_obs,
obs,
squeezed_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,

# For discrete, you don't need to backprop through the Q for the policy
q1p_out, q2p_out = self.value_network(
vec_obs,
vis_obs,
obs,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q1_grad=False,

vec_obs,
vis_obs,
obs,
memories=q_memories,
sequence_length=self.policy.sequence_length,
)

with torch.no_grad():
target_values, _ = self.target_network(
next_vec_obs,
next_vis_obs,
next_obs,
memories=next_memories,
sequence_length=self.policy.sequence_length,
)

6
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, SACSettings, FrameworkType
from mlagents.trainers.torch.components.reward_providers import BaseRewardProvider
from mlagents.trainers.buffer import AgentBuffer
from mlagents import tf_utils
if tf_utils.is_available():

# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
obs_to_normalize = AgentBuffer.obs_list_to_obs_batch(
agent_buffer_trajectory["obs"]
)
self.policy.update_normalization(obs_to_normalize)
# Evaluate all reward functions for reporting purposes
self.collected_rewards["environment"][agent_id] += np.sum(

2
ml-agents/mlagents/trainers/torch/encoders.py


inputs = self.normalizer(inputs)
return inputs
def copy_normalization(self, other_input: "VectorInput") -> None:
def copy_normalization(self, other_input: "InputProcessor") -> None:
if self.normalizer is not None and other_input.normalizer is not None:
self.normalizer.copy_from(other_input.normalizer)

2
ml-agents/mlagents/trainers/torch/networks.py


def copy_normalization(self, other_network: "NetworkBody") -> None:
if self.normalize:
for n1, n2 in zip(self.vector_processors, other_network.vector_processors):
for n1, n2 in zip(self.processors, other_network.processors):
n1.copy_normalization(n2)
@property

正在加载...
取消
保存