浏览代码

Add memories and sequence length to critic_pass

/develop/add-fire/sac-lst
Ervin Teng 4 年前
当前提交
f4da3592
共有 2 个文件被更改,包括 21 次插入46 次删除
  1. 42
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  2. 25
      ml-agents/mlagents/trainers/torch/networks.py

42
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


from typing import Dict, Optional, Tuple, List
import torch
import numpy as np
from mlagents_envs.base_env import DecisionSteps
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.components.bc.module import BCModule

from mlagents.trainers.optimizer import Optimizer
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.torch.utils import ModelUtils

reward_signal, self.policy.behavior_spec, settings
)
def get_value_estimates(
self, decision_requests: DecisionSteps, idx: int, done: bool
) -> Dict[str, float]:
"""
Generates value estimates for bootstrapping.
:param decision_requests:
:param idx: Index in BrainInfo of agent.
:param done: Whether or not this is the last element of the episode,
in which case the value estimate will be 0.
:return: The value estimate dictionary with key being the name of the reward signal
and the value the corresponding value estimate.
"""
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
value_estimates = self.policy.actor_critic.critic_pass(
np.expand_dims(vec_vis_obs.vector_observations[idx], 0),
np.expand_dims(vec_vis_obs.visual_observations[idx], 0),
)
value_estimates = {k: float(v) for k, v in value_estimates.items()}
# If we're done, reassign all of the value estimates that need terminal states.
if done:
for k in value_estimates:
if not self.reward_signals[k].ignore_done:
value_estimates[k] = 0.0
return value_estimates
def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:

else:
visual_obs = []
memory = torch.zeros([1, len(vector_obs[0]), self.policy.m_size])
memory = torch.zeros([1, 1, self.policy.m_size])
next_memory = torch.zeros([1, 1, self.policy.m_size])
value_estimates = self.policy.actor_critic.critic_pass(
vector_obs, visual_obs, memory
value_estimates, next_memory = self.policy.actor_critic.critic_pass(
vector_obs, visual_obs, memory, sequence_length=batch.num_experiences
next_value_estimate = self.policy.actor_critic.critic_pass(
next_obs, next_obs, next_memory
next_value_estimate, _ = self.policy.actor_critic.critic_pass(
next_obs, next_obs, next_memory, sequence_length=1
)
for name, estimate in value_estimates.items():

25
ml-agents/mlagents/trainers/torch/networks.py


vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
) -> Dict[str, torch.Tensor]:
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
"""
Get value outputs for the given obs.
:param vec_inputs: List of vector inputs as tensors.

vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
) -> Dict[str, torch.Tensor]:
encoding, _ = self.network_body(vec_inputs, vis_inputs, memories=memories)
return self.value_heads(encoding)
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
encoding, memories_out = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
)
return self.value_heads(encoding), memories_out
def get_dist_and_value(
self,

vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
) -> Dict[str, torch.Tensor]:
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
_, critic_mem = torch.split(memories, self.half_mem_size, -1)
actor_mem, critic_mem = torch.split(memories, self.half_mem_size, -1)
value_outputs, _memories = self.critic(
vec_inputs, vis_inputs, memories=critic_mem
value_outputs, critic_mem_out = self.critic(
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
return value_outputs
# Make memories with the actor mem unchanged
memories_out = torch.cat([actor_mem, critic_mem_out], dim=-1)
return value_outputs, memories_out
def get_dist_and_value(
self,

正在加载...
取消
保存