浏览代码

Update GAIL and BC

/develop/unified-obs
Ervin Teng 4 年前
当前提交
79a3051e
共有 3 个文件被更改,包括 24 次插入64 次删除
  1. 6
      ml-agents/mlagents/trainers/demo_loader.py
  2. 20
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  3. 62
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py

6
ml-agents/mlagents/trainers/demo_loader.py


from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
AgentInfoActionPairProto,
)
from mlagents.trainers.trajectory import SplitObservations
from mlagents_envs.rpc_utils import behavior_spec_from_proto, steps_from_proto
from mlagents_envs.base_env import BehaviorSpec
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto

demo_raw_buffer["done"].append(next_done)
demo_raw_buffer["rewards"].append(next_reward)
split_obs = SplitObservations.from_observations(current_obs)
for i, obs in enumerate(split_obs.visual_observations):
demo_raw_buffer["visual_obs%d" % i].append(obs)
demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
demo_raw_buffer["obs"].append(current_obs)
demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
demo_raw_buffer["prev_action"].append(previous_action)
if next_done:

20
ml-agents/mlagents/trainers/torch/components/bc/module.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.settings import BehavioralCloningSettings, ScheduleType
from mlagents.trainers.torch.utils import ModelUtils

"""
Helper function for update_batch.
"""
vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])]
obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(mini_batch_demo["obs"]), dtype=torch.float
)
act_masks = None
if self.policy.use_continuous_act:
expert_actions = ModelUtils.list_to_tensor(mini_batch_demo["actions"])

if self.policy.use_recurrent:
memories = torch.zeros(1, self.n_sequences, self.policy.m_size)
if self.policy.use_vis_obs:
vis_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
vis_ob = ModelUtils.list_to_tensor(
mini_batch_demo["visual_obs%d" % idx]
)
vis_obs.append(vis_ob)
else:
vis_obs = []
(
selected_actions,
clipped_actions,

) = self.policy.sample_actions(
vec_obs,
vis_obs,
obs,
masks=act_masks,
memories=memories,
seq_len=self.policy.sequence_length,

62
ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py


"""
Creates the observation input.
"""
n_vis = len(self.encoder.visual_processors)
n_vec = len(self.encoder.vector_processors)
vec_inputs = (
[ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float)]
if n_vec > 0
else []
net_inputs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(mini_batch["obs"]), dtype=torch.float
vis_inputs = [
ModelUtils.list_to_tensor(mini_batch["visual_obs%d" % i], dtype=torch.float)
for i in range(n_vis)
]
return vec_inputs, vis_inputs
return net_inputs
def compute_estimate(
self, mini_batch: AgentBuffer, use_vail_noise: bool = False

:param use_vail_noise: Only when using VAIL : If true, will sample the code, if
false, will return the mean of the code.
"""
vec_inputs, vis_inputs = self.get_state_inputs(mini_batch)
net_inputs = self.get_state_inputs(mini_batch)
hidden, _ = self.encoder(vec_inputs, vis_inputs, action_inputs)
hidden, _ = self.encoder(net_inputs, action_inputs)
hidden, _ = self.encoder(vec_inputs, vis_inputs)
hidden, _ = self.encoder(net_inputs)
z_mu: Optional[torch.Tensor] = None
if self._settings.use_vail:
z_mu = self._z_mu_layer(hidden)

Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp.
for off-policy. Compute gradients w.r.t randomly interpolated input.
"""
policy_vec_inputs, policy_vis_inputs = self.get_state_inputs(policy_batch)
expert_vec_inputs, expert_vis_inputs = self.get_state_inputs(expert_batch)
interp_vec_inputs = []
for policy_vec_input, expert_vec_input in zip(
policy_vec_inputs, expert_vec_inputs
):
obs_epsilon = torch.rand(policy_vec_input.shape)
interp_vec_input = (
obs_epsilon * policy_vec_input + (1 - obs_epsilon) * expert_vec_input
)
interp_vec_input.requires_grad = True # For gradient calculation
interp_vec_inputs.append(interp_vec_input)
interp_vis_inputs = []
for policy_vis_input, expert_vis_input in zip(
policy_vis_inputs, expert_vis_inputs
):
obs_epsilon = torch.rand(policy_vis_input.shape)
interp_vis_input = (
obs_epsilon * policy_vis_input + (1 - obs_epsilon) * expert_vis_input
)
interp_vis_input.requires_grad = True # For gradient calculation
interp_vis_inputs.append(interp_vis_input)
policy_inputs = self.get_state_inputs(policy_batch)
expert_inputs = self.get_state_inputs(expert_batch)
interp_inputs = []
for policy_input, expert_input in zip(policy_inputs, expert_inputs):
obs_epsilon = torch.rand(policy_input.shape)
interp_input = obs_epsilon * policy_input + (1 - obs_epsilon) * expert_input
interp_input.requires_grad = True # For gradient calculation
interp_inputs.append(interp_input)
if self._settings.use_actions:
policy_action = self.get_action_input(policy_batch)
expert_action = self.get_action_input(expert_batch)

dim=1,
)
action_inputs.requires_grad = True
hidden, _ = self.encoder(
interp_vec_inputs, interp_vis_inputs, action_inputs
)
encoder_input = tuple(
interp_vec_inputs + interp_vis_inputs + [action_inputs]
)
hidden, _ = self.encoder(interp_inputs, action_inputs)
encoder_input = tuple(interp_inputs + [action_inputs])
hidden, _ = self.encoder(interp_vec_inputs, interp_vis_inputs)
encoder_input = tuple(interp_vec_inputs + interp_vis_inputs)
hidden, _ = self.encoder(interp_inputs)
encoder_input = tuple(interp_inputs)
if self._settings.use_vail:
use_vail_noise = True
z_mu = self._z_mu_layer(hidden)

正在加载...
取消
保存