浏览代码

Less broken PPO

/develop/unified-obs
Ervin Teng 4 年前
当前提交
95bdbba3
共有 6 个文件被更改,包括 44 次插入54 次删除
  1. 10
      ml-agents/mlagents/trainers/buffer.py
  2. 30
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  3. 7
      ml-agents/mlagents/trainers/policy/torch_policy.py
  4. 6
      ml-agents/mlagents/trainers/ppo/trainer.py
  5. 34
      ml-agents/mlagents/trainers/torch/networks.py
  6. 11
      ml-agents/mlagents/trainers/torch/utils.py

10
ml-agents/mlagents/trainers/buffer.py


return len(next(iter(self.values())))
else:
return 0
@staticmethod
def obs_list_to_obs_batch(obs_list: List[List[np.ndarray]]) -> List[np.ndarray]:
"""
Converts a List of obs (an obs itself consinsting of a List of np.ndarray) to
a List of np.ndarray, with the observations batchwise.
"""
# Transpose and convert List of Lists
new_list = list(map(lambda x: np.asanyarray(list(x)), zip(*obs_list)))
return new_list

30
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.torch.components.bc.module import BCModule
from mlagents.trainers.torch.components.reward_providers import create_reward_provider

def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
if self.policy.use_vis_obs:
visual_obs = []
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_processors
):
visual_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
visual_obs.append(visual_ob)
else:
visual_obs = []
obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(batch["obs"])
)
next_obs = ModelUtils.list_to_tensor_list(
AgentBuffer.obs_list_to_obs_batch(batch["next_obs"])
)
vec_vis_obs = SplitObservations.from_observations(next_obs)
next_vec_obs = [
ModelUtils.list_to_tensor(vec_vis_obs.vector_observations).unsqueeze(0)
]
next_vis_obs = [
ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0)
for _vis_ob in vec_vis_obs.visual_observations
]
vector_obs, visual_obs, memory, sequence_length=batch.num_experiences
obs, memory, sequence_length=batch.num_experiences
next_vec_obs, next_vis_obs, next_memory, sequence_length=1
next_obs, next_memory, sequence_length=1
)
for name, estimate in value_estimates.items():

7
ml-agents/mlagents/trainers/policy/torch_policy.py


def _split_decision_step(
self, decision_requests: DecisionSteps
) -> Tuple[SplitObservations, np.ndarray]:
obs = ModelUtils.list_to_tensor_list(decision_requests.obs, )
obs = ModelUtils.list_to_tensor_list(decision_requests.obs)
mask = None
if not self.use_continuous_act:
mask = torch.ones([len(decision_requests), np.sum(self.act_size)])

)
return obs, mask
def update_normalization(self, obs: List[np.ndarray]) -> None:
def update_normalization(self, vector_obs: List[np.ndarray]) -> None:
all_obs = tuple(ModelUtils.list_to_tensor(_obs) for _obs in obs])]
print(all_obs)
all_obs = ModelUtils.list_to_tensor_list(vector_obs)
if self.use_vec_obs and self.normalize:
self.actor_critic.update_normalization(all_obs)

6
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, PPOSettings, FrameworkType
from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (

agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["obs"])
obs_to_normalize = AgentBuffer.obs_list_to_obs_batch(
agent_buffer_trajectory["obs"]
)
self.policy.update_normalization(obs_to_normalize)
# Get all value estimates
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(

34
ml-agents/mlagents/trainers/torch/networks.py


class Actor(abc.ABC):
@abc.abstractmethod
def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
def update_normalization(self, net_inputs: List[torch.Tensor]) -> None:
"""
Updates normalization of Actor based on the provided List of vector obs.
:param vector_obs: A List of vector obs as tensors.

@abc.abstractmethod
def get_dists(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

@abc.abstractmethod
def forward(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, int, int, int, int]:

@abc.abstractmethod
def critic_pass(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:

@abc.abstractmethod
def get_dist_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,

def memory_size(self) -> int:
return self.network_body.memory_size
def update_normalization(self, obs: List[torch.Tensor]) -> None:
self.network_body.update_normalization(obs)
def update_normalization(self, net_inputs: List[torch.Tensor]) -> None:
self.network_body.update_normalization(net_inputs)
def sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
actions = []

def critic_pass(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
net_inputs: List[torch.Tensor],
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:

actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, -1)
value_outputs, critic_mem_out = self.critic(
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
net_inputs, memories=critic_mem, sequence_length=sequence_length
)
if actor_mem is not None:
# Make memories with the actor mem unchanged

critic_mem = None
actor_mem = None
dists, actor_mem_outs = self.get_dists(
net_inputs,
memories=actor_mem,
sequence_length=sequence_length,
masks=masks,
net_inputs, memories=actor_mem, sequence_length=sequence_length, masks=masks
)
value_outputs, critic_mem_outs = self.critic(
net_inputs, memories=critic_mem, sequence_length=sequence_length

mem_out = None
return dists, value_outputs, mem_out
def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
super().update_normalization(vector_obs)
self.critic.network_body.update_normalization(vector_obs)
def update_normalization(self, net_inputs: List[torch.Tensor]) -> None:
super().update_normalization(net_inputs)
self.critic.network_body.update_normalization(net_inputs)
class GlobalSteps(nn.Module):

11
ml-agents/mlagents/trainers/torch/utils.py


h_size: int,
vis_encode_type: EncoderType,
normalize: bool = False,
) -> Tuple[nn.ModuleList, nn.ModuleList, int]:
) -> Tuple[nn.ModuleList, int]:
"""
Creates visual and vector encoders, along with their normalizers.
:param observation_shapes: List of Tuples that represent the action dimensions.

)
# Total output size for all inputs + CNNs
return (
nn.ModuleList(encoders),
total_encoded_size,
)
return (nn.ModuleList(encoders), total_encoded_size)
@staticmethod
def list_to_tensor(

Converts a list of numpy arrays into a list of tensors. MUCH faster than
calling as_tensor on the list directly.
"""
return [torch.as_tensor(np.asanyarray(_arr), dtype=dtype) for _arr in ndarray_list]
return [
torch.as_tensor(np.asanyarray(_arr), dtype=dtype) for _arr in ndarray_list
]
@staticmethod
def to_numpy(tensor: torch.Tensor) -> np.ndarray:

正在加载...
取消
保存