浏览代码

Optimize np -> tensor operations

/develop/add-fire
Arthur Juliani 4 年前
当前提交
3eef9d78
共有 3 个文件被更改,包括 36 次插入39 次删除
  1. 6
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  2. 39
      ml-agents/mlagents/trainers/policy/torch_policy.py
  3. 30
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py

6
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
vector_obs = [torch.Tensor(np.array(batch["vector_obs"]))]
vector_obs = [torch.as_tensor(batch["vector_obs"])]
visual_ob = torch.Tensor(np.array(batch["visual_obs%d" % idx]))
visual_ob = torch.as_tensor(batch["visual_obs%d" % idx])
visual_obs.append(visual_ob)
else:
visual_obs = []

next_obs = np.concatenate(next_obs, axis=-1)
next_obs = [torch.Tensor(next_obs).unsqueeze(0)]
next_obs = [torch.as_tensor(next_obs).unsqueeze(0)]
next_memory = torch.zeros([1, 1, self.policy.m_size])
value_estimates, mean_value = self.policy.actor_critic.critic_pass(

39
ml-agents/mlagents/trainers/policy/torch_policy.py


vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
mask = None
if not self.use_continuous_act:
mask = np.ones(
(len(decision_requests), np.sum(self.brain.vector_action_space_size)),
dtype=np.float32,
mask = torch.ones(
[len(decision_requests), np.sum(self.brain.vector_action_space_size)]
mask = 1 - np.concatenate(decision_requests.action_mask, axis=1)
mask = torch.as_tensor(
1 - np.concatenate(decision_requests.action_mask, axis=1)
)
return vec_vis_obs.vector_observations, vec_vis_obs.visual_observations, mask
def update_normalization(self, vector_obs: np.ndarray) -> None:

"""
vector_obs = torch.Tensor(vector_obs)
vector_obs = [vector_obs]
vector_obs = [torch.as_tensor(vector_obs)]
if self.use_vec_obs and self.normalize:
self.actor_critic.update_normalization(vector_obs)

return actions, log_probs, entropies, value_heads, memories
def evaluate_actions(
self, vec_obs, vis_obs, masks=None, actions=None, memories=None, seq_len=1
self, vec_obs, vis_obs, actions, masks=None, memories=None, seq_len=1
):
dists, (value_heads, mean_value), _ = self.actor_critic.get_dist_and_value(
vec_obs, vis_obs, masks, memories, seq_len

:return: Outputs from network as defined by self.inference_dict.
"""
vec_obs, vis_obs, masks = self.split_decision_step(decision_requests)
vec_obs = [torch.Tensor(vec_obs)]
vis_obs = [torch.Tensor(vis_ob) for vis_ob in vis_obs]
memories = torch.Tensor(self.retrieve_memories(global_agent_ids)).unsqueeze(0)
vec_obs = [torch.as_tensor(vec_obs)]
vis_obs = [torch.as_tensor(vis_ob) for vis_ob in vis_obs]
memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze(
0
)
if masks is not None:
masks = torch.Tensor(masks)
run_out["action"] = np.array(action.detach())
run_out["pre_action"] = np.array(
action.detach()
) # Todo - make pre_action difference
run_out["log_probs"] = np.array(log_probs.detach())
run_out["entropy"] = np.array(entropy.detach())
run_out["action"] = action.detach().numpy()
run_out["pre_action"] = action.detach().numpy()
# Todo - make pre_action difference
run_out["log_probs"] = log_probs.detach().numpy()
run_out["entropy"] = entropy.detach().numpy()
name: np.array(t.detach()) for name, t in value_heads.items()
name: t.detach().numpy() for name, t in value_heads.items()
run_out["memories"] = np.array(memories.detach())
run_out["memories"] = memories.detach().numpy()
self.actor_critic.update_normalization(vec_obs)
return run_out

30
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from typing import Any, Dict
import numpy as np
import torch
from mlagents.trainers.buffer import AgentBuffer

value_losses = []
for name, head in values.items():
old_val_tensor = torch.Tensor(old_values[name])
returns_tensor = torch.Tensor(returns[name])
old_val_tensor = old_values[name]
returns_tensor = returns[name]
clipped_value_estimate = old_val_tensor + torch.clamp(
head - old_val_tensor, -decay_epsilon, decay_epsilon
)

:param log_probs: Current policy probabilities
:param old_log_probs: Past policy probabilities
"""
advantage = torch.Tensor(advantages).unsqueeze(-1)
old_log_probs = torch.Tensor(old_log_probs)
advantage = advantages.unsqueeze(-1)
decay_epsilon = self.trainer_params["epsilon"]

returns = {}
old_values = {}
for name in self.reward_signals:
old_values[name] = np.array(batch["{}_value_estimates".format(name)])
returns[name] = np.array(batch["{}_returns".format(name)])
old_values[name] = torch.as_tensor(batch["{}_value_estimates".format(name)])
returns[name] = torch.as_tensor(batch["{}_returns".format(name)])
vec_obs = [torch.Tensor(np.array(batch["vector_obs"]))]
act_masks = torch.Tensor(np.array(batch["action_mask"]))
vec_obs = [torch.as_tensor(batch["vector_obs"])]
act_masks = torch.as_tensor(batch["action_mask"])
actions = torch.Tensor(np.array(batch["actions"])).unsqueeze(-1)
actions = torch.as_tensor(batch["actions"]).unsqueeze(-1)
actions = torch.Tensor(np.array(batch["actions"]))
actions = torch.as_tensor(batch["actions"])
torch.Tensor(np.array(batch["memory"][i]))
torch.as_tensor(batch["memory"][i])
for i in range(0, len(batch["memory"]), self.policy.sequence_length)
]
if len(memories) > 0:

for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_encoders
):
vis_ob = torch.Tensor(np.array(batch["visual_obs%d" % idx]))
vis_ob = torch.as_tensor(batch["visual_obs%d" % idx])
vis_obs.append(vis_ob)
else:
vis_obs = []

)
value_loss = self.ppo_value_loss(values, old_values, returns)
policy_loss = self.ppo_policy_loss(
np.array(batch["advantages"]),
torch.as_tensor(batch["advantages"]),
np.array(batch["action_probs"]),
np.array(batch["masks"], dtype=np.uint32),
torch.as_tensor(batch["action_probs"]),
torch.as_tensor(batch["masks"], dtype=torch.int32),
)
loss = (
policy_loss

正在加载...
取消
保存