浏览代码

Fix bug in probs calculation

/develop/add-fire
Arthur Juliani 4 年前
当前提交
b7be7f04
共有 3 个文件被更改,包括 17 次插入9 次删除
  1. 2
      ml-agents/mlagents/trainers/models_torch.py
  2. 22
      ml-agents/mlagents/trainers/policy/torch_policy.py
  3. 2
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py

2
ml-agents/mlagents/trainers/models_torch.py


log_probs = []
entropies = []
for idx, action_dist in enumerate(dists):
action = actions[:, idx]
action = actions[..., idx]
log_probs.append(action_dist.log_prob(action))
entropies.append(action_dist.entropy())
log_probs = torch.stack(log_probs, dim=-1)

22
ml-agents/mlagents/trainers/policy/torch_policy.py


if self.use_vec_obs and self.normalize:
self.actor_critic.update_normalization(vector_obs)
def execute_model(
self, vec_obs, vis_obs, masks=None, actions=None, memories=None, seq_len=1
):
def sample_actions(self, vec_obs, vis_obs, masks=None, memories=None, seq_len=1):
), new_memories = self.actor_critic.get_dist_and_value(
), memories = self.actor_critic.get_dist_and_value(
if actions is None:
actions = self.actor_critic.sample_action(dists)
actions = self.actor_critic.sample_action(dists)
log_probs, entropies = self.actor_critic.get_probs_and_entropy(actions, dists)
if self.act_type == "continuous":
actions.squeeze_(-1)

def evaluate_actions(
self, vec_obs, vis_obs, masks=None, actions=None, memories=None, seq_len=1
):
dists, (value_heads, mean_value), _ = self.actor_critic.get_dist_and_value(
vec_obs, vis_obs, masks, memories, seq_len
)
log_probs, entropies = self.actor_critic.get_probs_and_entropy(actions, dists)
return log_probs, entropies, value_heads
@timed
def evaluate(
self, decision_requests: DecisionSteps, global_agent_ids: List[str]

if masks is not None:
masks = torch.Tensor(masks)
run_out = {}
action, log_probs, entropy, value_heads, memories = self.execute_model(
action, log_probs, entropy, value_heads, memories = self.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
)
run_out["action"] = np.array(action.detach())

2
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


vis_obs.append(vis_ob)
else:
vis_obs = []
_, log_probs, entropy, values, _ = self.policy.execute_model(
log_probs, entropy, values = self.policy.evaluate_actions(
vec_obs,
vis_obs,
masks=act_masks,

正在加载...
取消
保存