浏览代码

[bug-fix] Change entropy computation and loss reporting in Torch to match TF (#4538)

* Proper dimensions for entropy, sum before bonus in PPO

* Make entropy reporting same as TF

* Always use separate critic

* Revert to shared

* Remove unneeded extra line

* Change entropy shape in test

* Change another entropy shape

* Add entropy summing to evaluate_actions

* Add notes about torch.abs(policy_loss)
/MLA-1734-demo-provider
GitHub 4 年前
当前提交
e0ef30a5
共有 4 个文件被更改,包括 18 次插入8 次删除
  1. 15
      ml-agents/mlagents/trainers/policy/torch_policy.py
  2. 4
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  3. 4
      ml-agents/mlagents/trainers/tests/torch/test_policy.py
  4. 3
      ml-agents/mlagents/trainers/torch/distributions.py

15
ml-agents/mlagents/trainers/policy/torch_policy.py


actions = actions[:, :, 0]
else:
actions = actions[:, 0, :]
return (actions, all_logs if all_log_probs else log_probs, entropies, memories)
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return (
actions,
all_logs if all_log_probs else log_probs,
entropy_sum,
memories,
)
def evaluate_actions(
self,

)
action_list = [actions[..., i] for i in range(actions.shape[-1])]
log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_list, dists)
return log_probs, entropies, value_heads
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return log_probs, entropy_sum, value_heads
@timed
def evaluate(

4
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


self.optimizer.step()
update_stats = {
"Losses/Policy Loss": policy_loss.item(),
# NOTE: abs() is not technically correct, but matches the behavior in TensorFlow.
# TODO: After PyTorch is default, change to something more correct.
"Losses/Policy Loss": torch.abs(policy_loss).item(),
"Losses/Value Loss": value_loss.item(),
"Policy/Learning Rate": decay_lr,
"Policy/Epsilon": decay_eps,

4
ml-agents/mlagents/trainers/tests/torch/test_policy.py


seq_len=policy.sequence_length,
)
assert log_probs.shape == (64, policy.behavior_spec.action_size)
assert entropy.shape == (64, policy.behavior_spec.action_size)
assert entropy.shape == (64,)
for val in values.values():
assert val.shape == (64,)

)
else:
assert log_probs.shape == (64, policy.behavior_spec.action_shape)
assert entropies.shape == (64, policy.behavior_spec.action_size)
assert entropies.shape == (64,)
if rnn:
assert memories.shape == (1, 1, policy.m_size)

3
ml-agents/mlagents/trainers/torch/distributions.py


if self.conditional_sigma:
log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2)
else:
log_sigma = self.log_sigma
# Expand so that entropy matches batch size
log_sigma = self.log_sigma.expand(inputs.shape[0], -1)
if self.tanh_squash:
return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))]
else:

正在加载...
取消
保存