Try reduce bias

4 年前 · 4fe8d036
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py

 from collections import defaultdict
 from typing import cast
+import statistics

 import numpy as np

    :return: list of advantage estimates for time-steps t to T.
    """
    value_estimates = np.append(value_estimates, value_next)
-    delta_t = rewards + gamma * value_estimates[1:] - baseline
+    q_estimate = rewards + gamma * value_estimates[1:]
+    delta_t = (q_estimate - statistics.mean(q_estimate)) - (
+        baseline - statistics.mean(baseline)
+    )
    advantage = discount_rewards(r=delta_t, gamma=gamma * lambd)
    return advantage