behavior_spec,
self.trainer_settings,
condition_sigma_on_obs=False, # Faster training for PPO
tanh_squash=True,
separate_critic=behavior_spec.action_spec.is_continuous(),
)
return policy
def sample(self):
sample = self.mean + torch.randn_like(self.mean) * self.std
return sample / 3
return sample
unscaled_val = value * 3 # Inverse of the clipping
-((unscaled_val - self.mean) ** 2) / (2 * var + EPSILON)
-((value - self.mean) ** 2) / (2 * var + EPSILON)
- log_scale
- math.log(math.sqrt(2 * math.pi))