|
|
|
|
|
|
) |
|
|
|
loss = ( |
|
|
|
policy_loss |
|
|
|
+ 0.5 * (value_loss + baseline_loss) |
|
|
|
+ 0.5 * (value_loss + 0.5 * baseline_loss) |
|
|
|
- decay_bet * ModelUtils.masked_mean(entropy, loss_masks) |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
modules.update(reward_provider.get_modules()) |
|
|
|
return modules |
|
|
|
|
|
|
|
def get_trajectory_value_estimates( |
|
|
|
def get_trajectory_and_baseline_value_estimates( |
|
|
|
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]: |
|
|
|
) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray], Dict[str, float]]: |
|
|
|
|
|
|
|
n_obs = len(self.policy.behavior_spec.observation_specs) |
|
|
|
|
|
|
|