|
|
|
|
|
|
self.hyperparameters: PPOSettings = cast( |
|
|
|
PPOSettings, trainer_settings.hyperparameters |
|
|
|
) |
|
|
|
self.decay_learning_rate = ModelUtils.DecayedValue( |
|
|
|
self.hyperparameters.learning_rate_schedule, |
|
|
|
self.hyperparameters.learning_rate, |
|
|
|
1e-10, |
|
|
|
self.trainer_settings.max_steps, |
|
|
|
) |
|
|
|
self.decay_epsilon = ModelUtils.DecayedValue( |
|
|
|
self.hyperparameters.learning_rate_schedule, |
|
|
|
self.hyperparameters.epsilon, |
|
|
|
0.1, |
|
|
|
self.trainer_settings.max_steps, |
|
|
|
) |
|
|
|
self.decay_beta = ModelUtils.DecayedValue( |
|
|
|
self.hyperparameters.learning_rate_schedule, |
|
|
|
self.hyperparameters.beta, |
|
|
|
1e-5, |
|
|
|
self.trainer_settings.max_steps, |
|
|
|
) |
|
|
|
|
|
|
|
self.optimizer = torch.optim.Adam( |
|
|
|
params, lr=self.trainer_settings.hyperparameters.learning_rate |
|
|
|
|
|
|
|
|
|
|
self.stream_names = list(self.reward_signals.keys()) |
|
|
|
|
|
|
|
def ppo_value_loss(self, values, old_values, returns): |
|
|
|
def ppo_value_loss( |
|
|
|
self, |
|
|
|
values: Dict[str, torch.Tensor], |
|
|
|
old_values: Dict[str, torch.Tensor], |
|
|
|
returns: Dict[str, torch.Tensor], |
|
|
|
epsilon: float, |
|
|
|
) -> torch.Tensor: |
|
|
|
""" |
|
|
|
Creates training-specific Tensorflow ops for PPO models. |
|
|
|
:param returns: |
|
|
|
|
|
|
|
|
|
|
decay_epsilon = self.hyperparameters.epsilon |
|
|
|
|
|
|
|
head - old_val_tensor, -decay_epsilon, decay_epsilon |
|
|
|
head - old_val_tensor, -1 * epsilon, epsilon |
|
|
|
) |
|
|
|
v_opt_a = (returns_tensor - head) ** 2 |
|
|
|
v_opt_b = (returns_tensor - clipped_value_estimate) ** 2 |
|
|
|
|
|
|
:param num_sequences: Number of sequences to process. |
|
|
|
:return: Results of update. |
|
|
|
""" |
|
|
|
# Get decayed parameters |
|
|
|
decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step()) |
|
|
|
decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step()) |
|
|
|
decay_bet = self.decay_beta.get_value(self.policy.get_current_step()) |
|
|
|
returns = {} |
|
|
|
old_values = {} |
|
|
|
for name in self.reward_signals: |
|
|
|
|
|
|
memories=memories, |
|
|
|
seq_len=self.policy.sequence_length, |
|
|
|
) |
|
|
|
value_loss = self.ppo_value_loss(values, old_values, returns) |
|
|
|
value_loss = self.ppo_value_loss(values, old_values, returns, decay_eps) |
|
|
|
policy_loss = self.ppo_policy_loss( |
|
|
|
ModelUtils.list_to_tensor(batch["advantages"]), |
|
|
|
log_probs, |
|
|
|
|
|
|
loss = ( |
|
|
|
policy_loss |
|
|
|
+ 0.5 * value_loss |
|
|
|
- self.hyperparameters.beta * torch.mean(entropy) |
|
|
|
) |
|
|
|
loss = policy_loss + 0.5 * value_loss - decay_bet * torch.mean(entropy) |
|
|
|
|
|
|
|
# Set optimizer learning rate |
|
|
|
ModelUtils.update_learning_rate(self.optimizer, decay_lr) |
|
|
|
self.optimizer.zero_grad() |
|
|
|
loss.backward() |
|
|
|
|
|
|
|
|
|
|
"Losses/Value Loss": value_loss.detach().cpu().numpy(), |
|
|
|
"Policy/Learning Rate": decay_lr, |
|
|
|
"Policy/Epsilon": decay_eps, |
|
|
|
"Policy/Beta": decay_bet, |
|
|
|
} |
|
|
|
|
|
|
|
return update_stats |