|
|
|
|
|
|
"learning_rate": self.learning_rate, |
|
|
|
"decay_epsilon": self.decay_epsilon, |
|
|
|
"decay_beta": self.decay_beta, |
|
|
|
"reward_loss": self.policy.reward_loss, |
|
|
|
|
|
|
|
|
|
|
|
self.model_update_dict.update( |
|
|
|
{ |
|
|
|
|
|
|
"decay_epsilon": self.decay_epsilon, |
|
|
|
"decay_beta": self.decay_beta, |
|
|
|
"reward_loss": self.policy.reward_loss, |
|
|
|
|
|
|
|
if self.predict_return: |
|
|
|
self.ppo_update_dict.update({ |
|
|
|
"reward_loss": self.policy.reward_loss, |
|
|
|
}) |
|
|
|
|
|
|
|
self.model_update_dict.update({ |
|
|
|
"reward_loss": self.policy.reward_loss, |
|
|
|
}) |
|
|
|
|
|
|
|
@timed |
|
|
|
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: |
|
|
|