|
|
|
|
|
|
import numpy as np |
|
|
|
from mlagents.tf_utils import tf |
|
|
|
from mlagents_envs.timers import timed |
|
|
|
from mlagents.trainers.models import ModelUtils, EncoderType, LearningRateSchedule |
|
|
|
from mlagents.trainers.models import ModelUtils, EncoderType, ScheduleType |
|
|
|
from mlagents.trainers.policy.tf_policy import TFPolicy |
|
|
|
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer |
|
|
|
from mlagents.trainers.buffer import AgentBuffer |
|
|
|
|
|
|
super().__init__(policy, trainer_params) |
|
|
|
|
|
|
|
lr = float(trainer_params["learning_rate"]) |
|
|
|
lr_schedule = LearningRateSchedule( |
|
|
|
self._schedule = ScheduleType( |
|
|
|
trainer_params.get("learning_rate_schedule", "linear") |
|
|
|
) |
|
|
|
h_size = int(trainer_params["hidden_units"]) |
|
|
|
|
|
|
"Losses/Value Loss": "value_loss", |
|
|
|
"Losses/Policy Loss": "policy_loss", |
|
|
|
"Policy/Learning Rate": "learning_rate", |
|
|
|
"Policy/Epsilon": "decay_epsilon", |
|
|
|
"Policy/Beta": "decay_beta", |
|
|
|
} |
|
|
|
if self.policy.use_recurrent: |
|
|
|
self.m_size = self.policy.m_size |
|
|
|
|
|
|
else: |
|
|
|
self._create_dc_critic(h_size, num_layers, vis_encode_type) |
|
|
|
|
|
|
|
self.learning_rate = ModelUtils.create_learning_rate( |
|
|
|
lr_schedule, lr, self.policy.global_step, int(max_step) |
|
|
|
self.learning_rate = ModelUtils.create_schedule( |
|
|
|
self._schedule, |
|
|
|
lr, |
|
|
|
self.policy.global_step, |
|
|
|
int(max_step), |
|
|
|
min_value=1e-10, |
|
|
|
) |
|
|
|
self._create_losses( |
|
|
|
self.policy.total_log_probs, |
|
|
|
|
|
|
"policy_loss": self.abs_policy_loss, |
|
|
|
"update_batch": self.update_batch, |
|
|
|
"learning_rate": self.learning_rate, |
|
|
|
"decay_epsilon": self.decay_epsilon, |
|
|
|
"decay_beta": self.decay_beta, |
|
|
|
} |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
) |
|
|
|
advantage = tf.expand_dims(self.advantage, -1) |
|
|
|
|
|
|
|
decay_epsilon = tf.train.polynomial_decay( |
|
|
|
epsilon, self.policy.global_step, max_step, 0.1, power=1.0 |
|
|
|
self.decay_epsilon = ModelUtils.create_schedule( |
|
|
|
self._schedule, epsilon, self.policy.global_step, max_step, min_value=0.1 |
|
|
|
decay_beta = tf.train.polynomial_decay( |
|
|
|
beta, self.policy.global_step, max_step, 1e-5, power=1.0 |
|
|
|
self.decay_beta = ModelUtils.create_schedule( |
|
|
|
self._schedule, beta, self.policy.global_step, max_step, min_value=1e-5 |
|
|
|
) |
|
|
|
|
|
|
|
value_losses = [] |
|
|
|
|
|
|
-decay_epsilon, |
|
|
|
decay_epsilon, |
|
|
|
-self.decay_epsilon, |
|
|
|
self.decay_epsilon, |
|
|
|
) |
|
|
|
v_opt_a = tf.squared_difference( |
|
|
|
self.returns_holders[name], tf.reduce_sum(head, axis=1) |
|
|
|
|
|
|
r_theta = tf.exp(probs - old_probs) |
|
|
|
p_opt_a = r_theta * advantage |
|
|
|
p_opt_b = ( |
|
|
|
tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) |
|
|
|
tf.clip_by_value( |
|
|
|
r_theta, 1.0 - self.decay_epsilon, 1.0 + self.decay_epsilon |
|
|
|
) |
|
|
|
* advantage |
|
|
|
) |
|
|
|
self.policy_loss = -tf.reduce_mean( |
|
|
|
|
|
|
self.loss = ( |
|
|
|
self.policy_loss |
|
|
|
+ 0.5 * self.value_loss |
|
|
|
- decay_beta |
|
|
|
- self.decay_beta |
|
|
|
* tf.reduce_mean(tf.dynamic_partition(entropy, self.policy.mask, 2)[1]) |
|
|
|
) |
|
|
|
|
|
|
|