浏览代码

Add constant decay to beta and epsilon

/docs-update
Andrew Cohen 4 年前
当前提交
4a3ad193
共有 3 个文件被更改,包括 42 次插入29 次删除
  1. 25
      ml-agents/mlagents/trainers/models.py
  2. 34
      ml-agents/mlagents/trainers/ppo/optimizer.py
  3. 12
      ml-agents/mlagents/trainers/sac/optimizer.py

25
ml-agents/mlagents/trainers/models.py


RESNET = "resnet"
class LearningRateSchedule(Enum):
class ScheduleType(Enum):
CONSTANT = "constant"
LINEAR = "linear"

return global_step, increment_step, steps_to_increment
@staticmethod
def create_learning_rate(
lr_schedule: LearningRateSchedule,
lr: float,
def create_schedule(
schedule: ScheduleType,
parameter: float,
min_value: float,
) -> tf.Tensor:
"""
Create a learning rate tensor.

:param max_step: The maximum number of steps in the training run.
:return: A Tensor containing the learning rate.
"""
if lr_schedule == LearningRateSchedule.CONSTANT:
learning_rate = tf.Variable(lr)
elif lr_schedule == LearningRateSchedule.LINEAR:
learning_rate = tf.train.polynomial_decay(
lr, global_step, max_step, 1e-10, power=1.0
if schedule == ScheduleType.CONSTANT:
parameter_rate = tf.Variable(parameter)
elif schedule == ScheduleType.LINEAR:
parameter_rate = tf.train.polynomial_decay(
parameter, global_step, max_step, min_value, power=1.0
raise UnityTrainerException(
"The learning rate schedule {} is invalid.".format(lr_schedule)
)
return learning_rate
raise UnityTrainerException("The schedule {} is invalid.".format(schedule))
return tf.stop_gradient(parameter_rate)
@staticmethod
def scaled_init(scale):

34
ml-agents/mlagents/trainers/ppo/optimizer.py


import numpy as np
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents.trainers.models import ModelUtils, EncoderType, LearningRateSchedule
from mlagents.trainers.models import ModelUtils, EncoderType, ScheduleType
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.buffer import AgentBuffer

super().__init__(policy, trainer_params)
lr = float(trainer_params["learning_rate"])
lr_schedule = LearningRateSchedule(
self._schedule = ScheduleType(
trainer_params.get("learning_rate_schedule", "linear")
)
h_size = int(trainer_params["hidden_units"])

"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
"Policy/Learning Rate": "learning_rate",
"Policy/Epsilon": "decay_epsilon",
"Policy/Beta": "decay_beta",
}
if self.policy.use_recurrent:
self.m_size = self.policy.m_size

else:
self._create_dc_critic(h_size, num_layers, vis_encode_type)
self.learning_rate = ModelUtils.create_learning_rate(
lr_schedule, lr, self.policy.global_step, int(max_step)
self.learning_rate = ModelUtils.create_schedule(
self._schedule,
lr,
self.policy.global_step,
int(max_step),
min_value=1e-10,
)
self._create_losses(
self.policy.total_log_probs,

"policy_loss": self.abs_policy_loss,
"update_batch": self.update_batch,
"learning_rate": self.learning_rate,
"decay_epsilon": self.decay_epsilon,
"decay_beta": self.decay_beta,
}
)

)
advantage = tf.expand_dims(self.advantage, -1)
decay_epsilon = tf.train.polynomial_decay(
epsilon, self.policy.global_step, max_step, 0.1, power=1.0
self.decay_epsilon = ModelUtils.create_schedule(
self._schedule, epsilon, self.policy.global_step, max_step, min_value=0.1
decay_beta = tf.train.polynomial_decay(
beta, self.policy.global_step, max_step, 1e-5, power=1.0
self.decay_beta = ModelUtils.create_schedule(
self._schedule, beta, self.policy.global_step, max_step, min_value=1e-5
)
value_losses = []

-decay_epsilon,
decay_epsilon,
-self.decay_epsilon,
self.decay_epsilon,
)
v_opt_a = tf.squared_difference(
self.returns_holders[name], tf.reduce_sum(head, axis=1)

r_theta = tf.exp(probs - old_probs)
p_opt_a = r_theta * advantage
p_opt_b = (
tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon)
tf.clip_by_value(
r_theta, 1.0 - self.decay_epsilon, 1.0 + self.decay_epsilon
)
* advantage
)
self.policy_loss = -tf.reduce_mean(

self.loss = (
self.policy_loss
+ 0.5 * self.value_loss
- decay_beta
- self.decay_beta
* tf.reduce_mean(tf.dynamic_partition(entropy, self.policy.mask, 2)[1])
)

12
ml-agents/mlagents/trainers/sac/optimizer.py


from mlagents_envs.logging_util import get_logger
from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
from mlagents.trainers.models import LearningRateSchedule, EncoderType, ModelUtils
from mlagents.trainers.models import ScheduleType, EncoderType, ModelUtils
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer

with tf.variable_scope(""):
super().__init__(policy, trainer_params)
lr = float(trainer_params["learning_rate"])
lr_schedule = LearningRateSchedule(
lr_schedule = ScheduleType(
trainer_params.get("learning_rate_schedule", "constant")
)
self.policy = policy

# The optimizer's m_size is 3 times the policy (Q1, Q2, and Value)
self.m_size = 3 * self.policy.m_size
self._create_inputs_and_outputs()
self.learning_rate = ModelUtils.create_learning_rate(
lr_schedule, lr, self.policy.global_step, int(max_step)
self.learning_rate = ModelUtils.create_schedule(
lr_schedule,
lr,
self.policy.global_step,
int(max_step),
min_value=1e-10,
)
self._create_losses(
self.policy_network.q1_heads,

正在加载...
取消
保存