浏览代码

Fix discrete SAC and clean up policy

/develop/nopreviousactions
Ervin Teng 5 年前
当前提交
1b6e175c
共有 2 个文件被更改,包括 22 次插入22 次删除
  1. 20
      ml-agents/mlagents/trainers/common/nn_policy.py
  2. 24
      ml-agents/mlagents/trainers/sac/optimizer.py

20
ml-agents/mlagents/trainers/common/nn_policy.py


kernel_initializer=LearningModel.scaled_init(0.01),
)
self.log_sigma = tf.clip_by_value(log_sigma, LOG_STD_MIN, LOG_STD_MAX)
log_sigma = tf.clip_by_value(log_sigma, LOG_STD_MIN, LOG_STD_MAX)
sigma = tf.exp(self.log_sigma)
sigma = tf.exp(log_sigma)
self.epsilon = tf.random_normal(tf.shape(mu))
epsilon = tf.random_normal(tf.shape(mu))
sampled_policy = mu + sigma * self.epsilon
sampled_policy = mu + sigma * epsilon
# Stop gradient if we're not doing the resampling trick
if not resample:

_gauss_pre = -0.5 * (
((sampled_policy - mu) / (sigma + EPSILON)) ** 2
+ 2 * self.log_sigma
+ 2 * log_sigma
+ np.log(2 * np.pi)
)
all_probs = _gauss_pre

self.all_log_probs = tf.identity(all_probs, name="action_probs")
single_dim_entropy = 0.5 * tf.reduce_mean(
tf.log(2 * np.pi * np.e) + tf.square(self.log_sigma)
tf.log(2 * np.pi * np.e) + tf.square(log_sigma)
)
# Make entropy the right shape
self.entropy = tf.ones_like(tf.reshape(mu[:, 0], [-1])) * single_dim_entropy

)
)
self.all_log_probs = tf.concat(policy_branches, axis=1, name="action_probs")
raw_log_probs = tf.concat(policy_branches, axis=1, name="action_probs")
output, _, normalized_logits = LearningModel.create_discrete_action_masking_layer(
self.all_log_probs, self.action_masks, self.act_size
output, self.action_probs, normalized_logits = LearningModel.create_discrete_action_masking_layer(
raw_log_probs, self.action_masks, self.act_size
self.normalized_logits = tf.identity(normalized_logits, name="action")
self.all_log_probs = tf.identity(normalized_logits, name="action")
self.action_holder = tf.placeholder(
shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder"

24
ml-agents/mlagents/trainers/sac/optimizer.py


self.policy_network.value, name="value_estimate_unused"
)
self.value_heads = self.policy_network.value_heads
self.all_log_probs = self.policy.all_log_probs
self.dones_holder = tf.placeholder(
shape=[None], dtype=tf.float32, name="dones_holder"
)

DISCRETE_TARGET_ENTROPY_SCALE * np.log(i).astype(np.float32)
for i in self.act_size
]
discrete_action_probs = tf.exp(self.policy.all_log_probs)
per_action_entropy = discrete_action_probs * self.policy.all_log_probs
else:
self.target_entropy = (
-1

for name in stream_names:
if discrete:
_branched_mpq1 = self.apply_as_branches(
self.policy_network.q1_pheads[name] * self.policy.action_probs
self.policy_network.q1_pheads[name] * discrete_action_probs
)
branched_mpq1 = tf.stack(
[

_q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0)
_branched_mpq2 = self.apply_as_branches(
self.policy_network.q2_pheads[name] * self.policy.action_probs
self.policy_network.q2_pheads[name] * discrete_action_probs
)
branched_mpq2 = tf.stack(
[

self.ent_coef = tf.exp(self.log_ent_coef)
if discrete:
# We also have to do a different entropy and target_entropy per branch.
branched_log_probs = self.apply_as_branches(self.policy.all_log_probs)
branched_per_action_ent = self.apply_as_branches(per_action_entropy)
for _lp, _te in zip(branched_log_probs, self.target_entropy)
for _lp, _te in zip(branched_per_action_ent, self.target_entropy)
],
axis=1,
)

# so that larger branches don't get more weight.
# The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q
branched_q_term = self.apply_as_branches(
self.policy_network.action_probs * self.policy_network.q1_p
discrete_action_probs * self.policy_network.q1_p
)
branched_policy_loss = tf.stack(

zip(branched_log_probs, branched_q_term)
zip(branched_per_action_ent, branched_q_term)
)
]
)

branched_ent_bonus = tf.stack(
[
tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True)
for i, _lp in enumerate(branched_log_probs)
for i, _lp in enumerate(branched_per_action_ent)
]
)
value_losses = []

* tf.to_float(self.policy.mask)
* tf.stop_gradient(
tf.reduce_sum(
self.policy.all_log_probs + self.target_entropy,
branched_per_action_ent + self.target_entropy,
axis=1,
keep_dims=True,
)

self.ent_coef * self.policy.all_log_probs - self.policy_network.q1_p,
axis=1,
self.ent_coef * per_action_entropy - self.policy_network.q1_p, axis=1
)
self.policy_loss = tf.reduce_mean(
tf.to_float(self.policy.mask) * batch_policy_loss

for name in stream_names:
v_backup = tf.stop_gradient(
self.min_policy_qs[name]
- tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
- tf.reduce_sum(self.ent_coef * per_action_entropy, axis=1)
)
value_losses.append(
0.5

正在加载...
取消
保存