浏览代码

Somewhat running

/develop-pytorch
Ervin Teng 5 年前
当前提交
9dbbfd77
共有 1 个文件被更改,包括 12 次插入8 次删除
  1. 20
      ml-agents/mlagents/trainers/ppo/policy.py

20
ml-agents/mlagents/trainers/ppo/policy.py


self.optimizer = tf.keras.optimizers.Adam(
lr=self.trainer_params["learning_rate"]
)
self.sequence_length = 1 if not self.trainer_params["use_recurrent"] else self.trainer_params["sequence_length"]
self.sequence_length = (
1
if not self.trainer_params["use_recurrent"]
else self.trainer_params["sequence_length"]
)
self.global_step = tf.Variable(0)
self.create_reward_signals(reward_signal_configs)

-decay_epsilon,
decay_epsilon,
)
v_opt_a = tf.math.squared_difference(returns[name], tf.reduce_sum(head, axis=1))
v_opt_a = tf.math.squared_difference(
returns[name], tf.reduce_sum(head, axis=1)
)
v_opt_b = tf.math.squared_difference(returns[name], clipped_value_estimate)
value_loss = tf.reduce_mean(
tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), masks, 2)[1]

# For cleaner stats reporting
# abs_policy_loss = tf.abs(policy_loss)
loss = (
policy_loss
+ 0.5 * value_loss
- decay_beta * tf.reduce_mean(entropy)
)
loss = policy_loss + 0.5 * value_loss - decay_beta * tf.reduce_mean(entropy)
return loss
def create_reward_signals(self, reward_signal_configs):

# feed_dict[self.model.prev_action] = [
# brain_info.previous_vector_actions[idx]
# ]
value_estimates = self.model.get_values(np.expand_dims(brain_info.vector_observations[idx],0))
value_estimates = self.model.get_values(
np.expand_dims(brain_info.vector_observations[idx], 0)
)
value_estimates = {k: float(v) for k, v in value_estimates.items()}

正在加载...
取消
保存