浏览代码

Fix PPO regression (#434)

* Fix PPO regression
/develop-generalizationTraining-TrainerController
GitHub 7 年前
当前提交
848b8a58
共有 3 个文件被更改,包括 15 次插入10 次删除
  1. 13
      python/unitytrainers/models.py
  2. 6
      python/unitytrainers/ppo/models.py
  3. 6
      python/unitytrainers/ppo/trainer.py

13
python/unitytrainers/models.py


self.selected_actions = c_layers.one_hot_encoding(self.action_holder, self.a_size)
self.all_old_probs = tf.placeholder(shape=[None, self.a_size], dtype=tf.float32, name='old_probabilities')
self.probs = tf.reduce_sum(self.all_probs * self.selected_actions, axis=1)
self.old_probs = tf.reduce_sum(self.all_old_probs * self.selected_actions, axis=1)
# We reshape these tensors to [batch x 1] in order to be of the same rank as continuous control probabilities.
self.probs = tf.expand_dims(tf.reduce_sum(self.all_probs * self.selected_actions, axis=1), 1)
self.old_probs = tf.expand_dims(tf.reduce_sum(self.all_old_probs * self.selected_actions, axis=1), 1)
def create_cc_actor_critic(self, h_size, num_layers):
num_streams = 2

a = tf.exp(-1 * tf.pow(tf.stop_gradient(self.output) - self.mu, 2) / (2 * self.sigma_sq))
b = 1 / tf.sqrt(2 * self.sigma_sq * np.pi)
self.all_probs = tf.multiply(a, b, name="action_probs")
self.probs = tf.reduce_prod(self.all_probs, axis=1)
self.entropy = tf.reduce_sum(0.5 * tf.log(2 * np.pi * np.e * self.sigma_sq))
self.entropy = tf.reduce_mean(0.5 * tf.log(2 * np.pi * np.e * self.sigma_sq))
self.old_probs = tf.reduce_prod(self.all_old_probs, axis=1)
# We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
self.probs = tf.identity(self.all_probs)
self.old_probs = tf.identity(self.all_old_probs)

6
python/unitytrainers/ppo/models.py


"""
self.returns_holder = tf.placeholder(shape=[None], dtype=tf.float32, name='discounted_rewards')
self.advantage = tf.placeholder(shape=[None], dtype=tf.float32, name='advantages')
self.advantage = tf.placeholder(shape=[None, 1], dtype=tf.float32, name='advantages')
self.learning_rate = tf.train.polynomial_decay(lr, self.global_step, max_step, 1e-10, power=1.0)
self.old_value = tf.placeholder(shape=[None], dtype=tf.float32, name='old_value_estimates')

v_opt_b = tf.squared_difference(self.returns_holder, clipped_value_estimate)
self.value_loss = tf.reduce_mean(tf.boolean_mask(tf.maximum(v_opt_a, v_opt_b), self.mask))
# Here we calculate PPO policy loss. In continuous control this is done independently for each action gaussian
# and then averaged together. This provides significantly better performance than treating the probability
# as an average of probabilities, or as a joint probability.
self.loss = self.policy_loss + 0.5 * self.value_loss - decay_beta * tf.reduce_mean(
tf.boolean_mask(entropy, self.mask))
self.update_batch = optimizer.minimize(self.loss)

6
python/unitytrainers/ppo/trainer.py


total_v, total_p = 0, 0
advantages = self.training_buffer.update_buffer['advantages'].get_batch()
self.training_buffer.update_buffer['advantages'].set(
(advantages - advantages.mean()) / advantages.std() + 1e-10)
(advantages - advantages.mean()) / (advantages.std() + 1e-10))
for k in range(num_epoch):
self.training_buffer.update_buffer.shuffle()
for l in range(len(self.training_buffer.update_buffer['actions']) // n_sequences):

self.model.returns_holder: np.array(_buffer['discounted_returns'][start:end]).reshape(
[-1]),
self.model.old_value: np.array(_buffer['value_estimates'][start:end]).reshape([-1]),
self.model.advantage: np.array(_buffer['advantages'][start:end]).reshape([-1]),
self.model.advantage: np.array(_buffer['advantages'][start:end]).reshape([-1, 1]),
self.model.all_old_probs: np.array(
_buffer['action_probs'][start:end]).reshape([-1, self.brain.vector_action_space_size])}
if self.is_continuous:

_obs = np.array(_buffer['observations%d' % i][start:end])
(_batch, _seq, _w, _h, _c) = _obs.shape
feed_dict[self.model.visual_in[i]] = _obs.reshape([-1, _w, _h, _c])
# Memories are zeros
# feed_dict[self.model.memory_in] = np.zeros([batch_size, self.m_size])
feed_dict[self.model.memory_in] = np.array(_buffer['memory'][start:end])[:, 0, :]
v_loss, p_loss, _ = self.sess.run(
[self.model.value_loss, self.model.policy_loss,

正在加载...
取消
保存