Fix PPO regression (#434)

* Fix PPO regression
7 年前 · 848b8a58
--- a/python/unitytrainers/models.py
+++ b/python/unitytrainers/models.py
        self.selected_actions = c_layers.one_hot_encoding(self.action_holder, self.a_size)

        self.all_old_probs = tf.placeholder(shape=[None, self.a_size], dtype=tf.float32, name='old_probabilities')
-        self.probs = tf.reduce_sum(self.all_probs * self.selected_actions, axis=1)
-        self.old_probs = tf.reduce_sum(self.all_old_probs * self.selected_actions, axis=1)
+
+        # We reshape these tensors to [batch x 1] in order to be of the same rank as continuous control probabilities.
+        self.probs = tf.expand_dims(tf.reduce_sum(self.all_probs * self.selected_actions, axis=1), 1)
+        self.old_probs = tf.expand_dims(tf.reduce_sum(self.all_old_probs * self.selected_actions, axis=1), 1)

    def create_cc_actor_critic(self, h_size, num_layers):
        num_streams = 2
        a = tf.exp(-1 * tf.pow(tf.stop_gradient(self.output) - self.mu, 2) / (2 * self.sigma_sq))
        b = 1 / tf.sqrt(2 * self.sigma_sq * np.pi)
        self.all_probs = tf.multiply(a, b, name="action_probs")
-        self.probs = tf.reduce_prod(self.all_probs, axis=1)
-        self.entropy = tf.reduce_sum(0.5 * tf.log(2 * np.pi * np.e * self.sigma_sq))
+        self.entropy = tf.reduce_mean(0.5 * tf.log(2 * np.pi * np.e * self.sigma_sq))
-        self.old_probs = tf.reduce_prod(self.all_old_probs, axis=1)
+        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
+        self.probs = tf.identity(self.all_probs)
+        self.old_probs = tf.identity(self.all_old_probs)
--- a/python/unitytrainers/ppo/models.py
+++ b/python/unitytrainers/ppo/models.py
        """

        self.returns_holder = tf.placeholder(shape=[None], dtype=tf.float32, name='discounted_rewards')
-        self.advantage = tf.placeholder(shape=[None], dtype=tf.float32, name='advantages')
+        self.advantage = tf.placeholder(shape=[None, 1], dtype=tf.float32, name='advantages')
        self.learning_rate = tf.train.polynomial_decay(lr, self.global_step, max_step, 1e-10, power=1.0)

        self.old_value = tf.placeholder(shape=[None], dtype=tf.float32, name='old_value_estimates')
        v_opt_b = tf.squared_difference(self.returns_holder, clipped_value_estimate)
        self.value_loss = tf.reduce_mean(tf.boolean_mask(tf.maximum(v_opt_a, v_opt_b), self.mask))

+        # Here we calculate PPO policy loss. In continuous control this is done independently for each action gaussian
+        # and then averaged together. This provides significantly better performance than treating the probability
+        # as an average of probabilities, or as a joint probability.
+
        self.loss = self.policy_loss + 0.5 * self.value_loss - decay_beta * tf.reduce_mean(
            tf.boolean_mask(entropy, self.mask))
        self.update_batch = optimizer.minimize(self.loss)
--- a/python/unitytrainers/ppo/trainer.py
+++ b/python/unitytrainers/ppo/trainer.py
        total_v, total_p = 0, 0
        advantages = self.training_buffer.update_buffer['advantages'].get_batch()
        self.training_buffer.update_buffer['advantages'].set(
-            (advantages - advantages.mean()) / advantages.std() + 1e-10)
+            (advantages - advantages.mean()) / (advantages.std() + 1e-10))
        for k in range(num_epoch):
            self.training_buffer.update_buffer.shuffle()
            for l in range(len(self.training_buffer.update_buffer['actions']) // n_sequences):
                             self.model.returns_holder: np.array(_buffer['discounted_returns'][start:end]).reshape(
                                 [-1]),
                             self.model.old_value: np.array(_buffer['value_estimates'][start:end]).reshape([-1]),
-                             self.model.advantage: np.array(_buffer['advantages'][start:end]).reshape([-1]),
+                             self.model.advantage: np.array(_buffer['advantages'][start:end]).reshape([-1, 1]),
                             self.model.all_old_probs: np.array(
                                 _buffer['action_probs'][start:end]).reshape([-1, self.brain.vector_action_space_size])}
                if self.is_continuous:
                        _obs = np.array(_buffer['observations%d' % i][start:end])
                        (_batch, _seq, _w, _h, _c) = _obs.shape
                        feed_dict[self.model.visual_in[i]] = _obs.reshape([-1, _w, _h, _c])
-                # Memories are zeros
-                    # feed_dict[self.model.memory_in] = np.zeros([batch_size, self.m_size])
                    feed_dict[self.model.memory_in] = np.array(_buffer['memory'][start:end])[:, 0, :]
                v_loss, p_loss, _ = self.sess.run(
                    [self.model.value_loss, self.model.policy_loss,