Remove unused "last reward" logic, TF nodes

At each step, an unused `last_reward` variable in the TF graph is updated in our PPO trainer. There are also related unused methods in various places in the codebase. This change removes them.
6 年前 · 177ee5b8
--- a/ml-agents/mlagents/trainers/bc/trainer.py
+++ b/ml-agents/mlagents/trainers/bc/trainer.py
        """
        return self.policy.get_current_step()

-    @property
-    def get_last_reward(self):
-        """
-        Returns the last reward the trainer has had
-        :return: the new last reward
+    def increment_step(self):
-        if len(self.stats["Environment/Cumulative Reward"]) > 0:
-            return np.mean(self.stats["Environment/Cumulative Reward"])
-        else:
-            return 0
-
-    def increment_step_and_update_last_reward(self):
-        """
-        Increment the step count of the trainer and Updates the last reward
+        Increment the step count of the trainer
        """
        self.policy.increment_step()
        return
--- a/ml-agents/mlagents/trainers/ppo/models.py
+++ b/ml-agents/mlagents/trainers/ppo/models.py
        )
        if num_layers < 1:
            num_layers = 1
-        self.last_reward, self.new_reward, self.update_reward = (
-            self.create_reward_encoder()
-        )
        if brain.vector_action_space_type == "continuous":
            self.create_cc_actor_critic(h_size, num_layers)
            self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy
            lr,
            max_step,
        )
-
-    @staticmethod
-    def create_reward_encoder():
-        """Creates TF ops to track and increment recent average cumulative reward."""
-        last_reward = tf.Variable(
-            0, name="last_reward", trainable=False, dtype=tf.float32
-        )
-        new_reward = tf.placeholder(shape=[], dtype=tf.float32, name="new_reward")
-        update_reward = tf.assign(last_reward, new_reward)
-        return last_reward, new_reward, update_reward

    def create_losses(
        self, probs, old_probs, value_heads, entropy, beta, epsilon, lr, max_step
--- a/ml-agents/mlagents/trainers/ppo/policy.py
+++ b/ml-agents/mlagents/trainers/ppo/policy.py
            value=mean_values,
            outputs=run_out,
        )
-
-    def get_last_reward(self):
-        """
-        Returns the last reward the trainer has had
-        :return: the new last reward
-        """
-        return self.sess.run(self.model.last_reward)
-
-    def update_reward(self, new_reward):
-        """
-        Updates reward value for policy.
-        :param new_reward: New reward to save.
-        """
-        self.sess.run(
-            self.model.update_reward, feed_dict={self.model.new_reward: new_reward}
-        )
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
        """
        return self._reward_buffer

-    def increment_step_and_update_last_reward(self):
+    def increment_step(self):
-        Increment the step count of the trainer and Updates the last reward
+        Increment the step count of the trainer
-        if self.stats["Environment/Cumulative Reward"]:
-            mean_reward = np.mean(self.stats["Environment/Cumulative Reward"])
-            self.policy.update_reward(mean_reward)
        self.policy.increment_step()
        self.step = self.policy.get_current_step()

--- a/ml-agents/mlagents/trainers/tests/test_trainer_controller.py
+++ b/ml-agents/mlagents/trainers/tests/test_trainer_controller.py
    )
    trainer_mock.update_policy.assert_called_once()
    trainer_mock.write_summary.assert_called_once()
-    trainer_mock.increment_step_and_update_last_reward.assert_called_once()
+    trainer_mock.increment_step.assert_called_once()
--- a/ml-agents/mlagents/trainers/trainer.py
+++ b/ml-agents/mlagents/trainers/trainer.py
        """
        raise UnityTrainerException("The get_step property was not implemented.")

-    @property
-    def get_last_reward(self):
-        """
-        Returns the last reward the trainer has had
-        :return: the new last reward
-        """
-        raise UnityTrainerException("The get_last_reward property was not implemented.")
-
-    def increment_step_and_update_last_reward(self):
+    def increment_step(self):
-        Increment the step count of the trainer and updates the last reward
+        Increment the step count of the trainer
-        raise UnityTrainerException(
-            "The increment_step_and_update_last_reward method was not implemented."
-        )
+        raise UnityTrainerException("The increment_step method was not implemented.")

    def get_action(self, curr_info: BrainInfo) -> ActionInfo:
        """
--- a/ml-agents/mlagents/trainers/trainer_controller.py
+++ b/ml-agents/mlagents/trainers/trainer_controller.py
            else:
                trainer.write_summary(self.global_step, delta_train_start)
            if self.train_model and trainer.get_step <= trainer.get_max_steps:
-                trainer.increment_step_and_update_last_reward()
+                trainer.increment_step()
        return new_info