Normalize observations when adding experiences (#2556)

* Normalize observations when adding experiences This change moves normalization of vector observations into the trainer's "add_experiences" interface. Prior to this change, normalization occurred at inference time. This was somewhat confusing since usually executing a forward pass shouldn't have side-effects which would change the training step. Also, in a asynchronous or distributed setting where we copy the neural network weights from a trainer to a remote actor / inference worker we'd end up with training issues because of the weights being different on the trainer than the workers.
5 年前 · 832e4a47
--- a/ml-agents/mlagents/trainers/ppo/policy.py
+++ b/ml-agents/mlagents/trainers/ppo/policy.py
            self.inference_dict["pre_action"] = self.model.output_pre
        if self.use_recurrent:
            self.inference_dict["memory_out"] = self.model.memory_out
-        if (
-            is_training
-            and self.use_vec_obs
-            and trainer_params["normalize"]
-            and not load
-        ):
-            self.inference_dict["update_mean"] = self.model.update_normalization

        self.total_policy_loss = self.model.abs_policy_loss
        self.update_dict.update(
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
        :param new_info: Dictionary of all next brains and corresponding BrainInfo.
        """
        info = new_info[self.brain_name]
+        if self.is_training:
+            self.policy.update_normalization(info.vector_observations)
        for l in range(len(info.agents)):
            agent_actions = self.training_buffer[info.agents[l]]["actions"]
            if (
--- a/ml-agents/mlagents/trainers/sac/policy.py
+++ b/ml-agents/mlagents/trainers/sac/policy.py
            self.inference_dict["pre_action"] = self.model.output_pre
        if self.use_recurrent:
            self.inference_dict["memory_out"] = self.model.memory_out
-        if (
-            is_training
-            and self.use_vec_obs
-            and trainer_params["normalize"]
-            and not load
-        ):
-            self.inference_dict["update_mean"] = self.model.update_normalization

        self.update_dict.update(
            {
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
        :param new_info: Dictionary of all next brains and corresponding BrainInfo.
        """
        info = new_info[self.brain_name]
+        if self.is_training:
+            self.policy.update_normalization(info.vector_observations)
        for l in range(len(info.agents)):
            agent_actions = self.training_buffer[info.agents[l]]["actions"]
            if (
--- a/ml-agents/mlagents/trainers/tf_policy.py
+++ b/ml-agents/mlagents/trainers/tf_policy.py
        self.seed = seed
        self.brain = brain
        self.use_recurrent = trainer_parameters["use_recurrent"]
+        self.normalize = trainer_parameters.get("normalize", False)
        self.use_continuous_act = brain.vector_action_space_type == "continuous"
        self.model_path = trainer_parameters["model_path"]
        self.keep_checkpoints = trainer_parameters.get("keep_checkpoints", 5)
        for n in nodes:
            logger.info("\t" + n)
        return nodes
+
+    def update_normalization(self, vector_obs: np.ndarray) -> None:
+        """
+        If this policy normalizes vector observations, this will update the norm values in the graph.
+        :param vector_obs: The vector observations to add to the running estimate of the distribution.
+        """
+        if self.use_vec_obs and self.normalize:
+            self.sess.run(
+                self.model.update_normalization,
+                feed_dict={self.model.vector_in: vector_obs},
+            )

    @property
    def vis_obs_size(self):