Add option to not condition sigma on obs

5 年前 · be9d772e
--- a/ml-agents/mlagents/trainers/common/nn_policy.py
+++ b/ml-agents/mlagents/trainers/common/nn_policy.py
        load: bool,
        tanh_squash: bool = False,
        resample: bool = False,
+        condition_sigma_on_obs: bool = True,
        create_tf_graph: bool = True,
    ):
        """
        )
        self.tanh_squash = tanh_squash
        self.resample = resample
+        self.condition_sigma_on_obs = condition_sigma_on_obs
        self.trainable_variables: List[tf.Variable] = []

        # Non-exposed parameters; these aren't exposed because they don't have a
                    self.vis_encode_type,
                    self.tanh_squash,
                    self.resample,
+                    self.condition_sigma_on_obs,
                )
            else:
                self._create_dc_actor(
        vis_encode_type: EncoderType,
        tanh_squash: bool = False,
        resample: bool = False,
+        condition_sigma_on_obs: bool = True,
    ) -> None:
        """
        Creates Continuous control actor-critic model.
                reuse=tf.AUTO_REUSE,
            )

-            # Policy-dependent log_sigma_sq
-            log_sigma = tf.layers.dense(
-                hidden_policy,
-                self.act_size[0],
-                activation=None,
-                name="log_std",
-                kernel_initializer=LearningModel.scaled_init(0.01),
-            )
-
+            # Policy-dependent log_sigma
+            if condition_sigma_on_obs:
+                log_sigma = tf.layers.dense(
+                    hidden_policy,
+                    self.act_size[0],
+                    activation=None,
+                    name="log_std",
+                    kernel_initializer=LearningModel.scaled_init(0.01),
+                )
+            else:
+                log_sigma = tf.get_variable(
+                    "log_sigma_squared",
+                    [self.act_size[0]],
+                    dtype=tf.float32,
+                    initializer=tf.zeros_initializer(),
+                )
            log_sigma = tf.clip_by_value(log_sigma, self.log_std_min, self.log_std_max)

            sigma = tf.exp(log_sigma)
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
            self.trainer_parameters,
            self.is_training,
            self.load,
+            condition_sigma_on_obs=False,  # Faster training for PPO
            create_tf_graph=False,  # We will create the TF graph in the Optimizer
        )