Fix comments for PPO

5 年前 · 4871f49c
--- a/ml-agents/mlagents/trainers/common/nn_policy.py
+++ b/ml-agents/mlagents/trainers/common/nn_policy.py
        resample: bool = False,
    ):
        """
-        Policy for Proximal Policy Optimization Networks.
+        Policy that uses a multilayer perceptron to map the observations to actions. Could
+        also use a CNN to encode visual input prior to the MLP. Supports discrete and
+        continuous action spaces, as well as recurrent networks.
-        :param brain: Assigned Brain object.
+        :param brain: Assigned BrainParameters object.
+        :param tanh_squash: Whether to use a tanh function on the continuous output, or a clipped output.
+        :param resample: Whether we are using the resampling trick to update the policy in continuous output.
        """
        with tf.variable_scope("policy"):
            super().__init__(seed, brain, trainer_params, load)
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
+        :param tanh_squash: Whether to use a tanh function, or a clipped output.
+        :param resample: Whether we are using the resampling trick to update the policy.
        """
        hidden_stream = LearningModel.create_observation_streams(
            self.visual_in,
        Creates Discrete control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
        """
        hidden_stream = LearningModel.create_observation_streams(
            self.visual_in,
--- a/ml-agents/mlagents/trainers/optimizer.py
+++ b/ml-agents/mlagents/trainers/optimizer.py
    def __init__(self, policy: Policy):
        """
        Create loss functions and auxillary networks.
+        :param policy: Policy object that is updated by the Optimizer
        """
        pass

        Update the Policy based on the batch that was passed in.
+        :param batch: AgentBuffer that contains the minibatch of data used for this update.
+        :param num_sequences: Number of recurrent sequences found in the minibatch.
+        :return: A Dict containing statistics (name, value) from the update (e.g. loss)
        """
        pass

--- a/ml-agents/mlagents/trainers/ppo/optimizer.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer.py
 from mlagents.tf_utils import tf
 from mlagents_envs.timers import timed
 from mlagents.trainers.models import LearningModel, EncoderType, LearningRateSchedule
+from mlagents.trainers.tf_policy import TFPolicy
 from mlagents.trainers.optimizer import TFOptimizer
 from mlagents.trainers.buffer import AgentBuffer



 class PPOOptimizer(TFOptimizer):
-    def __init__(self, policy, trainer_params):
+    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
-        Takes a Unity environment and model-specific hyper-parameters and returns the
-        appropriate PPO agent model for the environment.
-        :param brain: brain parameters used to generate specific network graph.
-        :param lr: Learning rate.
-        :param lr_schedule: Learning rate decay schedule.
-        :param h_size: Size of hidden layers
-        :param epsilon: Value for policy-divergence threshold.
-        :param beta: Strength of entropy regularization.
-        :param max_step: Total number of training steps.
-        :param normalize: Whether to normalize vector observation input.
-        :param use_recurrent: Whether to use an LSTM layer in the network.
-        :param num_layers Number of hidden layers between encoded input and policy & value layers
-        :param m_size: Size of brain memory.
-        :param seed: Seed to use for initialization of model.
-        :param stream_names: List of names of value streams. Usually, a list of the Reward Signals being used.
-        :return: a sub-class of PPOAgent tailored to the environment.
+        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
+        The PPO optimizer has a value estimator and a loss function.
+        :param policy: A TFPolicy object that will be updated by this PPO Optimizer.
+        :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
        """
        with policy.graph.as_default():
            with tf.variable_scope("optimizer/"):
                    self.create_dc_critic(h_size, num_layers, vis_encode_type)

                self.learning_rate = LearningModel.create_learning_rate(
-                    lr_schedule, lr, self.policy.global_step, max_step
+                    lr_schedule, lr, self.policy.global_step, int(max_step)
                )
                self.create_losses(
                    self.policy.log_probs,
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: The type of visual encoder to use.
        """
        hidden_stream = LearningModel.create_observation_streams(
            self.policy.visual_in,
        Creates Discrete control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: The type of visual encoder to use.
        """
        hidden_stream = LearningModel.create_observation_streams(
            self.policy.visual_in,
--- a/ml-agents/mlagents/trainers/tf_policy.py
+++ b/ml-agents/mlagents/trainers/tf_policy.py
            self.update_normalization_op: Optional[tf.Operation] = None
            self.value: Optional[tf.Tensor] = None
            self.all_log_probs: Optional[tf.Tensor] = None
+            self.log_probs: Optional[tf.Tensor] = None
            self.entropy: Optional[tf.Tensor] = None
            self.action_oh: tf.Tensor = None
            self.output_pre: Optional[tf.Tensor] = None