浏览代码

Fix comments for PPO

/develop/nopreviousactions
Ervin Teng 5 年前
当前提交
4871f49c
共有 4 个文件被更改,包括 24 次插入20 次删除
  1. 12
      ml-agents/mlagents/trainers/common/nn_policy.py
  2. 4
      ml-agents/mlagents/trainers/optimizer.py
  3. 27
      ml-agents/mlagents/trainers/ppo/optimizer.py
  4. 1
      ml-agents/mlagents/trainers/tf_policy.py

12
ml-agents/mlagents/trainers/common/nn_policy.py


resample: bool = False,
):
"""
Policy for Proximal Policy Optimization Networks.
Policy that uses a multilayer perceptron to map the observations to actions. Could
also use a CNN to encode visual input prior to the MLP. Supports discrete and
continuous action spaces, as well as recurrent networks.
:param brain: Assigned Brain object.
:param brain: Assigned BrainParameters object.
:param tanh_squash: Whether to use a tanh function on the continuous output, or a clipped output.
:param resample: Whether we are using the resampling trick to update the policy in continuous output.
"""
with tf.variable_scope("policy"):
super().__init__(seed, brain, trainer_params, load)

Creates Continuous control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
:param tanh_squash: Whether to use a tanh function, or a clipped output.
:param resample: Whether we are using the resampling trick to update the policy.
"""
hidden_stream = LearningModel.create_observation_streams(
self.visual_in,

Creates Discrete control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
"""
hidden_stream = LearningModel.create_observation_streams(
self.visual_in,

4
ml-agents/mlagents/trainers/optimizer.py


def __init__(self, policy: Policy):
"""
Create loss functions and auxillary networks.
:param policy: Policy object that is updated by the Optimizer
"""
pass

Update the Policy based on the batch that was passed in.
:param batch: AgentBuffer that contains the minibatch of data used for this update.
:param num_sequences: Number of recurrent sequences found in the minibatch.
:return: A Dict containing statistics (name, value) from the update (e.g. loss)
"""
pass

27
ml-agents/mlagents/trainers/ppo/optimizer.py


from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents.trainers.models import LearningModel, EncoderType, LearningRateSchedule
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.optimizer import TFOptimizer
from mlagents.trainers.buffer import AgentBuffer

class PPOOptimizer(TFOptimizer):
def __init__(self, policy, trainer_params):
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
Takes a Unity environment and model-specific hyper-parameters and returns the
appropriate PPO agent model for the environment.
:param brain: brain parameters used to generate specific network graph.
:param lr: Learning rate.
:param lr_schedule: Learning rate decay schedule.
:param h_size: Size of hidden layers
:param epsilon: Value for policy-divergence threshold.
:param beta: Strength of entropy regularization.
:param max_step: Total number of training steps.
:param normalize: Whether to normalize vector observation input.
:param use_recurrent: Whether to use an LSTM layer in the network.
:param num_layers Number of hidden layers between encoded input and policy & value layers
:param m_size: Size of brain memory.
:param seed: Seed to use for initialization of model.
:param stream_names: List of names of value streams. Usually, a list of the Reward Signals being used.
:return: a sub-class of PPOAgent tailored to the environment.
Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
The PPO optimizer has a value estimator and a loss function.
:param policy: A TFPolicy object that will be updated by this PPO Optimizer.
:param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
"""
with policy.graph.as_default():
with tf.variable_scope("optimizer/"):

self.create_dc_critic(h_size, num_layers, vis_encode_type)
self.learning_rate = LearningModel.create_learning_rate(
lr_schedule, lr, self.policy.global_step, max_step
lr_schedule, lr, self.policy.global_step, int(max_step)
)
self.create_losses(
self.policy.log_probs,

Creates Continuous control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: The type of visual encoder to use.
"""
hidden_stream = LearningModel.create_observation_streams(
self.policy.visual_in,

Creates Discrete control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: The type of visual encoder to use.
"""
hidden_stream = LearningModel.create_observation_streams(
self.policy.visual_in,

1
ml-agents/mlagents/trainers/tf_policy.py


self.update_normalization_op: Optional[tf.Operation] = None
self.value: Optional[tf.Tensor] = None
self.all_log_probs: Optional[tf.Tensor] = None
self.log_probs: Optional[tf.Tensor] = None
self.entropy: Optional[tf.Tensor] = None
self.action_oh: tf.Tensor = None
self.output_pre: Optional[tf.Tensor] = None

正在加载...
取消
保存