浏览代码

Add docstring and make some methods private

/develop/nopreviousactions
Ervin Teng 5 年前
当前提交
7d5c1b0b
共有 3 个文件被更改,包括 21 次插入17 次删除
  1. 10
      ml-agents/mlagents/trainers/common/nn_policy.py
  2. 9
      ml-agents/mlagents/trainers/common/optimizer.py
  3. 19
      ml-agents/mlagents/trainers/tf_optimizer.py

10
ml-agents/mlagents/trainers/common/nn_policy.py


self.create_input_placeholders()
if self.use_continuous_act:
self.create_cc_actor(
self._create_cc_actor(
self.h_size,
self.num_layers,
self.vis_encode_type,

else:
self.create_dc_actor(self.h_size, self.num_layers, self.vis_encode_type)
self._create_dc_actor(
self.h_size, self.num_layers, self.vis_encode_type
)
self.trainable_variables = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy"
)

run_out = self._execute_model(feed_dict, self.inference_dict)
return run_out
def create_cc_actor(
def _create_cc_actor(
self,
h_size: int,
num_layers: int,

shape=[None, self.act_size[0]], dtype=tf.float32, name="action_holder"
)
def create_dc_actor(
def _create_dc_actor(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""

9
ml-agents/mlagents/trainers/common/optimizer.py


from typing import Dict
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.policy import Policy
class Optimizer(abc.ABC):

"""
@abc.abstractmethod
def __init__(self, policy: Policy):
"""
Create loss functions and auxillary networks.
:param policy: Policy object that is updated by the Optimizer
"""
pass
@abc.abstractmethod
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:

19
ml-agents/mlagents/trainers/tf_optimizer.py


class TFOptimizer(Optimizer): # pylint: disable=W0223
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
super().__init__(policy)
self.sess = policy.sess
self.policy = policy
self.update_dict: Dict[str, tf.Tensor] = {}

def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
"""
Gets the value estimates for an entire trajectory of samples, except for the final one.
:param batch: An AgentBuffer that represents the trajectory. Assumed to be created from
Trajectory.to_agentbuffer()/
:param next_obs: The next observation after the trajectory is Done, for use with bootstrapping.
:param done: Whether or not this trajectory is terminal, in which case the value estimate will be 0.
:returns: Two Dicts that represent the trajectory value estimates for each reward signal (str to np.ndarray)
and the final value estimate of the next_obs for each reward signal (str to float).
"""
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size_ph: batch.num_experiences,
self.policy.sequence_length_ph: batch.num_experiences, # We want to feed data in batch-wise, not time-wise.

# We do this in a separate step to feed the memory outs - a further optimization would
# be to append to the obs before running sess.run.
final_value_estimates = self.get_value_estimates(
final_value_estimates = self._get_value_estimates(
def get_value_estimates(
def _get_value_estimates(
self,
next_obs: List[np.ndarray],
done: bool,

) -> Dict[str, float]:
"""
Generates value estimates for bootstrapping.
Generates value estimates for bootstrapping. Called by get_trajectory_value_extimates
:param policy_memory: Memory output of the policy at the prior timestep.
:param value_memory: Memory output of the value network at the prior timestep.
:prev_action: The last action before this observation.
:return: The value estimate dictionary with key being the name of the reward signal and the value the
corresponding value estimate.
"""

正在加载...
取消
保存