浏览代码

Separate out optimizer creation and policy graph creation (#3355)

/develop/nopreviousactions
GitHub 5 年前
当前提交
dd86e879
共有 7 个文件被更改,包括 75 次插入41 次删除
  1. 53
      ml-agents/mlagents/trainers/common/nn_policy.py
  2. 3
      ml-agents/mlagents/trainers/optimizer.py
  3. 5
      ml-agents/mlagents/trainers/ppo/optimizer.py
  4. 1
      ml-agents/mlagents/trainers/ppo/trainer.py
  5. 9
      ml-agents/mlagents/trainers/sac/optimizer.py
  6. 1
      ml-agents/mlagents/trainers/sac/trainer.py
  7. 44
      ml-agents/mlagents/trainers/tf_policy.py

53
ml-agents/mlagents/trainers/common/nn_policy.py


load: bool,
tanh_squash: bool = False,
resample: bool = False,
create_tf_graph: bool = True,
):
"""
Policy that uses a multilayer perceptron to map the observations to actions. Could

:param tanh_squash: Whether to use a tanh function on the continuous output, or a clipped output.
:param resample: Whether we are using the resampling trick to update the policy in continuous output.
"""
with tf.variable_scope("policy/"):
super().__init__(seed, brain, trainer_params, load)
super().__init__(seed, brain, trainer_params, load)
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
self.optimizer: Optional[tf.train.AdamOptimizer] = None
self.grads = None
self.update_batch: Optional[tf.Operation] = None
num_layers = trainer_params["num_layers"]
h_size = trainer_params["hidden_units"]
if num_layers < 1:
num_layers = 1
vis_encode_type = EncoderType(
trainer_params.get("vis_encode_type", "simple")
)
self.optimizer: Optional[tf.train.AdamOptimizer] = None
self.grads = None
self.update_batch: Optional[tf.Operation] = None
num_layers = trainer_params["num_layers"]
self.h_size = trainer_params["hidden_units"]
if num_layers < 1:
num_layers = 1
self.num_layers = num_layers
self.vis_encode_type = EncoderType(
trainer_params.get("vis_encode_type", "simple")
)
self.tanh_squash = tanh_squash
self.resample = resample
if create_tf_graph:
self.create_tf_graph()
def create_tf_graph(self):
"""
Builds the tensorflow graph needed for this policy.
"""
with tf.variable_scope("policy/"):
self.create_input_placeholders()
h_size, num_layers, vis_encode_type, tanh_squash, resample
self.h_size,
self.num_layers,
self.vis_encode_type,
self.tanh_squash,
self.resample,
self.create_dc_actor(h_size, num_layers, vis_encode_type)
self.create_dc_actor(
self.h_size, self.num_layers, self.vis_encode_type
)
self.inference_dict: Dict[str, tf.Tensor] = {
"action": self.output,

3
ml-agents/mlagents/trainers/optimizer.py


)
self.update_dict.update(self.reward_signals[reward_signal].update_dict)
def create_tf_optimizer(self, learning_rate, name="Adam"):
return tf.train.AdamOptimizer(learning_rate=learning_rate, name=name)
def _execute_model(self, feed_dict, out_dict):
"""
Executes model.

5
ml-agents/mlagents/trainers/ppo/optimizer.py


:param policy: A TFPolicy object that will be updated by this PPO Optimizer.
:param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
"""
# Create the graph here to give more granular control of the TF graph to the Optimizer.
policy.create_tf_graph()
with policy.graph.as_default():
with tf.variable_scope("optimizer/"):
super().__init__(policy, trainer_params)

)
def create_ppo_optimizer(self):
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
self.optimizer = self.create_tf_optimizer(self.learning_rate)
self.grads = self.optimizer.compute_gradients(self.loss)
self.update_batch = self.optimizer.minimize(self.loss)

1
ml-agents/mlagents/trainers/ppo/trainer.py


self.trainer_parameters,
self.is_training,
self.load,
create_tf_graph=False, # We will create the TF graph in the Optimizer
)
return policy

9
ml-agents/mlagents/trainers/sac/optimizer.py


:param tau: Strength of soft-Q update.
:param m_size: Size of brain memory.
"""
# Create the graph here to give more granular control of the TF graph to the Optimizer.
policy.create_tf_graph()
with policy.graph.as_default():
with tf.variable_scope(""):
super().__init__(policy, trainer_params)

Creates the Adam optimizers and update ops for SAC, including
the policy, value, and entropy updates, as well as the target network update.
"""
policy_optimizer = tf.train.AdamOptimizer(
policy_optimizer = self.create_tf_optimizer(
entropy_optimizer = tf.train.AdamOptimizer(
entropy_optimizer = self.create_tf_optimizer(
value_optimizer = tf.train.AdamOptimizer(
value_optimizer = self.create_tf_optimizer(
learning_rate=self.learning_rate, name="sac_value_opt"
)

1
ml-agents/mlagents/trainers/sac/trainer.py


self.load,
tanh_squash=True,
resample=True,
create_tf_graph=False,
)
for _reward_signal in policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)

44
ml-agents/mlagents/trainers/tf_policy.py


import logging
from typing import Any, Dict, List, Optional
import abc
import numpy as np
from mlagents.tf_utils import tf
from mlagents import tf_utils

)
self._initialize_tensorflow_references()
self.load = load
@abc.abstractmethod
def create_tf_graph(self):
pass
def _initialize_graph(self):
with self.graph.as_default():

return self.vec_obs_size > 0
def _initialize_tensorflow_references(self):
self.value_heads: Dict[str, tf.Tensor] = {}
self.normalization_steps: Optional[tf.Variable] = None
self.running_mean: Optional[tf.Variable] = None
self.running_variance: Optional[tf.Variable] = None
self.update_normalization_op: Optional[tf.Operation] = None
self.value: Optional[tf.Tensor] = None
self.all_log_probs: tf.Tensor = None
self.log_probs: Optional[tf.Tensor] = None
self.entropy: Optional[tf.Tensor] = None
self.action_oh: tf.Tensor = None
self.output_pre: Optional[tf.Tensor] = None
self.output: Optional[tf.Tensor] = None
self.selected_actions: Optional[tf.Tensor] = None
self.action_holder: Optional[tf.Tensor] = None
self.action_masks: Optional[tf.Tensor] = None
self.prev_action: Optional[tf.Tensor] = None
self.memory_in: Optional[tf.Tensor] = None
self.memory_out: Optional[tf.Tensor] = None
def create_input_placeholders(self):
self.value_heads: Dict[str, tf.Tensor] = {}
self.normalization_steps: Optional[tf.Variable] = None
self.running_mean: Optional[tf.Variable] = None
self.running_variance: Optional[tf.Variable] = None
self.update_normalization_op: Optional[tf.Operation] = None
self.value: Optional[tf.Tensor] = None
self.all_log_probs: tf.Tensor = None
self.log_probs: Optional[tf.Tensor] = None
self.entropy: Optional[tf.Tensor] = None
self.action_oh: tf.Tensor = None
self.output_pre: Optional[tf.Tensor] = None
self.output: Optional[tf.Tensor] = None
self.selected_actions: Optional[tf.Tensor] = None
self.action_holder: Optional[tf.Tensor] = None
self.action_masks: Optional[tf.Tensor] = None
self.prev_action: Optional[tf.Tensor] = None
self.memory_in: Optional[tf.Tensor] = None
self.memory_out: Optional[tf.Tensor] = None
self.global_step, self.increment_step_op, self.steps_to_increment = (
LearningModel.create_global_steps()
)

正在加载...
取消
保存