Ervin Teng
5 年前
当前提交
0ef40c08
共有 7 个文件被更改,包括 1165 次插入 和 39 次删除
-
32ml-agents/mlagents/trainers/common/nn_policy.py
-
11ml-agents/mlagents/trainers/optimizer.py
-
11ml-agents/mlagents/trainers/ppo/optimizer.py
-
31ml-agents/mlagents/trainers/sac/trainer.py
-
10ml-agents/mlagents/trainers/tf_policy.py
-
472ml-agents/mlagents/trainers/sac/network.py
-
637ml-agents/mlagents/trainers/sac/optimizer.py
|
|||
import logging |
|||
from typing import Dict, Optional |
|||
|
|||
from mlagents.tf_utils import tf |
|||
|
|||
from mlagents.trainers.models import LearningModel, EncoderType |
|||
|
|||
LOG_STD_MAX = 2 |
|||
LOG_STD_MIN = -20 |
|||
EPSILON = 1e-6 # Small value to avoid divide by zero |
|||
DISCRETE_TARGET_ENTROPY_SCALE = 0.2 # Roughly equal to e-greedy 0.05 |
|||
CONTINUOUS_TARGET_ENTROPY_SCALE = 1.0 # TODO: Make these an optional hyperparam. |
|||
|
|||
LOGGER = logging.getLogger("mlagents.trainers") |
|||
|
|||
POLICY_SCOPE = "" |
|||
TARGET_SCOPE = "target_network" |
|||
|
|||
|
|||
class SACNetwork: |
|||
""" |
|||
Base class for an SAC network. Implements methods for creating the actor and critic heads. |
|||
""" |
|||
|
|||
def __init__( |
|||
self, |
|||
policy=None, |
|||
m_size=None, |
|||
h_size=128, |
|||
normalize=False, |
|||
use_recurrent=False, |
|||
num_layers=2, |
|||
stream_names=None, |
|||
vis_encode_type=EncoderType.SIMPLE, |
|||
): |
|||
self.normalize = normalize |
|||
self.use_recurrent = use_recurrent |
|||
self.num_layers = num_layers |
|||
self.stream_names = stream_names |
|||
self.h_size = h_size |
|||
self.activ_fn = LearningModel.swish |
|||
|
|||
self.sequence_length_ph = tf.placeholder( |
|||
shape=None, dtype=tf.int32, name="sequence_length" |
|||
) |
|||
|
|||
self.policy_memory_in: Optional[tf.Tensor] = None |
|||
self.policy_memory_out: Optional[tf.Tensor] = None |
|||
self.value_memory_in: Optional[tf.Tensor] = None |
|||
self.value_memory_out: Optional[tf.Tensor] = None |
|||
self.q1: Optional[tf.Tensor] = None |
|||
self.q2: Optional[tf.Tensor] = None |
|||
self.q1_p: Optional[tf.Tensor] = None |
|||
self.q2_p: Optional[tf.Tensor] = None |
|||
self.q1_memory_in: Optional[tf.Tensor] = None |
|||
self.q2_memory_in: Optional[tf.Tensor] = None |
|||
self.q1_memory_out: Optional[tf.Tensor] = None |
|||
self.q2_memory_out: Optional[tf.Tensor] = None |
|||
self.prev_action: Optional[tf.Tensor] = None |
|||
self.action_masks: Optional[tf.Tensor] = None |
|||
self.external_action_in: Optional[tf.Tensor] = None |
|||
self.log_sigma_sq: Optional[tf.Tensor] = None |
|||
self.entropy: Optional[tf.Tensor] = None |
|||
self.deterministic_output: Optional[tf.Tensor] = None |
|||
self.normalized_logprobs: Optional[tf.Tensor] = None |
|||
self.action_probs: Optional[tf.Tensor] = None |
|||
self.output_oh: Optional[tf.Tensor] = None |
|||
self.output_pre: Optional[tf.Tensor] = None |
|||
|
|||
self.value_vars = None |
|||
self.q_vars = None |
|||
self.critic_vars = None |
|||
self.policy_vars = None |
|||
|
|||
self.q1_heads: Optional[Dict[str, tf.Tensor]] = None |
|||
self.q2_heads: Optional[Dict[str, tf.Tensor]] = None |
|||
self.q1_pheads: Optional[Dict[str, tf.Tensor]] = None |
|||
self.q2_pheads: Optional[Dict[str, tf.Tensor]] = None |
|||
|
|||
self.policy = policy |
|||
|
|||
def get_vars(self, scope): |
|||
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) |
|||
|
|||
def join_scopes(self, scope_1, scope_2): |
|||
""" |
|||
Joins two scopes. Does so safetly (i.e., if one of the two scopes doesn't |
|||
exist, don't add any backslashes) |
|||
""" |
|||
if not scope_1: |
|||
return scope_2 |
|||
if not scope_2: |
|||
return scope_1 |
|||
else: |
|||
return "/".join(filter(None, [scope_1, scope_2])) |
|||
|
|||
def create_value_heads(self, stream_names, hidden_input): |
|||
""" |
|||
Creates one value estimator head for each reward signal in stream_names. |
|||
Also creates the node corresponding to the mean of all the value heads in self.value. |
|||
self.value_head is a dictionary of stream name to node containing the value estimator head for that signal. |
|||
:param stream_names: The list of reward signal names |
|||
:param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top |
|||
of the hidden input. |
|||
""" |
|||
self.value_heads = {} |
|||
for name in stream_names: |
|||
value = tf.layers.dense(hidden_input, 1, name="{}_value".format(name)) |
|||
self.value_heads[name] = value |
|||
self.value = tf.reduce_mean(list(self.value_heads.values()), 0) |
|||
|
|||
def create_cc_critic(self, hidden_value, scope, create_qs=True): |
|||
""" |
|||
Creates just the critic network |
|||
""" |
|||
scope = self.join_scopes(scope, "critic") |
|||
self.create_sac_value_head( |
|||
self.stream_names, |
|||
hidden_value, |
|||
self.num_layers, |
|||
self.h_size, |
|||
self.join_scopes(scope, "value"), |
|||
) |
|||
|
|||
self.value_vars = self.get_vars(self.join_scopes(scope, "value")) |
|||
if create_qs: |
|||
hidden_q = tf.concat([hidden_value, self.policy.action_holder], axis=-1) |
|||
hidden_qp = tf.concat([hidden_value, self.policy.output], axis=-1) |
|||
self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads( |
|||
self.stream_names, |
|||
hidden_q, |
|||
self.num_layers, |
|||
self.h_size, |
|||
self.join_scopes(scope, "q"), |
|||
) |
|||
self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads( |
|||
self.stream_names, |
|||
hidden_qp, |
|||
self.num_layers, |
|||
self.h_size, |
|||
self.join_scopes(scope, "q"), |
|||
reuse=True, |
|||
) |
|||
self.q_vars = self.get_vars(self.join_scopes(scope, "q")) |
|||
self.critic_vars = self.get_vars(scope) |
|||
|
|||
def create_dc_critic(self, hidden_value, scope, create_qs=True): |
|||
""" |
|||
Creates just the critic network |
|||
""" |
|||
scope = self.join_scopes(scope, "critic") |
|||
self.create_sac_value_head( |
|||
self.stream_names, |
|||
hidden_value, |
|||
self.num_layers, |
|||
self.h_size, |
|||
self.join_scopes(scope, "value"), |
|||
) |
|||
|
|||
self.value_vars = self.get_vars("/".join([scope, "value"])) |
|||
|
|||
if create_qs: |
|||
self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads( |
|||
self.stream_names, |
|||
hidden_value, |
|||
self.num_layers, |
|||
self.h_size, |
|||
self.join_scopes(scope, "q"), |
|||
num_outputs=sum(self.policy.act_size), |
|||
) |
|||
self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads( |
|||
self.stream_names, |
|||
hidden_value, |
|||
self.num_layers, |
|||
self.h_size, |
|||
self.join_scopes(scope, "q"), |
|||
reuse=True, |
|||
num_outputs=sum(self.policy.act_size), |
|||
) |
|||
self.q_vars = self.get_vars(scope) |
|||
self.critic_vars = self.get_vars(scope) |
|||
|
|||
def create_sac_value_head( |
|||
self, stream_names, hidden_input, num_layers, h_size, scope |
|||
): |
|||
""" |
|||
Creates one value estimator head for each reward signal in stream_names. |
|||
Also creates the node corresponding to the mean of all the value heads in self.value. |
|||
self.value_head is a dictionary of stream name to node containing the value estimator head for that signal. |
|||
:param stream_names: The list of reward signal names |
|||
:param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top |
|||
of the hidden input. |
|||
:param num_layers: Number of hidden layers for value network |
|||
:param h_size: size of hidden layers for value network |
|||
:param scope: TF scope for value network. |
|||
""" |
|||
with tf.variable_scope(scope): |
|||
value_hidden = LearningModel.create_vector_observation_encoder( |
|||
hidden_input, h_size, self.activ_fn, num_layers, "encoder", False |
|||
) |
|||
if self.use_recurrent: |
|||
value_hidden, memory_out = LearningModel.create_recurrent_encoder( |
|||
value_hidden, |
|||
self.value_memory_in, |
|||
self.sequence_length_ph, |
|||
name="lstm_value", |
|||
) |
|||
self.value_memory_out = memory_out |
|||
self.create_value_heads(stream_names, value_hidden) |
|||
|
|||
def create_q_heads( |
|||
self, |
|||
stream_names, |
|||
hidden_input, |
|||
num_layers, |
|||
h_size, |
|||
scope, |
|||
reuse=False, |
|||
num_outputs=1, |
|||
): |
|||
""" |
|||
Creates two q heads for each reward signal in stream_names. |
|||
Also creates the node corresponding to the mean of all the value heads in self.value. |
|||
self.value_head is a dictionary of stream name to node containing the value estimator head for that signal. |
|||
:param stream_names: The list of reward signal names |
|||
:param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top |
|||
of the hidden input. |
|||
:param num_layers: Number of hidden layers for Q network |
|||
:param h_size: size of hidden layers for Q network |
|||
:param scope: TF scope for Q network. |
|||
:param reuse: Whether or not to reuse variables. Useful for creating Q of policy. |
|||
:param num_outputs: Number of outputs of each Q function. If discrete, equal to number of actions. |
|||
""" |
|||
with tf.variable_scope(self.join_scopes(scope, "q1_encoding"), reuse=reuse): |
|||
q1_hidden = LearningModel.create_vector_observation_encoder( |
|||
hidden_input, h_size, self.activ_fn, num_layers, "q1_encoder", reuse |
|||
) |
|||
if self.use_recurrent: |
|||
q1_hidden, memory_out = LearningModel.create_recurrent_encoder( |
|||
q1_hidden, |
|||
self.q1_memory_in, |
|||
self.sequence_length_ph, |
|||
name="lstm_q1", |
|||
) |
|||
self.q1_memory_out = memory_out |
|||
|
|||
q1_heads = {} |
|||
for name in stream_names: |
|||
_q1 = tf.layers.dense(q1_hidden, num_outputs, name="{}_q1".format(name)) |
|||
q1_heads[name] = _q1 |
|||
|
|||
q1 = tf.reduce_mean(list(q1_heads.values()), axis=0) |
|||
with tf.variable_scope(self.join_scopes(scope, "q2_encoding"), reuse=reuse): |
|||
q2_hidden = LearningModel.create_vector_observation_encoder( |
|||
hidden_input, h_size, self.activ_fn, num_layers, "q2_encoder", reuse |
|||
) |
|||
if self.use_recurrent: |
|||
q2_hidden, memory_out = LearningModel.create_recurrent_encoder( |
|||
q2_hidden, |
|||
self.q2_memory_in, |
|||
self.sequence_length_ph, |
|||
name="lstm_q2", |
|||
) |
|||
self.q2_memory_out = memory_out |
|||
|
|||
q2_heads = {} |
|||
for name in stream_names: |
|||
_q2 = tf.layers.dense(q2_hidden, num_outputs, name="{}_q2".format(name)) |
|||
q2_heads[name] = _q2 |
|||
|
|||
q2 = tf.reduce_mean(list(q2_heads.values()), axis=0) |
|||
|
|||
return q1_heads, q2_heads, q1, q2 |
|||
|
|||
|
|||
class SACTargetNetwork(SACNetwork): |
|||
""" |
|||
Instantiation for the SAC target network. Only contains a single |
|||
value estimator and is updated from the Policy Network. |
|||
""" |
|||
|
|||
def __init__( |
|||
self, |
|||
policy, |
|||
m_size=None, |
|||
h_size=128, |
|||
normalize=False, |
|||
use_recurrent=False, |
|||
num_layers=2, |
|||
stream_names=None, |
|||
vis_encode_type=EncoderType.SIMPLE, |
|||
): |
|||
super().__init__( |
|||
policy, |
|||
m_size, |
|||
h_size, |
|||
normalize, |
|||
use_recurrent, |
|||
num_layers, |
|||
stream_names, |
|||
vis_encode_type, |
|||
) |
|||
with tf.variable_scope(TARGET_SCOPE): |
|||
self.visual_in = LearningModel.create_visual_input_placeholders( |
|||
policy.brain.camera_resolutions |
|||
) |
|||
self.vector_in = LearningModel.create_vector_input(policy.vec_obs_size) |
|||
if self.policy.normalize: |
|||
normalization_tensors = LearningModel.create_normalizer(self.vector_in) |
|||
self.update_normalization_op = normalization_tensors[0] |
|||
self.normalization_steps = normalization_tensors[1] |
|||
self.running_mean = normalization_tensors[2] |
|||
self.running_variance = normalization_tensors[3] |
|||
self.processed_vector_in = LearningModel.normalize_vector_obs( |
|||
self.vector_in, |
|||
self.running_mean, |
|||
self.running_variance, |
|||
self.normalization_steps, |
|||
) |
|||
else: |
|||
self.processed_vector_in = self.vector_in |
|||
self.update_normalization_op = None |
|||
|
|||
if self.policy.use_recurrent: |
|||
self.memory_in = tf.placeholder( |
|||
shape=[None, self.policy.m_size], |
|||
dtype=tf.float32, |
|||
name="recurrent_in", |
|||
) |
|||
self.value_memory_in = self.memory_in |
|||
hidden_streams = LearningModel.create_observation_streams( |
|||
self.visual_in, |
|||
self.processed_vector_in, |
|||
1, |
|||
self.h_size, |
|||
0, |
|||
vis_encode_type=vis_encode_type, |
|||
stream_scopes=["critic/value/"], |
|||
) |
|||
if self.policy.use_continuous_act: |
|||
self.create_cc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False) |
|||
else: |
|||
self.create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False) |
|||
if self.use_recurrent: |
|||
self.memory_out = tf.concat( |
|||
self.value_memory_out, axis=1 |
|||
) # Needed for Barracuda to work |
|||
|
|||
def copy_normalization(self, mean, variance, steps): |
|||
""" |
|||
Copies the mean, variance, and steps into the normalizers of the |
|||
input of this SACNetwork. Used to copy the normalizer from the policy network |
|||
to the target network. |
|||
param mean: Tensor containing the mean. |
|||
param variance: Tensor containing the variance |
|||
param steps: Tensor containing the number of steps. |
|||
""" |
|||
update_mean = tf.assign(self.running_mean, mean) |
|||
update_variance = tf.assign(self.running_variance, variance) |
|||
update_norm_step = tf.assign(self.normalization_steps, steps) |
|||
return tf.group([update_mean, update_variance, update_norm_step]) |
|||
|
|||
|
|||
class SACPolicyNetwork(SACNetwork): |
|||
""" |
|||
Instantiation for SAC policy network. Contains a dual Q estimator, |
|||
a value estimator, and the actual policy network. |
|||
""" |
|||
|
|||
def __init__( |
|||
self, |
|||
policy, |
|||
m_size=None, |
|||
h_size=128, |
|||
normalize=False, |
|||
use_recurrent=False, |
|||
num_layers=2, |
|||
stream_names=None, |
|||
vis_encode_type=EncoderType.SIMPLE, |
|||
): |
|||
super().__init__( |
|||
policy, |
|||
m_size, |
|||
h_size, |
|||
normalize, |
|||
use_recurrent, |
|||
num_layers, |
|||
stream_names, |
|||
vis_encode_type, |
|||
) |
|||
if self.policy.use_recurrent: |
|||
self.create_memory_ins(self.policy.m_size) |
|||
|
|||
hidden_critic = self.create_observation_in(vis_encode_type) |
|||
self.policy.output = self.policy.output |
|||
|
|||
if self.policy.use_continuous_act: |
|||
self.create_cc_critic(hidden_critic, POLICY_SCOPE) |
|||
|
|||
else: |
|||
self.create_dc_critic(hidden_critic, POLICY_SCOPE) |
|||
|
|||
if self.use_recurrent: |
|||
mem_outs = [ |
|||
self.value_memory_out, |
|||
self.q1_memory_out, |
|||
self.q2_memory_out, |
|||
self.policy_memory_out, |
|||
] |
|||
self.memory_out = tf.concat(mem_outs, axis=1) |
|||
|
|||
def create_memory_ins(self, m_size): |
|||
""" |
|||
Creates the memory input placeholders for LSTM. |
|||
:param m_size: the total size of the memory. |
|||
""" |
|||
# Create the Policy input separate from the rest |
|||
# This is so in inference we only have to run the Policy network. |
|||
# Barracuda will grab the recurrent_in and recurrent_out named tensors. |
|||
self.inference_memory_in = tf.placeholder( |
|||
shape=[None, m_size // 4], dtype=tf.float32, name="recurrent_in" |
|||
) |
|||
# We assume m_size is divisible by 4 |
|||
# Create the non-Policy inputs |
|||
# Use a default placeholder here so nothing has to be provided during |
|||
# Barracuda inference. Note that the default value is just the tiled input |
|||
# for the policy, which is thrown away. |
|||
three_fourths_m_size = m_size * 3 // 4 |
|||
self.other_memory_in = tf.placeholder_with_default( |
|||
input=tf.tile(self.inference_memory_in, [1, 3]), |
|||
shape=[None, three_fourths_m_size], |
|||
name="other_recurrent_in", |
|||
) |
|||
|
|||
# Concat and use this as the "placeholder" |
|||
# for training |
|||
self.memory_in = tf.concat( |
|||
[self.other_memory_in, self.inference_memory_in], axis=1 |
|||
) |
|||
|
|||
# Re-break-up for each network |
|||
num_mems = 4 |
|||
mem_ins = [] |
|||
for i in range(num_mems): |
|||
_start = m_size // num_mems * i |
|||
_end = m_size // num_mems * (i + 1) |
|||
mem_ins.append(self.memory_in[:, _start:_end]) |
|||
self.value_memory_in = mem_ins[0] |
|||
self.q1_memory_in = mem_ins[1] |
|||
self.q2_memory_in = mem_ins[2] |
|||
self.policy_memory_in = mem_ins[3] |
|||
|
|||
def create_observation_in(self, vis_encode_type): |
|||
""" |
|||
Creates the observation inputs, and a CNN if needed, |
|||
:param vis_encode_type: Type of CNN encoder. |
|||
:param share_ac_cnn: Whether or not to share the actor and critic CNNs. |
|||
:return A tuple of (hidden_policy, hidden_critic). We don't save it to self since they're used |
|||
once and thrown away. |
|||
""" |
|||
with tf.variable_scope(POLICY_SCOPE): |
|||
hidden_streams = LearningModel.create_observation_streams( |
|||
self.policy.visual_in, |
|||
self.policy.processed_vector_in, |
|||
1, |
|||
self.h_size, |
|||
0, |
|||
vis_encode_type=vis_encode_type, |
|||
stream_scopes=["policy/", "critic/value/"], |
|||
) |
|||
hidden_critic = hidden_streams[0] |
|||
return hidden_critic |
|
|||
import logging |
|||
import numpy as np |
|||
from typing import Dict, List, Optional, Any, Mapping |
|||
|
|||
from mlagents.tf_utils import tf |
|||
|
|||
from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork |
|||
from mlagents.trainers.models import LearningRateSchedule, EncoderType, LearningModel |
|||
from mlagents.trainers.optimizer import TFOptimizer |
|||
from mlagents.trainers.tf_policy import TFPolicy |
|||
from mlagents.trainers.buffer import AgentBuffer |
|||
from mlagents_envs.timers import timed |
|||
|
|||
LOG_STD_MAX = 2 |
|||
LOG_STD_MIN = -20 |
|||
EPSILON = 1e-6 # Small value to avoid divide by zero |
|||
DISCRETE_TARGET_ENTROPY_SCALE = 0.2 # Roughly equal to e-greedy 0.05 |
|||
CONTINUOUS_TARGET_ENTROPY_SCALE = 1.0 # TODO: Make these an optional hyperparam. |
|||
|
|||
LOGGER = logging.getLogger("mlagents.trainers") |
|||
|
|||
POLICY_SCOPE = "" |
|||
TARGET_SCOPE = "target_network" |
|||
|
|||
|
|||
class SACOptimizer(TFOptimizer): |
|||
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]): |
|||
""" |
|||
Takes a Unity environment and model-specific hyper-parameters and returns the |
|||
appropriate PPO agent model for the environment. |
|||
:param brain: Brain parameters used to generate specific network graph. |
|||
:param lr: Learning rate. |
|||
:param lr_schedule: Learning rate decay schedule. |
|||
:param h_size: Size of hidden layers |
|||
:param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster, |
|||
set higher to explore more. |
|||
:return: a sub-class of PPOAgent tailored to the environment. |
|||
:param max_step: Total number of training steps. |
|||
:param normalize: Whether to normalize vector observation input. |
|||
:param use_recurrent: Whether to use an LSTM layer in the network. |
|||
:param num_layers: Number of hidden layers between encoded input and policy & value layers |
|||
:param tau: Strength of soft-Q update. |
|||
:param m_size: Size of brain memory. |
|||
""" |
|||
with policy.graph.as_default(): |
|||
with tf.variable_scope(""): |
|||
super().__init__(policy, trainer_params) |
|||
lr = float(trainer_params["learning_rate"]) |
|||
lr_schedule = LearningRateSchedule( |
|||
trainer_params.get("learning_rate_schedule", "constant") |
|||
) |
|||
self.policy = policy |
|||
self.act_size = self.policy.act_size |
|||
h_size = int(trainer_params["hidden_units"]) |
|||
max_step = int(trainer_params["max_steps"]) |
|||
num_layers = int(trainer_params["num_layers"]) |
|||
vis_encode_type = EncoderType( |
|||
trainer_params.get("vis_encode_type", "simple") |
|||
) |
|||
self.tau = trainer_params.get("tau", 0.005) |
|||
m_size = self.policy.m_size |
|||
self.init_entcoef = trainer_params.get("init_entcoef", 1.0) |
|||
stream_names = self.reward_signals.keys() |
|||
# Use to reduce "survivor bonus" when using Curiosity or GAIL. |
|||
self.gammas = [ |
|||
_val["gamma"] for _val in trainer_params["reward_signals"].values() |
|||
] |
|||
self.use_dones_in_backup = { |
|||
name: tf.Variable(1.0) for name in stream_names |
|||
} |
|||
self.disable_use_dones = { |
|||
name: self.use_dones_in_backup[name].assign(0.0) |
|||
for name in stream_names |
|||
} |
|||
|
|||
if num_layers < 1: |
|||
num_layers = 1 |
|||
|
|||
self.target_init_op: List[tf.Tensor] = [] |
|||
self.target_update_op: List[tf.Tensor] = [] |
|||
self.update_batch_policy: Optional[tf.Operation] = None |
|||
self.update_batch_value: Optional[tf.Operation] = None |
|||
self.update_batch_entropy: Optional[tf.Operation] = None |
|||
|
|||
self.policy_network = SACPolicyNetwork( |
|||
policy=self.policy, |
|||
m_size=m_size, |
|||
h_size=h_size, |
|||
normalize=self.policy.normalize, |
|||
use_recurrent=self.policy.use_recurrent, |
|||
num_layers=num_layers, |
|||
stream_names=stream_names, |
|||
vis_encode_type=vis_encode_type, |
|||
) |
|||
self.target_network = SACTargetNetwork( |
|||
policy=self.policy, |
|||
m_size=m_size // 4 if m_size else None, |
|||
h_size=h_size, |
|||
normalize=self.policy.normalize, |
|||
use_recurrent=self.policy.use_recurrent, |
|||
num_layers=num_layers, |
|||
stream_names=stream_names, |
|||
vis_encode_type=vis_encode_type, |
|||
) |
|||
self.create_inputs_and_outputs() |
|||
self.learning_rate = LearningModel.create_learning_rate( |
|||
lr_schedule, lr, self.policy.global_step, max_step |
|||
) |
|||
self.create_losses( |
|||
self.policy_network.q1_heads, |
|||
self.policy_network.q2_heads, |
|||
lr, |
|||
max_step, |
|||
stream_names, |
|||
discrete=not self.policy.use_continuous_act, |
|||
) |
|||
self.create_sac_optimizers() |
|||
|
|||
self.selected_actions = ( |
|||
self.policy.selected_actions |
|||
) # For GAIL and other reward signals |
|||
if self.policy.normalize: |
|||
target_update_norm = self.target_network.copy_normalization( |
|||
self.policy.running_mean, |
|||
self.policy.running_variance, |
|||
self.policy.normalization_steps, |
|||
) |
|||
# Update the normalization of the optimizer when the policy does. |
|||
self.policy.update_normalization_op = tf.group( |
|||
[self.policy.update_normalization_op, target_update_norm] |
|||
) |
|||
|
|||
self.stats_name_to_update_name = { |
|||
"Losses/Value Loss": "value_loss", |
|||
"Losses/Policy Loss": "policy_loss", |
|||
"Losses/Q1 Loss": "q1_loss", |
|||
"Losses/Q2 Loss": "q2_loss", |
|||
"Policy/Entropy Coeff": "entropy_coef", |
|||
} |
|||
|
|||
self.update_dict = { |
|||
"value_loss": self.total_value_loss, |
|||
"policy_loss": self.policy_loss, |
|||
"q1_loss": self.q1_loss, |
|||
"q2_loss": self.q2_loss, |
|||
"entropy_coef": self.ent_coef, |
|||
"entropy": self.policy.entropy, |
|||
"update_batch": self.update_batch_policy, |
|||
"update_value": self.update_batch_value, |
|||
"update_entropy": self.update_batch_entropy, |
|||
} |
|||
|
|||
# Add some stuff to inference dict from optimizer |
|||
self.policy.inference_dict["learning_rate"] = self.learning_rate |
|||
if self.policy.use_recurrent: |
|||
self.policy.inference_dict["optimizer_memory_out"] = self.memory_out |
|||
|
|||
def create_inputs_and_outputs(self): |
|||
""" |
|||
Assign the higher-level SACModel's inputs and outputs to those of its policy or |
|||
target network. |
|||
""" |
|||
self.vector_in = self.policy.vector_in |
|||
self.visual_in = self.policy.visual_in |
|||
self.next_vector_in = self.target_network.vector_in |
|||
self.next_visual_in = self.target_network.visual_in |
|||
self.action_holder = self.policy.action_holder |
|||
self.sequence_length_ph = self.policy.sequence_length_ph |
|||
self.next_sequence_length_ph = self.target_network.sequence_length_ph |
|||
if not self.policy.use_continuous_act: |
|||
self.action_masks = self.policy_network.action_masks |
|||
else: |
|||
self.output_pre = self.policy_network.output_pre |
|||
|
|||
# Don't use value estimate during inference. TODO: Check why PPO uses value_estimate in inference. |
|||
self.value = tf.identity( |
|||
self.policy_network.value, name="value_estimate_unused" |
|||
) |
|||
self.value_heads = self.policy_network.value_heads |
|||
self.all_log_probs = self.policy.all_log_probs |
|||
self.dones_holder = tf.placeholder( |
|||
shape=[None], dtype=tf.float32, name="dones_holder" |
|||
) |
|||
# This is just a dummy to get BC to work. PPO has this but SAC doesn't. |
|||
# TODO: Proper input and output specs for models |
|||
self.epsilon = tf.placeholder( |
|||
shape=[None, self.policy.act_size[0]], dtype=tf.float32, name="epsilon" |
|||
) |
|||
|
|||
if self.policy.use_recurrent: |
|||
self.memory_in = self.policy_network.memory_in |
|||
self.memory_out = self.policy_network.memory_out |
|||
|
|||
# For Barracuda |
|||
self.inference_memory_out = tf.identity( |
|||
self.policy_network.policy_memory_out, name="recurrent_out" |
|||
) |
|||
|
|||
if not self.policy.use_continuous_act: |
|||
self.prev_action = self.policy_network.prev_action |
|||
self.next_memory_in = self.target_network.memory_in |
|||
|
|||
def create_losses( |
|||
self, q1_streams, q2_streams, lr, max_step, stream_names, discrete=False |
|||
): |
|||
""" |
|||
Creates training-specific Tensorflow ops for SAC models. |
|||
:param q1_streams: Q1 streams from policy network |
|||
:param q1_streams: Q2 streams from policy network |
|||
:param lr: Learning rate |
|||
:param max_step: Total number of training steps. |
|||
:param stream_names: List of reward stream names. |
|||
:param discrete: Whether or not to use discrete action losses. |
|||
""" |
|||
|
|||
if discrete: |
|||
self.target_entropy = [ |
|||
DISCRETE_TARGET_ENTROPY_SCALE * np.log(i).astype(np.float32) |
|||
for i in self.act_size |
|||
] |
|||
else: |
|||
self.target_entropy = ( |
|||
-1 |
|||
* CONTINUOUS_TARGET_ENTROPY_SCALE |
|||
* np.prod(self.act_size[0]).astype(np.float32) |
|||
) |
|||
|
|||
self.rewards_holders = {} |
|||
self.min_policy_qs = {} |
|||
|
|||
for name in stream_names: |
|||
if discrete: |
|||
_branched_mpq1 = self.apply_as_branches( |
|||
self.policy_network.q1_pheads[name] * self.policy.action_probs |
|||
) |
|||
branched_mpq1 = tf.stack( |
|||
[ |
|||
tf.reduce_sum(_br, axis=1, keep_dims=True) |
|||
for _br in _branched_mpq1 |
|||
] |
|||
) |
|||
_q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0) |
|||
|
|||
_branched_mpq2 = self.apply_as_branches( |
|||
self.policy_network.q2_pheads[name] * self.policy.action_probs |
|||
) |
|||
branched_mpq2 = tf.stack( |
|||
[ |
|||
tf.reduce_sum(_br, axis=1, keep_dims=True) |
|||
for _br in _branched_mpq2 |
|||
] |
|||
) |
|||
_q2_p_mean = tf.reduce_mean(branched_mpq2, axis=0) |
|||
|
|||
self.min_policy_qs[name] = tf.minimum(_q1_p_mean, _q2_p_mean) |
|||
else: |
|||
self.min_policy_qs[name] = tf.minimum( |
|||
self.policy_network.q1_pheads[name], |
|||
self.policy_network.q2_pheads[name], |
|||
) |
|||
|
|||
rewards_holder = tf.placeholder( |
|||
shape=[None], dtype=tf.float32, name="{}_rewards".format(name) |
|||
) |
|||
self.rewards_holders[name] = rewards_holder |
|||
|
|||
q1_losses = [] |
|||
q2_losses = [] |
|||
# Multiple q losses per stream |
|||
expanded_dones = tf.expand_dims(self.dones_holder, axis=-1) |
|||
for i, name in enumerate(stream_names): |
|||
_expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1) |
|||
|
|||
q_backup = tf.stop_gradient( |
|||
_expanded_rewards |
|||
+ (1.0 - self.use_dones_in_backup[name] * expanded_dones) |
|||
* self.gammas[i] |
|||
* self.target_network.value_heads[name] |
|||
) |
|||
|
|||
if discrete: |
|||
# We need to break up the Q functions by branch, and update them individually. |
|||
branched_q1_stream = self.apply_as_branches( |
|||
self.policy.action_oh * q1_streams[name] |
|||
) |
|||
branched_q2_stream = self.apply_as_branches( |
|||
self.policy.action_oh * q2_streams[name] |
|||
) |
|||
|
|||
# Reduce each branch into scalar |
|||
branched_q1_stream = [ |
|||
tf.reduce_sum(_branch, axis=1, keep_dims=True) |
|||
for _branch in branched_q1_stream |
|||
] |
|||
branched_q2_stream = [ |
|||
tf.reduce_sum(_branch, axis=1, keep_dims=True) |
|||
for _branch in branched_q2_stream |
|||
] |
|||
|
|||
q1_stream = tf.reduce_mean(branched_q1_stream, axis=0) |
|||
q2_stream = tf.reduce_mean(branched_q2_stream, axis=0) |
|||
|
|||
else: |
|||
q1_stream = q1_streams[name] |
|||
q2_stream = q2_streams[name] |
|||
|
|||
_q1_loss = 0.5 * tf.reduce_mean( |
|||
tf.to_float(self.policy.mask) |
|||
* tf.squared_difference(q_backup, q1_stream) |
|||
) |
|||
|
|||
_q2_loss = 0.5 * tf.reduce_mean( |
|||
tf.to_float(self.policy.mask) |
|||
* tf.squared_difference(q_backup, q2_stream) |
|||
) |
|||
|
|||
q1_losses.append(_q1_loss) |
|||
q2_losses.append(_q2_loss) |
|||
|
|||
self.q1_loss = tf.reduce_mean(q1_losses) |
|||
self.q2_loss = tf.reduce_mean(q2_losses) |
|||
|
|||
# Learn entropy coefficient |
|||
if discrete: |
|||
# Create a log_ent_coef for each branch |
|||
self.log_ent_coef = tf.get_variable( |
|||
"log_ent_coef", |
|||
dtype=tf.float32, |
|||
initializer=np.log([self.init_entcoef] * len(self.act_size)).astype( |
|||
np.float32 |
|||
), |
|||
trainable=True, |
|||
) |
|||
else: |
|||
self.log_ent_coef = tf.get_variable( |
|||
"log_ent_coef", |
|||
dtype=tf.float32, |
|||
initializer=np.log(self.init_entcoef).astype(np.float32), |
|||
trainable=True, |
|||
) |
|||
|
|||
self.ent_coef = tf.exp(self.log_ent_coef) |
|||
if discrete: |
|||
# We also have to do a different entropy and target_entropy per branch. |
|||
branched_log_probs = self.apply_as_branches(self.policy.all_log_probs) |
|||
branched_ent_sums = tf.stack( |
|||
[ |
|||
tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te |
|||
for _lp, _te in zip(branched_log_probs, self.target_entropy) |
|||
], |
|||
axis=1, |
|||
) |
|||
self.entropy_loss = -tf.reduce_mean( |
|||
tf.to_float(self.policy.mask) |
|||
* tf.reduce_mean( |
|||
self.log_ent_coef |
|||
* tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2), |
|||
axis=1, |
|||
) |
|||
) |
|||
|
|||
# Same with policy loss, we have to do the loss per branch and average them, |
|||
# so that larger branches don't get more weight. |
|||
# The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q |
|||
branched_q_term = self.apply_as_branches( |
|||
self.policy_network.action_probs * self.policy_network.q1_p |
|||
) |
|||
|
|||
branched_policy_loss = tf.stack( |
|||
[ |
|||
tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True) |
|||
for i, (_lp, _qt) in enumerate( |
|||
zip(branched_log_probs, branched_q_term) |
|||
) |
|||
] |
|||
) |
|||
self.policy_loss = tf.reduce_mean( |
|||
tf.to_float(self.policy.mask) * tf.squeeze(branched_policy_loss) |
|||
) |
|||
|
|||
# Do vbackup entropy bonus per branch as well. |
|||
branched_ent_bonus = tf.stack( |
|||
[ |
|||
tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True) |
|||
for i, _lp in enumerate(branched_log_probs) |
|||
] |
|||
) |
|||
value_losses = [] |
|||
for name in stream_names: |
|||
v_backup = tf.stop_gradient( |
|||
self.min_policy_qs[name] |
|||
- tf.reduce_mean(branched_ent_bonus, axis=0) |
|||
) |
|||
value_losses.append( |
|||
0.5 |
|||
* tf.reduce_mean( |
|||
tf.to_float(self.policy.mask) |
|||
* tf.squared_difference( |
|||
self.policy_network.value_heads[name], v_backup |
|||
) |
|||
) |
|||
) |
|||
|
|||
else: |
|||
self.entropy_loss = -tf.reduce_mean( |
|||
self.log_ent_coef |
|||
* tf.to_float(self.policy.mask) |
|||
* tf.stop_gradient( |
|||
tf.reduce_sum( |
|||
self.policy.all_log_probs + self.target_entropy, |
|||
axis=1, |
|||
keep_dims=True, |
|||
) |
|||
) |
|||
) |
|||
batch_policy_loss = tf.reduce_mean( |
|||
self.ent_coef * self.policy.all_log_probs - self.policy_network.q1_p, |
|||
axis=1, |
|||
) |
|||
self.policy_loss = tf.reduce_mean( |
|||
tf.to_float(self.policy.mask) * batch_policy_loss |
|||
) |
|||
|
|||
value_losses = [] |
|||
for name in stream_names: |
|||
v_backup = tf.stop_gradient( |
|||
self.min_policy_qs[name] |
|||
- tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1) |
|||
) |
|||
value_losses.append( |
|||
0.5 |
|||
* tf.reduce_mean( |
|||
tf.to_float(self.policy.mask) |
|||
* tf.squared_difference( |
|||
self.policy_network.value_heads[name], v_backup |
|||
) |
|||
) |
|||
) |
|||
self.value_loss = tf.reduce_mean(value_losses) |
|||
|
|||
self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss |
|||
|
|||
self.entropy = self.policy_network.entropy |
|||
|
|||
def apply_as_branches(self, concat_logits): |
|||
""" |
|||
Takes in a concatenated set of logits and breaks it up into a list of non-concatenated logits, one per |
|||
action branch |
|||
""" |
|||
action_idx = [0] + list(np.cumsum(self.act_size)) |
|||
branches_logits = [ |
|||
concat_logits[:, action_idx[i] : action_idx[i + 1]] |
|||
for i in range(len(self.act_size)) |
|||
] |
|||
return branches_logits |
|||
|
|||
def create_sac_optimizers(self): |
|||
""" |
|||
Creates the Adam optimizers and update ops for SAC, including |
|||
the policy, value, and entropy updates, as well as the target network update. |
|||
""" |
|||
policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) |
|||
entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) |
|||
value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) |
|||
|
|||
self.target_update_op = [ |
|||
tf.assign(target, (1 - self.tau) * target + self.tau * source) |
|||
for target, source in zip( |
|||
self.target_network.value_vars, self.policy_network.value_vars |
|||
) |
|||
] |
|||
LOGGER.debug("value_vars") |
|||
self.print_all_vars(self.policy_network.value_vars) |
|||
LOGGER.debug("targvalue_vars") |
|||
self.print_all_vars(self.target_network.value_vars) |
|||
LOGGER.debug("critic_vars") |
|||
self.print_all_vars(self.policy_network.critic_vars) |
|||
LOGGER.debug("q_vars") |
|||
self.print_all_vars(self.policy_network.q_vars) |
|||
LOGGER.debug("policy_vars") |
|||
policy_vars = tf.get_collection( |
|||
tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy" |
|||
) |
|||
self.print_all_vars(policy_vars) |
|||
|
|||
self.target_init_op = [ |
|||
tf.assign(target, source) |
|||
for target, source in zip( |
|||
self.target_network.value_vars, self.policy_network.value_vars |
|||
) |
|||
] |
|||
|
|||
self.update_batch_policy = policy_optimizer.minimize( |
|||
self.policy_loss, var_list=policy_vars |
|||
) |
|||
|
|||
# Make sure policy is updated first, then value, then entropy. |
|||
with tf.control_dependencies([self.update_batch_policy]): |
|||
self.update_batch_value = value_optimizer.minimize( |
|||
self.total_value_loss, var_list=self.policy_network.critic_vars |
|||
) |
|||
# Add entropy coefficient optimization operation |
|||
with tf.control_dependencies([self.update_batch_value]): |
|||
self.update_batch_entropy = entropy_optimizer.minimize( |
|||
self.entropy_loss, var_list=self.log_ent_coef |
|||
) |
|||
|
|||
def print_all_vars(self, variables): |
|||
for _var in variables: |
|||
LOGGER.debug(_var) |
|||
|
|||
@timed |
|||
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: |
|||
""" |
|||
Updates model using buffer. |
|||
:param num_sequences: Number of trajectories in batch. |
|||
:param batch: Experience mini-batch. |
|||
:param update_target: Whether or not to update target value network |
|||
:param reward_signal_batches: Minibatches to use for updating the reward signals, |
|||
indexed by name. If none, don't update the reward signals. |
|||
:return: Output from update process. |
|||
""" |
|||
feed_dict = self.construct_feed_dict(self.policy, batch, num_sequences) |
|||
stats_needed = self.stats_name_to_update_name |
|||
update_stats: Dict[str, float] = {} |
|||
update_vals = self._execute_model(feed_dict, self.update_dict) |
|||
for stat_name, update_name in stats_needed.items(): |
|||
update_stats[stat_name] = update_vals[update_name] |
|||
# Update target network. By default, target update happens at every policy update. |
|||
self.sess.run(self.target_update_op) |
|||
return update_stats |
|||
|
|||
def update_reward_signals( |
|||
self, reward_signal_minibatches: Mapping[str, Dict], num_sequences: int |
|||
) -> Dict[str, float]: |
|||
""" |
|||
Only update the reward signals. |
|||
:param reward_signal_batches: Minibatches to use for updating the reward signals, |
|||
indexed by name. If none, don't update the reward signals. |
|||
""" |
|||
# Collect feed dicts for all reward signals. |
|||
feed_dict: Dict[tf.Tensor, Any] = {} |
|||
update_dict: Dict[str, tf.Tensor] = {} |
|||
update_stats: Dict[str, float] = {} |
|||
stats_needed: Dict[str, str] = {} |
|||
if reward_signal_minibatches: |
|||
self.add_reward_signal_dicts( |
|||
feed_dict, |
|||
update_dict, |
|||
stats_needed, |
|||
reward_signal_minibatches, |
|||
num_sequences, |
|||
) |
|||
update_vals = self._execute_model(feed_dict, update_dict) |
|||
for stat_name, update_name in stats_needed.items(): |
|||
update_stats[stat_name] = update_vals[update_name] |
|||
return update_stats |
|||
|
|||
def add_reward_signal_dicts( |
|||
self, |
|||
feed_dict: Dict[tf.Tensor, Any], |
|||
update_dict: Dict[str, tf.Tensor], |
|||
stats_needed: Dict[str, str], |
|||
reward_signal_minibatches: Mapping[str, Dict], |
|||
num_sequences: int, |
|||
) -> None: |
|||
""" |
|||
Adds the items needed for reward signal updates to the feed_dict and stats_needed dict. |
|||
:param feed_dict: Feed dict needed update |
|||
:param update_dit: Update dict that needs update |
|||
:param stats_needed: Stats needed to get from the update. |
|||
:param reward_signal_minibatches: Minibatches to use for updating the reward signals, |
|||
indexed by name. |
|||
""" |
|||
for name, r_batch in reward_signal_minibatches.items(): |
|||
feed_dict.update( |
|||
self.reward_signals[name].prepare_update( |
|||
self.policy, r_batch, num_sequences |
|||
) |
|||
) |
|||
update_dict.update(self.reward_signals[name].update_dict) |
|||
stats_needed.update(self.reward_signals[name].stats_name_to_update_name) |
|||
|
|||
def construct_feed_dict( |
|||
self, policy: TFPolicy, batch: Dict[str, Any], num_sequences: int |
|||
) -> Dict[tf.Tensor, Any]: |
|||
""" |
|||
Builds the feed dict for updating the SAC model. |
|||
:param model: The model to update. May be different when, e.g. using multi-GPU. |
|||
:param batch: Mini-batch to use to update. |
|||
:param num_sequences: Number of LSTM sequences in batch. |
|||
""" |
|||
feed_dict = { |
|||
policy.batch_size_ph: num_sequences, |
|||
policy.sequence_length_ph: self.policy.sequence_length, |
|||
self.next_sequence_length_ph: self.policy.sequence_length, |
|||
self.policy.mask_input: batch["masks"], |
|||
} |
|||
for name in self.reward_signals: |
|||
feed_dict[self.rewards_holders[name]] = batch["{}_rewards".format(name)] |
|||
|
|||
if self.policy.use_continuous_act: |
|||
feed_dict[policy.action_holder] = batch["actions"] |
|||
else: |
|||
feed_dict[policy.action_holder] = batch["actions"] |
|||
if self.policy.use_recurrent: |
|||
feed_dict[policy.prev_action] = batch["prev_action"] |
|||
feed_dict[policy.action_masks] = batch["action_mask"] |
|||
if self.policy.use_vec_obs: |
|||
feed_dict[policy.vector_in] = batch["vector_obs"] |
|||
feed_dict[self.next_vector_in] = batch["next_vector_in"] |
|||
if self.policy.vis_obs_size > 0: |
|||
for i, _ in enumerate(policy.visual_in): |
|||
_obs = batch["visual_obs%d" % i] |
|||
feed_dict[policy.visual_in[i]] = _obs |
|||
for i, _ in enumerate(self.next_visual_in): |
|||
_obs = batch["next_visual_obs%d" % i] |
|||
feed_dict[self.next_visual_in[i]] = _obs |
|||
if self.policy.use_recurrent: |
|||
mem_in = [ |
|||
batch["memory"][i] |
|||
for i in range(0, len(batch["memory"]), self.policy.sequence_length) |
|||
] |
|||
# LSTM shouldn't have sequence length <1, but stop it from going out of the index if true. |
|||
offset = 1 if self.policy.sequence_length > 1 else 0 |
|||
next_mem_in = [ |
|||
batch["memory"][i][ |
|||
: self.policy.m_size // 4 |
|||
] # only pass value part of memory to target network |
|||
for i in range( |
|||
offset, len(batch["memory"]), self.policy.sequence_length |
|||
) |
|||
] |
|||
feed_dict[policy.memory_in] = mem_in |
|||
feed_dict[self.next_memory_in] = next_mem_in |
|||
feed_dict[self.dones_holder] = batch["done"] |
|||
return feed_dict |
撰写
预览
正在加载...
取消
保存
Reference in new issue