|
|
|
|
|
|
TARGET_SCOPE = "target_network" |
|
|
|
|
|
|
|
|
|
|
|
class SACNetwork: |
|
|
|
class SACTransferNetwork: |
|
|
|
Base class for an SAC network. Implements methods for creating the actor and critic heads. |
|
|
|
Base class for an SAC network with support for transfer. Implements methods for creating the actor and critic heads. |
|
|
|
""" |
|
|
|
|
|
|
|
def __init__( |
|
|
|
|
|
|
return q1_heads, q2_heads, q1, q2 |
|
|
|
|
|
|
|
|
|
|
|
class SACTargetNetwork(SACNetwork): |
|
|
|
class SACTransferTargetNetwork(SACTransferNetwork): |
|
|
|
""" |
|
|
|
Instantiation for the SAC target network. Only contains a single |
|
|
|
value estimator and is updated from the Policy Network. |
|
|
|
|
|
|
vis_encode_type, |
|
|
|
) |
|
|
|
with tf.variable_scope(TARGET_SCOPE): |
|
|
|
self.visual_in = ModelUtils.create_visual_input_placeholders( |
|
|
|
policy.brain.camera_resolutions |
|
|
|
) |
|
|
|
self.vector_in = ModelUtils.create_vector_input(policy.vec_obs_size) |
|
|
|
if self.policy.normalize: |
|
|
|
normalization_tensors = ModelUtils.create_normalizer(self.vector_in) |
|
|
|
self.update_normalization_op = normalization_tensors.update_op |
|
|
|
self.normalization_steps = normalization_tensors.steps |
|
|
|
self.running_mean = normalization_tensors.running_mean |
|
|
|
self.running_variance = normalization_tensors.running_variance |
|
|
|
self.processed_vector_in = ModelUtils.normalize_vector_obs( |
|
|
|
self.vector_in, |
|
|
|
self.running_mean, |
|
|
|
self.running_variance, |
|
|
|
self.normalization_steps, |
|
|
|
) |
|
|
|
else: |
|
|
|
self.processed_vector_in = self.vector_in |
|
|
|
self.update_normalization_op = None |
|
|
|
# self.visual_in = ModelUtils.create_visual_input_placeholders( |
|
|
|
# policy.brain.camera_resolutions |
|
|
|
# ) |
|
|
|
# self.vector_in = ModelUtils.create_vector_input(policy.vec_obs_size) |
|
|
|
# if self.policy.normalize: |
|
|
|
# normalization_tensors = ModelUtils.create_normalizer(self.vector_in) |
|
|
|
# self.update_normalization_op = normalization_tensors.update_op |
|
|
|
# self.normalization_steps = normalization_tensors.steps |
|
|
|
# self.running_mean = normalization_tensors.running_mean |
|
|
|
# self.running_variance = normalization_tensors.running_variance |
|
|
|
# self.processed_vector_in = ModelUtils.normalize_vector_obs( |
|
|
|
# self.vector_in, |
|
|
|
# self.running_mean, |
|
|
|
# self.running_variance, |
|
|
|
# self.normalization_steps, |
|
|
|
# ) |
|
|
|
# else: |
|
|
|
# self.processed_vector_in = self.vector_in |
|
|
|
# self.update_normalization_op = None |
|
|
|
|
|
|
|
if self.policy.use_recurrent: |
|
|
|
self.memory_in = tf.placeholder( |
|
|
|
|
|
|
hidden_streams = ModelUtils.create_observation_streams( |
|
|
|
self.visual_in, |
|
|
|
self.processed_vector_in, |
|
|
|
1, |
|
|
|
self.h_size, |
|
|
|
0, |
|
|
|
vis_encode_type=vis_encode_type, |
|
|
|
stream_scopes=["critic/value/"], |
|
|
|
) |
|
|
|
# hidden_streams = ModelUtils.create_observation_streams( |
|
|
|
# self.visual_in, |
|
|
|
# self.processed_vector_in, |
|
|
|
# 1, |
|
|
|
# self.h_size, |
|
|
|
# 0, |
|
|
|
# vis_encode_type=vis_encode_type, |
|
|
|
# stream_scopes=["critic/value/"], |
|
|
|
# ) |
|
|
|
self._create_cc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False) |
|
|
|
self._create_cc_critic(self.policy.targ_encoder, TARGET_SCOPE, create_qs=False) |
|
|
|
self._create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False) |
|
|
|
self._create_dc_critic(self.policy.targ_encoder, TARGET_SCOPE, create_qs=False) |
|
|
|
def copy_normalization(self, mean, variance, steps): |
|
|
|
""" |
|
|
|
Copies the mean, variance, and steps into the normalizers of the |
|
|
|
input of this SACNetwork. Used to copy the normalizer from the policy network |
|
|
|
to the target network. |
|
|
|
param mean: Tensor containing the mean. |
|
|
|
param variance: Tensor containing the variance |
|
|
|
param steps: Tensor containing the number of steps. |
|
|
|
""" |
|
|
|
update_mean = tf.assign(self.running_mean, mean) |
|
|
|
update_variance = tf.assign(self.running_variance, variance) |
|
|
|
update_norm_step = tf.assign(self.normalization_steps, steps) |
|
|
|
return tf.group([update_mean, update_variance, update_norm_step]) |
|
|
|
# def copy_normalization(self, mean, variance, steps): |
|
|
|
# """ |
|
|
|
# Copies the mean, variance, and steps into the normalizers of the |
|
|
|
# input of this SACNetwork. Used to copy the normalizer from the policy network |
|
|
|
# to the target network. |
|
|
|
# param mean: Tensor containing the mean. |
|
|
|
# param variance: Tensor containing the variance |
|
|
|
# param steps: Tensor containing the number of steps. |
|
|
|
# """ |
|
|
|
# update_mean = tf.assign(self.running_mean, mean) |
|
|
|
# update_variance = tf.assign(self.running_variance, variance) |
|
|
|
# update_norm_step = tf.assign(self.normalization_steps, steps) |
|
|
|
# return tf.group([update_mean, update_variance, update_norm_step]) |
|
|
|
class SACPolicyNetwork(SACNetwork): |
|
|
|
class SACTransferPolicyNetwork(SACTransferNetwork): |
|
|
|
""" |
|
|
|
Instantiation for SAC policy network. Contains a dual Q estimator, |
|
|
|
a value estimator, and a reference to the actual policy network. |
|
|
|
|
|
|
self.sequence_length_ph = self.policy.sequence_length_ph |
|
|
|
|
|
|
|
if self.policy.use_continuous_act: |
|
|
|
self._create_cc_critic(hidden_critic, POLICY_SCOPE) |
|
|
|
self._create_cc_critic(self.policy.encoder, POLICY_SCOPE) |
|
|
|
self._create_dc_critic(hidden_critic, POLICY_SCOPE) |
|
|
|
self._create_dc_critic(self.policy.encoder, POLICY_SCOPE) |
|
|
|
|
|
|
|
if self.use_recurrent: |
|
|
|
mem_outs = [self.value_memory_out, self.q1_memory_out, self.q2_memory_out] |
|
|
|