浏览代码

update target encoder soft copy

/develop/bisim-sac-transfer
yanchaosun 4 年前
当前提交
b991096b
共有 6 个文件被更改,包括 358 次插入778 次删除
  1. 41
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  2. 8
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  3. 98
      ml-agents/mlagents/trainers/sac_transfer/network.py
  4. 30
      ml-agents/mlagents/trainers/sac_transfer/optimizer.py
  5. 941
      ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
  6. 18
      ml-agents/mlagents/trainers/tests/test_simple_transfer.py

41
ml-agents/mlagents/trainers/policy/transfer_policy.py


inverse_model=False,
reuse_encoder=True,
use_bisim=True,
tau=0.1,
) -> None:
"""
Builds the tensorflow graph needed for this policy.

self.predict_return = predict_return
self.use_bisim = use_bisim
self.transfer = transfer
self.tau = tau
with self.graph.as_default():
tf.set_random_seed(self.seed)

action_layers
)
# if not reuse_encoder:
# self.targ_encoder = tf.stop_gradient(self.targ_encoder)
# self._create_hard_copy()
if not reuse_encoder:
self.targ_encoder = tf.stop_gradient(self.targ_encoder)
self._create_hard_copy()
self._create_soft_copy()
# if self.inverse_model:
# with tf.variable_scope("inverse"):
# self.create_inverse_model(
# self.encoder, self.targ_encoder, inverse_layers
# )
if self.inverse_model:
with tf.variable_scope("inverse"):
self.create_inverse_model(
self.encoder, self.targ_encoder, inverse_layers
)
with tf.variable_scope("predict"):
self.create_forward_model(

e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="encoding")
with tf.variable_scope("hard_replacement"):
self.target_replace_op = [
tf.assign(t, 0.9 * t + 0.1 * e) for t, e in zip(t_params, e_params)
self.target_hardcp_op = [
tf.assign(t, e) for t, e in zip(t_params, e_params)
]
def _create_soft_copy(self):
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target_enc")
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="encoding")
with tf.variable_scope("soft_replacement"):
self.target_softcp_op = [
tf.assign(t, (1-self.tau) * t + self.tau * e) for t, e in zip(t_params, e_params)
self.sess.run(self.target_replace_op)
self.sess.run(self.target_hardcp_op)
def run_soft_copy(self):
self.sess.run(self.target_softcp_op)
def _create_cc_actor(
self,

# activation=tf.tanh,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
# if not self.transfer:
encoded_next_state = tf.stop_gradient(encoded_next_state)
if not self.transfer:
encoded_next_state = tf.stop_gradient(encoded_next_state)
squared_difference = 0.5 * tf.reduce_sum(
tf.squared_difference(tf.tanh(self.predict), encoded_next_state), axis=1
)

8
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


update_vals = self._execute_model(feed_dict, self.update_dict)
# update target encoder
if not self.reuse_encoder and self.num_updates % self.copy_every == 0:
self.policy.run_hard_copy()
if not self.reuse_encoder: # and self.num_updates % self.copy_every == 0:
self.policy.run_soft_copy()
# print("copy")
# self.policy.get_encoder_weights()

update_vals = self._execute_model(feed_dict, self.model_only_update_dict)
# update target encoder
if not self.reuse_encoder and self.num_updates % self.copy_every == 0:
self.policy.run_hard_copy()
if not self.reuse_encoder: # and self.num_updates % self.copy_every == 0:
self.policy.run_soft_copy()
for stat_name, update_name in stats_needed.items():
if update_name in update_vals.keys():

98
ml-agents/mlagents/trainers/sac_transfer/network.py


TARGET_SCOPE = "target_network"
class SACNetwork:
class SACTransferNetwork:
Base class for an SAC network. Implements methods for creating the actor and critic heads.
Base class for an SAC network with support for transfer. Implements methods for creating the actor and critic heads.
"""
def __init__(

return q1_heads, q2_heads, q1, q2
class SACTargetNetwork(SACNetwork):
class SACTransferTargetNetwork(SACTransferNetwork):
"""
Instantiation for the SAC target network. Only contains a single
value estimator and is updated from the Policy Network.

vis_encode_type,
)
with tf.variable_scope(TARGET_SCOPE):
self.visual_in = ModelUtils.create_visual_input_placeholders(
policy.brain.camera_resolutions
)
self.vector_in = ModelUtils.create_vector_input(policy.vec_obs_size)
if self.policy.normalize:
normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
self.update_normalization_op = normalization_tensors.update_op
self.normalization_steps = normalization_tensors.steps
self.running_mean = normalization_tensors.running_mean
self.running_variance = normalization_tensors.running_variance
self.processed_vector_in = ModelUtils.normalize_vector_obs(
self.vector_in,
self.running_mean,
self.running_variance,
self.normalization_steps,
)
else:
self.processed_vector_in = self.vector_in
self.update_normalization_op = None
# self.visual_in = ModelUtils.create_visual_input_placeholders(
# policy.brain.camera_resolutions
# )
# self.vector_in = ModelUtils.create_vector_input(policy.vec_obs_size)
# if self.policy.normalize:
# normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
# self.update_normalization_op = normalization_tensors.update_op
# self.normalization_steps = normalization_tensors.steps
# self.running_mean = normalization_tensors.running_mean
# self.running_variance = normalization_tensors.running_variance
# self.processed_vector_in = ModelUtils.normalize_vector_obs(
# self.vector_in,
# self.running_mean,
# self.running_variance,
# self.normalization_steps,
# )
# else:
# self.processed_vector_in = self.vector_in
# self.update_normalization_op = None
if self.policy.use_recurrent:
self.memory_in = tf.placeholder(

hidden_streams = ModelUtils.create_observation_streams(
self.visual_in,
self.processed_vector_in,
1,
self.h_size,
0,
vis_encode_type=vis_encode_type,
stream_scopes=["critic/value/"],
)
# hidden_streams = ModelUtils.create_observation_streams(
# self.visual_in,
# self.processed_vector_in,
# 1,
# self.h_size,
# 0,
# vis_encode_type=vis_encode_type,
# stream_scopes=["critic/value/"],
# )
self._create_cc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
self._create_cc_critic(self.policy.targ_encoder, TARGET_SCOPE, create_qs=False)
self._create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
self._create_dc_critic(self.policy.targ_encoder, TARGET_SCOPE, create_qs=False)
def copy_normalization(self, mean, variance, steps):
"""
Copies the mean, variance, and steps into the normalizers of the
input of this SACNetwork. Used to copy the normalizer from the policy network
to the target network.
param mean: Tensor containing the mean.
param variance: Tensor containing the variance
param steps: Tensor containing the number of steps.
"""
update_mean = tf.assign(self.running_mean, mean)
update_variance = tf.assign(self.running_variance, variance)
update_norm_step = tf.assign(self.normalization_steps, steps)
return tf.group([update_mean, update_variance, update_norm_step])
# def copy_normalization(self, mean, variance, steps):
# """
# Copies the mean, variance, and steps into the normalizers of the
# input of this SACNetwork. Used to copy the normalizer from the policy network
# to the target network.
# param mean: Tensor containing the mean.
# param variance: Tensor containing the variance
# param steps: Tensor containing the number of steps.
# """
# update_mean = tf.assign(self.running_mean, mean)
# update_variance = tf.assign(self.running_variance, variance)
# update_norm_step = tf.assign(self.normalization_steps, steps)
# return tf.group([update_mean, update_variance, update_norm_step])
class SACPolicyNetwork(SACNetwork):
class SACTransferPolicyNetwork(SACTransferNetwork):
"""
Instantiation for SAC policy network. Contains a dual Q estimator,
a value estimator, and a reference to the actual policy network.

self.sequence_length_ph = self.policy.sequence_length_ph
if self.policy.use_continuous_act:
self._create_cc_critic(hidden_critic, POLICY_SCOPE)
self._create_cc_critic(self.policy.encoder, POLICY_SCOPE)
self._create_dc_critic(hidden_critic, POLICY_SCOPE)
self._create_dc_critic(self.policy.encoder, POLICY_SCOPE)
if self.use_recurrent:
mem_outs = [self.value_memory_out, self.q1_memory_out, self.q2_memory_out]

30
ml-agents/mlagents/trainers/sac_transfer/optimizer.py


from mlagents.tf_utils import tf
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.sac_transfer.network import SACPolicyNetwork, SACTargetNetwork
from mlagents.trainers.sac_transfer.network import SACTransferPolicyNetwork, SACTransferTargetNetwork
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.policy.tf_policy import TFPolicy

self.update_batch_value: Optional[tf.Operation] = None
self.update_batch_entropy: Optional[tf.Operation] = None
self.policy_network = SACPolicyNetwork(
self.policy_network = SACTransferPolicyNetwork(
policy=self.policy,
m_size=self.policy.m_size, # 3x policy.m_size
h_size=h_size,

stream_names=stream_names,
vis_encode_type=vis_encode_type,
)
self.target_network = SACTargetNetwork(
self.target_network = SACTransferTargetNetwork(
policy=self.policy,
m_size=self.policy.m_size, # 1x policy.m_size
h_size=h_size,

self.selected_actions = (
self.policy.selected_actions
) # For GAIL and other reward signals
if self.policy.normalize:
target_update_norm = self.target_network.copy_normalization(
self.policy.running_mean,
self.policy.running_variance,
self.policy.normalization_steps,
)
# Update the normalization of the optimizer when the policy does.
self.policy.update_normalization_op = tf.group(
[self.policy.update_normalization_op, target_update_norm]
)
# if self.policy.normalize:
# target_update_norm = self.target_network.copy_normalization(
# self.policy.running_mean,
# self.policy.running_variance,
# self.policy.normalization_steps,
# )
# # Update the normalization of the optimizer when the policy does.
# self.policy.update_normalization_op = tf.group(
# [self.policy.update_normalization_op, target_update_norm]
# )
self.policy.initialize_or_load()

"""
self.vector_in = self.policy.vector_in
self.visual_in = self.policy.visual_in
self.next_vector_in = self.target_network.vector_in
self.next_visual_in = self.target_network.visual_in
self.next_vector_in = self.policy.vector_next
self.next_visual_in = self.policy.visual_next
self.sequence_length_ph = self.policy.sequence_length_ph
self.next_sequence_length_ph = self.target_network.sequence_length_ph
if not self.policy.use_continuous_act:

941
ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
文件差异内容过多而无法显示
查看文件

18
ml-agents/mlagents/trainers/tests/test_simple_transfer.py


forward_layers=0,
encoder_layers=2,
action_layers=1,
use_bisim=False,
use_bisim=True,
)
config = attr.evolve(
config, hyperparameters=new_hyperparams, max_steps=500000, summary_freq=5000

for seed in range(5):
if seed > -1:
for obs in ["long-n", "longpre-n"]:
for obs in ["normal"]:
# for obs in ["long-n", "longpre-n"]:
# test_2d_transfer(
# seed=seed,
# obs_spec_type=obs,
# transfer_from="./transfer_results/model_normal_s" + str(seed) + "/Simple",
# run_id="normal_transfer_bisim_to_" + obs,
# )
for obs in ["long-n", "longpre-n"]:
test_2d_transfer(
seed=seed,
obs_spec_type=obs,
transfer_from="./transfer_results/model_bisim_normal_s" + str(seed) + "/Simple",
run_id="normal_transfer_bisim_to_" + obs,
)
# # test_2d_model(config=SAC_CONFIG, run_id="sac_rich2_hard", seed=0)
# for obs in ["normal", "rich2"]:

正在加载...
取消
保存