浏览代码

target encoders and new forward loss

/develop/bisim-sac-transfer
yanchaosun 4 年前
当前提交
b74294bf
共有 4 个文件被更改,包括 264 次插入95 次删除
  1. 330
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  2. 14
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  3. 3
      ml-agents/mlagents/trainers/sac_transfer/optimizer.py
  4. 12
      ml-agents/mlagents/trainers/tests/transfer_test_envs.py

330
ml-agents/mlagents/trainers/policy/transfer_policy.py


self.reparameterize = reparameterize
self.condition_sigma_on_obs = condition_sigma_on_obs
self.trainable_variables: List[tf.Variable] = []
self.next_visual_in: List[tf.Tensor] = []
self.encoder = None
self.encoder_distribution = None
self.targ_encoder = None

# already populated, don't create more tensors.
return
self.create_input_placeholders()
self.create_next_inputs()
self.current_action = tf.placeholder(
shape=[None, sum(self.act_size)],
dtype=tf.float32,

shape=[None], dtype=tf.float32, name="current_reward"
)
self.next_visual_in: List[tf.Tensor] = []
self.encoder = self._create_encoder_general(
self.visual_in,
self.processed_vector_in,
self.h_size,
self.feature_size,
encoder_layers,
self.vis_encode_type,
scope="encoding"
)
self.encoder = self._create_encoder(
self.next_encoder = self._create_encoder_general(
self.visual_next,
self.processed_vector_next,
self.h_size,
self.feature_size,
encoder_layers,
self.vis_encode_type,
scope="encoding",
reuse=True
)
self.targ_encoder = self._create_encoder_general(
self.visual_in,
self.processed_vector_in,
self.h_size,

scope="target_enc",
stop_gradient=True,
self.targ_encoder = self._create_target_encoder(
self.next_targ_encoder = self._create_encoder_general(
self.visual_next,
self.processed_vector_next,
reuse_encoder,
scope="target_enc",
reuse=True,
stop_gradient=True,
self._create_hard_copy()
self._create_soft_copy()
# self.encoder = self._create_encoder(
# self.visual_in,
# self.processed_vector_in,
# self.h_size,
# self.feature_size,
# encoder_layers,
# self.vis_encode_type,
# )
# self.targ_encoder = self._create_target_encoder(
# self.h_size,
# self.feature_size,
# encoder_layers,
# self.vis_encode_type,
# reuse_encoder,
# )
self.action_encoder = self._create_action_encoder(
self.current_action,
self.h_size,

if not reuse_encoder:
self.targ_encoder = tf.stop_gradient(self.targ_encoder)
self._create_hard_copy()
self._create_soft_copy()
# if self.inverse_model:
# with tf.variable_scope("inverse"):
# self.create_inverse_model(
# self.encoder, self.targ_encoder, inverse_layers
# )
if self.inverse_model:
with tf.variable_scope("inverse"):
self.create_inverse_model(
self.encoder, self.targ_encoder, inverse_layers
)
with tf.variable_scope("predict"):
with tf.variable_scope("predict"):
self.create_forward_model(
self.predict, self.predict_distribution = self.create_forward_model(
forward_layers,
var_predict=var_predict,
)
self.targ_predict, self.targ_predict_distribution = self.create_forward_model(
self.action_encoder,
reuse=True
self.create_forward_loss(self.reuse_encoder, self.transfer)
if predict_return:
with tf.variable_scope("reward"):

run_out = self._execute_model(feed_dict, self.inference_dict)
return run_out
def _create_target_encoder(
self,
h_size: int,
feature_size: int,
num_layers: int,
vis_encode_type: EncoderType,
reuse_encoder: bool,
) -> tf.Tensor:
if reuse_encoder:
next_encoder_scope = "encoding"
else:
next_encoder_scope = "target_enc"
# def _create_target_encoder(
# self,
# h_size: int,
# feature_size: int,
# num_layers: int,
# vis_encode_type: EncoderType,
# reuse_encoder: bool,
# ) -> tf.Tensor:
# if reuse_encoder:
# next_encoder_scope = "encoding"
# else:
# next_encoder_scope = "target_enc"
self.visual_next = ModelUtils.create_visual_input_placeholders(
self.brain.camera_resolutions
)
self.vector_next = ModelUtils.create_vector_input(self.vec_obs_size)
if self.normalize:
vn_normalization_tensors = self.create_target_normalizer(self.vector_next)
self.vn_update_normalization_op = vn_normalization_tensors.update_op
self.vn_normalization_steps = vn_normalization_tensors.steps
self.vn_running_mean = vn_normalization_tensors.running_mean
self.vn_running_variance = vn_normalization_tensors.running_variance
self.processed_vector_next = ModelUtils.normalize_vector_obs(
self.vector_next,
self.vn_running_mean,
self.vn_running_variance,
self.vn_normalization_steps,
)
else:
self.processed_vector_next = self.vector_next
self.vp_update_normalization_op = None
# self.visual_next = ModelUtils.create_visual_input_placeholders(
# self.brain.camera_resolutions
# )
# self.vector_next = ModelUtils.create_vector_input(self.vec_obs_size)
# if self.normalize:
# vn_normalization_tensors = self.create_target_normalizer(self.vector_next)
# self.vn_update_normalization_op = vn_normalization_tensors.update_op
# self.vn_normalization_steps = vn_normalization_tensors.steps
# self.vn_running_mean = vn_normalization_tensors.running_mean
# self.vn_running_variance = vn_normalization_tensors.running_variance
# self.processed_vector_next = ModelUtils.normalize_vector_obs(
# self.vector_next,
# self.vn_running_mean,
# self.vn_running_variance,
# self.vn_normalization_steps,
# )
# else:
# self.processed_vector_next = self.vector_next
# self.vp_update_normalization_op = None
with tf.variable_scope(next_encoder_scope):
hidden_stream_targ = ModelUtils.create_observation_streams(
self.visual_next,
self.processed_vector_next,
1,
h_size,
num_layers,
vis_encode_type,
reuse=reuse_encoder,
)[0]
# with tf.variable_scope(next_encoder_scope):
# hidden_stream_targ = ModelUtils.create_observation_streams(
# self.visual_next,
# self.processed_vector_next,
# 1,
# h_size,
# num_layers,
# vis_encode_type,
# reuse=reuse_encoder,
# )[0]
latent_targ = tf.layers.dense(
hidden_stream_targ,
feature_size,
name="latent",
reuse=reuse_encoder,
activation=tf.tanh, # ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
return latent_targ
# latent_targ = tf.layers.dense(
# hidden_stream_targ,
# feature_size,
# name="latent",
# reuse=reuse_encoder,
# activation=tf.tanh, # ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
# )
# return latent_targ
def _create_encoder(
# def _create_encoder(
# self,
# visual_in: List[tf.Tensor],
# vector_in: tf.Tensor,
# h_size: int,
# feature_size: int,
# num_layers: int,
# vis_encode_type: EncoderType,
# ) -> tf.Tensor:
# """
# Creates an encoder for visual and vector observations.
# :param h_size: Size of hidden linear layers.
# :param num_layers: Number of hidden linear layers.
# :param vis_encode_type: Type of visual encoder to use if visual input.
# :return: The hidden layer (tf.Tensor) after the encoder.
# """
# with tf.variable_scope("encoding"):
# hidden_stream = ModelUtils.create_observation_streams(
# visual_in, vector_in, 1, h_size, num_layers, vis_encode_type,
# )[0]
# latent = tf.layers.dense(
# hidden_stream,
# feature_size,
# name="latent",
# activation=tf.tanh, # ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
# )
# return latent
def _create_encoder_general(
self,
visual_in: List[tf.Tensor],
vector_in: tf.Tensor,

vis_encode_type: EncoderType,
scope: str,
reuse: bool=False,
stop_gradient: bool=False
) -> tf.Tensor:
"""
Creates an encoder for visual and vector observations.

:return: The hidden layer (tf.Tensor) after the encoder.
"""
with tf.variable_scope("encoding"):
with tf.variable_scope(scope):
visual_in, vector_in, 1, h_size, num_layers, vis_encode_type
visual_in, vector_in, 1, h_size, num_layers, vis_encode_type, reuse=reuse
)[0]
latent = tf.layers.dense(

activation=tf.tanh, # ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
reuse=reuse
if stop_gradient:
latent = tf.stop_gradient(latent)
return latent
def _create_action_encoder(

initializer=tf.zeros_initializer(),
)
running_mean = tf.get_variable(
prefix + "vn_running_mean",
prefix + "_running_mean",
[vec_obs_size],
trainable=False,
dtype=tf.float32,

prefix + "vn_running_variance",
prefix + "_running_variance",
[vec_obs_size],
trainable=False,
dtype=tf.float32,

self,
encoded_state: tf.Tensor,
encoded_action: tf.Tensor,
encoded_next_state: tf.Tensor,
reuse: bool = False,
separate_train: bool = False,
) -> None:
"""

"""
combined_input = tf.concat([encoded_state, encoded_action], axis=1)
hidden = combined_input
# if self.transfer:
# hidden = tf.stop_gradient(hidden)
# * (self.vis_obs_size + int(self.vec_obs_size > 0)),
reuse=reuse
self.predict_distribution = GaussianEncoderDistribution(
hidden, self.feature_size
predict_distribution = GaussianEncoderDistribution(
hidden, self.feature_size, reuse=reuse
self.predict = self.predict_distribution.sample()
predict = predict_distribution.sample()
self.predict = tf.layers.dense(
predict = tf.layers.dense(
reuse=reuse
if not self.transfer:
encoded_next_state = tf.stop_gradient(encoded_next_state)
squared_difference = 0.5 * tf.reduce_sum(
tf.squared_difference(tf.tanh(self.predict), encoded_next_state), axis=1
)
predict_distribution = None
return predict, predict_distribution
# if not self.transfer:
# encoded_next_state = tf.stop_gradient(encoded_next_state)
# squared_difference = 0.5 * tf.reduce_sum(
# tf.squared_difference(tf.tanh(self.predict), encoded_next_state), axis=1
# )
# # self.forward_loss = tf.reduce_mean(squared_difference)
# self.next_state = encoded_next_state
# self.forward_loss = tf.reduce_mean(
# tf.dynamic_partition(squared_difference, self.mask, 2)[1]
# )
def create_forward_loss(self, reuse: bool, transfer: bool):
if not transfer:
if reuse:
encoded_next_state = tf.stop_gradient(self.next_encoder)
else:
encoded_next_state = self.next_targ_encoder # gradient of target encode is already stopped
# self.forward_loss = tf.reduce_mean(squared_difference)
self.next_state = encoded_next_state
self.forward_loss = tf.reduce_mean(
tf.dynamic_partition(squared_difference, self.mask, 2)[1]
)
squared_difference = 0.5 * tf.reduce_sum(
tf.squared_difference(tf.tanh(self.predict), encoded_next_state), axis=1
)
self.forward_loss = tf.reduce_mean(
tf.dynamic_partition(squared_difference, self.mask, 2)[1]
)
else:
if reuse:
squared_difference_1 = 0.5 * tf.reduce_sum(
tf.squared_difference(tf.tanh(self.predict), tf.stop_gradient(self.next_encoder)),
axis=1
)
squared_difference_2 = 0.5 * tf.reduce_sum(
tf.squared_difference(tf.tanh(tf.stop_gradient(self.predict)), self.next_encoder),
axis=1
)
else:
squared_difference_1 = 0.5 * tf.reduce_sum(
tf.squared_difference(tf.tanh(self.predict), self.next_targ_encoder),
axis=1
)
squared_difference_2 = 0.5 * tf.reduce_sum(
tf.squared_difference(tf.tanh(self.targ_predict), self.next_encoder),
axis=1
)
self.forward_loss = tf.reduce_mean(
tf.dynamic_partition(0.5 * squared_difference_1 + 0.5 * squared_difference_2, self.mask, 2)[1]
)
def create_reward_model(
self,

self.vector_bisim = ModelUtils.create_vector_input(self.vec_obs_size)
if self.normalize:
bi_normalization_tensors = self.create_target_normalizer(
self.vector_bisim
self.vector_bisim, prefix="bi"
)
self.bi_update_normalization_op = bi_normalization_tensors.update_op
self.bi_normalization_steps = bi_normalization_tensors.steps

# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
def create_next_inputs(self):
self.visual_next = ModelUtils.create_visual_input_placeholders(
self.brain.camera_resolutions
)
self.vector_next = ModelUtils.create_vector_input(self.vec_obs_size)
if self.normalize:
vn_normalization_tensors = self.create_target_normalizer(self.vector_next)
self.vn_update_normalization_op = vn_normalization_tensors.update_op
self.vn_normalization_steps = vn_normalization_tensors.steps
self.vn_running_mean = vn_normalization_tensors.running_mean
self.vn_running_variance = vn_normalization_tensors.running_variance
self.processed_vector_next = ModelUtils.normalize_vector_obs(
self.vector_next,
self.vn_running_mean,
self.vn_running_variance,
self.vn_normalization_steps,
)
else:
self.processed_vector_next = self.vector_next
self.vp_update_normalization_op = None

14
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


self._init_alter_update()
self.policy.initialize_or_load()
if self.use_transfer:
self.policy.load_graph_partial(
self.transfer_path,

hyperparameters.load_encoder,
hyperparameters.load_action,
)
self.policy.run_hard_copy()
# self.policy.get_encoder_weights()
# self.policy.get_policy_weights()

update_vals = self._execute_model(feed_dict, self.update_dict)
# update target encoder
if not self.reuse_encoder: # and self.num_updates % self.copy_every == 0:
self.policy.run_soft_copy()
# print("copy")
# self.policy.get_encoder_weights()
self.policy.run_soft_copy()
# print("copy")
# self.policy.get_encoder_weights()
for stat_name, update_name in stats_needed.items():
# if update_name in update_vals.keys():

update_vals = self._execute_model(feed_dict, self.model_only_update_dict)
# update target encoder
if not self.reuse_encoder: # and self.num_updates % self.copy_every == 0:
self.policy.run_soft_copy()
self.policy.run_soft_copy()
# print("copy")
# self.policy.get_encoder_weights()
for stat_name, update_name in stats_needed.items():
if update_name in update_vals.keys():

3
ml-agents/mlagents/trainers/sac_transfer/optimizer.py


self.use_inverse_model,
self.reuse_encoder,
self.use_bisim,
hyperparameters.tau
)
with policy.graph.as_default():

update_stats[stat_name] = update_vals[update_name]
# Update target network. By default, target update happens at every policy update.
self.sess.run(self.target_update_op)
if not self.reuse_encoder:
self.policy.run_soft_copy()
return update_stats
def update_reward_signals(

12
ml-agents/mlagents/trainers/tests/transfer_test_envs.py


obs.append(
np.ones((1, self.vec_obs_size), dtype=np.float32) * (2 * i - j)
)
elif self.obs_spec_type == "long":
for name in self.names:
for i in self.positions[name]:
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * i)
for _ in range(self.extra_obs_size):
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32))
elif self.obs_spec_type == "longpre":
for name in self.names:
for _ in range(self.extra_obs_size):
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32))
for i in self.positions[name]:
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * i)
elif self.obs_spec_type == "long-n":
for name in self.names:
for i in self.positions[name]:

正在加载...
取消
保存