浏览代码

add tanh activ

/develop/bisim-review
Andrew Cohen 4 年前
当前提交
1b17ae56
共有 4 个文件被更改,包括 152 次插入161 次删除
  1. 219
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  2. 23
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  3. 8
      ml-agents/mlagents/trainers/ppo_transfer/trainer.py
  4. 63
      ml-agents/mlagents/trainers/tests/test_simple_transfer.py

219
ml-agents/mlagents/trainers/policy/transfer_policy.py


feature_size,
name="latent",
reuse=reuse_encoder,
# activation=ModelUtils.swish,
activation=tf.tanh,#ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
return latent_targ

hidden_stream,
feature_size,
name="latent",
# activation=ModelUtils.swish,
activation=tf.tanh,#ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
return latent

return latent_targ_distribution, latent_targ
def _create_var_encoder(
self,
visual_in: List[tf.Tensor],
vector_in: tf.Tensor,
h_size: int,
feature_size: int,
num_layers: int,
vis_encode_type: EncoderType,
) -> tf.Tensor:
"""
Creates a variational encoder for visual and vector observations.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
:return: The hidden layer (tf.Tensor) after the encoder.
"""
#def _create_var_encoder(
# self,
# visual_in: List[tf.Tensor],
# vector_in: tf.Tensor,
# h_size: int,
# feature_size: int,
# num_layers: int,
# vis_encode_type: EncoderType,
#) -> tf.Tensor:
# """
# Creates a variational encoder for visual and vector observations.
# :param h_size: Size of hidden linear layers.
# :param num_layers: Number of hidden linear layers.
# :param vis_encode_type: Type of visual encoder to use if visual input.
# :return: The hidden layer (tf.Tensor) after the encoder.
# """
with tf.variable_scope("encoding"):
hidden_stream = ModelUtils.create_observation_streams(
visual_in, vector_in, 1, h_size, num_layers, vis_encode_type
)[0]
# with tf.variable_scope("encoding"):
# hidden_stream = ModelUtils.create_observation_streams(
# visual_in, vector_in, 1, h_size, num_layers, vis_encode_type
# )[0]
with tf.variable_scope("latent"):
latent_distribution = GaussianEncoderDistribution(
hidden_stream, feature_size
)
# with tf.variable_scope("latent"):
# latent_distribution = GaussianEncoderDistribution(
# hidden_stream, feature_size
# )
latent = latent_distribution.sample()
# latent = latent_distribution.sample()
return latent_distribution, latent
# return latent_distribution, latent
def _create_hard_copy(self):
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target_enc")

def run_hard_copy(self):
self.sess.run(self.target_replace_op)
def _create_inverse_model(
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
) -> None:
"""
Creates inverse model TensorFlow ops for Curiosity module.
Predicts action taken given current and future encoded states.
:param encoded_state: Tensor corresponding to encoded current state.
:param encoded_next_state: Tensor corresponding to encoded next state.
"""
with tf.variable_scope("inverse"):
combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
hidden = tf.layers.dense(
combined_input, self.h_size, activation=ModelUtils.swish
)
if self.brain.vector_action_space_type == "continuous":
pred_action = tf.layers.dense(hidden, self.act_size[0], activation=None)
squared_difference = tf.reduce_sum(
tf.squared_difference(pred_action, self.current_action), axis=1
)
self.inverse_loss = tf.reduce_mean(
tf.dynamic_partition(squared_difference, self.mask, 2)[1]
)
else:
pred_action = tf.concat(
[
tf.layers.dense(
hidden, self.act_size[i], activation=tf.nn.softmax
)
for i in range(len(self.act_size))
],
axis=1,
)
cross_entropy = tf.reduce_sum(
-tf.log(pred_action + 1e-10) * self.current_action, axis=1
)
self.inverse_loss = tf.reduce_mean(
tf.dynamic_partition(cross_entropy, self.mask, 2)[1]
)
#def _create_inverse_model(
# self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
#) -> None:
# """
# Creates inverse model TensorFlow ops for Curiosity module.
# Predicts action taken given current and future encoded states.
# :param encoded_state: Tensor corresponding to encoded current state.
# :param encoded_next_state: Tensor corresponding to encoded next state.
# """
# with tf.variable_scope("inverse"):
# combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
# hidden = tf.layers.dense(
# combined_input, self.h_size, activation=ModelUtils.swish
# )
# if self.brain.vector_action_space_type == "continuous":
# pred_action = tf.layers.dense(hidden, self.act_size[0], activation=None)
# squared_difference = tf.reduce_sum(
# tf.squared_difference(pred_action, self.current_action), axis=1
# )
# self.inverse_loss = tf.reduce_mean(
# tf.dynamic_partition(squared_difference, self.mask, 2)[1]
# )
# else:
# pred_action = tf.concat(
# [
# tf.layers.dense(
# hidden, self.act_size[i], activation=tf.nn.softmax
# )
# for i in range(len(self.act_size))
# ],
# axis=1,
# )
# cross_entropy = tf.reduce_sum(
# -tf.log(pred_action + 1e-10) * self.current_action, axis=1
# )
# self.inverse_loss = tf.reduce_mean(
# tf.dynamic_partition(cross_entropy, self.mask, 2)[1]
# )
def _create_cc_actor(
self,

:param steps: The number of steps the model was trained for
:return:
"""
self.get_policy_weights()
#self.get_policy_weights()
with self.graph.as_default():
last_checkpoint = os.path.join(self.model_path, f"model-{steps}.ckpt")
self.saver.save(self.sess, last_checkpoint)

self.h_size,
# * (self.vis_obs_size + int(self.vec_obs_size > 0)),
name="hidden_{}".format(i),
# activation=ModelUtils.swish,
activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)

)
self.predict = self.predict_distribution.sample()
self.predict = tf.tanh(self.predict_distribution.sample())
# activation=ModelUtils.swish,
activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)

hidden,
self.h_size * (self.vis_obs_size + int(self.vec_obs_size > 0)),
name="hidden_{}".format(i),
# activation=ModelUtils.swish,
activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
self.pred_reward = tf.layers.dense(

# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
self.reward_loss = tf.clip_by_value(
tf.reduce_mean(
self.reward_loss = tf.reduce_mean(
),
1e-10,
1.0,
)
)
#self.reward_loss = tf.clip_by_value(
# tf.reduce_mean(
# tf.squared_difference(self.pred_reward, self.current_reward)
# ),
# 1e-10,
# 1.0,
#)
def create_bisim_model(
self,

hidden,
self.h_size,
name="hidden_{}".format(i),
reuse=True
# activation=ModelUtils.swish,
reuse=True,
activation=ModelUtils.swish,
if var_predict:
self.bisim_predict_distribution = GaussianEncoderDistribution(
hidden, self.feature_size, reuse=True
)
self.bisim_predict = self.predict_distribution.sample()
else:
self.bisim_predict = tf.layers.dense(
hidden,
self.feature_size,
name="latent",
reuse=True
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
if predict_return:
with tf.variable_scope("reward"):
hidden = combined_input
for i in range(forward_layers):
hidden = tf.layers.dense(
hidden,
self.h_size * (self.vis_obs_size + int(self.vec_obs_size > 0)),
name="hidden_{}".format(i),
reuse=True
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
self.bisim_pred_reward = tf.layers.dense(
hidden,
1,
name="reward",
reuse=True
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
self.bisim_predict_distribution = GaussianEncoderDistribution(
hidden, self.feature_size, reuse=True
)
self.bisim_predict = tf.tanh(self.predict_distribution.sample())
with tf.variable_scope("reward"):
hidden = combined_input
for i in range(forward_layers):
hidden = tf.layers.dense(
hidden,
self.h_size * (self.vis_obs_size + int(self.vec_obs_size > 0)),
name="hidden_{}".format(i),
reuse=True
activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
self.bisim_pred_reward = tf.layers.dense(
hidden,
1,
name="reward",
reuse=True
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)

23
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


hyperparameters.load_policy,
hyperparameters.load_value,
)
self.policy.get_encoder_weights()
self.policy.get_policy_weights()
#self.policy.get_encoder_weights()
#self.policy.get_policy_weights()
# slim.model_analyzer.analyze_vars(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES), print_info=True)

if self.use_bisim:
if self.use_var_predict:
predict_diff = tf.reduce_mean(self.policy.predict_distribution.w_distance(
predict_diff = self.policy.predict_distribution.w_distance(
))
)
else:
predict_diff = tf.reduce_mean(
tf.reduce_sum(

)
if self.predict_return:
reward_diff = tf.reduce_mean(
tf.abs(
reward_diff = tf.reduce_sum(tf.abs(
)
)
), axis=1)
encode_dist = tf.reduce_mean(tf.reduce_sum(
# tf.squared_difference(self.policy.encoder, self.policy.bisim_encoder)
tf.abs(self.policy.encoder - self.policy.bisim_encoder), axis=1
))
encode_dist = tf.reduce_sum(tf.abs(self.policy.encoder - self.policy.bisim_encoder), axis=1)
self.bisim_loss = tf.squared_difference(encode_dist, predict_diff)
self.bisim_loss = tf.reduce_mean(tf.squared_difference(encode_dist, predict_diff))
self.loss = (
self.policy_loss

train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
if self.train_value:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
print("trainable", train_vars)
#print("trainable", train_vars)
self.tf_optimizer = self.create_optimizer_op(self.learning_rate)
self.grads = self.tf_optimizer.compute_gradients(self.loss, var_list=train_vars)

8
ml-agents/mlagents/trainers/ppo_transfer/trainer.py


num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)
for _ in range(num_epoch):
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
#self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
max_num_batch = buffer_length // batch_size
max_num_batch = 20 #buffer_length // batch_size
buffer.make_mini_batch(i, i + batch_size), n_sequences, "model"
buffer.sample_mini_batch(batch_size, self.policy.sequence_length), n_sequences, "model"
#buffer.make_mini_batch(i, i + batch_size), n_sequences, "model"
#)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
if self.use_bisim:

63
ml-agents/mlagents/trainers/tests/test_simple_transfer.py


batch_size=16,
buffer_size=64,
),
network_settings=NetworkSettings(num_layers=2, hidden_units=32),
network_settings=NetworkSettings(num_layers=2, hidden_units=64),
summary_freq=500,
max_steps=3000,
threaded=False,

# separate_value_train=True
# separate_value_net=True,
),
network_settings=NetworkSettings(num_layers=1, hidden_units=32),
network_settings=NetworkSettings(num_layers=1, hidden_units=64),
summary_freq=500,
max_steps=3000,
threaded=False,

batch_size=1200,
buffer_size=12000,
learning_rate=5.0e-3,
use_bisim=True,
use_bisim=False,
predict_return=True,
reuse_encoder=True,
separate_value_train=True,

use_op_buffer=False,
in_epoch_alter=False,
in_batch_alter=True,
policy_layers=0,
value_layers=2,
forward_layers=2,
policy_layers=1,
value_layers=1,
forward_layers=1,
feature_size=16,
feature_size=32,
config, hyperparameters=new_hyperparams, max_steps=500000, summary_freq=5000
config, hyperparameters=new_hyperparams, max_steps=350000, summary_freq=5000
)
_check_environment_trains(
env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed

train_model=False,
separate_value_train=True,
separate_policy_train=False,
feature_size=16,
feature_size=32,
policy_layers=0,
forward_layers=2,
value_layers=2,
policy_layers=1,
forward_layers=1,
value_layers=1,
use_bisim=True,
use_bisim=False,
config, hyperparameters=new_hyperparams, max_steps=500000, summary_freq=5000
config, hyperparameters=new_hyperparams, max_steps=350000, summary_freq=5000
)
_check_environment_trains(
env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed

if __name__ == "__main__":
for seed in range(5):
for obs in ["normal", "rich1", "rich2"]:
test_2d_model(seed=seed, obs_spec_type=obs, run_id="tmodel_" + obs)
if seed > -1:
for obs in ["normal", "rich1", "rich2"]:
test_2d_model(seed=seed, obs_spec_type=obs, run_id="model_" + obs)
# for obs in ["normal", "rich2"]:
# test_2d_transfer(
# seed=seed,
# obs_spec_type="rich1",
# transfer_from="./transfer_results/model_" + obs + "_s" + str(seed) + "/Simple",
# run_id=obs + "transfer_to_rich1",
# )
#
# for obs in ["normal", "rich1"]:
# test_2d_transfer(
# seed=seed,
# obs_spec_type="rich2",
# transfer_from="./transfer_results/model_" + obs + "_s" + str(seed) + "/Simple",
# run_id=obs + "transfer_to_rich2",
# )
for obs in ["normal", "rich2"]:
test_2d_transfer(
seed=seed,
obs_spec_type="rich1",
transfer_from="./transfer_results/model_" + obs + "_s" + str(seed) + "/Simple",
run_id=obs + "transfer_to_rich1",
)
for obs in ["normal", "rich1"]:
test_2d_transfer(
seed=seed,
obs_spec_type="rich2",
transfer_from="./transfer_results/model_" + obs + "_s" + str(seed) + "/Simple",
run_id=obs + "transfer_to_rich2",
)
# for obs in ["normal"]:

正在加载...
取消
保存